Imago/app/services/ocr_service.py
Bruno Charest cc99fea20a
Some checks failed
CI / Lint & Format (push) Has been cancelled
CI / Tests (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
Add comprehensive test suite for image processing and related services
- Implement tests for database generator to ensure proper session handling.
- Create tests for EXIF extraction and conversion functions.
- Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients.
- Develop tests for OCR functionality, including language detection and text extraction.
- Introduce tests for the image processing pipeline, covering success and failure scenarios.
- Validate rate limiting functionality and ensure independent counters for different clients.
- Implement scraper tests to verify HTML content fetching and error handling.
- Add unit tests for various services, including storage and filename generation.
- Establish worker entry point for ARQ to handle background image processing tasks.
2026-02-24 11:22:10 -05:00

108 lines
3.1 KiB
Python

"""
Service OCR — extraction de texte via Tesseract
"""
import logging
from pathlib import Path
from PIL import Image as PILImage
from app.config import settings
logger = logging.getLogger(__name__)
try:
import pytesseract
_ocr_import_error: Exception | None = None
except Exception as e:
pytesseract = None
_ocr_import_error = e
def _detect_language(text: str) -> str:
"""Détection grossière de la langue à partir du texte extrait."""
if not text:
return "unknown"
# Mots communs français
fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"}
# Mots communs anglais
en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"}
words = set(text.lower().split())
fr_score = len(words & fr_words)
en_score = len(words & en_words)
if fr_score == 0 and en_score == 0:
return "unknown"
return "fr" if fr_score >= en_score else "en"
def extract_text(file_path: str) -> dict:
"""
Extrait le texte d'une image via Tesseract OCR.
Retourne un dict avec le texte, la langue et le score de confiance.
"""
result = {
"text": None,
"language": None,
"confidence": None,
"has_text": False,
}
if not settings.OCR_ENABLED:
return result
if pytesseract is None:
logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)})
return result
path = Path(file_path)
if not path.exists():
return result
try:
# Configuration Tesseract
if settings.TESSERACT_CMD:
pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD
with PILImage.open(path) as img:
# Convertit en RGB si nécessaire
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
# Extraction avec données de confiance
data = pytesseract.image_to_data(
img,
lang=settings.OCR_LANGUAGES,
output_type=pytesseract.Output.DICT,
)
# Calcul de la confiance moyenne (on ignore les -1)
confidences = [
int(c) for c in data["conf"]
if str(c).strip() not in ("-1", "")
]
avg_confidence = (
round(sum(confidences) / len(confidences) / 100, 3)
if confidences else 0.0
)
# Texte nettoyé
raw_text = pytesseract.image_to_string(
img,
lang=settings.OCR_LANGUAGES,
).strip()
if raw_text and len(raw_text) > 3:
result["text"] = raw_text
result["has_text"] = True
result["confidence"] = avg_confidence
result["language"] = _detect_language(raw_text)
else:
result["has_text"] = False
except pytesseract.TesseractNotFoundError:
logger.warning("ocr.tesseract_not_found")
except Exception as e:
logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)})
return result