""" Service OCR — extraction de texte via Tesseract """ import logging import io from pathlib import Path from PIL import Image as PILImage from app.config import settings from app.services.storage_backend import get_storage_backend logger = logging.getLogger(__name__) try: import pytesseract _ocr_import_error: Exception | None = None except Exception as e: pytesseract = None _ocr_import_error = e def _detect_language(text: str) -> str: """Détection grossière de la langue à partir du texte extrait.""" if not text: return "unknown" # Mots communs français fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"} # Mots communs anglais en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"} words = set(text.lower().split()) fr_score = len(words & fr_words) en_score = len(words & en_words) if fr_score == 0 and en_score == 0: return "unknown" return "fr" if fr_score >= en_score else "en" async def extract_text(file_path: str) -> dict: """ Extrait le texte d'une image via Tesseract OCR. Supporte Local et S3 via StorageBackend (lecture en mémoire). """ result = { "text": None, "language": None, "confidence": None, "has_text": False, } if not settings.OCR_ENABLED: return result if pytesseract is None: logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)}) return result try: # Lecture via le backend backend = get_storage_backend() image_bytes = await backend.get_bytes(file_path) # Configuration Tesseract if settings.TESSERACT_CMD: pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD with PILImage.open(io.BytesIO(image_bytes)) as img: # Convertit en RGB si nécessaire if img.mode not in ("RGB", "L"): img = img.convert("RGB") # Extraction avec données de confiance data = pytesseract.image_to_data( img, lang=settings.OCR_LANGUAGES, output_type=pytesseract.Output.DICT, ) # Calcul de la confiance moyenne (on ignore les -1) confidences = [ int(c) for c in data["conf"] if str(c).strip() not in ("-1", "") ] avg_confidence = ( round(sum(confidences) / len(confidences) / 100, 3) if confidences else 0.0 ) # Texte nettoyé raw_text = pytesseract.image_to_string( img, lang=settings.OCR_LANGUAGES, ).strip() if raw_text and len(raw_text) > 3: result["text"] = raw_text result["has_text"] = True result["confidence"] = avg_confidence result["language"] = _detect_language(raw_text) else: result["has_text"] = False except pytesseract.TesseractNotFoundError: logger.warning("ocr.tesseract_not_found") except Exception as e: logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)}) return result