Imago/app/services/ocr_service.py

"""
Service OCR — extraction de texte via Tesseract
"""
import logging
import io
from pathlib import Path
from PIL import Image as PILImage
from app.config import settings
from app.services.storage_backend import get_storage_backend

logger = logging.getLogger(__name__)

try:
    import pytesseract
    _ocr_import_error: Exception | None = None
except Exception as e:
    pytesseract = None
    _ocr_import_error = e


def _detect_language(text: str) -> str:
    """Détection grossière de la langue à partir du texte extrait."""
    if not text:
        return "unknown"

    # Mots communs français
    fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"}
    # Mots communs anglais
    en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"}

    words = set(text.lower().split())
    fr_score = len(words & fr_words)
    en_score = len(words & en_words)

    if fr_score == 0 and en_score == 0:
        return "unknown"
    return "fr" if fr_score >= en_score else "en"


async def extract_text(file_path: str) -> dict:
    """
    Extrait le texte d'une image via Tesseract OCR.
    Supporte Local et S3 via StorageBackend (lecture en mémoire).
    """
    result = {
        "text": None,
        "language": None,
        "confidence": None,
        "has_text": False,
    }

    if not settings.OCR_ENABLED:
        return result

    if pytesseract is None:
        logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)})
        return result

    try:
        # Lecture via le backend
        backend = get_storage_backend()
        image_bytes = await backend.get_bytes(file_path)

        # Configuration Tesseract
        if settings.TESSERACT_CMD:
            pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD

        with PILImage.open(io.BytesIO(image_bytes)) as img:
            # Convertit en RGB si nécessaire
            if img.mode not in ("RGB", "L"):
                img = img.convert("RGB")

            # Extraction avec données de confiance
            data = pytesseract.image_to_data(
                img,
                lang=settings.OCR_LANGUAGES,
                output_type=pytesseract.Output.DICT,
            )

            # Calcul de la confiance moyenne (on ignore les -1)
            confidences = [
                int(c) for c in data["conf"]
                if str(c).strip() not in ("-1", "")
            ]
            avg_confidence = (
                round(sum(confidences) / len(confidences) / 100, 3)
                if confidences else 0.0
            )

            # Texte nettoyé
            raw_text = pytesseract.image_to_string(
                img,
                lang=settings.OCR_LANGUAGES,
            ).strip()

            if raw_text and len(raw_text) > 3:
                result["text"] = raw_text
                result["has_text"] = True
                result["confidence"] = avg_confidence
                result["language"] = _detect_language(raw_text)
            else:
                result["has_text"] = False

    except pytesseract.TesseractNotFoundError:
        logger.warning("ocr.tesseract_not_found")
    except Exception as e:
        logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)})

    return result