Imago/app/services/ocr_service.py

110 lines
3.3 KiB
Python

"""
Service OCR — extraction de texte via Tesseract
"""
import logging
import io
from pathlib import Path
from PIL import Image as PILImage
from app.config import settings
from app.services.storage_backend import get_storage_backend
logger = logging.getLogger(__name__)
try:
import pytesseract
_ocr_import_error: Exception | None = None
except Exception as e:
pytesseract = None
_ocr_import_error = e
def _detect_language(text: str) -> str:
"""Détection grossière de la langue à partir du texte extrait."""
if not text:
return "unknown"
# Mots communs français
fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"}
# Mots communs anglais
en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"}
words = set(text.lower().split())
fr_score = len(words & fr_words)
en_score = len(words & en_words)
if fr_score == 0 and en_score == 0:
return "unknown"
return "fr" if fr_score >= en_score else "en"
async def extract_text(file_path: str) -> dict:
"""
Extrait le texte d'une image via Tesseract OCR.
Supporte Local et S3 via StorageBackend (lecture en mémoire).
"""
result = {
"text": None,
"language": None,
"confidence": None,
"has_text": False,
}
if not settings.OCR_ENABLED:
return result
if pytesseract is None:
logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)})
return result
try:
# Lecture via le backend
backend = get_storage_backend()
image_bytes = await backend.get_bytes(file_path)
# Configuration Tesseract
if settings.TESSERACT_CMD:
pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD
with PILImage.open(io.BytesIO(image_bytes)) as img:
# Convertit en RGB si nécessaire
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
# Extraction avec données de confiance
data = pytesseract.image_to_data(
img,
lang=settings.OCR_LANGUAGES,
output_type=pytesseract.Output.DICT,
)
# Calcul de la confiance moyenne (on ignore les -1)
confidences = [
int(c) for c in data["conf"]
if str(c).strip() not in ("-1", "")
]
avg_confidence = (
round(sum(confidences) / len(confidences) / 100, 3)
if confidences else 0.0
)
# Texte nettoyé
raw_text = pytesseract.image_to_string(
img,
lang=settings.OCR_LANGUAGES,
).strip()
if raw_text and len(raw_text) > 3:
result["text"] = raw_text
result["has_text"] = True
result["confidence"] = avg_confidence
result["language"] = _detect_language(raw_text)
else:
result["has_text"] = False
except pytesseract.TesseractNotFoundError:
logger.warning("ocr.tesseract_not_found")
except Exception as e:
logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)})
return result