110 lines
3.3 KiB
Python
110 lines
3.3 KiB
Python
"""
|
|
Service OCR — extraction de texte via Tesseract
|
|
"""
|
|
import logging
|
|
import io
|
|
from pathlib import Path
|
|
from PIL import Image as PILImage
|
|
from app.config import settings
|
|
from app.services.storage_backend import get_storage_backend
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import pytesseract
|
|
_ocr_import_error: Exception | None = None
|
|
except Exception as e:
|
|
pytesseract = None
|
|
_ocr_import_error = e
|
|
|
|
|
|
def _detect_language(text: str) -> str:
|
|
"""Détection grossière de la langue à partir du texte extrait."""
|
|
if not text:
|
|
return "unknown"
|
|
|
|
# Mots communs français
|
|
fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"}
|
|
# Mots communs anglais
|
|
en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"}
|
|
|
|
words = set(text.lower().split())
|
|
fr_score = len(words & fr_words)
|
|
en_score = len(words & en_words)
|
|
|
|
if fr_score == 0 and en_score == 0:
|
|
return "unknown"
|
|
return "fr" if fr_score >= en_score else "en"
|
|
|
|
|
|
async def extract_text(file_path: str) -> dict:
|
|
"""
|
|
Extrait le texte d'une image via Tesseract OCR.
|
|
Supporte Local et S3 via StorageBackend (lecture en mémoire).
|
|
"""
|
|
result = {
|
|
"text": None,
|
|
"language": None,
|
|
"confidence": None,
|
|
"has_text": False,
|
|
}
|
|
|
|
if not settings.OCR_ENABLED:
|
|
return result
|
|
|
|
if pytesseract is None:
|
|
logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)})
|
|
return result
|
|
|
|
try:
|
|
# Lecture via le backend
|
|
backend = get_storage_backend()
|
|
image_bytes = await backend.get_bytes(file_path)
|
|
|
|
# Configuration Tesseract
|
|
if settings.TESSERACT_CMD:
|
|
pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD
|
|
|
|
with PILImage.open(io.BytesIO(image_bytes)) as img:
|
|
# Convertit en RGB si nécessaire
|
|
if img.mode not in ("RGB", "L"):
|
|
img = img.convert("RGB")
|
|
|
|
# Extraction avec données de confiance
|
|
data = pytesseract.image_to_data(
|
|
img,
|
|
lang=settings.OCR_LANGUAGES,
|
|
output_type=pytesseract.Output.DICT,
|
|
)
|
|
|
|
# Calcul de la confiance moyenne (on ignore les -1)
|
|
confidences = [
|
|
int(c) for c in data["conf"]
|
|
if str(c).strip() not in ("-1", "")
|
|
]
|
|
avg_confidence = (
|
|
round(sum(confidences) / len(confidences) / 100, 3)
|
|
if confidences else 0.0
|
|
)
|
|
|
|
# Texte nettoyé
|
|
raw_text = pytesseract.image_to_string(
|
|
img,
|
|
lang=settings.OCR_LANGUAGES,
|
|
).strip()
|
|
|
|
if raw_text and len(raw_text) > 3:
|
|
result["text"] = raw_text
|
|
result["has_text"] = True
|
|
result["confidence"] = avg_confidence
|
|
result["language"] = _detect_language(raw_text)
|
|
else:
|
|
result["has_text"] = False
|
|
|
|
except pytesseract.TesseractNotFoundError:
|
|
logger.warning("ocr.tesseract_not_found")
|
|
except Exception as e:
|
|
logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)})
|
|
|
|
return result
|