- Implement tests for database generator to ensure proper session handling. - Create tests for EXIF extraction and conversion functions. - Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients. - Develop tests for OCR functionality, including language detection and text extraction. - Introduce tests for the image processing pipeline, covering success and failure scenarios. - Validate rate limiting functionality and ensure independent counters for different clients. - Implement scraper tests to verify HTML content fetching and error handling. - Add unit tests for various services, including storage and filename generation. - Establish worker entry point for ARQ to handle background image processing tasks.
108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
"""
|
|
Service OCR — extraction de texte via Tesseract
|
|
"""
|
|
import logging
|
|
from pathlib import Path
|
|
from PIL import Image as PILImage
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import pytesseract
|
|
_ocr_import_error: Exception | None = None
|
|
except Exception as e:
|
|
pytesseract = None
|
|
_ocr_import_error = e
|
|
|
|
|
|
def _detect_language(text: str) -> str:
|
|
"""Détection grossière de la langue à partir du texte extrait."""
|
|
if not text:
|
|
return "unknown"
|
|
|
|
# Mots communs français
|
|
fr_words = {"le", "la", "les", "de", "du", "des", "un", "une", "et", "en", "est", "que"}
|
|
# Mots communs anglais
|
|
en_words = {"the", "is", "are", "and", "or", "of", "to", "in", "a", "an", "for", "with"}
|
|
|
|
words = set(text.lower().split())
|
|
fr_score = len(words & fr_words)
|
|
en_score = len(words & en_words)
|
|
|
|
if fr_score == 0 and en_score == 0:
|
|
return "unknown"
|
|
return "fr" if fr_score >= en_score else "en"
|
|
|
|
|
|
def extract_text(file_path: str) -> dict:
|
|
"""
|
|
Extrait le texte d'une image via Tesseract OCR.
|
|
Retourne un dict avec le texte, la langue et le score de confiance.
|
|
"""
|
|
result = {
|
|
"text": None,
|
|
"language": None,
|
|
"confidence": None,
|
|
"has_text": False,
|
|
}
|
|
|
|
if not settings.OCR_ENABLED:
|
|
return result
|
|
|
|
if pytesseract is None:
|
|
logger.warning("ocr.unavailable", extra={"error": str(_ocr_import_error)})
|
|
return result
|
|
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
return result
|
|
|
|
try:
|
|
# Configuration Tesseract
|
|
if settings.TESSERACT_CMD:
|
|
pytesseract.pytesseract.tesseract_cmd = settings.TESSERACT_CMD
|
|
|
|
with PILImage.open(path) as img:
|
|
# Convertit en RGB si nécessaire
|
|
if img.mode not in ("RGB", "L"):
|
|
img = img.convert("RGB")
|
|
|
|
# Extraction avec données de confiance
|
|
data = pytesseract.image_to_data(
|
|
img,
|
|
lang=settings.OCR_LANGUAGES,
|
|
output_type=pytesseract.Output.DICT,
|
|
)
|
|
|
|
# Calcul de la confiance moyenne (on ignore les -1)
|
|
confidences = [
|
|
int(c) for c in data["conf"]
|
|
if str(c).strip() not in ("-1", "")
|
|
]
|
|
avg_confidence = (
|
|
round(sum(confidences) / len(confidences) / 100, 3)
|
|
if confidences else 0.0
|
|
)
|
|
|
|
# Texte nettoyé
|
|
raw_text = pytesseract.image_to_string(
|
|
img,
|
|
lang=settings.OCR_LANGUAGES,
|
|
).strip()
|
|
|
|
if raw_text and len(raw_text) > 3:
|
|
result["text"] = raw_text
|
|
result["has_text"] = True
|
|
result["confidence"] = avg_confidence
|
|
result["language"] = _detect_language(raw_text)
|
|
else:
|
|
result["has_text"] = False
|
|
|
|
except pytesseract.TesseractNotFoundError:
|
|
logger.warning("ocr.tesseract_not_found")
|
|
except Exception as e:
|
|
logger.error("ocr.extraction_error", extra={"file": file_path, "error": str(e)})
|
|
|
|
return result
|