- Implement tests for database generator to ensure proper session handling. - Create tests for EXIF extraction and conversion functions. - Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients. - Develop tests for OCR functionality, including language detection and text extraction. - Introduce tests for the image processing pipeline, covering success and failure scenarios. - Validate rate limiting functionality and ensure independent counters for different clients. - Implement scraper tests to verify HTML content fetching and error handling. - Add unit tests for various services, including storage and filename generation. - Establish worker entry point for ARQ to handle background image processing tasks.
192 lines
5.7 KiB
Python
192 lines
5.7 KiB
Python
"""
|
|
Worker ARQ — traitement asynchrone des images via Redis.
|
|
|
|
Lance avec : python worker.py
|
|
|
|
Fonctionnalités :
|
|
- File persistante Redis (survit aux redémarrages)
|
|
- Retry automatique avec backoff exponentiel
|
|
- Queues prioritaires (premium / standard)
|
|
- Dead-letter : marquage error après max_tries
|
|
"""
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from arq import cron, func
|
|
from arq.connections import RedisSettings
|
|
|
|
from app.config import settings
|
|
from app.database import AsyncSessionLocal
|
|
from app.models.image import Image, ProcessingStatus
|
|
from app.services.pipeline import process_image_pipeline
|
|
from sqlalchemy import select
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Backoff exponentiel : délais entre tentatives (en secondes)
|
|
RETRY_DELAYS = [1, 4, 16]
|
|
|
|
|
|
async def process_image_task(ctx: dict, image_id: int, client_id: str) -> str:
|
|
"""
|
|
Tâche ARQ : traite une image via le pipeline EXIF → OCR → AI.
|
|
|
|
Args:
|
|
ctx: Contexte ARQ (contient job_try, redis, etc.)
|
|
image_id: ID de l'image à traiter
|
|
client_id: ID du client propriétaire
|
|
"""
|
|
job_try = ctx.get("job_try", 1)
|
|
redis = ctx.get("redis")
|
|
|
|
logger.info(
|
|
"worker.job.started",
|
|
extra={"image_id": image_id, "client_id": client_id, "job_try": job_try},
|
|
)
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
try:
|
|
await process_image_pipeline(image_id, db, redis=redis)
|
|
logger.info(
|
|
"worker.job.completed",
|
|
extra={"image_id": image_id, "client_id": client_id},
|
|
)
|
|
return f"OK image_id={image_id}"
|
|
|
|
except Exception as e:
|
|
max_tries = settings.WORKER_MAX_TRIES
|
|
logger.error(
|
|
"worker.job.failed",
|
|
extra={
|
|
"image_id": image_id,
|
|
"client_id": client_id,
|
|
"job_try": job_try,
|
|
"max_tries": max_tries,
|
|
"error": str(e),
|
|
},
|
|
exc_info=True,
|
|
)
|
|
|
|
if job_try >= max_tries:
|
|
# Dead-letter : marquer l'image en erreur définitive
|
|
await _mark_image_error(db, image_id, str(e), job_try)
|
|
logger.error(
|
|
"worker.job.dead_letter",
|
|
extra={
|
|
"image_id": image_id,
|
|
"client_id": client_id,
|
|
"total_tries": job_try,
|
|
},
|
|
)
|
|
return f"DEAD_LETTER image_id={image_id} after {job_try} tries"
|
|
|
|
# Retry avec backoff
|
|
delay_idx = min(job_try - 1, len(RETRY_DELAYS) - 1)
|
|
retry_delay = RETRY_DELAYS[delay_idx]
|
|
logger.warning(
|
|
"worker.job.retry_scheduled",
|
|
extra={
|
|
"image_id": image_id,
|
|
"retry_in_seconds": retry_delay,
|
|
"next_try": job_try + 1,
|
|
},
|
|
)
|
|
raise # ARQ replanifie automatiquement
|
|
|
|
|
|
async def _mark_image_error(
|
|
db, image_id: int, error_msg: str, total_tries: int
|
|
) -> None:
|
|
"""Marque une image en erreur définitive après épuisement des retries."""
|
|
result = await db.execute(select(Image).where(Image.id == image_id))
|
|
image = result.scalar_one_or_none()
|
|
if image:
|
|
image.processing_status = ProcessingStatus.ERROR
|
|
image.processing_error = f"Échec après {total_tries} tentatives : {error_msg}"
|
|
image.processing_done_at = datetime.now(timezone.utc)
|
|
await db.commit()
|
|
|
|
|
|
async def on_startup(ctx: dict) -> None:
|
|
"""Hook ARQ : appelé au démarrage du worker."""
|
|
logger.info("worker.startup", extra={"max_jobs": settings.WORKER_MAX_JOBS})
|
|
|
|
|
|
async def on_shutdown(ctx: dict) -> None:
|
|
"""Hook ARQ : appelé à l'arrêt du worker."""
|
|
logger.info("worker.shutdown")
|
|
|
|
|
|
async def on_job_start(ctx: dict) -> None:
|
|
"""Hook ARQ : appelé au début de chaque job."""
|
|
pass # Le logging est fait dans process_image_task
|
|
|
|
|
|
async def on_job_end(ctx: dict) -> None:
|
|
"""Hook ARQ : appelé à la fin de chaque job."""
|
|
pass # Le logging est fait dans process_image_task
|
|
|
|
|
|
def _parse_redis_settings() -> RedisSettings:
|
|
"""Parse REDIS_URL en RedisSettings ARQ."""
|
|
url = settings.REDIS_URL
|
|
# redis://[:password@]host[:port][/db]
|
|
if url.startswith("redis://"):
|
|
url = url[8:]
|
|
elif url.startswith("rediss://"):
|
|
url = url[9:]
|
|
|
|
password = None
|
|
host = "localhost"
|
|
port = 6379
|
|
database = 0
|
|
|
|
# Parse password
|
|
if "@" in url:
|
|
auth_part, url = url.rsplit("@", 1)
|
|
if ":" in auth_part:
|
|
password = auth_part.split(":", 1)[1]
|
|
else:
|
|
password = auth_part
|
|
|
|
# Parse host:port/db
|
|
if "/" in url:
|
|
host_port, db_str = url.split("/", 1)
|
|
if db_str:
|
|
database = int(db_str)
|
|
else:
|
|
host_port = url
|
|
|
|
if ":" in host_port:
|
|
host, port_str = host_port.rsplit(":", 1)
|
|
if port_str:
|
|
port = int(port_str)
|
|
else:
|
|
host = host_port
|
|
|
|
return RedisSettings(
|
|
host=host or "localhost",
|
|
port=port,
|
|
password=password,
|
|
database=database,
|
|
)
|
|
|
|
|
|
class WorkerSettings:
|
|
"""Configuration du worker ARQ."""
|
|
|
|
functions = [func(process_image_task, name="process_image_task")]
|
|
redis_settings = _parse_redis_settings()
|
|
max_jobs = settings.WORKER_MAX_JOBS
|
|
job_timeout = settings.WORKER_JOB_TIMEOUT
|
|
retry_jobs = True
|
|
max_tries = settings.WORKER_MAX_TRIES
|
|
queue_name = "standard" # Queue par défaut
|
|
on_startup = on_startup
|
|
on_shutdown = on_shutdown
|
|
on_job_start = on_job_start
|
|
on_job_end = on_job_end
|
|
|
|
# Le worker écoute les deux queues
|
|
queues = ["standard", "premium"]
|