Imago/app/workers/image_worker.py
Bruno Charest cc99fea20a
Some checks failed
CI / Lint & Format (push) Has been cancelled
CI / Tests (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
Add comprehensive test suite for image processing and related services
- Implement tests for database generator to ensure proper session handling.
- Create tests for EXIF extraction and conversion functions.
- Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients.
- Develop tests for OCR functionality, including language detection and text extraction.
- Introduce tests for the image processing pipeline, covering success and failure scenarios.
- Validate rate limiting functionality and ensure independent counters for different clients.
- Implement scraper tests to verify HTML content fetching and error handling.
- Add unit tests for various services, including storage and filename generation.
- Establish worker entry point for ARQ to handle background image processing tasks.
2026-02-24 11:22:10 -05:00

192 lines
5.7 KiB
Python

"""
Worker ARQ — traitement asynchrone des images via Redis.
Lance avec : python worker.py
Fonctionnalités :
- File persistante Redis (survit aux redémarrages)
- Retry automatique avec backoff exponentiel
- Queues prioritaires (premium / standard)
- Dead-letter : marquage error après max_tries
"""
import logging
from datetime import datetime, timezone
from arq import cron, func
from arq.connections import RedisSettings
from app.config import settings
from app.database import AsyncSessionLocal
from app.models.image import Image, ProcessingStatus
from app.services.pipeline import process_image_pipeline
from sqlalchemy import select
logger = logging.getLogger(__name__)
# Backoff exponentiel : délais entre tentatives (en secondes)
RETRY_DELAYS = [1, 4, 16]
async def process_image_task(ctx: dict, image_id: int, client_id: str) -> str:
"""
Tâche ARQ : traite une image via le pipeline EXIF → OCR → AI.
Args:
ctx: Contexte ARQ (contient job_try, redis, etc.)
image_id: ID de l'image à traiter
client_id: ID du client propriétaire
"""
job_try = ctx.get("job_try", 1)
redis = ctx.get("redis")
logger.info(
"worker.job.started",
extra={"image_id": image_id, "client_id": client_id, "job_try": job_try},
)
async with AsyncSessionLocal() as db:
try:
await process_image_pipeline(image_id, db, redis=redis)
logger.info(
"worker.job.completed",
extra={"image_id": image_id, "client_id": client_id},
)
return f"OK image_id={image_id}"
except Exception as e:
max_tries = settings.WORKER_MAX_TRIES
logger.error(
"worker.job.failed",
extra={
"image_id": image_id,
"client_id": client_id,
"job_try": job_try,
"max_tries": max_tries,
"error": str(e),
},
exc_info=True,
)
if job_try >= max_tries:
# Dead-letter : marquer l'image en erreur définitive
await _mark_image_error(db, image_id, str(e), job_try)
logger.error(
"worker.job.dead_letter",
extra={
"image_id": image_id,
"client_id": client_id,
"total_tries": job_try,
},
)
return f"DEAD_LETTER image_id={image_id} after {job_try} tries"
# Retry avec backoff
delay_idx = min(job_try - 1, len(RETRY_DELAYS) - 1)
retry_delay = RETRY_DELAYS[delay_idx]
logger.warning(
"worker.job.retry_scheduled",
extra={
"image_id": image_id,
"retry_in_seconds": retry_delay,
"next_try": job_try + 1,
},
)
raise # ARQ replanifie automatiquement
async def _mark_image_error(
db, image_id: int, error_msg: str, total_tries: int
) -> None:
"""Marque une image en erreur définitive après épuisement des retries."""
result = await db.execute(select(Image).where(Image.id == image_id))
image = result.scalar_one_or_none()
if image:
image.processing_status = ProcessingStatus.ERROR
image.processing_error = f"Échec après {total_tries} tentatives : {error_msg}"
image.processing_done_at = datetime.now(timezone.utc)
await db.commit()
async def on_startup(ctx: dict) -> None:
"""Hook ARQ : appelé au démarrage du worker."""
logger.info("worker.startup", extra={"max_jobs": settings.WORKER_MAX_JOBS})
async def on_shutdown(ctx: dict) -> None:
"""Hook ARQ : appelé à l'arrêt du worker."""
logger.info("worker.shutdown")
async def on_job_start(ctx: dict) -> None:
"""Hook ARQ : appelé au début de chaque job."""
pass # Le logging est fait dans process_image_task
async def on_job_end(ctx: dict) -> None:
"""Hook ARQ : appelé à la fin de chaque job."""
pass # Le logging est fait dans process_image_task
def _parse_redis_settings() -> RedisSettings:
"""Parse REDIS_URL en RedisSettings ARQ."""
url = settings.REDIS_URL
# redis://[:password@]host[:port][/db]
if url.startswith("redis://"):
url = url[8:]
elif url.startswith("rediss://"):
url = url[9:]
password = None
host = "localhost"
port = 6379
database = 0
# Parse password
if "@" in url:
auth_part, url = url.rsplit("@", 1)
if ":" in auth_part:
password = auth_part.split(":", 1)[1]
else:
password = auth_part
# Parse host:port/db
if "/" in url:
host_port, db_str = url.split("/", 1)
if db_str:
database = int(db_str)
else:
host_port = url
if ":" in host_port:
host, port_str = host_port.rsplit(":", 1)
if port_str:
port = int(port_str)
else:
host = host_port
return RedisSettings(
host=host or "localhost",
port=port,
password=password,
database=database,
)
class WorkerSettings:
"""Configuration du worker ARQ."""
functions = [func(process_image_task, name="process_image_task")]
redis_settings = _parse_redis_settings()
max_jobs = settings.WORKER_MAX_JOBS
job_timeout = settings.WORKER_JOB_TIMEOUT
retry_jobs = True
max_tries = settings.WORKER_MAX_TRIES
queue_name = "standard" # Queue par défaut
on_startup = on_startup
on_shutdown = on_shutdown
on_job_start = on_job_start
on_job_end = on_job_end
# Le worker écoute les deux queues
queues = ["standard", "premium"]