Imago/app/services/scraper.py
Bruno Charest cc99fea20a
Some checks failed
CI / Lint & Format (push) Has been cancelled
CI / Tests (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
Add comprehensive test suite for image processing and related services
- Implement tests for database generator to ensure proper session handling.
- Create tests for EXIF extraction and conversion functions.
- Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients.
- Develop tests for OCR functionality, including language detection and text extraction.
- Introduce tests for the image processing pipeline, covering success and failure scenarios.
- Validate rate limiting functionality and ensure independent counters for different clients.
- Implement scraper tests to verify HTML content fetching and error handling.
- Add unit tests for various services, including storage and filename generation.
- Establish worker entry point for ARQ to handle background image processing tasks.
2026-02-24 11:22:10 -05:00

71 lines
2.0 KiB
Python

"""
Service de scraping — extraction de contenu web pour résumés AI
"""
from typing import Optional
import httpx
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (compatible; ShaarliBot/1.0)"
)
}
async def fetch_page_content(url: str) -> dict:
"""
Récupère le contenu d'une URL et extrait :
- Titre de la page
- Méta description
- Texte principal
"""
result = {
"url": url,
"title": None,
"description": None,
"text": None,
"error": None,
}
try:
async with httpx.AsyncClient(
headers=HEADERS,
timeout=15.0,
follow_redirects=True,
) as client:
response = await client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Titre
title_tag = soup.find("title")
result["title"] = title_tag.get_text(strip=True) if title_tag else None
# Meta description
meta_desc = soup.find("meta", attrs={"name": "description"})
if not meta_desc:
meta_desc = soup.find("meta", attrs={"property": "og:description"})
result["description"] = meta_desc.get("content", "") if meta_desc else None
# Texte principal — on retire scripts, styles, nav
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
# Priorité aux balises sémantiques
main = soup.find("article") or soup.find("main") or soup.find("body")
if main:
paragraphs = main.find_all("p")
text = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30)
result["text"] = text[:5000] if text else None
except httpx.HTTPStatusError as e:
result["error"] = f"HTTP {e.response.status_code}"
except httpx.RequestError as e:
result["error"] = f"Connexion impossible : {str(e)}"
except Exception as e:
result["error"] = str(e)
return result