- Implement tests for database generator to ensure proper session handling. - Create tests for EXIF extraction and conversion functions. - Add tests for image-related endpoints, ensuring proper data retrieval and isolation between clients. - Develop tests for OCR functionality, including language detection and text extraction. - Introduce tests for the image processing pipeline, covering success and failure scenarios. - Validate rate limiting functionality and ensure independent counters for different clients. - Implement scraper tests to verify HTML content fetching and error handling. - Add unit tests for various services, including storage and filename generation. - Establish worker entry point for ARQ to handle background image processing tasks.
71 lines
2.0 KiB
Python
71 lines
2.0 KiB
Python
"""
|
|
Service de scraping — extraction de contenu web pour résumés AI
|
|
"""
|
|
from typing import Optional
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (compatible; ShaarliBot/1.0)"
|
|
)
|
|
}
|
|
|
|
|
|
async def fetch_page_content(url: str) -> dict:
|
|
"""
|
|
Récupère le contenu d'une URL et extrait :
|
|
- Titre de la page
|
|
- Méta description
|
|
- Texte principal
|
|
"""
|
|
result = {
|
|
"url": url,
|
|
"title": None,
|
|
"description": None,
|
|
"text": None,
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
headers=HEADERS,
|
|
timeout=15.0,
|
|
follow_redirects=True,
|
|
) as client:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Titre
|
|
title_tag = soup.find("title")
|
|
result["title"] = title_tag.get_text(strip=True) if title_tag else None
|
|
|
|
# Meta description
|
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
|
if not meta_desc:
|
|
meta_desc = soup.find("meta", attrs={"property": "og:description"})
|
|
result["description"] = meta_desc.get("content", "") if meta_desc else None
|
|
|
|
# Texte principal — on retire scripts, styles, nav
|
|
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
|
tag.decompose()
|
|
|
|
# Priorité aux balises sémantiques
|
|
main = soup.find("article") or soup.find("main") or soup.find("body")
|
|
if main:
|
|
paragraphs = main.find_all("p")
|
|
text = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30)
|
|
result["text"] = text[:5000] if text else None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
result["error"] = f"HTTP {e.response.status_code}"
|
|
except httpx.RequestError as e:
|
|
result["error"] = f"Connexion impossible : {str(e)}"
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
|
|
return result
|