""" Service de scraping — extraction de contenu web pour résumés AI """ from typing import Optional import httpx from bs4 import BeautifulSoup HEADERS = { "User-Agent": ( "Mozilla/5.0 (compatible; ShaarliBot/1.0)" ) } async def fetch_page_content(url: str) -> dict: """ Récupère le contenu d'une URL et extrait : - Titre de la page - Méta description - Texte principal """ result = { "url": url, "title": None, "description": None, "text": None, "error": None, } try: async with httpx.AsyncClient( headers=HEADERS, timeout=15.0, follow_redirects=True, ) as client: response = await client.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Titre title_tag = soup.find("title") result["title"] = title_tag.get_text(strip=True) if title_tag else None # Meta description meta_desc = soup.find("meta", attrs={"name": "description"}) if not meta_desc: meta_desc = soup.find("meta", attrs={"property": "og:description"}) result["description"] = meta_desc.get("content", "") if meta_desc else None # Texte principal — on retire scripts, styles, nav for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): tag.decompose() # Priorité aux balises sémantiques main = soup.find("article") or soup.find("main") or soup.find("body") if main: paragraphs = main.find_all("p") text = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30) result["text"] = text[:5000] if text else None except httpx.HTTPStatusError as e: result["error"] = f"HTTP {e.response.status_code}" except httpx.RequestError as e: result["error"] = f"Connexion impossible : {str(e)}" except Exception as e: result["error"] = str(e) return result