Imago/tests/test_scraper_extended.py

import pytest
import respx
from httpx import Response
from app.services.scraper import fetch_page_content

@pytest.mark.asyncio
@respx.mock
async def test_fetch_page_content_success():
    url = "https://example.com"
    html_content = """
    <html>
        <head><title>Test Title</title><meta name="description" content="Test Description"></head>
        <body>
            <article>
                <p>This is a long enough paragraph to be captured by the scraper logic which requires > 30 chars.</p>
                <p>Another long paragraph that should be joined with the previous one for the final text.</p>
            </article>
        </body>
    </html>
    """
    respx.get(url).mock(return_value=Response(200, text=html_content))

    result = await fetch_page_content(url)
    assert result["title"] == "Test Title"
    assert result["description"] == "Test Description"
    assert "This is a long enough paragraph" in result["text"]
    assert result["error"] is None

@pytest.mark.asyncio
@respx.mock
async def test_fetch_page_content_http_error():
    url = "https://example.com/404"
    respx.get(url).mock(return_value=Response(404))

    result = await fetch_page_content(url)
    assert result["error"] == "HTTP 404"

@pytest.mark.asyncio
@respx.mock
async def test_fetch_page_content_request_error():
    url = "https://broken.url"
    respx.get(url).mock(side_effect=Exception("Connection reset"))

    result = await fetch_page_content(url)
    assert "Connection reset" in result["error"]