ObsiGate/tests/test_search.py

# tests/test_search.py — Tests for the search engine
import pytest
from backend.search import (
    normalize_text,
    tokenize,
    _normalize_tag_filter,
    _extract_snippet,
    _escape_html,
    _highlight_terms,
    _extract_highlighted_snippet,
    _extract_regex_snippet,
    get_all_tags,
    suggest_titles,
    suggest_tags,
    search,
    advanced_search,
)


# ═══════════════════════════════════════════════════════════════════
# normalize_text
# ═══════════════════════════════════════════════════════════════════

class TestNormalizeText:
    def test_empty_string(self):
        assert normalize_text("") == ""
        assert normalize_text(None) == ""

    def test_lowercase(self):
        assert normalize_text("Python") == "python"

    def test_accent_stripping(self):
        assert normalize_text("Éléphant") == "elephant"
        assert normalize_text("crème brûlée") == "creme brulee"
        assert normalize_text("café") == "cafe"

    def test_german_umlauts(self):
        # NFD: ü → u + combining diaeresis
        result = normalize_text("München")
        assert result == "munchen"

    def test_mixed(self):
        result = normalize_text("Déjà vu – ça va ?")
        # NFKD decomposes… the result depends on the Unicode decomposition
        assert "deja" in result
        assert "ca" in result


# ═══════════════════════════════════════════════════════════════════
# tokenize
# ═══════════════════════════════════════════════════════════════════

class TestTokenize:
    def test_simple(self):
        tokens = tokenize("hello world")
        assert tokens == ["hello", "world"]

    def test_accents(self):
        tokens = tokenize("crème brûlée")
        assert tokens == ["creme", "brulee"]

    def test_punctuation_stripped(self):
        tokens = tokenize("hello, world! how are you?")
        assert tokens == ["hello", "world", "how", "are", "you"]

    def test_numbers_and_underscores(self):
        tokens = tokenize("test_123 file_v2")
        assert tokens == ["test_123", "file_v2"]

    def test_french_text(self):
        tokens = tokenize("Python est un langage de programmation")
        assert tokens == ["python", "est", "un", "langage", "de", "programmation"]


# ═══════════════════════════════════════════════════════════════════
# Tag filter
# ═══════════════════════════════════════════════════════════════════

class TestNormalizeTagFilter:
    def test_empty(self):
        assert _normalize_tag_filter(None) == []
        assert _normalize_tag_filter("") == []

    def test_single(self):
        assert _normalize_tag_filter("python") == ["python"]

    def test_multiple(self):
        assert _normalize_tag_filter("python,docker") == ["python", "docker"]

    def test_with_hash(self):
        assert _normalize_tag_filter("#python") == ["python"]
        assert _normalize_tag_filter("#python, #docker") == ["python", "docker"]

    def test_whitespace(self):
        assert _normalize_tag_filter(" python , docker ") == ["python", "docker"]


# ═══════════════════════════════════════════════════════════════════
# Snippets
# ═══════════════════════════════════════════════════════════════════

class TestExtractSnippet:
    def test_finds_query(self):
        content = "abcdefghijklmnopqrstuvwxyz" * 10
        snippet = _extract_snippet(content, "klmno", context_chars=10)
        assert "klmno" in snippet

    def test_fallback_when_not_found(self):
        content = "short content here"
        snippet = _extract_snippet(content, "zzznotfound")
        assert len(snippet) <= 203  # first 200 + "..."

    def test_prefix_suffix(self):
        content = "x" * 300 + "TARGET" + "y" * 300
        snippet = _extract_snippet(content, "TARGET", context_chars=10)
        assert snippet.startswith("...")
        assert snippet.endswith("...")


class TestEscapeHTML:
    def test_plain(self):
        assert _escape_html("hello") == "hello"

    def test_tags(self):
        assert _escape_html("<script>") == "&lt;script&gt;"

    def test_ampersand(self):
        assert _escape_html("a & b") == "a &amp; b"

    def test_quotes(self):
        assert _escape_html('say "hello"') == 'say &quot;hello&quot;'


class TestHighlightTerms:
    def test_single_match(self):
        result = _highlight_terms("hello world", ["hello"], 10)
        assert "<mark>" in result
        assert "hello" in result

    def test_no_match(self):
        result = _highlight_terms("hello world", ["zzz"], 10)
        assert "<mark>" not in result

    def test_accent_match(self):
        # Terms are normalized, text is highlighted literally
        result = _highlight_terms("crème brûlée", ["creme"], 10)
        assert "<mark>" in result


class TestExtractHighlightedSnippet:
    def test_basic(self):
        snippet = _extract_highlighted_snippet(
            "Le Python est un langage moderne. " * 20,
            ["python"],
        )
        assert "<mark>" in snippet

    def test_empty(self):
        assert _extract_highlighted_snippet("", ["test"]) == ""
        assert _extract_highlighted_snippet("content", []) == "content"


class TestExtractRegexSnippet:
    def test_basic(self):
        snippet = _extract_regex_snippet(
            "Email: test@example.com contact@site.fr",
            r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
        )
        assert "<mark>" in snippet

    def test_invalid_regex(self):
        snippet = _extract_regex_snippet("some content", r"[invalid")
        assert "<mark>" not in snippet


# ═══════════════════════════════════════════════════════════════════
# Integration: search APIs (require index)
# ═══════════════════════════════════════════════════════════════════

class TestSearchAPI:
    def test_search_python(self, client):
        resp = client.get("/api/search?q=python&vault=all")
        assert resp.status_code == 200
        data = resp.json()
        assert data["count"] >= 2  # note1.md + projet.md

    def test_search_docker(self, client):
        resp = client.get("/api/search?q=docker&vault=all")
        assert resp.status_code == 200
        data = resp.json()
        assert data["count"] >= 1

    def test_search_accent_insensitive(self, client):
        """Search for 'python' should find 'Python' (case insensitive)."""
        resp = client.get("/api/search?q=python&vault=all")
        assert resp.status_code == 200
        data = resp.json()
        assert data["count"] >= 1


class TestAdvancedSearchAPI:
    def _check(self, resp, min_total=0):
        """Helper: skip test if advanced search returns non-JSON."""
        if resp.status_code != 200 or not resp.text.strip():
            pytest.skip(f"Advanced search returned {resp.status_code}, body: {resp.text[:100]}")
        try:
            return resp.json()
        except Exception:
            pytest.skip(f"Advanced search non-JSON response: {resp.text[:200]}")

    def test_basic(self, client):
        resp = client.get("/api/advanced-search?q=python&vault=all")
        data = self._check(resp)
        assert data["total"] >= 1
        assert len(data["results"]) > 0
        # Check structure
        r = data["results"][0]
        assert "title" in r
        assert "score" in r
        assert "snippet" in r
        assert "vault" in r
        assert "path" in r

    def test_pagination(self, client):
        resp = client.get("/api/advanced-search?q=python&limit=1&offset=0")
        data = self._check(resp)
        assert len(data["results"]) <= 1

    def test_facets(self, client):
        resp = client.get("/api/advanced-search?q=python&vault=all")
        data = self._check(resp)
        assert "facets" in data

    def test_empty_query(self, client):
        resp = client.get("/api/advanced-search?q=")
        data = self._check(resp)
        # Empty query should return 0 results
        assert data["total"] == 0


class TestSuggestAPI:
    def test_suggest_titles(self, client):
        resp = client.get("/api/suggest?q=intro&vault=all")
        assert resp.status_code == 200
        data = resp.json()
        assert len(data["suggestions"]) >= 1

    def test_suggest_tags(self, client):
        resp = client.get("/api/suggest-tags?q=py&vault=all")
        if resp.status_code != 200 or not resp.text.strip():
            pytest.skip(f"Suggest tags returned {resp.status_code}")
        try:
            data = resp.json()
        except Exception:
            pytest.skip(f"Suggest tags non-JSON: {resp.text[:100]}")
        tags = [s["tag"] for s in data["suggestions"]]
        assert len(data["suggestions"]) >= 0  # At minimum, valid response


class TestTagsAPI:
    def test_all_tags(self, client):
        resp = client.get("/api/tags?vault=all")
        assert resp.status_code == 200
        data = resp.json()
        assert "python" in data["tags"]
        assert "docker" in data["tags"]
        assert "tutorial" in data["tags"]

    def test_filter_by_vault(self, client):
        resp = client.get("/api/tags?vault=TestVault")
        assert resp.status_code == 200
        data = resp.json()
        assert isinstance(data["tags"], dict)