# tests/test_search.py — Tests for the search engine import pytest from backend.search import ( normalize_text, tokenize, _normalize_tag_filter, _extract_snippet, _escape_html, _highlight_terms, _extract_highlighted_snippet, _extract_regex_snippet, get_all_tags, suggest_titles, suggest_tags, search, advanced_search, ) # ═══════════════════════════════════════════════════════════════════ # normalize_text # ═══════════════════════════════════════════════════════════════════ class TestNormalizeText: def test_empty_string(self): assert normalize_text("") == "" assert normalize_text(None) == "" def test_lowercase(self): assert normalize_text("Python") == "python" def test_accent_stripping(self): assert normalize_text("Éléphant") == "elephant" assert normalize_text("crème brûlée") == "creme brulee" assert normalize_text("café") == "cafe" def test_german_umlauts(self): # NFD: ü → u + combining diaeresis result = normalize_text("München") assert result == "munchen" def test_mixed(self): result = normalize_text("Déjà vu – ça va ?") # NFKD decomposes… the result depends on the Unicode decomposition assert "deja" in result assert "ca" in result # ═══════════════════════════════════════════════════════════════════ # tokenize # ═══════════════════════════════════════════════════════════════════ class TestTokenize: def test_simple(self): tokens = tokenize("hello world") assert tokens == ["hello", "world"] def test_accents(self): tokens = tokenize("crème brûlée") assert tokens == ["creme", "brulee"] def test_punctuation_stripped(self): tokens = tokenize("hello, world! how are you?") assert tokens == ["hello", "world", "how", "are", "you"] def test_numbers_and_underscores(self): tokens = tokenize("test_123 file_v2") assert tokens == ["test_123", "file_v2"] def test_french_text(self): tokens = tokenize("Python est un langage de programmation") assert tokens == ["python", "est", "un", "langage", "de", "programmation"] # ═══════════════════════════════════════════════════════════════════ # Tag filter # ═══════════════════════════════════════════════════════════════════ class TestNormalizeTagFilter: def test_empty(self): assert _normalize_tag_filter(None) == [] assert _normalize_tag_filter("") == [] def test_single(self): assert _normalize_tag_filter("python") == ["python"] def test_multiple(self): assert _normalize_tag_filter("python,docker") == ["python", "docker"] def test_with_hash(self): assert _normalize_tag_filter("#python") == ["python"] assert _normalize_tag_filter("#python, #docker") == ["python", "docker"] def test_whitespace(self): assert _normalize_tag_filter(" python , docker ") == ["python", "docker"] # ═══════════════════════════════════════════════════════════════════ # Snippets # ═══════════════════════════════════════════════════════════════════ class TestExtractSnippet: def test_finds_query(self): content = "abcdefghijklmnopqrstuvwxyz" * 10 snippet = _extract_snippet(content, "klmno", context_chars=10) assert "klmno" in snippet def test_fallback_when_not_found(self): content = "short content here" snippet = _extract_snippet(content, "zzznotfound") assert len(snippet) <= 203 # first 200 + "..." def test_prefix_suffix(self): content = "x" * 300 + "TARGET" + "y" * 300 snippet = _extract_snippet(content, "TARGET", context_chars=10) assert snippet.startswith("...") assert snippet.endswith("...") class TestEscapeHTML: def test_plain(self): assert _escape_html("hello") == "hello" def test_tags(self): assert _escape_html("