feat: French stemming (snowballstemmer) — 'recettes' matches 'recette', 'mangeons' matches 'manger'

Add French snowball stemmer to tokenization pipeline: - Index both original tokens AND their stems in InvertedIndex - Query terms are also stemmed before lookup - Stemmed forms accumulate TF from all original forms - Lazy-init singleton pattern for stemmer
2026-05-28 13:15:37 -04:00 · 2026-05-28 13:15:37 -04:00 · 23fa003422
commit 23fa003422
parent 271a463d6d
2 changed files with 59 additions and 1 deletions
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -9,4 +9,5 @@ watchdog>=4.0.0
 argon2-cffi>=23.1.0
 python-jose>=3.3.0
 sortedcontainers>=2.4.0
+snowballstemmer>=2.2.0
 weasyprint>=60.0
--- a/backend/search.py
+++ b/backend/search.py
@ -7,6 +7,8 @@ import unicodedata
 from collections import defaultdict
 from typing import List, Dict, Any, Optional, Tuple

+from snowballstemmer import stemmer as _snowball_stemmer
+
 from backend import indexer as _indexer
 from backend.indexer import index

@ -28,6 +30,28 @@ SUGGEST_LIMIT = 10         # Default max suggestions returned
 # Regex to tokenize text into alphanumeric words (Unicode-aware)
 _WORD_RE = re.compile(r"[\w]+", re.UNICODE)

+# French stemmer (lazy-init singleton)
+_FR_STEMMER: object | None = None
+
+
+def _get_stemmer():
+    """Return a cached French snowball stemmer instance."""
+    global _FR_STEMMER
+    if _FR_STEMMER is None:
+        _FR_STEMMER = _snowball_stemmer("french")
+    return _FR_STEMMER
+
+
+def stem_token(token: str) -> str:
+    """Reduce a word to its French stem (e.g. 'mangeons' → 'mang')."""
+    return _get_stemmer().stemWord(token)
+
+
+def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
+    """Return the set of unique stems for a list of tokens."""
+    s = _get_stemmer()
+    return {s.stemWord(t) for t in tokens}
+

 # ---------------------------------------------------------------------------
 # Accent / Unicode normalization helpers
@ -381,8 +405,17 @@ class InvertedIndex:
                tf: Dict[str, int] = defaultdict(int)
                for token in tokens:
                    tf[token] += 1
+                # Also index stemmed forms (French stemming)
+                stems = stem_tokens(list(tf.keys()))
                for token, freq in tf.items():
                    self.word_index[token][doc_key] = freq
+                for stem in stems:
+                    # Accumulate frequency for stem (sum of all forms mapping to same stem)
+                    stem_freq = self.word_index[stem].get(doc_key, 0)
+                    for token, freq in tf.items():
+                        if stem_token(token) == stem:
+                            stem_freq += freq
+                    self.word_index[stem][doc_key] = stem_freq

            # --- Tag indexes ---
            for tag in vault_data.get("tags", {}):
@ -446,10 +479,20 @@ class InvertedIndex:
        for token in tokens:
            if token:
                tf[token] += 1
+        # Also compute stems
+        stems = stem_tokens(list(tf.keys()))
        for token, freq in tf.items():
            if not self.word_index.get(token):
                self._sorted_tokens.add(token)
            self.word_index[token][doc_key] = freq
+        for stem in stems:
+            stem_freq = self.word_index[stem].get(doc_key, 0)
+            for token, freq in tf.items():
+                if stem_token(token) == stem:
+                    stem_freq += freq
+            if not self.word_index.get(stem):
+                self._sorted_tokens.add(stem)
+            self.word_index[stem][doc_key] = stem_freq

    def remove_document(self, vault_name: str, path: str):
        """Remove a single document incrementally."""
@ -501,7 +544,10 @@ class InvertedIndex:
        # Word index
        content = file_info.get("content", "")
        full_text = title + " " + content
-        for token in set(tokenize(full_text)):
+        tokens_to_remove = set(tokenize(full_text))
+        # Also compute stems to clean up
+        stems_to_clean = stem_tokens(tokens_to_remove)
+        for token in tokens_to_remove:
            if not token:
                continue
            wi = self.word_index.get(token)
@ -511,6 +557,14 @@ class InvertedIndex:
                    del self.word_index[token]
                    if not skip_sorted_cleanup:
                        self._sorted_tokens.discard(token)
+        for stem in stems_to_clean:
+            wi = self.word_index.get(stem)
+            if wi:
+                wi.pop(doc_key, None)
+                if not wi:
+                    del self.word_index[stem]
+                    if not skip_sorted_cleanup:
+                        self._sorted_tokens.discard(stem)

    def idf(self, term: str) -> float:
        """Inverse Document Frequency for a term.
@ -939,6 +993,9 @@ def advanced_search(
    query_terms = []
    for t in query_terms_raw:
        query_terms.extend(tokenize(t))
+    # Also add stemmed forms for French stemming support
+    query_stems = stem_tokens(query_terms)
+    query_terms.extend(stem for stem in query_stems if stem not in query_terms)
    has_terms = len(query_terms) > 0

    if not has_terms and not all_tags and not parsed["title"] and not parsed["path"] and not parsed["ext"]: