From 23fa0034229d2b6c71ce11a443e8f2164d6c9453 Mon Sep 17 00:00:00 2001
From: Bruno Charest <bruno.charest@gmail.com>
Date: Thu, 28 May 2026 13:15:37 -0400
Subject: [PATCH] =?UTF-8?q?feat:=20French=20stemming=20(snowballstemmer)?=
 =?UTF-8?q?=20=E2=80=94=20'recettes'=20matches=20'recette',=20'mangeons'?=
 =?UTF-8?q?=20matches=20'manger'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add French snowball stemmer to tokenization pipeline:
- Index both original tokens AND their stems in InvertedIndex
- Query terms are also stemmed before lookup
- Stemmed forms accumulate TF from all original forms
- Lazy-init singleton pattern for stemmer
---
 backend/requirements.txt |  1 +
 backend/search.py        | 59 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index fcc8362..82eb51c 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -9,4 +9,5 @@ watchdog>=4.0.0
 argon2-cffi>=23.1.0
 python-jose>=3.3.0
 sortedcontainers>=2.4.0
+snowballstemmer>=2.2.0
 weasyprint>=60.0
diff --git a/backend/search.py b/backend/search.py
index 16b9d3a..61e4135 100644
--- a/backend/search.py
+++ b/backend/search.py
@@ -7,6 +7,8 @@ import unicodedata
 from collections import defaultdict
 from typing import List, Dict, Any, Optional, Tuple
 
+from snowballstemmer import stemmer as _snowball_stemmer
+
 from backend import indexer as _indexer
 from backend.indexer import index
 
@@ -28,6 +30,28 @@ SUGGEST_LIMIT = 10         # Default max suggestions returned
 # Regex to tokenize text into alphanumeric words (Unicode-aware)
 _WORD_RE = re.compile(r"[\w]+", re.UNICODE)
 
+# French stemmer (lazy-init singleton)
+_FR_STEMMER: object | None = None
+
+
+def _get_stemmer():
+    """Return a cached French snowball stemmer instance."""
+    global _FR_STEMMER
+    if _FR_STEMMER is None:
+        _FR_STEMMER = _snowball_stemmer("french")
+    return _FR_STEMMER
+
+
+def stem_token(token: str) -> str:
+    """Reduce a word to its French stem (e.g. 'mangeons' → 'mang')."""
+    return _get_stemmer().stemWord(token)
+
+
+def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
+    """Return the set of unique stems for a list of tokens."""
+    s = _get_stemmer()
+    return {s.stemWord(t) for t in tokens}
+
 
 # ---------------------------------------------------------------------------
 # Accent / Unicode normalization helpers
@@ -381,8 +405,17 @@ class InvertedIndex:
                 tf: Dict[str, int] = defaultdict(int)
                 for token in tokens:
                     tf[token] += 1
+                # Also index stemmed forms (French stemming)
+                stems = stem_tokens(list(tf.keys()))
                 for token, freq in tf.items():
                     self.word_index[token][doc_key] = freq
+                for stem in stems:
+                    # Accumulate frequency for stem (sum of all forms mapping to same stem)
+                    stem_freq = self.word_index[stem].get(doc_key, 0)
+                    for token, freq in tf.items():
+                        if stem_token(token) == stem:
+                            stem_freq += freq
+                    self.word_index[stem][doc_key] = stem_freq
 
             # --- Tag indexes ---
             for tag in vault_data.get("tags", {}):
@@ -446,10 +479,20 @@ class InvertedIndex:
         for token in tokens:
             if token:
                 tf[token] += 1
+        # Also compute stems
+        stems = stem_tokens(list(tf.keys()))
         for token, freq in tf.items():
             if not self.word_index.get(token):
                 self._sorted_tokens.add(token)
             self.word_index[token][doc_key] = freq
+        for stem in stems:
+            stem_freq = self.word_index[stem].get(doc_key, 0)
+            for token, freq in tf.items():
+                if stem_token(token) == stem:
+                    stem_freq += freq
+            if not self.word_index.get(stem):
+                self._sorted_tokens.add(stem)
+            self.word_index[stem][doc_key] = stem_freq
 
     def remove_document(self, vault_name: str, path: str):
         """Remove a single document incrementally."""
@@ -501,7 +544,10 @@ class InvertedIndex:
         # Word index
         content = file_info.get("content", "")
         full_text = title + " " + content
-        for token in set(tokenize(full_text)):
+        tokens_to_remove = set(tokenize(full_text))
+        # Also compute stems to clean up
+        stems_to_clean = stem_tokens(tokens_to_remove)
+        for token in tokens_to_remove:
             if not token:
                 continue
             wi = self.word_index.get(token)
@@ -511,6 +557,14 @@ class InvertedIndex:
                     del self.word_index[token]
                     if not skip_sorted_cleanup:
                         self._sorted_tokens.discard(token)
+        for stem in stems_to_clean:
+            wi = self.word_index.get(stem)
+            if wi:
+                wi.pop(doc_key, None)
+                if not wi:
+                    del self.word_index[stem]
+                    if not skip_sorted_cleanup:
+                        self._sorted_tokens.discard(stem)
 
     def idf(self, term: str) -> float:
         """Inverse Document Frequency for a term.
@@ -939,6 +993,9 @@ def advanced_search(
     query_terms = []
     for t in query_terms_raw:
         query_terms.extend(tokenize(t))
+    # Also add stemmed forms for French stemming support
+    query_stems = stem_tokens(query_terms)
+    query_terms.extend(stem for stem in query_stems if stem not in query_terms)
     has_terms = len(query_terms) > 0
 
     if not has_terms and not all_tags and not parsed["title"] and not parsed["path"] and not parsed["ext"]: