From 23fa0034229d2b6c71ce11a443e8f2164d6c9453 Mon Sep 17 00:00:00 2001 From: Bruno Charest Date: Thu, 28 May 2026 13:15:37 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20French=20stemming=20(snowballstemmer)?= =?UTF-8?q?=20=E2=80=94=20'recettes'=20matches=20'recette',=20'mangeons'?= =?UTF-8?q?=20matches=20'manger'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add French snowball stemmer to tokenization pipeline: - Index both original tokens AND their stems in InvertedIndex - Query terms are also stemmed before lookup - Stemmed forms accumulate TF from all original forms - Lazy-init singleton pattern for stemmer --- backend/requirements.txt | 1 + backend/search.py | 59 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index fcc8362..82eb51c 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -9,4 +9,5 @@ watchdog>=4.0.0 argon2-cffi>=23.1.0 python-jose>=3.3.0 sortedcontainers>=2.4.0 +snowballstemmer>=2.2.0 weasyprint>=60.0 diff --git a/backend/search.py b/backend/search.py index 16b9d3a..61e4135 100644 --- a/backend/search.py +++ b/backend/search.py @@ -7,6 +7,8 @@ import unicodedata from collections import defaultdict from typing import List, Dict, Any, Optional, Tuple +from snowballstemmer import stemmer as _snowball_stemmer + from backend import indexer as _indexer from backend.indexer import index @@ -28,6 +30,28 @@ SUGGEST_LIMIT = 10 # Default max suggestions returned # Regex to tokenize text into alphanumeric words (Unicode-aware) _WORD_RE = re.compile(r"[\w]+", re.UNICODE) +# French stemmer (lazy-init singleton) +_FR_STEMMER: object | None = None + + +def _get_stemmer(): + """Return a cached French snowball stemmer instance.""" + global _FR_STEMMER + if _FR_STEMMER is None: + _FR_STEMMER = _snowball_stemmer("french") + return _FR_STEMMER + + +def stem_token(token: str) -> str: + """Reduce a word to its French stem (e.g. 'mangeons' → 'mang').""" + return _get_stemmer().stemWord(token) + + +def stem_tokens(tokens: list[str] | set[str]) -> set[str]: + """Return the set of unique stems for a list of tokens.""" + s = _get_stemmer() + return {s.stemWord(t) for t in tokens} + # --------------------------------------------------------------------------- # Accent / Unicode normalization helpers @@ -381,8 +405,17 @@ class InvertedIndex: tf: Dict[str, int] = defaultdict(int) for token in tokens: tf[token] += 1 + # Also index stemmed forms (French stemming) + stems = stem_tokens(list(tf.keys())) for token, freq in tf.items(): self.word_index[token][doc_key] = freq + for stem in stems: + # Accumulate frequency for stem (sum of all forms mapping to same stem) + stem_freq = self.word_index[stem].get(doc_key, 0) + for token, freq in tf.items(): + if stem_token(token) == stem: + stem_freq += freq + self.word_index[stem][doc_key] = stem_freq # --- Tag indexes --- for tag in vault_data.get("tags", {}): @@ -446,10 +479,20 @@ class InvertedIndex: for token in tokens: if token: tf[token] += 1 + # Also compute stems + stems = stem_tokens(list(tf.keys())) for token, freq in tf.items(): if not self.word_index.get(token): self._sorted_tokens.add(token) self.word_index[token][doc_key] = freq + for stem in stems: + stem_freq = self.word_index[stem].get(doc_key, 0) + for token, freq in tf.items(): + if stem_token(token) == stem: + stem_freq += freq + if not self.word_index.get(stem): + self._sorted_tokens.add(stem) + self.word_index[stem][doc_key] = stem_freq def remove_document(self, vault_name: str, path: str): """Remove a single document incrementally.""" @@ -501,7 +544,10 @@ class InvertedIndex: # Word index content = file_info.get("content", "") full_text = title + " " + content - for token in set(tokenize(full_text)): + tokens_to_remove = set(tokenize(full_text)) + # Also compute stems to clean up + stems_to_clean = stem_tokens(tokens_to_remove) + for token in tokens_to_remove: if not token: continue wi = self.word_index.get(token) @@ -511,6 +557,14 @@ class InvertedIndex: del self.word_index[token] if not skip_sorted_cleanup: self._sorted_tokens.discard(token) + for stem in stems_to_clean: + wi = self.word_index.get(stem) + if wi: + wi.pop(doc_key, None) + if not wi: + del self.word_index[stem] + if not skip_sorted_cleanup: + self._sorted_tokens.discard(stem) def idf(self, term: str) -> float: """Inverse Document Frequency for a term. @@ -939,6 +993,9 @@ def advanced_search( query_terms = [] for t in query_terms_raw: query_terms.extend(tokenize(t)) + # Also add stemmed forms for French stemming support + query_stems = stem_tokens(query_terms) + query_terms.extend(stem for stem in query_stems if stem not in query_terms) has_terms = len(query_terms) > 0 if not has_terms and not all_tags and not parsed["title"] and not parsed["path"] and not parsed["ext"]: