feat: French stemming (snowballstemmer) — 'recettes' matches 'recette', 'mangeons' matches 'manger'
Some checks failed
CI / test (push) Has been cancelled
CI / security (push) Has been cancelled
CI / build (push) Has been cancelled
CI / lint (push) Has been cancelled

Add French snowball stemmer to tokenization pipeline:
- Index both original tokens AND their stems in InvertedIndex
- Query terms are also stemmed before lookup
- Stemmed forms accumulate TF from all original forms
- Lazy-init singleton pattern for stemmer
This commit is contained in:
Bruno Charest 2026-05-28 13:15:37 -04:00
parent 271a463d6d
commit 23fa003422
2 changed files with 59 additions and 1 deletions

View File

@ -9,4 +9,5 @@ watchdog>=4.0.0
argon2-cffi>=23.1.0
python-jose>=3.3.0
sortedcontainers>=2.4.0
snowballstemmer>=2.2.0
weasyprint>=60.0

View File

@ -7,6 +7,8 @@ import unicodedata
from collections import defaultdict
from typing import List, Dict, Any, Optional, Tuple
from snowballstemmer import stemmer as _snowball_stemmer
from backend import indexer as _indexer
from backend.indexer import index
@ -28,6 +30,28 @@ SUGGEST_LIMIT = 10 # Default max suggestions returned
# Regex to tokenize text into alphanumeric words (Unicode-aware)
_WORD_RE = re.compile(r"[\w]+", re.UNICODE)
# French stemmer (lazy-init singleton)
_FR_STEMMER: object | None = None
def _get_stemmer():
"""Return a cached French snowball stemmer instance."""
global _FR_STEMMER
if _FR_STEMMER is None:
_FR_STEMMER = _snowball_stemmer("french")
return _FR_STEMMER
def stem_token(token: str) -> str:
"""Reduce a word to its French stem (e.g. 'mangeons''mang')."""
return _get_stemmer().stemWord(token)
def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
"""Return the set of unique stems for a list of tokens."""
s = _get_stemmer()
return {s.stemWord(t) for t in tokens}
# ---------------------------------------------------------------------------
# Accent / Unicode normalization helpers
@ -381,8 +405,17 @@ class InvertedIndex:
tf: Dict[str, int] = defaultdict(int)
for token in tokens:
tf[token] += 1
# Also index stemmed forms (French stemming)
stems = stem_tokens(list(tf.keys()))
for token, freq in tf.items():
self.word_index[token][doc_key] = freq
for stem in stems:
# Accumulate frequency for stem (sum of all forms mapping to same stem)
stem_freq = self.word_index[stem].get(doc_key, 0)
for token, freq in tf.items():
if stem_token(token) == stem:
stem_freq += freq
self.word_index[stem][doc_key] = stem_freq
# --- Tag indexes ---
for tag in vault_data.get("tags", {}):
@ -446,10 +479,20 @@ class InvertedIndex:
for token in tokens:
if token:
tf[token] += 1
# Also compute stems
stems = stem_tokens(list(tf.keys()))
for token, freq in tf.items():
if not self.word_index.get(token):
self._sorted_tokens.add(token)
self.word_index[token][doc_key] = freq
for stem in stems:
stem_freq = self.word_index[stem].get(doc_key, 0)
for token, freq in tf.items():
if stem_token(token) == stem:
stem_freq += freq
if not self.word_index.get(stem):
self._sorted_tokens.add(stem)
self.word_index[stem][doc_key] = stem_freq
def remove_document(self, vault_name: str, path: str):
"""Remove a single document incrementally."""
@ -501,7 +544,10 @@ class InvertedIndex:
# Word index
content = file_info.get("content", "")
full_text = title + " " + content
for token in set(tokenize(full_text)):
tokens_to_remove = set(tokenize(full_text))
# Also compute stems to clean up
stems_to_clean = stem_tokens(tokens_to_remove)
for token in tokens_to_remove:
if not token:
continue
wi = self.word_index.get(token)
@ -511,6 +557,14 @@ class InvertedIndex:
del self.word_index[token]
if not skip_sorted_cleanup:
self._sorted_tokens.discard(token)
for stem in stems_to_clean:
wi = self.word_index.get(stem)
if wi:
wi.pop(doc_key, None)
if not wi:
del self.word_index[stem]
if not skip_sorted_cleanup:
self._sorted_tokens.discard(stem)
def idf(self, term: str) -> float:
"""Inverse Document Frequency for a term.
@ -939,6 +993,9 @@ def advanced_search(
query_terms = []
for t in query_terms_raw:
query_terms.extend(tokenize(t))
# Also add stemmed forms for French stemming support
query_stems = stem_tokens(query_terms)
query_terms.extend(stem for stem in query_stems if stem not in query_terms)
has_terms = len(query_terms) > 0
if not has_terms and not all_tags and not parsed["title"] and not parsed["path"] and not parsed["ext"]: