feat: French stemming (snowballstemmer) — 'recettes' matches 'recette', 'mangeons' matches 'manger'
Add French snowball stemmer to tokenization pipeline: - Index both original tokens AND their stems in InvertedIndex - Query terms are also stemmed before lookup - Stemmed forms accumulate TF from all original forms - Lazy-init singleton pattern for stemmer
This commit is contained in:
parent
271a463d6d
commit
23fa003422
@ -9,4 +9,5 @@ watchdog>=4.0.0
|
||||
argon2-cffi>=23.1.0
|
||||
python-jose>=3.3.0
|
||||
sortedcontainers>=2.4.0
|
||||
snowballstemmer>=2.2.0
|
||||
weasyprint>=60.0
|
||||
|
||||
@ -7,6 +7,8 @@ import unicodedata
|
||||
from collections import defaultdict
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
from snowballstemmer import stemmer as _snowball_stemmer
|
||||
|
||||
from backend import indexer as _indexer
|
||||
from backend.indexer import index
|
||||
|
||||
@ -28,6 +30,28 @@ SUGGEST_LIMIT = 10 # Default max suggestions returned
|
||||
# Regex to tokenize text into alphanumeric words (Unicode-aware)
|
||||
_WORD_RE = re.compile(r"[\w]+", re.UNICODE)
|
||||
|
||||
# French stemmer (lazy-init singleton)
|
||||
_FR_STEMMER: object | None = None
|
||||
|
||||
|
||||
def _get_stemmer():
|
||||
"""Return a cached French snowball stemmer instance."""
|
||||
global _FR_STEMMER
|
||||
if _FR_STEMMER is None:
|
||||
_FR_STEMMER = _snowball_stemmer("french")
|
||||
return _FR_STEMMER
|
||||
|
||||
|
||||
def stem_token(token: str) -> str:
|
||||
"""Reduce a word to its French stem (e.g. 'mangeons' → 'mang')."""
|
||||
return _get_stemmer().stemWord(token)
|
||||
|
||||
|
||||
def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
|
||||
"""Return the set of unique stems for a list of tokens."""
|
||||
s = _get_stemmer()
|
||||
return {s.stemWord(t) for t in tokens}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Accent / Unicode normalization helpers
|
||||
@ -381,8 +405,17 @@ class InvertedIndex:
|
||||
tf: Dict[str, int] = defaultdict(int)
|
||||
for token in tokens:
|
||||
tf[token] += 1
|
||||
# Also index stemmed forms (French stemming)
|
||||
stems = stem_tokens(list(tf.keys()))
|
||||
for token, freq in tf.items():
|
||||
self.word_index[token][doc_key] = freq
|
||||
for stem in stems:
|
||||
# Accumulate frequency for stem (sum of all forms mapping to same stem)
|
||||
stem_freq = self.word_index[stem].get(doc_key, 0)
|
||||
for token, freq in tf.items():
|
||||
if stem_token(token) == stem:
|
||||
stem_freq += freq
|
||||
self.word_index[stem][doc_key] = stem_freq
|
||||
|
||||
# --- Tag indexes ---
|
||||
for tag in vault_data.get("tags", {}):
|
||||
@ -446,10 +479,20 @@ class InvertedIndex:
|
||||
for token in tokens:
|
||||
if token:
|
||||
tf[token] += 1
|
||||
# Also compute stems
|
||||
stems = stem_tokens(list(tf.keys()))
|
||||
for token, freq in tf.items():
|
||||
if not self.word_index.get(token):
|
||||
self._sorted_tokens.add(token)
|
||||
self.word_index[token][doc_key] = freq
|
||||
for stem in stems:
|
||||
stem_freq = self.word_index[stem].get(doc_key, 0)
|
||||
for token, freq in tf.items():
|
||||
if stem_token(token) == stem:
|
||||
stem_freq += freq
|
||||
if not self.word_index.get(stem):
|
||||
self._sorted_tokens.add(stem)
|
||||
self.word_index[stem][doc_key] = stem_freq
|
||||
|
||||
def remove_document(self, vault_name: str, path: str):
|
||||
"""Remove a single document incrementally."""
|
||||
@ -501,7 +544,10 @@ class InvertedIndex:
|
||||
# Word index
|
||||
content = file_info.get("content", "")
|
||||
full_text = title + " " + content
|
||||
for token in set(tokenize(full_text)):
|
||||
tokens_to_remove = set(tokenize(full_text))
|
||||
# Also compute stems to clean up
|
||||
stems_to_clean = stem_tokens(tokens_to_remove)
|
||||
for token in tokens_to_remove:
|
||||
if not token:
|
||||
continue
|
||||
wi = self.word_index.get(token)
|
||||
@ -511,6 +557,14 @@ class InvertedIndex:
|
||||
del self.word_index[token]
|
||||
if not skip_sorted_cleanup:
|
||||
self._sorted_tokens.discard(token)
|
||||
for stem in stems_to_clean:
|
||||
wi = self.word_index.get(stem)
|
||||
if wi:
|
||||
wi.pop(doc_key, None)
|
||||
if not wi:
|
||||
del self.word_index[stem]
|
||||
if not skip_sorted_cleanup:
|
||||
self._sorted_tokens.discard(stem)
|
||||
|
||||
def idf(self, term: str) -> float:
|
||||
"""Inverse Document Frequency for a term.
|
||||
@ -939,6 +993,9 @@ def advanced_search(
|
||||
query_terms = []
|
||||
for t in query_terms_raw:
|
||||
query_terms.extend(tokenize(t))
|
||||
# Also add stemmed forms for French stemming support
|
||||
query_stems = stem_tokens(query_terms)
|
||||
query_terms.extend(stem for stem in query_stems if stem not in query_terms)
|
||||
has_terms = len(query_terms) > 0
|
||||
|
||||
if not has_terms and not all_tags and not parsed["title"] and not parsed["path"] and not parsed["ext"]:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user