fix: stem_token/stem_tokens — try/except sur crash snowballstemmer (IndexError sur tokens exotiques)
All checks were successful
CI / lint (push) Successful in 14s
CI / security (push) Successful in 8s
CI / test (push) Successful in 16s
CI / build (push) Successful in 2s

This commit is contained in:
Bruno Charest 2026-05-29 13:33:04 -04:00
parent 10fbeccb5f
commit fe1a2be364

View File

@ -44,13 +44,22 @@ def _get_stemmer():
def stem_token(token: str) -> str:
"""Reduce a word to its French stem (e.g. 'mangeons''mang')."""
try:
return _get_stemmer().stemWord(token)
except Exception:
return token # fallback: keep original token if stemmer crashes
def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
"""Return the set of unique stems for a list of tokens."""
s = _get_stemmer()
return {s.stemWord(t) for t in tokens}
result = set()
for t in tokens:
try:
stemmed = stem_token(t)
except Exception:
stemmed = t
result.add(stemmed)
return result
# ---------------------------------------------------------------------------