fix: stem_token/stem_tokens — try/except sur crash snowballstemmer (IndexError sur tokens exotiques)
This commit is contained in:
parent
10fbeccb5f
commit
fe1a2be364
@ -44,13 +44,22 @@ def _get_stemmer():
|
||||
|
||||
def stem_token(token: str) -> str:
|
||||
"""Reduce a word to its French stem (e.g. 'mangeons' → 'mang')."""
|
||||
return _get_stemmer().stemWord(token)
|
||||
try:
|
||||
return _get_stemmer().stemWord(token)
|
||||
except Exception:
|
||||
return token # fallback: keep original token if stemmer crashes
|
||||
|
||||
|
||||
def stem_tokens(tokens: list[str] | set[str]) -> set[str]:
|
||||
"""Return the set of unique stems for a list of tokens."""
|
||||
s = _get_stemmer()
|
||||
return {s.stemWord(t) for t in tokens}
|
||||
result = set()
|
||||
for t in tokens:
|
||||
try:
|
||||
stemmed = stem_token(t)
|
||||
except Exception:
|
||||
stemmed = t
|
||||
result.add(stemmed)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user