perf: O(T) stemming instead of O(S×T) — fixes 15min index freeze
All checks were successful
CI / lint (push) Successful in 14s
CI / security (push) Successful in 9s
CI / test (push) Successful in 18s
CI / build (push) Successful in 4s

Replace double-nested stem loop (stems × tokens) with single-pass
stem frequency map. For 100 unique tokens per file: 10,000 iterations
→ 100 iterations per file. Critical for large vaults.
This commit is contained in:
Bruno Charest 2026-05-28 15:23:51 -04:00
parent 25cfd7cc56
commit 1b9ba69c52

View File

@ -406,16 +406,17 @@ class InvertedIndex:
for token in tokens: for token in tokens:
tf[token] += 1 tf[token] += 1
# Also index stemmed forms (French stemming) # Also index stemmed forms (French stemming)
stems = stem_tokens(list(tf.keys())) # Compute stem frequencies in one pass instead of O(S×T)
stem_freqs: Dict[str, int] = defaultdict(int)
for token, freq in tf.items():
stemmed = stem_token(token)
if stemmed != token: # only index stem if different
stem_freqs[stemmed] += freq
for token, freq in tf.items(): for token, freq in tf.items():
self.word_index[token][doc_key] = freq self.word_index[token][doc_key] = freq
for stem in stems: for stem, freq in stem_freqs.items():
# Accumulate frequency for stem (sum of all forms mapping to same stem) existing = self.word_index[stem].get(doc_key, 0)
stem_freq = self.word_index[stem].get(doc_key, 0) self.word_index[stem][doc_key] = existing + freq
for token, freq in tf.items():
if stem_token(token) == stem:
stem_freq += freq
self.word_index[stem][doc_key] = stem_freq
# --- Tag indexes --- # --- Tag indexes ---
for tag in vault_data.get("tags", {}): for tag in vault_data.get("tags", {}):
@ -479,20 +480,21 @@ class InvertedIndex:
for token in tokens: for token in tokens:
if token: if token:
tf[token] += 1 tf[token] += 1
# Also compute stems # Also compute stems in one pass
stems = stem_tokens(list(tf.keys())) stem_freqs: Dict[str, int] = defaultdict(int)
for token, freq in tf.items():
stemmed = stem_token(token)
if stemmed != token:
stem_freqs[stemmed] += freq
for token, freq in tf.items(): for token, freq in tf.items():
if not self.word_index.get(token): if not self.word_index.get(token):
self._sorted_tokens.add(token) self._sorted_tokens.add(token)
self.word_index[token][doc_key] = freq self.word_index[token][doc_key] = freq
for stem in stems: for stem, freq in stem_freqs.items():
stem_freq = self.word_index[stem].get(doc_key, 0)
for token, freq in tf.items():
if stem_token(token) == stem:
stem_freq += freq
if not self.word_index.get(stem): if not self.word_index.get(stem):
self._sorted_tokens.add(stem) self._sorted_tokens.add(stem)
self.word_index[stem][doc_key] = stem_freq existing = self.word_index[stem].get(doc_key, 0)
self.word_index[stem][doc_key] = existing + freq
def remove_document(self, vault_name: str, path: str): def remove_document(self, vault_name: str, path: str):
"""Remove a single document incrementally.""" """Remove a single document incrementally."""