perf: O(T) stemming instead of O(S×T) — fixes 15min index freeze
Replace double-nested stem loop (stems × tokens) with single-pass stem frequency map. For 100 unique tokens per file: 10,000 iterations → 100 iterations per file. Critical for large vaults.
This commit is contained in:
parent
25cfd7cc56
commit
1b9ba69c52
@ -406,16 +406,17 @@ class InvertedIndex:
|
|||||||
for token in tokens:
|
for token in tokens:
|
||||||
tf[token] += 1
|
tf[token] += 1
|
||||||
# Also index stemmed forms (French stemming)
|
# Also index stemmed forms (French stemming)
|
||||||
stems = stem_tokens(list(tf.keys()))
|
# Compute stem frequencies in one pass instead of O(S×T)
|
||||||
|
stem_freqs: Dict[str, int] = defaultdict(int)
|
||||||
|
for token, freq in tf.items():
|
||||||
|
stemmed = stem_token(token)
|
||||||
|
if stemmed != token: # only index stem if different
|
||||||
|
stem_freqs[stemmed] += freq
|
||||||
for token, freq in tf.items():
|
for token, freq in tf.items():
|
||||||
self.word_index[token][doc_key] = freq
|
self.word_index[token][doc_key] = freq
|
||||||
for stem in stems:
|
for stem, freq in stem_freqs.items():
|
||||||
# Accumulate frequency for stem (sum of all forms mapping to same stem)
|
existing = self.word_index[stem].get(doc_key, 0)
|
||||||
stem_freq = self.word_index[stem].get(doc_key, 0)
|
self.word_index[stem][doc_key] = existing + freq
|
||||||
for token, freq in tf.items():
|
|
||||||
if stem_token(token) == stem:
|
|
||||||
stem_freq += freq
|
|
||||||
self.word_index[stem][doc_key] = stem_freq
|
|
||||||
|
|
||||||
# --- Tag indexes ---
|
# --- Tag indexes ---
|
||||||
for tag in vault_data.get("tags", {}):
|
for tag in vault_data.get("tags", {}):
|
||||||
@ -479,20 +480,21 @@ class InvertedIndex:
|
|||||||
for token in tokens:
|
for token in tokens:
|
||||||
if token:
|
if token:
|
||||||
tf[token] += 1
|
tf[token] += 1
|
||||||
# Also compute stems
|
# Also compute stems in one pass
|
||||||
stems = stem_tokens(list(tf.keys()))
|
stem_freqs: Dict[str, int] = defaultdict(int)
|
||||||
|
for token, freq in tf.items():
|
||||||
|
stemmed = stem_token(token)
|
||||||
|
if stemmed != token:
|
||||||
|
stem_freqs[stemmed] += freq
|
||||||
for token, freq in tf.items():
|
for token, freq in tf.items():
|
||||||
if not self.word_index.get(token):
|
if not self.word_index.get(token):
|
||||||
self._sorted_tokens.add(token)
|
self._sorted_tokens.add(token)
|
||||||
self.word_index[token][doc_key] = freq
|
self.word_index[token][doc_key] = freq
|
||||||
for stem in stems:
|
for stem, freq in stem_freqs.items():
|
||||||
stem_freq = self.word_index[stem].get(doc_key, 0)
|
|
||||||
for token, freq in tf.items():
|
|
||||||
if stem_token(token) == stem:
|
|
||||||
stem_freq += freq
|
|
||||||
if not self.word_index.get(stem):
|
if not self.word_index.get(stem):
|
||||||
self._sorted_tokens.add(stem)
|
self._sorted_tokens.add(stem)
|
||||||
self.word_index[stem][doc_key] = stem_freq
|
existing = self.word_index[stem].get(doc_key, 0)
|
||||||
|
self.word_index[stem][doc_key] = existing + freq
|
||||||
|
|
||||||
def remove_document(self, vault_name: str, path: str):
|
def remove_document(self, vault_name: str, path: str):
|
||||||
"""Remove a single document incrementally."""
|
"""Remove a single document incrementally."""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user