From 1b9ba69c52f9324884e1e6bf81be394141b7724a Mon Sep 17 00:00:00 2001 From: Bruno Charest Date: Thu, 28 May 2026 15:23:51 -0400 Subject: [PATCH] =?UTF-8?q?perf:=20O(T)=20stemming=20instead=20of=20O(S?= =?UTF-8?q?=C3=97T)=20=E2=80=94=20fixes=2015min=20index=20freeze?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace double-nested stem loop (stems × tokens) with single-pass stem frequency map. For 100 unique tokens per file: 10,000 iterations → 100 iterations per file. Critical for large vaults. --- backend/search.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/backend/search.py b/backend/search.py index 61e4135..096b2c9 100644 --- a/backend/search.py +++ b/backend/search.py @@ -406,16 +406,17 @@ class InvertedIndex: for token in tokens: tf[token] += 1 # Also index stemmed forms (French stemming) - stems = stem_tokens(list(tf.keys())) + # Compute stem frequencies in one pass instead of O(S×T) + stem_freqs: Dict[str, int] = defaultdict(int) + for token, freq in tf.items(): + stemmed = stem_token(token) + if stemmed != token: # only index stem if different + stem_freqs[stemmed] += freq for token, freq in tf.items(): self.word_index[token][doc_key] = freq - for stem in stems: - # Accumulate frequency for stem (sum of all forms mapping to same stem) - stem_freq = self.word_index[stem].get(doc_key, 0) - for token, freq in tf.items(): - if stem_token(token) == stem: - stem_freq += freq - self.word_index[stem][doc_key] = stem_freq + for stem, freq in stem_freqs.items(): + existing = self.word_index[stem].get(doc_key, 0) + self.word_index[stem][doc_key] = existing + freq # --- Tag indexes --- for tag in vault_data.get("tags", {}): @@ -479,20 +480,21 @@ class InvertedIndex: for token in tokens: if token: tf[token] += 1 - # Also compute stems - stems = stem_tokens(list(tf.keys())) + # Also compute stems in one pass + stem_freqs: Dict[str, int] = defaultdict(int) + for token, freq in tf.items(): + stemmed = stem_token(token) + if stemmed != token: + stem_freqs[stemmed] += freq for token, freq in tf.items(): if not self.word_index.get(token): self._sorted_tokens.add(token) self.word_index[token][doc_key] = freq - for stem in stems: - stem_freq = self.word_index[stem].get(doc_key, 0) - for token, freq in tf.items(): - if stem_token(token) == stem: - stem_freq += freq + for stem, freq in stem_freqs.items(): if not self.word_index.get(stem): self._sorted_tokens.add(stem) - self.word_index[stem][doc_key] = stem_freq + existing = self.word_index[stem].get(doc_key, 0) + self.word_index[stem][doc_key] = existing + freq def remove_document(self, vault_name: str, path: str): """Remove a single document incrementally."""