From 1b9ba69c52f9324884e1e6bf81be394141b7724a Mon Sep 17 00:00:00 2001
From: Bruno Charest <bruno.charest@gmail.com>
Date: Thu, 28 May 2026 15:23:51 -0400
Subject: [PATCH] =?UTF-8?q?perf:=20O(T)=20stemming=20instead=20of=20O(S?=
 =?UTF-8?q?=C3=97T)=20=E2=80=94=20fixes=2015min=20index=20freeze?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace double-nested stem loop (stems × tokens) with single-pass
stem frequency map. For 100 unique tokens per file: 10,000 iterations
→ 100 iterations per file. Critical for large vaults.
---
 backend/search.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/backend/search.py b/backend/search.py
index 61e4135..096b2c9 100644
--- a/backend/search.py
+++ b/backend/search.py
@@ -406,16 +406,17 @@ class InvertedIndex:
                 for token in tokens:
                     tf[token] += 1
                 # Also index stemmed forms (French stemming)
-                stems = stem_tokens(list(tf.keys()))
+                # Compute stem frequencies in one pass instead of O(S×T)
+                stem_freqs: Dict[str, int] = defaultdict(int)
+                for token, freq in tf.items():
+                    stemmed = stem_token(token)
+                    if stemmed != token:  # only index stem if different
+                        stem_freqs[stemmed] += freq
                 for token, freq in tf.items():
                     self.word_index[token][doc_key] = freq
-                for stem in stems:
-                    # Accumulate frequency for stem (sum of all forms mapping to same stem)
-                    stem_freq = self.word_index[stem].get(doc_key, 0)
-                    for token, freq in tf.items():
-                        if stem_token(token) == stem:
-                            stem_freq += freq
-                    self.word_index[stem][doc_key] = stem_freq
+                for stem, freq in stem_freqs.items():
+                    existing = self.word_index[stem].get(doc_key, 0)
+                    self.word_index[stem][doc_key] = existing + freq
 
             # --- Tag indexes ---
             for tag in vault_data.get("tags", {}):
@@ -479,20 +480,21 @@ class InvertedIndex:
         for token in tokens:
             if token:
                 tf[token] += 1
-        # Also compute stems
-        stems = stem_tokens(list(tf.keys()))
+        # Also compute stems in one pass
+        stem_freqs: Dict[str, int] = defaultdict(int)
+        for token, freq in tf.items():
+            stemmed = stem_token(token)
+            if stemmed != token:
+                stem_freqs[stemmed] += freq
         for token, freq in tf.items():
             if not self.word_index.get(token):
                 self._sorted_tokens.add(token)
             self.word_index[token][doc_key] = freq
-        for stem in stems:
-            stem_freq = self.word_index[stem].get(doc_key, 0)
-            for token, freq in tf.items():
-                if stem_token(token) == stem:
-                    stem_freq += freq
+        for stem, freq in stem_freqs.items():
             if not self.word_index.get(stem):
                 self._sorted_tokens.add(stem)
-            self.word_index[stem][doc_key] = stem_freq
+            existing = self.word_index[stem].get(doc_key, 0)
+            self.word_index[stem][doc_key] = existing + freq
 
     def remove_document(self, vault_name: str, path: str):
         """Remove a single document incrementally."""