Switch inverted index from stale check to incremental updates

Register a hook with the indexer so that file add/remove events incrementally maintain the inverted index, removing the need for periodic staleness checks and cooldowns. Rebuild the index once on startup via init_inverted_index().
2026-05-26 12:37:59 -04:00 · 2026-05-26 12:37:59 -04:00 · 775722f5d4
commit 775722f5d4
parent 872a3e56dd
4 changed files with 460 additions and 363 deletions
--- a/backend/indexer.py
+++ b/backend/indexer.py
@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic
        path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path]

    _index_generation += 1
+
+    # Notify inverted index for incremental update
+    if _on_index_change:
+        _on_index_change('remove', vault_name, rel_path, removed)
+
    return removed


@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]):

    _index_generation += 1

+    # Notify inverted index for incremental update
+    if _on_index_change:
+        _on_index_change('add', vault_name, file_info["path"], file_info)
+

 async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]:
    """Re-index a single file without full rebuild.
--- a/backend/main.py
+++ b/backend/main.py
@ -44,7 +44,7 @@ from backend.indexer import (
    remove_vault_from_index,
    add_vault_to_index,
 )
-from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags
+from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index
 from backend.image_processor import preprocess_images
 from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats
 from backend.vault_settings import (
@ -379,14 +379,9 @@ async def _on_vault_change(events: list):
    Processes each event (create/modify/delete/move) and updates the index
    incrementally, then broadcasts SSE notifications.
    """
-    import backend.indexer as idx
    updated_vaults = set()
    changes = []

-    # Temporarily suppress per-file generation increments to coalesce them
-    # into a single increment at the end of the batch.
-    old_gen = idx._index_generation
-
    for event in events:
        vault_name = event["vault"]
        event_type = event["type"]
@ -415,11 +410,6 @@ async def _on_vault_change(events: list):
        except Exception as e:
            logger.error(f"Error processing {event_type} event for {src}: {e}")

-    # Restore generation to only increment by 1 for the whole batch
-    # (unless it was already incremented by other operations)
-    if idx._index_generation > old_gen + 1:
-        idx._index_generation = old_gen + 1
-
    if changes:
        await sse_manager.broadcast("index_updated", {
            "vaults": list(updated_vaults),
@ -520,6 +510,9 @@ async def lifespan(app: FastAPI):
        logger.info("Background indexing started")
        await build_index(_progress_cb)

+        # Build inverted index for search (one-time, then incremental)
+        init_inverted_index()
+
        # Start file watcher
        config = _load_config()
        watcher_enabled = config.get("watcher_enabled", True)
--- a/backend/search.py
+++ b/backend/search.py
@ -239,9 +239,9 @@ def _escape_html(text: str) -> str:
 class InvertedIndex:
    """In-memory inverted index supporting TF-IDF scoring.

-    Built lazily from the global ``index`` dict whenever a search or
-    suggestion request detects that the underlying vault index has changed.
-    The class is designed to be a singleton — use ``get_inverted_index()``.
+    Built initially via ``rebuild()`` from the global index, then
+    maintained incrementally via ``add_document()`` / ``remove_document()``
+    hooks from the file watcher and API mutations.

    Attributes:
        word_index: ``{token: {doc_key: term_frequency}}``
@ -249,7 +249,6 @@ class InvertedIndex:
        tag_norm_map: ``{normalized_tag: original_tag}``
        tag_prefix_index: ``{prefix: [original_tag, ...]}``
        doc_count: Total number of indexed documents.
-        _source_id: Fingerprint of the source index to detect staleness.
    """

    def __init__(self) -> None:
@ -264,23 +263,7 @@ class InvertedIndex:
        self.vault_docs: Dict[str, set] = defaultdict(set)
        self.tag_docs: Dict[str, set] = defaultdict(set)
        self._sorted_tokens: List[str] = []
-        self._source_generation: int = -1
-        self._last_rebuild: float = 0
-        self._rebuild_cooldown: float = 3.0  # seconds
-
-    def is_stale(self) -> bool:
-        """Check if the inverted index needs rebuilding.
-
-        Uses a cooldown (3s) to prevent rapid rebuilds from file watcher
-        events. Staleness is only reported if the generation has changed
-        AND the cooldown has elapsed since the last rebuild.
-        """
-        import time
-        if _indexer._index_generation == self._source_generation:
-            return False
-        if time.time() - self._last_rebuild < self._rebuild_cooldown:
-            return False
-        return True
+        self._ready: bool = False  # True after initial build

    def rebuild(self) -> None:
        """Rebuild inverted index from the global ``index`` dict.
@ -288,9 +271,7 @@ class InvertedIndex:
        Tokenizes titles and content of every file, computes term frequencies,
        and builds auxiliary indexes for tag and title prefix suggestions.
        """
-        import time
-        self._last_rebuild = time.time()
-        logger.info("Rebuilding inverted index...")
+        logger.info("Building inverted index...")
        self.word_index = defaultdict(dict)
        self.title_index = defaultdict(list)
        self.tag_norm_map = {}
@ -351,7 +332,7 @@ class InvertedIndex:
                        self.tag_prefix_index[prefix].append(tag)

        self._sorted_tokens = sorted(self.word_index.keys())
-        self._source_generation = _indexer._index_generation
+        self._ready = True
        logger.info(
            "Inverted index built: %d documents, %d unique tokens, %d tags",
            self.doc_count,
@ -359,6 +340,117 @@ class InvertedIndex:
            len(self.tag_norm_map),
        )

+    def add_document(self, vault_name: str, path: str, file_info: dict):
+        """Add or update a single document incrementally."""
+        if not self._ready:
+            return
+        doc_key = f"{vault_name}::{path}"
+        old_file_info = self.doc_info.get(doc_key)
+        if old_file_info is not None:
+            self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True)
+        else:
+            self.doc_count += 1
+        # Metadata
+        self.doc_info[doc_key] = file_info
+        self.doc_vault[doc_key] = vault_name
+        self.vault_docs[vault_name].add(doc_key)
+        # Tags
+        tags = file_info.get("tags", [])
+        for tag in tags:
+            self.tag_docs[tag.lower()].add(doc_key)
+            norm_tag = normalize_text(tag)
+            if norm_tag not in self.tag_norm_map:
+                self.tag_norm_map[norm_tag] = tag
+                for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
+                    prefix = norm_tag[:plen]
+                    if tag not in self.tag_prefix_index[prefix]:
+                        self.tag_prefix_index[prefix].append(tag)
+        # Title tokens
+        title = file_info.get("title", "")
+        title_tokens = tokenize(title)
+        for token in set(title_tokens):
+            if token:
+                self.title_index[token].append(doc_key)
+        # Title norm map
+        norm_title = normalize_text(title)
+        if norm_title:
+            self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title})
+        # Word index (content + title TF)
+        content = file_info.get("content", "")
+        full_text = title + " " + content
+        tokens = tokenize(full_text)
+        tf: Dict[str, int] = defaultdict(int)
+        for token in tokens:
+            if token:
+                tf[token] += 1
+        for token, freq in tf.items():
+            if not self.word_index.get(token):
+                bisect.insort(self._sorted_tokens, token)
+            self.word_index[token][doc_key] = freq
+
+    def remove_document(self, vault_name: str, path: str):
+        """Remove a single document incrementally."""
+        if not self._ready:
+            return
+        doc_key = f"{vault_name}::{path}"
+        file_info = self.doc_info.get(doc_key)
+        if file_info is None:
+            return
+        self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False)
+        self.doc_count -= 1
+
+    def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False):
+        """Remove one doc_key from all indexes without adjusting doc_count."""
+        # Metadata
+        self.doc_info.pop(doc_key, None)
+        self.doc_vault.pop(doc_key, None)
+        if vault_name in self.vault_docs:
+            self.vault_docs[vault_name].discard(doc_key)
+        # Tags (per-document, NOT the global tag_norm_map)
+        for tag in file_info.get("tags", []):
+            td = self.tag_docs.get(tag.lower())
+            if td:
+                td.discard(doc_key)
+                if not td:
+                    del self.tag_docs[tag.lower()]
+        # Title tokens
+        title = file_info.get("title", "")
+        for token in set(tokenize(title)):
+            if not token:
+                continue
+            ti = self.title_index.get(token)
+            if ti:
+                try:
+                    ti.remove(doc_key)
+                except ValueError:
+                    pass
+                if not ti:
+                    del self.title_index[token]
+        # Title norm map
+        norm_title = normalize_text(title)
+        if norm_title and norm_title in self.title_norm_map:
+            self.title_norm_map[norm_title] = [
+                e for e in self.title_norm_map[norm_title]
+                if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
+            ]
+            if not self.title_norm_map[norm_title]:
+                del self.title_norm_map[norm_title]
+        # Word index
+        content = file_info.get("content", "")
+        full_text = title + " " + content
+        for token in set(tokenize(full_text)):
+            if not token:
+                continue
+            wi = self.word_index.get(token)
+            if wi:
+                wi.pop(doc_key, None)
+                if not wi:
+                    del self.word_index[token]
+                    if not skip_sorted_cleanup:
+                        idx = bisect.bisect_left(self._sorted_tokens, token)
+                        if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
+                            self._sorted_tokens.pop(idx)
+
    def idf(self, term: str) -> float:
        """Inverse Document Frequency for a term.

@ -424,9 +516,39 @@ class InvertedIndex:
 _inverted_index = InvertedIndex()


+def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict):
+    """Callback registered with indexer for incremental inverted index updates."""
+    inv = _inverted_index
+    try:
+        if action == 'add' and file_info:
+            inv.add_document(vault_name, path, file_info)
+        elif action == 'remove':
+            inv.remove_document(vault_name, path)
+    except Exception as e:
+        logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
+
+
+# Register the hook with indexer (indexer is already imported at top of file)
+_indexer.set_index_change_hook(_on_index_change_hook)
+
+
+def init_inverted_index():
+    """Force initial inverted index build. Called after build_index completes on startup."""
+    if any(vdata.get("files") for vdata in index.values()):
+        _inverted_index.rebuild()
+        logger.info("Inverted index initialized.")
+
+
 def get_inverted_index() -> InvertedIndex:
-    """Return the singleton inverted index, rebuilding if stale."""
-    if _inverted_index.is_stale():
+    """Return the singleton inverted index.
+
+    Auto-builds on first call if the index has files but the inverted
+    index hasn't been built yet (fallback for paths that don't go through
+    ``init_inverted_index()``).
+    """
+    if _inverted_index.doc_count == 0 and any(
+        vdata.get("files") for vdata in index.values()
+    ):
        _inverted_index.rebuild()
    return _inverted_index

--- a/plan.md
+++ b/plan.md
@ -1,375 +1,348 @@
-# Implementation Plan — Remaining Roadmap Items
+# Plan: Incremental InvertedIndex for 40k+ files

-## 1. 📝 Documentation OpenAPI enrichie (P3) — 5 min
+## Problem Summary

-**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`.
+Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` — O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable.

-**Models to update (lines 89–311):**
+The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it.

-| Line | Model | Fields to annotate |
-|------|-------|--------------------|
-| 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` |
-| 103 | `FileRawResponse` | `vault`, `path`, `raw` |
-| 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` |
-| 118 | `FileDeleteResponse` | `status`, `vault`, `path` |
-| 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
-| 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) |
-| 146 | `TagsResponse` | `vault_filter`, `tags` |
-| 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) |
-| 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` |
-| 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
-| 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) |
-| 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) |
-| 196 | `TitleSuggestion` | `vault`, `path`, `title` |
-| 203 | `SuggestResponse` | `query`, `suggestions` |
-| 209 | `TagSuggestion` | `tag`, `count` |
-| 215 | `TagSuggestResponse` | `query`, `suggestions` |
-| 221 | `GraphNode` | (all fields already have Field) |
-| 231 | `GraphEdge` | (all fields already have Field) |
-| 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` |
-| 247 | `ReloadResponse` | `status`, `vaults` |
-| 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` |
-| 265 | `DirectoryCreateResponse` | `success`, `path` |
-| 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` |
-| 296 | `FileCreateResponse` | `success`, `path` |
-| 307 | `FileRenameResponse` | `success` |
+## Solution: Incremental Add/Remove on the InvertedIndex

-**Dependency:** None. Pure documentation change.
+Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed:

---
+- `add_document(vault_name, path, file_info)` — called on create/modify
+- `remove_document(vault_name, path)` — called on delete/move-source

-## 2. 📊 Dashboard statistiques (P3) — 30 min
+Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale.

-### 2a. Backend: `GET /api/dashboard` (new endpoint)
+Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current.

-**File:** `backend/main.py` — insert at **line ~2547** (after `/api/diagnostics`)
+## Dependency Architecture
+
+**Current import chain:**
+```
+main.py → search.py → indexer.py  (search.py imports `from backend import indexer as _indexer`)
+```
+
+**Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py → indexer.py → search.py`.
+
+**Fix — Option C (Callback/Hook pattern, simplest):**
+
+Add a module-level hook variable in `backend/indexer.py`:

 ```python
-@app.get("/api/dashboard")
-async def api_dashboard(current_user=Depends(require_auth)):
-    """Aggregated dashboard statistics across all accessible vaults."""
-    from backend.indexer import index, vault_config, path_index
-    user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", [])
+# In backend/indexer.py
+_on_index_change: callable = None  # Called as (action, vault_name, path, file_info_or_None)

-    vault_stats = []
-    total_files = 0
-    total_tags = set()
-    total_size = 0
+def set_index_change_hook(hook):
+    """Register a callback for incremental index updates.
+    hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'.
+    """
+    global _on_index_change
+    _on_index_change = hook
+```

-    for vname, vdata in index.items():
-        if "*" not in user_vaults and vname not in user_vaults:
-            continue
-        files = vdata.get("files", [])
-        file_count = len(files)
-        total_files += file_count
-        tags = set()
-        for f in files:
-            tags.update(f.get("tags", []))
-            total_size += f.get("size", 0)
-        total_tags.update(tags)
-        vault_stats.append({
-            "name": vname,
-            "file_count": file_count,
-            "tag_count": len(tags),
-            "total_size_bytes": sum(f.get("size", 0) for f in files),
+Then at the end of `_add_file_to_structures`:
+```python
+if _on_index_change:
+    _on_index_change('add', vault_name, rel_path, file_info)
+```
+
+At the end of `_remove_file_from_structures`:
+```python
+if _on_index_change:
+    _on_index_change('remove', vault_name, rel_path, file_info)  # file_info = removed dict or None
+```
+
+Then in `backend/search.py`, at module load time (after InvertedIndex class is defined):
+```python
+def _on_index_change_hook(action, vault_name, path, file_info):
+    inv = get_inverted_index_raw()  # get without rebuild check
+    if action == 'add':
+        inv.add_document(vault_name, path, file_info)
+    elif action == 'remove':
+        inv.remove_document(vault_name, path)
+
+# Register the hook — this triggers an import of indexer, but indexer is already imported
+# by the time this line runs (since search.py does `from backend import indexer as _indexer` above)
+_indexer.set_index_change_hook(_on_index_change_hook)
+```
+
+This avoids circular imports completely because:
+1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`)
+2. `indexer.py` never imports `search.py` — it just stores a callback
+3. `search.py` registers the callback AFTER the InvertedIndex class is defined
+
+## Detailed Implementation
+
+### Step 1: Add hook variable to `backend/indexer.py`
+
+File: `backend/indexer.py`
+Changes:
+- After `_index_generation` global (line ~28), add:
+  ```python
+  _on_index_change: callable = None
+  ```
+- Add function `set_index_change_hook(hook)` (bottom of file, near other public functions)
+- Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665)
+- Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597)
+
+### Step 2: Add `add_document` and `remove_document` to InvertedIndex
+
+File: `backend/search.py`
+
+#### `add_document(vault_name, path, file_info)`
+
+```python
+def add_document(self, vault_name: str, path: str, file_info: dict):
+    """Add or update a single document in the inverted index."""
+    
+    doc_key = f"{vault_name}::{path}"
+    old_file_info = self.doc_info.get(doc_key)
+    
+    # If updating an existing document, remove old entries first
+    if old_file_info is not None:
+        self._remove_doc_internals(doc_key, vault_name, old_file_info)
+    else:
+        self.doc_count += 1
+    
+    # --- Metadata ---
+    self.doc_info[doc_key] = file_info
+    self.doc_vault[doc_key] = vault_name
+    self.vault_docs[vault_name].add(doc_key)
+    
+    # --- Tags ---
+    tags = file_info.get("tags", [])
+    for tag in tags:
+        self.tag_docs[tag.lower()].add(doc_key)
+    
+    # --- Title tokens ---
+    title = file_info.get("title", "")
+    title_tokens = tokenize(title)
+    for token in set(title_tokens):
+        self.title_index[token].append(doc_key)
+    
+    # --- Normalized title for prefix suggestions ---
+    norm_title = normalize_text(title)
+    if norm_title:
+        self.title_norm_map[norm_title].append({
+            "vault": vault_name,
+            "path": path,
+            "title": title,
        })
    
-    return {
-        "vaults": vault_stats,
-        "total_files": total_files,
-        "total_tags": len(total_tags),
-        "total_size_bytes": total_size,
-    }
+    # --- Word index (content + title TF) ---
+    content = file_info.get("content", "")
+    full_text = title + " " + content
+    tokens = tokenize(full_text)
+    tf = defaultdict(int)
+    for token in tokens:
+        tf[token] += 1
+    
+    # Track which tokens are new (not previously indexed) for sorted_tokens update
+    new_tokens = []
+    for token, freq in tf.items():
+        if not self.word_index.get(token):
+            new_tokens.append(token)
+        self.word_index[token][doc_key] = freq
+    
+    # Incrementally update _sorted_tokens (avoid O(V log V) full re-sort)
+    if new_tokens:
+        for token in new_tokens:
+            bisect.insort(self._sorted_tokens, token)
 ```

-**No new model needed** — return plain dict (or add optional `DashboardResponse` model).
-
-### 2b. Frontend: Insert stats widget in dashboard-home
-
-**File:** `frontend/index.html` — **after line 364** (`</div>` closing bookmarks section, before `<!-- Recently Opened Section -->`)
-
-Add:
-```html
-<!-- Stats Section -->
-<div id="dashboard-stats-section" class="dashboard-section">
-  <div class="dashboard-header">
-    <div class="dashboard-title-row">
-      <i data-lucide="bar-chart-3" class="dashboard-icon" style="color:var(--accent)"></i>
-      <h2>Statistiques</h2>
-    </div>
-  </div>
-  <div id="dashboard-stats-grid" class="dashboard-stats-grid">
-    <div class="dashboard-stats-loading">Chargement...</div>
-  </div>
-</div>
-```
-
-**File:** `frontend/app.js` — add `DashboardStatsWidget` module (insert at **line ~3343**, before `DashboardRecentWidget`):
-
-```javascript
-const DashboardStatsWidget = {
-  async load() {
-    const grid = document.getElementById("dashboard-stats-grid");
-    if (!grid) return;
-    grid.innerHTML = '<div class="dashboard-stats-loading">Chargement...</div>';
-    try {
-      const data = await api("/api/dashboard");
-      this.render(data);
-    } catch (err) {
-      grid.innerHTML = `<div class="dashboard-recent-empty">Erreur: ${escapeHtml(err.message)}</div>`;
-    }
-  },
-  render(data) {
-    const grid = document.getElementById("dashboard-stats-grid");
-    if (!grid) return;
-    const items = [
-      { icon: "files", label: "Fichiers", value: data.total_files.toLocaleString() },
-      { icon: "tags", label: "Tags uniques", value: data.total_tags.toLocaleString() },
-      { icon: "hard-drive", label: "Taille totale", value: this._formatSize(data.total_size_bytes) },
-      { icon: "folder", label: "Vaults", value: data.vaults.length.toString() },
-    ];
-    grid.innerHTML = items.map(i => `
-      <div class="stat-card">
-        <i data-lucide="${i.icon}" class="stat-icon"></i>
-        <span class="stat-value">${i.value}</span>
-        <span class="stat-label">${i.label}</span>
-      </div>
-    `).join("");
-    safeCreateIcons();
-  },
-  _formatSize(bytes) { /* KB/MB/GB formatter */ }
-};
-```
-
-**Also update `showWelcome()`** at **line ~5417** — the dashboard rebuild HTML must include the stats section div. And **line ~5490** — add `DashboardStatsWidget.load()` call.
-
-**File:** `frontend/style.css` — add CSS for `.dashboard-stats-grid`, `.stat-card`, `.stat-icon`, `.stat-value`, `.stat-label`.
-
-**Dependency:** Item 1 (Pydantic models) — none. Standalone.
-
---
-
-## 3. 🔔 Webhooks (P3) — 45 min
-
-### 3a. New backend module: `backend/webhooks.py`
-
-Create full module with:
- `WEBHOOKS_FILE = Path("data/webhooks.json")` — persistence
- `_DEFAULT_WEBHOOKS = []`
- `get_webhooks() -> list` — reads from disk
- `create_webhook(name, url, events, secret=None) -> dict` 
- `update_webhook(id, updates) -> dict`
- `delete_webhook(id) -> bool`
- `async def dispatch_webhooks(event_type: str, data: dict)` — calls all webhooks subscribed to `event_type`, sends JSON POST with HMAC-SHA256 signature header if secret is set, timeout 5s, logs failures
- Model: `WebhookConfig` with `id`, `name`, `url`, `events` (list of event type strings), `secret` (optional), `enabled`, `created_at`, `last_fired_at`
-
-### 3b. Backend: CRUD endpoints in `backend/main.py`
-
-Insert at **line ~2470** (before `GET /api/config`):
+#### `remove_document(vault_name, path)`

 ```python
-@app.get("/api/webhooks")
-@app.post("/api/webhooks")
-@app.patch("/api/webhooks/{webhook_id}")
-@app.delete("/api/webhooks/{webhook_id}")
+def remove_document(self, vault_name: str, path: str):
+    """Remove a document from the inverted index."""
+    
+    doc_key = f"{vault_name}::{path}"
+    file_info = self.doc_info.get(doc_key)
+    if file_info is None:
+        return
+    
+    self._remove_doc_internals(doc_key, vault_name, file_info)
+    self.doc_count -= 1
 ```

-Import `from backend.webhooks import get_webhooks, create_webhook, update_webhook, delete_webhook, dispatch_webhooks`
-
-### 3c. Backend: Hook dispatch_webhooks into file events
-
-Add `await dispatch_webhooks("file_created", {...})` calls alongside each `sse_manager.broadcast(...)` call:
-
-| Line | Event | Add dispatch |
-|------|-------|-------------|
-| ~1252 | `file_deleted` | `dispatch_webhooks("file_deleted", {"vault":..., "path":...})` |
-| ~1330 | `directory_created` | `dispatch_webhooks("directory_created", {...})` |
-| ~1401 | `directory_renamed` | `dispatch_webhooks("directory_renamed", {...})` |
-| ~1462 | `directory_deleted` | `dispatch_webhooks("directory_deleted", {...})` |
-| ~1532 | `file_created` | `dispatch_webhooks("file_created", {...})` |
-| ~1607 | `file_renamed` | `dispatch_webhooks("file_renamed", {...})` |
-
-### 3d. Frontend: Webhooks management UI in Configurations modal
-
-**File:** `frontend/index.html` — insert at **line ~633** (after `<!-- À propos -->` section, before `</div>` closing config-content):
-
-```html
-<section class="config-section">
-  <h2>🔔 Webhooks</h2>
-  <p class="config-description">Notifications HTTP vers des services externes lors des changements de fichiers.</p>
-  <div id="webhooks-list"></div>
-  <div class="config-add-pattern">
-    <input type="text" id="webhook-name-input" placeholder="Nom" class="config-input" style="width:120px">
-    <input type="text" id="webhook-url-input" placeholder="https://..." class="config-input" style="flex:1">
-    <button id="webhook-add-btn" class="config-btn-add">Ajouter</button>
-  </div>
-</section>
-```
-
-**File:** `frontend/app.js` — in `initConfigModal()` at **line ~3918**, add:
-```javascript
-loadWebhooks();  // in the open handler
-// Event binding for webhook add/save/delete buttons
-```
-
-Add functions: `loadWebhooks()`, `renderWebhooks(webhooks)`, `addWebhook()`, `deleteWebhook(id)`, `toggleWebhook(id, enabled)`. All use `api("/api/webhooks", ...)`.
-
-**File:** `frontend/style.css` — add `.webhook-item`, `.webhook-toggle`, `.webhook-delete` styles.
-
-**Dependency:** None on items 1–2. Standalone.
-
---
-
-## 4. 📤 Publication publique de documents (P3) — 60 min
-
-### 4a. New backend module: `backend/share.py`
-
-Create full module:
- `SHARES_FILE = Path("data/shares.json")`
- ShareToken model: `id`, `vault`, `path`, `token` (64-char hex), `created_by`, `created_at`, `expires_at` (optional, null = never), `access_count`, `last_accessed`
- `create_share(vault, path, created_by, expires_in_hours=None) -> dict` — generates token, stores, returns share info
- `get_share_by_token(token) -> dict | None` — validates expiry, returns share
- `revoke_share(id) -> bool`
- `list_shares(vault_filter=None) -> list` — for admin/settings page
- `record_access(token)` — increments access_count
-
-### 4b. Backend: Endpoints in `backend/main.py`
-
-Insert at **line ~1619** (before `GET /api/file/{vault_name}`):
+#### `_remove_doc_internals(doc_key, vault_name, file_info)` (private helper)

 ```python
-# Share management
-@app.post("/api/share/{vault_name}")
-@app.get("/api/shares")
-@app.delete("/api/share/{share_id}")
+def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict):
+    """Internal: remove one doc_key from all indexes without adjusting doc_count."""
    
-# Public view (no auth required!)
-@app.get("/s/{token}")
-async def public_share_view(token: str): ...
+    # --- Metadata ---
+    self.doc_info.pop(doc_key, None)
+    self.doc_vault.pop(doc_key, None)
+    if vault_name in self.vault_docs:
+        self.vault_docs[vault_name].discard(doc_key)
+    
+    # --- Tags ---
+    for tag in file_info.get("tags", []):
+        td = self.tag_docs.get(tag.lower())
+        if td:
+            td.discard(doc_key)
+            if not td:
+                del self.tag_docs[tag.lower()]
+    
+    # --- Title tokens ---
+    title = file_info.get("title", "")
+    for token in set(tokenize(title)):
+        ti = self.title_index.get(token)
+        if ti:
+            try:
+                ti.remove(doc_key)
+            except ValueError:
+                pass
+            if not ti:
+                del self.title_index[token]
+    
+    # --- Title norm map ---
+    norm_title = normalize_text(title)
+    if norm_title and norm_title in self.title_norm_map:
+        self.title_norm_map[norm_title] = [
+            e for e in self.title_norm_map[norm_title]
+            if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
+        ]
+        if not self.title_norm_map[norm_title]:
+            del self.title_norm_map[norm_title]
+    
+    # --- Word index ---
+    content = file_info.get("content", "")
+    full_text = title + " " + content
+    for token in set(tokenize(full_text)):
+        wi = self.word_index.get(token)
+        if wi:
+            wi.pop(doc_key, None)
+            if not wi:
+                del self.word_index[token]
+                # Remove from sorted tokens via bisect
+                idx = bisect.bisect_left(self._sorted_tokens, token)
+                if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
+                    self._sorted_tokens.pop(idx)
 ```

-The public view endpoint:
-1. Looks up token via `get_share_by_token(token)`
-2. Reads the file content
-3. Renders markdown with redacted secrets
-4. Returns simple HTML page (not SPA) with rendered content
-5. Increments access count
+Key: `bisect.insort` for insertion and `bisect.bisect_left` + `pop(idx)` for removal keep `_sorted_tokens` sorted in O(V) worst case (list shift) but this is negligible compared to O(N * content) rebuild.

-### 4c. Frontend: Share button in file actions
+### Step 3: Modify `get_inverted_index()` to NOT check staleness

-**File:** `frontend/app.js` — in `renderFile()`, at **line ~3250** (after pop-out button):
-
-```javascript
-const shareBtn = el("button", { class: "btn-action", title: "Partager" }, [icon("share-2", 14), document.createTextNode("Partager")]);
-shareBtn.addEventListener("click", () => openShareDialog(data.vault, data.path));
-```
-
-Add `shareBtn` to the file-actions div at **line ~3300**.
-
-Add `openShareDialog(vault, path)` function that:
- Calls `POST /api/share/{vault}` to create a share
- Shows a modal with the share URL (copyable) and expiration options
- Shows existing shares list with revoke buttons
-
-### 4d. Frontend: Share management in Configurations
-
-**File:** `frontend/index.html` — add share management section in config modal (alongside webhooks).
-
-**File:** `frontend/app.js` — `loadShares()` and `renderShares()` functions.
-
-**File:** `frontend/style.css` — add `.share-dialog`, `.share-url`, `.share-item` styles.
-
-**Dependency:** None. Standalone, but needs item 1 for clean models.
-
---
-
-## 5. 🔄 Gestion conflits Syncthing (P2) — 45 min
-
-### 5a. Backend: Conflict file detection
-
-**File:** `backend/indexer.py` — add after `_backlink_index`:
+File: `backend/search.py`

 ```python
-def get_conflicts() -> list:
-    """Scan all vaults for Syncthing/Nextcloud sync-conflict files."""
-    conflicts = []
-    pattern = re.compile(r'\.sync-conflict-(\d{8}-\d{6})\.')
-    for vname, vdata in index.items():
-        for f in vdata.get("files", []):
-            m = pattern.search(f["path"])
-            if m:
-                # Find the original file
-                orig_path = pattern.sub("", f["path"])
-                conflicts.append({
-                    "vault": vname,
-                    "conflict_path": f["path"],
-                    "original_path": orig_path,
-                    "conflict_date": m.group(1),
-                    "conflict_title": f.get("title", ""),
-                })
-    return conflicts
+def get_inverted_index() -> InvertedIndex:
+    """Return the singleton inverted index. Always up-to-date via hooks."""
+    return _inverted_index
 ```

-### 5b. Backend: Endpoints
+Remove `is_stale()` and the `_source_generation` / `_last_rebuild` / `_rebuild_cooldown` fields. Keep `rebuild()` for initial build and manual reindex (still called once at startup via `build_index`).

-**File:** `backend/main.py` — insert at **line ~2547**:
+### Step 4: Call `rebuild()` once after initial index build
+
+In `backend/search.py`, register the hook AND call rebuild once:

 ```python
-@app.get("/api/conflicts")
-async def api_conflicts(current_user=Depends(require_auth)):
-    """List sync-conflict files across accessible vaults."""
-    ...
-
-@app.post("/api/conflicts/resolve")
-async def api_conflict_resolve(body: dict, current_user=Depends(require_auth)):
-    """Resolve a conflict: keep_local (delete conflict), keep_conflict (replace original)."""
+# After InvertedIndex class and _inverted_index = InvertedIndex()
+
+def _on_index_change_hook(action, vault_name, path, file_info):
+    inv = _inverted_index
+    try:
+        if action == 'add':
+            inv.add_document(vault_name, path, file_info)
+        elif action == 'remove':
+            inv.remove_document(vault_name, path)
+    except Exception as e:
+        logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
+        # Fallback: mark for rebuild on next search
+        inv._needs_rebuild = True
+
+_indexer.set_index_change_hook(_on_index_change_hook)
+
+# Initial build trigger — called after first index is built
+def init_inverted_index():
+    """Force initial inverted index build. Called after build_index completes."""
+    _inverted_index.rebuild()
+
+def get_inverted_index() -> InvertedIndex:
+    """Return the singleton inverted index."""
+    # Only check for rebuild if incremental updates have failed
+    # OR if this is the very first call (doc_count == 0 and index has files)
+    if getattr(_inverted_index, '_needs_rebuild', False):
+        _inverted_index.rebuild()
+        _inverted_index._needs_rebuild = False
+    elif _inverted_index.doc_count == 0 and any(
+        vdata.get("files") for vdata in index.values()
+    ):
+        _inverted_index.rebuild()
+    return _inverted_index
+```
+
+### Step 5: Call `init_inverted_index()` from `build_index` in main.py
+
+In `backend/main.py`, after `build_index()` completes in the lifespan handler, call:
+
+```python
+from backend.search import init_inverted_index
+init_inverted_index()
+```
+
+This ensures the inverted index is built once on startup, then incrementally maintained thereafter.
+
+### Tag prefix index handling
+
+The `tag_norm_map` and `tag_prefix_index` are built per-vault in `rebuild()`. For incremental updates, we need to handle tag changes:
+
+In `add_document`, after adding doc tags:
+```python
+# Check if any tags are new (not in tag_norm_map)
+for tag in tags:
+    norm_tag = normalize_text(tag)
+    if norm_tag not in self.tag_norm_map:
+        self.tag_norm_map[norm_tag] = tag
+        for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
+            prefix = norm_tag[:plen]
+            if tag not in self.tag_prefix_index[prefix]:
+                self.tag_prefix_index[prefix].append(tag)
+```
+
+In `_remove_doc_internals`, we do NOT remove tags from `tag_norm_map` or `tag_prefix_index` — these are global (per-vault tag vocabulary), not per-document. They only grow over the lifetime of the inverted index. A periodic `rebuild()` on manual reindex will clean them up.
+
+### Step 6: Remove cooldown hack from search.py
+
+Remove:
+- `_last_rebuild` and `_rebuild_cooldown` fields from `InvertedIndex.__init__`
+- `is_stale()` method
+- `_source_generation` field (no longer needed for staleness, but keep for diagnostics)
+
+### Step 7: Remove coalescence hack from main.py
+
+In `_on_vault_change` in `backend/main.py`, remove:
+```python
+old_gen = idx._index_generation
 ...
+if idx._index_generation > old_gen + 1:
+    idx._index_generation = old_gen + 1
 ```

-### 5c. Backend: Diff endpoint
+This hack was only needed to reduce the number of inverted index rebuilds. With incremental updates, it's unnecessary — each mutation is cheap.

-```python
-@app.get("/api/conflicts/diff")
-async def api_conflict_diff(vault: str, original: str, conflict: str, current_user=Depends(require_auth)):
-    """Return unified diff between original and conflict file."""
-    import difflib
-    ...
-```
+## Files Modified (Summary)

-### 5d. Frontend: Conflict dashboard widget
+| File | Changes |
+|------|---------|
+| `backend/indexer.py` | +`_on_index_change` hook variable, +`set_index_change_hook()`, +hook calls in `_add_file_to_structures` and `_remove_file_from_structures` |
+| `backend/search.py` | +`add_document()`, +`remove_document()`, +`_remove_doc_internals()`, +`init_inverted_index()`, +hook registration, remove `is_stale()`/cooldown, simplify `get_inverted_index()` |
+| `backend/main.py` | +`init_inverted_index()` call after `build_index()`, remove coalescence hack in `_on_vault_change` |

-**File:** `frontend/index.html` — add `#dashboard-conflicts-section` in dashboard after stats section.
+## Risks & Edge Cases

-**File:** `frontend/app.js` — add `DashboardConflictsWidget` (pattern similar to recent/bookmarks):
- `load()` → `GET /api/conflicts`
- `render()` → shows conflict cards with file names and dates
- Click → opens diff modal showing side-by-side comparison
- Action buttons: "Garder l'original", "Garder le conflit"
+1. **Thread safety:** `_add_file_to_structures` and `_remove_file_from_structures` are protected by `_index_lock` / `_async_index_lock` in indexer.py. The InvertedIndex methods are called inside these locks, so they're also protected. No additional locking needed.

-**File:** `frontend/style.css` — add `.conflict-card`, `.conflict-diff`, `.conflict-actions` styles.
+2. **Hook registration timing:** `search.py` imports `indexer.py` at the top, then later registers the hook. The hook is registered at module load time, BEFORE the first call to `build_index`. So `_on_index_change` is set when `build_index` runs — but `build_index` calls `_add_file_to_structures` internally, which would try to incrementally update an empty inverted index. **Fix:** The hook checks `if _inverted_index.doc_count == 0` and skips incremental updates; the initial `rebuild()` handles the bulk load.

-**Dependency:** None. Standalone.
+3. **Hook call during initial build_index:** `build_index` iterates files and calls `_add_file_to_structures`. The hook fires for each file, calling `add_document()` on an empty inverted index. This is slower than a single `rebuild()`. **Fix:** Add a flag `_inverted_index._ready = False` initially, set to True after `init_inverted_index()`. The hook skips when `_ready` is False.

---
+4. **Sorted tokens performance:** `bisect.insort` and `list.pop(idx)` are O(V) worst case for large V. For 40k files, the vocabulary size V is typically 50k-200k tokens. O(V) for a single insertion is ~0.001ms, acceptable. The rebuild() call at startup handles the initial bulk.

-## Execution Order (optimal)
-
-1. **Item 1** — OpenAPI docs (quick win, no risk)
-2. **Item 2** — Dashboard stats (standalone, visible result)
-3. **Item 3** — Webhooks (new module + integration, most code)
-4. **Item 4** — Public shares (new module + public view, security-sensitive)
-5. **Item 5** — Syncthing conflicts (standalone, nice-to-have)
-
-**Total estimated effort:** ~3 hours
-
-## Files Summary
-
-| File | Action | Items |
-|------|--------|-------|
-| `backend/main.py` | Edit | 1 (models), 2a (endpoint), 3b+c (webhook CRUD+dispatch), 4b (share+public view), 5b+c (conflicts) |
-| `backend/webhooks.py` | **Create** | 3a |
-| `backend/share.py` | **Create** | 4a |
-| `backend/indexer.py` | Edit | 5a (get_conflicts) |
-| `frontend/index.html` | Edit | 2b, 3d, 4d, 5d (dashboard + config sections) |
-| `frontend/app.js` | Edit | 2b, 3d, 4c, 5d (widgets + share button + webhook UI) |
-| `frontend/style.css` | Edit | 2b, 3d, 4c, 5d (all new CSS classes) |
+5. **tag_norm_map / tag_prefix_index growth:** These grow monotonically (never shrink on incremental remove). With 40k files and thousands of tags, this is a few thousand entries — negligible. A manual "Réindexer" button triggers a full `rebuild()` to clean up.