Switch inverted index from stale check to incremental updates

Register a hook with the indexer so that file add/remove events incrementally maintain the inverted index, removing the need for periodic staleness checks and cooldowns. Rebuild the index once on startup via init_inverted_index().
2026-05-26 12:37:59 -04:00 · 2026-05-26 12:37:59 -04:00 · 775722f5d4
commit 775722f5d4
parent 872a3e56dd
4 changed files with 460 additions and 363 deletions
--- a/backend/indexer.py
+++ b/backend/indexer.py
@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic
        path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path]
    _index_generation += 1
    # Notify inverted index for incremental update
    if _on_index_change:
        _on_index_change('remove', vault_name, rel_path, removed)
    return removed
@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]):
    _index_generation += 1
    # Notify inverted index for incremental update
    if _on_index_change:
        _on_index_change('add', vault_name, file_info["path"], file_info)
 async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]:
    """Re-index a single file without full rebuild.
--- a/backend/main.py
+++ b/backend/main.py
@ -44,7 +44,7 @@ from backend.indexer import (
    remove_vault_from_index,
    add_vault_to_index,
 )
-from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags
+from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index
 from backend.image_processor import preprocess_images
 from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats
 from backend.vault_settings import (
@ -379,14 +379,9 @@ async def _on_vault_change(events: list):
    Processes each event (create/modify/delete/move) and updates the index
    incrementally, then broadcasts SSE notifications.
    """
    import backend.indexer as idx
    updated_vaults = set()
    changes = []
    # Temporarily suppress per-file generation increments to coalesce them
    # into a single increment at the end of the batch.
    old_gen = idx._index_generation
    for event in events:
        vault_name = event["vault"]
        event_type = event["type"]
@ -415,11 +410,6 @@ async def _on_vault_change(events: list):
        except Exception as e:
            logger.error(f"Error processing {event_type} event for {src}: {e}")
    # Restore generation to only increment by 1 for the whole batch
    # (unless it was already incremented by other operations)
    if idx._index_generation > old_gen + 1:
        idx._index_generation = old_gen + 1
    if changes:
        await sse_manager.broadcast("index_updated", {
            "vaults": list(updated_vaults),
@ -520,6 +510,9 @@ async def lifespan(app: FastAPI):
        logger.info("Background indexing started")
        await build_index(_progress_cb)
        # Build inverted index for search (one-time, then incremental)
        init_inverted_index()
        # Start file watcher
        config = _load_config()
        watcher_enabled = config.get("watcher_enabled", True)
--- a/backend/search.py
+++ b/backend/search.py
@ -239,9 +239,9 @@ def _escape_html(text: str) -> str:
 class InvertedIndex:
    """In-memory inverted index supporting TF-IDF scoring.
-    Built lazily from the global ``index`` dict whenever a search or
+    Built initially via ``rebuild()`` from the global index, then
-    suggestion request detects that the underlying vault index has changed.
+    maintained incrementally via ``add_document()`` / ``remove_document()``
-    The class is designed to be a singleton — use ``get_inverted_index()``.
+    hooks from the file watcher and API mutations.
    Attributes:
        word_index: ``{token: {doc_key: term_frequency}}``
@ -249,7 +249,6 @@ class InvertedIndex:
        tag_norm_map: ``{normalized_tag: original_tag}``
        tag_prefix_index: ``{prefix: [original_tag, ...]}``
        doc_count: Total number of indexed documents.
        _source_id: Fingerprint of the source index to detect staleness.
    """
    def __init__(self) -> None:
@ -264,23 +263,7 @@ class InvertedIndex:
        self.vault_docs: Dict[str, set] = defaultdict(set)
        self.tag_docs: Dict[str, set] = defaultdict(set)
        self._sorted_tokens: List[str] = []
-        self._source_generation: int = -1
+        self._ready: bool = False  # True after initial build
        self._last_rebuild: float = 0
        self._rebuild_cooldown: float = 3.0  # seconds
    def is_stale(self) -> bool:
        """Check if the inverted index needs rebuilding.
        Uses a cooldown (3s) to prevent rapid rebuilds from file watcher
        events. Staleness is only reported if the generation has changed
        AND the cooldown has elapsed since the last rebuild.
        """
        import time
        if _indexer._index_generation == self._source_generation:
            return False
        if time.time() - self._last_rebuild < self._rebuild_cooldown:
            return False
        return True
    def rebuild(self) -> None:
        """Rebuild inverted index from the global ``index`` dict.
@ -288,9 +271,7 @@ class InvertedIndex:
        Tokenizes titles and content of every file, computes term frequencies,
        and builds auxiliary indexes for tag and title prefix suggestions.
        """
-        import time
+        logger.info("Building inverted index...")
        self._last_rebuild = time.time()
        logger.info("Rebuilding inverted index...")
        self.word_index = defaultdict(dict)
        self.title_index = defaultdict(list)
        self.tag_norm_map = {}
@ -351,7 +332,7 @@ class InvertedIndex:
                        self.tag_prefix_index[prefix].append(tag)
        self._sorted_tokens = sorted(self.word_index.keys())
-        self._source_generation = _indexer._index_generation
+        self._ready = True
        logger.info(
            "Inverted index built: %d documents, %d unique tokens, %d tags",
            self.doc_count,
@ -359,6 +340,117 @@ class InvertedIndex:
            len(self.tag_norm_map),
        )
    def add_document(self, vault_name: str, path: str, file_info: dict):
        """Add or update a single document incrementally."""
        if not self._ready:
            return
        doc_key = f"{vault_name}::{path}"
        old_file_info = self.doc_info.get(doc_key)
        if old_file_info is not None:
            self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True)
        else:
            self.doc_count += 1
        # Metadata
        self.doc_info[doc_key] = file_info
        self.doc_vault[doc_key] = vault_name
        self.vault_docs[vault_name].add(doc_key)
        # Tags
        tags = file_info.get("tags", [])
        for tag in tags:
            self.tag_docs[tag.lower()].add(doc_key)
            norm_tag = normalize_text(tag)
            if norm_tag not in self.tag_norm_map:
                self.tag_norm_map[norm_tag] = tag
                for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
                    prefix = norm_tag[:plen]
                    if tag not in self.tag_prefix_index[prefix]:
                        self.tag_prefix_index[prefix].append(tag)
        # Title tokens
        title = file_info.get("title", "")
        title_tokens = tokenize(title)
        for token in set(title_tokens):
            if token:
                self.title_index[token].append(doc_key)
        # Title norm map
        norm_title = normalize_text(title)
        if norm_title:
            self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title})
        # Word index (content + title TF)
        content = file_info.get("content", "")
        full_text = title + " " + content
        tokens = tokenize(full_text)
        tf: Dict[str, int] = defaultdict(int)
        for token in tokens:
            if token:
                tf[token] += 1
        for token, freq in tf.items():
            if not self.word_index.get(token):
                bisect.insort(self._sorted_tokens, token)
            self.word_index[token][doc_key] = freq
    def remove_document(self, vault_name: str, path: str):
        """Remove a single document incrementally."""
        if not self._ready:
            return
        doc_key = f"{vault_name}::{path}"
        file_info = self.doc_info.get(doc_key)
        if file_info is None:
            return
        self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False)
        self.doc_count -= 1
    def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False):
        """Remove one doc_key from all indexes without adjusting doc_count."""
        # Metadata
        self.doc_info.pop(doc_key, None)
        self.doc_vault.pop(doc_key, None)
        if vault_name in self.vault_docs:
            self.vault_docs[vault_name].discard(doc_key)
        # Tags (per-document, NOT the global tag_norm_map)
        for tag in file_info.get("tags", []):
            td = self.tag_docs.get(tag.lower())
            if td:
                td.discard(doc_key)
                if not td:
                    del self.tag_docs[tag.lower()]
        # Title tokens
        title = file_info.get("title", "")
        for token in set(tokenize(title)):
            if not token:
                continue
            ti = self.title_index.get(token)
            if ti:
                try:
                    ti.remove(doc_key)
                except ValueError:
                    pass
                if not ti:
                    del self.title_index[token]
        # Title norm map
        norm_title = normalize_text(title)
        if norm_title and norm_title in self.title_norm_map:
            self.title_norm_map[norm_title] = [
                e for e in self.title_norm_map[norm_title]
                if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
            ]
            if not self.title_norm_map[norm_title]:
                del self.title_norm_map[norm_title]
        # Word index
        content = file_info.get("content", "")
        full_text = title + " " + content
        for token in set(tokenize(full_text)):
            if not token:
                continue
            wi = self.word_index.get(token)
            if wi:
                wi.pop(doc_key, None)
                if not wi:
                    del self.word_index[token]
                    if not skip_sorted_cleanup:
                        idx = bisect.bisect_left(self._sorted_tokens, token)
                        if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
                            self._sorted_tokens.pop(idx)
    def idf(self, term: str) -> float:
        """Inverse Document Frequency for a term.
@ -424,9 +516,39 @@ class InvertedIndex:
 _inverted_index = InvertedIndex()
 def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict):
    """Callback registered with indexer for incremental inverted index updates."""
    inv = _inverted_index
    try:
        if action == 'add' and file_info:
            inv.add_document(vault_name, path, file_info)
        elif action == 'remove':
            inv.remove_document(vault_name, path)
    except Exception as e:
        logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
 # Register the hook with indexer (indexer is already imported at top of file)
 _indexer.set_index_change_hook(_on_index_change_hook)
 def init_inverted_index():
    """Force initial inverted index build. Called after build_index completes on startup."""
    if any(vdata.get("files") for vdata in index.values()):
        _inverted_index.rebuild()
        logger.info("Inverted index initialized.")
 def get_inverted_index() -> InvertedIndex:
-    """Return the singleton inverted index, rebuilding if stale."""
+    """Return the singleton inverted index.
-    if _inverted_index.is_stale():
+
    Auto-builds on first call if the index has files but the inverted
    index hasn't been built yet (fallback for paths that don't go through
    ``init_inverted_index()``).
    """
    if _inverted_index.doc_count == 0 and any(
        vdata.get("files") for vdata in index.values()
    ):
        _inverted_index.rebuild()
    return _inverted_index
--- a/plan.md
+++ b/plan.md
@ -1,375 +1,348 @@
-# Implementation Plan — Remaining Roadmap Items
+# Plan: Incremental InvertedIndex for 40k+ files
-## 1. 📝 Documentation OpenAPI enrichie (P3) — 5 min
+## Problem Summary
-**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`.
+Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` — O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable.
-**Models to update (lines 89–311):**
+The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it.
-| Line | Model | Fields to annotate |
+## Solution: Incremental Add/Remove on the InvertedIndex
 |------|-------|--------------------|
 | 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` |
 | 103 | `FileRawResponse` | `vault`, `path`, `raw` |
 | 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` |
 | 118 | `FileDeleteResponse` | `status`, `vault`, `path` |
 | 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
 | 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) |
 | 146 | `TagsResponse` | `vault_filter`, `tags` |
 | 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) |
 | 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` |
 | 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
 | 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) |
 | 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) |
 | 196 | `TitleSuggestion` | `vault`, `path`, `title` |
 | 203 | `SuggestResponse` | `query`, `suggestions` |
 | 209 | `TagSuggestion` | `tag`, `count` |
 | 215 | `TagSuggestResponse` | `query`, `suggestions` |
 | 221 | `GraphNode` | (all fields already have Field) |
 | 231 | `GraphEdge` | (all fields already have Field) |
 | 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` |
 | 247 | `ReloadResponse` | `status`, `vaults` |
 | 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` |
 | 265 | `DirectoryCreateResponse` | `success`, `path` |
 | 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` |
 | 296 | `FileCreateResponse` | `success`, `path` |
 | 307 | `FileRenameResponse` | `success` |
-**Dependency:** None. Pure documentation change.
+Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed:
---
+- `add_document(vault_name, path, file_info)` — called on create/modify
 - `remove_document(vault_name, path)` — called on delete/move-source
-## 2. 📊 Dashboard statistiques (P3) — 30 min
+Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale.
-### 2a. Backend: `GET /api/dashboard` (new endpoint)
+Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current.
-**File:** `backend/main.py` — insert at **line ~2547** (after `/api/diagnostics`)
+## Dependency Architecture
 **Current import chain:**
 ```
 main.py → search.py → indexer.py  (search.py imports `from backend import indexer as _indexer`)
 ```
 **Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py → indexer.py → search.py`.
 **Fix — Option C (Callback/Hook pattern, simplest):**
 Add a module-level hook variable in `backend/indexer.py`:
 ```python
-@app.get("/api/dashboard")
+# In backend/indexer.py
-async def api_dashboard(current_user=Depends(require_auth)):
+_on_index_change: callable = None  # Called as (action, vault_name, path, file_info_or_None)
    """Aggregated dashboard statistics across all accessible vaults."""
    from backend.indexer import index, vault_config, path_index
    user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", [])
-    vault_stats = []
+def set_index_change_hook(hook):
-    total_files = 0
+    """Register a callback for incremental index updates.
-    total_tags = set()
+    hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'.
-    total_size = 0
+    """
    global _on_index_change
    _on_index_change = hook
 ```
-    for vname, vdata in index.items():
+Then at the end of `_add_file_to_structures`:
-        if "*" not in user_vaults and vname not in user_vaults:
+```python
-            continue
+if _on_index_change:
-        files = vdata.get("files", [])
+    _on_index_change('add', vault_name, rel_path, file_info)
-        file_count = len(files)
+```
-        total_files += file_count
+
-        tags = set()
+At the end of `_remove_file_from_structures`:
-        for f in files:
+```python
-            tags.update(f.get("tags", []))
+if _on_index_change:
-            total_size += f.get("size", 0)
+    _on_index_change('remove', vault_name, rel_path, file_info)  # file_info = removed dict or None
-        total_tags.update(tags)
+```
-        vault_stats.append({
+
-            "name": vname,
+Then in `backend/search.py`, at module load time (after InvertedIndex class is defined):
-            "file_count": file_count,
+```python
-            "tag_count": len(tags),
+def _on_index_change_hook(action, vault_name, path, file_info):
-            "total_size_bytes": sum(f.get("size", 0) for f in files),
+    inv = get_inverted_index_raw()  # get without rebuild check
    if action == 'add':
        inv.add_document(vault_name, path, file_info)
    elif action == 'remove':
        inv.remove_document(vault_name, path)
 # Register the hook — this triggers an import of indexer, but indexer is already imported
 # by the time this line runs (since search.py does `from backend import indexer as _indexer` above)
 _indexer.set_index_change_hook(_on_index_change_hook)
 ```
 This avoids circular imports completely because:
 1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`)
 2. `indexer.py` never imports `search.py` — it just stores a callback
 3. `search.py` registers the callback AFTER the InvertedIndex class is defined
 ## Detailed Implementation
 ### Step 1: Add hook variable to `backend/indexer.py`
 File: `backend/indexer.py`
 Changes:
 - After `_index_generation` global (line ~28), add:
  ```python
  _on_index_change: callable = None
  ```
 - Add function `set_index_change_hook(hook)` (bottom of file, near other public functions)
 - Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665)
 - Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597)
 ### Step 2: Add `add_document` and `remove_document` to InvertedIndex
 File: `backend/search.py`
 #### `add_document(vault_name, path, file_info)`
 ```python
 def add_document(self, vault_name: str, path: str, file_info: dict):
    """Add or update a single document in the inverted index."""
    doc_key = f"{vault_name}::{path}"
    old_file_info = self.doc_info.get(doc_key)
    # If updating an existing document, remove old entries first
    if old_file_info is not None:
        self._remove_doc_internals(doc_key, vault_name, old_file_info)
    else:
        self.doc_count += 1
    # --- Metadata ---
    self.doc_info[doc_key] = file_info
    self.doc_vault[doc_key] = vault_name
    self.vault_docs[vault_name].add(doc_key)
    # --- Tags ---
    tags = file_info.get("tags", [])
    for tag in tags:
        self.tag_docs[tag.lower()].add(doc_key)
    # --- Title tokens ---
    title = file_info.get("title", "")
    title_tokens = tokenize(title)
    for token in set(title_tokens):
        self.title_index[token].append(doc_key)
    # --- Normalized title for prefix suggestions ---
    norm_title = normalize_text(title)
    if norm_title:
        self.title_norm_map[norm_title].append({
            "vault": vault_name,
            "path": path,
            "title": title,
        })
-    return {
+    # --- Word index (content + title TF) ---
-        "vaults": vault_stats,
+    content = file_info.get("content", "")
-        "total_files": total_files,
+    full_text = title + " " + content
-        "total_tags": len(total_tags),
+    tokens = tokenize(full_text)
-        "total_size_bytes": total_size,
+    tf = defaultdict(int)
-    }
+    for token in tokens:
        tf[token] += 1
    # Track which tokens are new (not previously indexed) for sorted_tokens update
    new_tokens = []
    for token, freq in tf.items():
        if not self.word_index.get(token):
            new_tokens.append(token)
        self.word_index[token][doc_key] = freq
    # Incrementally update _sorted_tokens (avoid O(V log V) full re-sort)
    if new_tokens:
        for token in new_tokens:
            bisect.insort(self._sorted_tokens, token)
 ```
-**No new model needed** — return plain dict (or add optional `DashboardResponse` model).
+#### `remove_document(vault_name, path)`
 ### 2b. Frontend: Insert stats widget in dashboard-home
 **File:** `frontend/index.html` — **after line 364** (`</div>` closing bookmarks section, before `<!-- Recently Opened Section -->`)
 Add:
 ```html
 <!-- Stats Section -->
 <div id="dashboard-stats-section" class="dashboard-section">
  <div class="dashboard-header">
    <div class="dashboard-title-row">
      <i data-lucide="bar-chart-3" class="dashboard-icon" style="color:var(--accent)"></i>
      <h2>Statistiques</h2>
    </div>
  </div>
  <div id="dashboard-stats-grid" class="dashboard-stats-grid">
    <div class="dashboard-stats-loading">Chargement...</div>
  </div>
 </div>
 ```
 **File:** `frontend/app.js` — add `DashboardStatsWidget` module (insert at **line ~3343**, before `DashboardRecentWidget`):
 ```javascript
 const DashboardStatsWidget = {
  async load() {
    const grid = document.getElementById("dashboard-stats-grid");
    if (!grid) return;
    grid.innerHTML = '<div class="dashboard-stats-loading">Chargement...</div>';
    try {
      const data = await api("/api/dashboard");
      this.render(data);
    } catch (err) {
      grid.innerHTML = `<div class="dashboard-recent-empty">Erreur: ${escapeHtml(err.message)}</div>`;
    }
  },
  render(data) {
    const grid = document.getElementById("dashboard-stats-grid");
    if (!grid) return;
    const items = [
      { icon: "files", label: "Fichiers", value: data.total_files.toLocaleString() },
      { icon: "tags", label: "Tags uniques", value: data.total_tags.toLocaleString() },
      { icon: "hard-drive", label: "Taille totale", value: this._formatSize(data.total_size_bytes) },
      { icon: "folder", label: "Vaults", value: data.vaults.length.toString() },
    ];
    grid.innerHTML = items.map(i => `
      <div class="stat-card">
        <i data-lucide="${i.icon}" class="stat-icon"></i>
        <span class="stat-value">${i.value}</span>
        <span class="stat-label">${i.label}</span>
      </div>
    `).join("");
    safeCreateIcons();
  },
  _formatSize(bytes) { /* KB/MB/GB formatter */ }
 };
 ```
 **Also update `showWelcome()`** at **line ~5417** — the dashboard rebuild HTML must include the stats section div. And **line ~5490** — add `DashboardStatsWidget.load()` call.
 **File:** `frontend/style.css` — add CSS for `.dashboard-stats-grid`, `.stat-card`, `.stat-icon`, `.stat-value`, `.stat-label`.
 **Dependency:** Item 1 (Pydantic models) — none. Standalone.
 ---
 ## 3. 🔔 Webhooks (P3) — 45 min
 ### 3a. New backend module: `backend/webhooks.py`
 Create full module with:
 - `WEBHOOKS_FILE = Path("data/webhooks.json")` — persistence
 - `_DEFAULT_WEBHOOKS = []`
 - `get_webhooks() -> list` — reads from disk
 - `create_webhook(name, url, events, secret=None) -> dict` 
 - `update_webhook(id, updates) -> dict`
 - `delete_webhook(id) -> bool`
 - `async def dispatch_webhooks(event_type: str, data: dict)` — calls all webhooks subscribed to `event_type`, sends JSON POST with HMAC-SHA256 signature header if secret is set, timeout 5s, logs failures
 - Model: `WebhookConfig` with `id`, `name`, `url`, `events` (list of event type strings), `secret` (optional), `enabled`, `created_at`, `last_fired_at`
 ### 3b. Backend: CRUD endpoints in `backend/main.py`
 Insert at **line ~2470** (before `GET /api/config`):
 ```python
-@app.get("/api/webhooks")
+def remove_document(self, vault_name: str, path: str):
-@app.post("/api/webhooks")
+    """Remove a document from the inverted index."""
-@app.patch("/api/webhooks/{webhook_id}")
+    
-@app.delete("/api/webhooks/{webhook_id}")
+    doc_key = f"{vault_name}::{path}"
    file_info = self.doc_info.get(doc_key)
    if file_info is None:
        return
    self._remove_doc_internals(doc_key, vault_name, file_info)
    self.doc_count -= 1
 ```
-Import `from backend.webhooks import get_webhooks, create_webhook, update_webhook, delete_webhook, dispatch_webhooks`
+#### `_remove_doc_internals(doc_key, vault_name, file_info)` (private helper)
 ### 3c. Backend: Hook dispatch_webhooks into file events
 Add `await dispatch_webhooks("file_created", {...})` calls alongside each `sse_manager.broadcast(...)` call:
 | Line | Event | Add dispatch |
 |------|-------|-------------|
 | ~1252 | `file_deleted` | `dispatch_webhooks("file_deleted", {"vault":..., "path":...})` |
 | ~1330 | `directory_created` | `dispatch_webhooks("directory_created", {...})` |
 | ~1401 | `directory_renamed` | `dispatch_webhooks("directory_renamed", {...})` |
 | ~1462 | `directory_deleted` | `dispatch_webhooks("directory_deleted", {...})` |
 | ~1532 | `file_created` | `dispatch_webhooks("file_created", {...})` |
 | ~1607 | `file_renamed` | `dispatch_webhooks("file_renamed", {...})` |
 ### 3d. Frontend: Webhooks management UI in Configurations modal
 **File:** `frontend/index.html` — insert at **line ~633** (after `<!-- À propos -->` section, before `</div>` closing config-content):
 ```html
 <section class="config-section">
  <h2>🔔 Webhooks</h2>
  <p class="config-description">Notifications HTTP vers des services externes lors des changements de fichiers.</p>
  <div id="webhooks-list"></div>
  <div class="config-add-pattern">
    <input type="text" id="webhook-name-input" placeholder="Nom" class="config-input" style="width:120px">
    <input type="text" id="webhook-url-input" placeholder="https://..." class="config-input" style="flex:1">
    <button id="webhook-add-btn" class="config-btn-add">Ajouter</button>
  </div>
 </section>
 ```
 **File:** `frontend/app.js` — in `initConfigModal()` at **line ~3918**, add:
 ```javascript
 loadWebhooks();  // in the open handler
 // Event binding for webhook add/save/delete buttons
 ```
 Add functions: `loadWebhooks()`, `renderWebhooks(webhooks)`, `addWebhook()`, `deleteWebhook(id)`, `toggleWebhook(id, enabled)`. All use `api("/api/webhooks", ...)`.
 **File:** `frontend/style.css` — add `.webhook-item`, `.webhook-toggle`, `.webhook-delete` styles.
 **Dependency:** None on items 1–2. Standalone.
 ---
 ## 4. 📤 Publication publique de documents (P3) — 60 min
 ### 4a. New backend module: `backend/share.py`
 Create full module:
 - `SHARES_FILE = Path("data/shares.json")`
 - ShareToken model: `id`, `vault`, `path`, `token` (64-char hex), `created_by`, `created_at`, `expires_at` (optional, null = never), `access_count`, `last_accessed`
 - `create_share(vault, path, created_by, expires_in_hours=None) -> dict` — generates token, stores, returns share info
 - `get_share_by_token(token) -> dict | None` — validates expiry, returns share
 - `revoke_share(id) -> bool`
 - `list_shares(vault_filter=None) -> list` — for admin/settings page
 - `record_access(token)` — increments access_count
 ### 4b. Backend: Endpoints in `backend/main.py`
 Insert at **line ~1619** (before `GET /api/file/{vault_name}`):
 ```python
-# Share management
+def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict):
-@app.post("/api/share/{vault_name}")
+    """Internal: remove one doc_key from all indexes without adjusting doc_count."""
@app.get("/api/shares")
@app.delete("/api/share/{share_id}")
-# Public view (no auth required!)
+    # --- Metadata ---
-@app.get("/s/{token}")
+    self.doc_info.pop(doc_key, None)
-async def public_share_view(token: str): ...
+    self.doc_vault.pop(doc_key, None)
    if vault_name in self.vault_docs:
        self.vault_docs[vault_name].discard(doc_key)
    # --- Tags ---
    for tag in file_info.get("tags", []):
        td = self.tag_docs.get(tag.lower())
        if td:
            td.discard(doc_key)
            if not td:
                del self.tag_docs[tag.lower()]
    # --- Title tokens ---
    title = file_info.get("title", "")
    for token in set(tokenize(title)):
        ti = self.title_index.get(token)
        if ti:
            try:
                ti.remove(doc_key)
            except ValueError:
                pass
            if not ti:
                del self.title_index[token]
    # --- Title norm map ---
    norm_title = normalize_text(title)
    if norm_title and norm_title in self.title_norm_map:
        self.title_norm_map[norm_title] = [
            e for e in self.title_norm_map[norm_title]
            if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
        ]
        if not self.title_norm_map[norm_title]:
            del self.title_norm_map[norm_title]
    # --- Word index ---
    content = file_info.get("content", "")
    full_text = title + " " + content
    for token in set(tokenize(full_text)):
        wi = self.word_index.get(token)
        if wi:
            wi.pop(doc_key, None)
            if not wi:
                del self.word_index[token]
                # Remove from sorted tokens via bisect
                idx = bisect.bisect_left(self._sorted_tokens, token)
                if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
                    self._sorted_tokens.pop(idx)
 ```
-The public view endpoint:
+Key: `bisect.insort` for insertion and `bisect.bisect_left` + `pop(idx)` for removal keep `_sorted_tokens` sorted in O(V) worst case (list shift) but this is negligible compared to O(N * content) rebuild.
 1. Looks up token via `get_share_by_token(token)`
 2. Reads the file content
 3. Renders markdown with redacted secrets
 4. Returns simple HTML page (not SPA) with rendered content
 5. Increments access count
-### 4c. Frontend: Share button in file actions
+### Step 3: Modify `get_inverted_index()` to NOT check staleness
-**File:** `frontend/app.js` — in `renderFile()`, at **line ~3250** (after pop-out button):
+File: `backend/search.py`
 ```javascript
 const shareBtn = el("button", { class: "btn-action", title: "Partager" }, [icon("share-2", 14), document.createTextNode("Partager")]);
 shareBtn.addEventListener("click", () => openShareDialog(data.vault, data.path));
 ```
 Add `shareBtn` to the file-actions div at **line ~3300**.
 Add `openShareDialog(vault, path)` function that:
 - Calls `POST /api/share/{vault}` to create a share
 - Shows a modal with the share URL (copyable) and expiration options
 - Shows existing shares list with revoke buttons
 ### 4d. Frontend: Share management in Configurations
 **File:** `frontend/index.html` — add share management section in config modal (alongside webhooks).
 **File:** `frontend/app.js` — `loadShares()` and `renderShares()` functions.
 **File:** `frontend/style.css` — add `.share-dialog`, `.share-url`, `.share-item` styles.
 **Dependency:** None. Standalone, but needs item 1 for clean models.
 ---
 ## 5. 🔄 Gestion conflits Syncthing (P2) — 45 min
 ### 5a. Backend: Conflict file detection
 **File:** `backend/indexer.py` — add after `_backlink_index`:
 ```python
-def get_conflicts() -> list:
+def get_inverted_index() -> InvertedIndex:
-    """Scan all vaults for Syncthing/Nextcloud sync-conflict files."""
+    """Return the singleton inverted index. Always up-to-date via hooks."""
-    conflicts = []
+    return _inverted_index
    pattern = re.compile(r'\.sync-conflict-(\d{8}-\d{6})\.')
    for vname, vdata in index.items():
        for f in vdata.get("files", []):
            m = pattern.search(f["path"])
            if m:
                # Find the original file
                orig_path = pattern.sub("", f["path"])
                conflicts.append({
                    "vault": vname,
                    "conflict_path": f["path"],
                    "original_path": orig_path,
                    "conflict_date": m.group(1),
                    "conflict_title": f.get("title", ""),
                })
    return conflicts
 ```
-### 5b. Backend: Endpoints
+Remove `is_stale()` and the `_source_generation` / `_last_rebuild` / `_rebuild_cooldown` fields. Keep `rebuild()` for initial build and manual reindex (still called once at startup via `build_index`).
-**File:** `backend/main.py` — insert at **line ~2547**:
+### Step 4: Call `rebuild()` once after initial index build
 In `backend/search.py`, register the hook AND call rebuild once:
 ```python
-@app.get("/api/conflicts")
+# After InvertedIndex class and _inverted_index = InvertedIndex()
-async def api_conflicts(current_user=Depends(require_auth)):
+
-    """List sync-conflict files across accessible vaults."""
+def _on_index_change_hook(action, vault_name, path, file_info):
-    ...
+    inv = _inverted_index
-
+    try:
-@app.post("/api/conflicts/resolve")
+        if action == 'add':
-async def api_conflict_resolve(body: dict, current_user=Depends(require_auth)):
+            inv.add_document(vault_name, path, file_info)
-    """Resolve a conflict: keep_local (delete conflict), keep_conflict (replace original)."""
+        elif action == 'remove':
            inv.remove_document(vault_name, path)
    except Exception as e:
        logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
        # Fallback: mark for rebuild on next search
        inv._needs_rebuild = True
 _indexer.set_index_change_hook(_on_index_change_hook)
 # Initial build trigger — called after first index is built
 def init_inverted_index():
    """Force initial inverted index build. Called after build_index completes."""
    _inverted_index.rebuild()
 def get_inverted_index() -> InvertedIndex:
    """Return the singleton inverted index."""
    # Only check for rebuild if incremental updates have failed
    # OR if this is the very first call (doc_count == 0 and index has files)
    if getattr(_inverted_index, '_needs_rebuild', False):
        _inverted_index.rebuild()
        _inverted_index._needs_rebuild = False
    elif _inverted_index.doc_count == 0 and any(
        vdata.get("files") for vdata in index.values()
    ):
        _inverted_index.rebuild()
    return _inverted_index
 ```
 ### Step 5: Call `init_inverted_index()` from `build_index` in main.py
 In `backend/main.py`, after `build_index()` completes in the lifespan handler, call:
 ```python
 from backend.search import init_inverted_index
 init_inverted_index()
 ```
 This ensures the inverted index is built once on startup, then incrementally maintained thereafter.
 ### Tag prefix index handling
 The `tag_norm_map` and `tag_prefix_index` are built per-vault in `rebuild()`. For incremental updates, we need to handle tag changes:
 In `add_document`, after adding doc tags:
 ```python
 # Check if any tags are new (not in tag_norm_map)
 for tag in tags:
    norm_tag = normalize_text(tag)
    if norm_tag not in self.tag_norm_map:
        self.tag_norm_map[norm_tag] = tag
        for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
            prefix = norm_tag[:plen]
            if tag not in self.tag_prefix_index[prefix]:
                self.tag_prefix_index[prefix].append(tag)
 ```
 In `_remove_doc_internals`, we do NOT remove tags from `tag_norm_map` or `tag_prefix_index` — these are global (per-vault tag vocabulary), not per-document. They only grow over the lifetime of the inverted index. A periodic `rebuild()` on manual reindex will clean them up.
 ### Step 6: Remove cooldown hack from search.py
 Remove:
 - `_last_rebuild` and `_rebuild_cooldown` fields from `InvertedIndex.__init__`
 - `is_stale()` method
 - `_source_generation` field (no longer needed for staleness, but keep for diagnostics)
 ### Step 7: Remove coalescence hack from main.py
 In `_on_vault_change` in `backend/main.py`, remove:
 ```python
 old_gen = idx._index_generation
 ...
 if idx._index_generation > old_gen + 1:
    idx._index_generation = old_gen + 1
 ```
-### 5c. Backend: Diff endpoint
+This hack was only needed to reduce the number of inverted index rebuilds. With incremental updates, it's unnecessary — each mutation is cheap.
-```python
+## Files Modified (Summary)
@app.get("/api/conflicts/diff")
 async def api_conflict_diff(vault: str, original: str, conflict: str, current_user=Depends(require_auth)):
    """Return unified diff between original and conflict file."""
    import difflib
    ...
 ```
-### 5d. Frontend: Conflict dashboard widget
+| File | Changes |
 |------|---------|
 | `backend/indexer.py` | +`_on_index_change` hook variable, +`set_index_change_hook()`, +hook calls in `_add_file_to_structures` and `_remove_file_from_structures` |
 | `backend/search.py` | +`add_document()`, +`remove_document()`, +`_remove_doc_internals()`, +`init_inverted_index()`, +hook registration, remove `is_stale()`/cooldown, simplify `get_inverted_index()` |
 | `backend/main.py` | +`init_inverted_index()` call after `build_index()`, remove coalescence hack in `_on_vault_change` |
-**File:** `frontend/index.html` — add `#dashboard-conflicts-section` in dashboard after stats section.
+## Risks & Edge Cases
-**File:** `frontend/app.js` — add `DashboardConflictsWidget` (pattern similar to recent/bookmarks):
+1. **Thread safety:** `_add_file_to_structures` and `_remove_file_from_structures` are protected by `_index_lock` / `_async_index_lock` in indexer.py. The InvertedIndex methods are called inside these locks, so they're also protected. No additional locking needed.
 - `load()` → `GET /api/conflicts`
 - `render()` → shows conflict cards with file names and dates
 - Click → opens diff modal showing side-by-side comparison
 - Action buttons: "Garder l'original", "Garder le conflit"
-**File:** `frontend/style.css` — add `.conflict-card`, `.conflict-diff`, `.conflict-actions` styles.
+2. **Hook registration timing:** `search.py` imports `indexer.py` at the top, then later registers the hook. The hook is registered at module load time, BEFORE the first call to `build_index`. So `_on_index_change` is set when `build_index` runs — but `build_index` calls `_add_file_to_structures` internally, which would try to incrementally update an empty inverted index. **Fix:** The hook checks `if _inverted_index.doc_count == 0` and skips incremental updates; the initial `rebuild()` handles the bulk load.
-**Dependency:** None. Standalone.
+3. **Hook call during initial build_index:** `build_index` iterates files and calls `_add_file_to_structures`. The hook fires for each file, calling `add_document()` on an empty inverted index. This is slower than a single `rebuild()`. **Fix:** Add a flag `_inverted_index._ready = False` initially, set to True after `init_inverted_index()`. The hook skips when `_ready` is False.
---
+4. **Sorted tokens performance:** `bisect.insort` and `list.pop(idx)` are O(V) worst case for large V. For 40k files, the vocabulary size V is typically 50k-200k tokens. O(V) for a single insertion is ~0.001ms, acceptable. The rebuild() call at startup handles the initial bulk.
-## Execution Order (optimal)
+5. **tag_norm_map / tag_prefix_index growth:** These grow monotonically (never shrink on incremental remove). With 40k files and thousands of tags, this is a few thousand entries — negligible. A manual "Réindexer" button triggers a full `rebuild()` to clean up.
 1. **Item 1** — OpenAPI docs (quick win, no risk)
 2. **Item 2** — Dashboard stats (standalone, visible result)
 3. **Item 3** — Webhooks (new module + integration, most code)
 4. **Item 4** — Public shares (new module + public view, security-sensitive)
 5. **Item 5** — Syncthing conflicts (standalone, nice-to-have)
 **Total estimated effort:** ~3 hours
 ## Files Summary
 | File | Action | Items |
 |------|--------|-------|
 | `backend/main.py` | Edit | 1 (models), 2a (endpoint), 3b+c (webhook CRUD+dispatch), 4b (share+public view), 5b+c (conflicts) |
 | `backend/webhooks.py` | **Create** | 3a |
 | `backend/share.py` | **Create** | 4a |
 | `backend/indexer.py` | Edit | 5a (get_conflicts) |
 | `frontend/index.html` | Edit | 2b, 3d, 4d, 5d (dashboard + config sections) |
 | `frontend/app.js` | Edit | 2b, 3d, 4c, 5d (widgets + share button + webhook UI) |
 | `frontend/style.css` | Edit | 2b, 3d, 4c, 5d (all new CSS classes) |