From 775722f5d4156294bcee729ed7f3a291e845c87a Mon Sep 17 00:00:00 2001 From: Bruno Charest Date: Tue, 26 May 2026 12:37:59 -0400 Subject: [PATCH] Switch inverted index from stale check to incremental updates Register a hook with the indexer so that file add/remove events incrementally maintain the inverted index, removing the need for periodic staleness checks and cooldowns. Rebuild the index once on startup via init_inverted_index(). --- backend/indexer.py | 9 + backend/main.py | 15 +- backend/search.py | 176 +++++++++++-- plan.md | 623 ++++++++++++++++++++++----------------------- 4 files changed, 460 insertions(+), 363 deletions(-) diff --git a/backend/indexer.py b/backend/indexer.py index de24d76..806d637 100644 --- a/backend/indexer.py +++ b/backend/indexer.py @@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path] _index_generation += 1 + + # Notify inverted index for incremental update + if _on_index_change: + _on_index_change('remove', vault_name, rel_path, removed) + return removed @@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]): _index_generation += 1 + # Notify inverted index for incremental update + if _on_index_change: + _on_index_change('add', vault_name, file_info["path"], file_info) + async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]: """Re-index a single file without full rebuild. diff --git a/backend/main.py b/backend/main.py index a973d9c..52c96c4 100644 --- a/backend/main.py +++ b/backend/main.py @@ -44,7 +44,7 @@ from backend.indexer import ( remove_vault_from_index, add_vault_to_index, ) -from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags +from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index from backend.image_processor import preprocess_images from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats from backend.vault_settings import ( @@ -379,14 +379,9 @@ async def _on_vault_change(events: list): Processes each event (create/modify/delete/move) and updates the index incrementally, then broadcasts SSE notifications. """ - import backend.indexer as idx updated_vaults = set() changes = [] - # Temporarily suppress per-file generation increments to coalesce them - # into a single increment at the end of the batch. - old_gen = idx._index_generation - for event in events: vault_name = event["vault"] event_type = event["type"] @@ -415,11 +410,6 @@ async def _on_vault_change(events: list): except Exception as e: logger.error(f"Error processing {event_type} event for {src}: {e}") - # Restore generation to only increment by 1 for the whole batch - # (unless it was already incremented by other operations) - if idx._index_generation > old_gen + 1: - idx._index_generation = old_gen + 1 - if changes: await sse_manager.broadcast("index_updated", { "vaults": list(updated_vaults), @@ -520,6 +510,9 @@ async def lifespan(app: FastAPI): logger.info("Background indexing started") await build_index(_progress_cb) + # Build inverted index for search (one-time, then incremental) + init_inverted_index() + # Start file watcher config = _load_config() watcher_enabled = config.get("watcher_enabled", True) diff --git a/backend/search.py b/backend/search.py index 4ee1a57..3ff27e4 100644 --- a/backend/search.py +++ b/backend/search.py @@ -239,9 +239,9 @@ def _escape_html(text: str) -> str: class InvertedIndex: """In-memory inverted index supporting TF-IDF scoring. - Built lazily from the global ``index`` dict whenever a search or - suggestion request detects that the underlying vault index has changed. - The class is designed to be a singleton β€” use ``get_inverted_index()``. + Built initially via ``rebuild()`` from the global index, then + maintained incrementally via ``add_document()`` / ``remove_document()`` + hooks from the file watcher and API mutations. Attributes: word_index: ``{token: {doc_key: term_frequency}}`` @@ -249,7 +249,6 @@ class InvertedIndex: tag_norm_map: ``{normalized_tag: original_tag}`` tag_prefix_index: ``{prefix: [original_tag, ...]}`` doc_count: Total number of indexed documents. - _source_id: Fingerprint of the source index to detect staleness. """ def __init__(self) -> None: @@ -264,23 +263,7 @@ class InvertedIndex: self.vault_docs: Dict[str, set] = defaultdict(set) self.tag_docs: Dict[str, set] = defaultdict(set) self._sorted_tokens: List[str] = [] - self._source_generation: int = -1 - self._last_rebuild: float = 0 - self._rebuild_cooldown: float = 3.0 # seconds - - def is_stale(self) -> bool: - """Check if the inverted index needs rebuilding. - - Uses a cooldown (3s) to prevent rapid rebuilds from file watcher - events. Staleness is only reported if the generation has changed - AND the cooldown has elapsed since the last rebuild. - """ - import time - if _indexer._index_generation == self._source_generation: - return False - if time.time() - self._last_rebuild < self._rebuild_cooldown: - return False - return True + self._ready: bool = False # True after initial build def rebuild(self) -> None: """Rebuild inverted index from the global ``index`` dict. @@ -288,9 +271,7 @@ class InvertedIndex: Tokenizes titles and content of every file, computes term frequencies, and builds auxiliary indexes for tag and title prefix suggestions. """ - import time - self._last_rebuild = time.time() - logger.info("Rebuilding inverted index...") + logger.info("Building inverted index...") self.word_index = defaultdict(dict) self.title_index = defaultdict(list) self.tag_norm_map = {} @@ -351,7 +332,7 @@ class InvertedIndex: self.tag_prefix_index[prefix].append(tag) self._sorted_tokens = sorted(self.word_index.keys()) - self._source_generation = _indexer._index_generation + self._ready = True logger.info( "Inverted index built: %d documents, %d unique tokens, %d tags", self.doc_count, @@ -359,6 +340,117 @@ class InvertedIndex: len(self.tag_norm_map), ) + def add_document(self, vault_name: str, path: str, file_info: dict): + """Add or update a single document incrementally.""" + if not self._ready: + return + doc_key = f"{vault_name}::{path}" + old_file_info = self.doc_info.get(doc_key) + if old_file_info is not None: + self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True) + else: + self.doc_count += 1 + # Metadata + self.doc_info[doc_key] = file_info + self.doc_vault[doc_key] = vault_name + self.vault_docs[vault_name].add(doc_key) + # Tags + tags = file_info.get("tags", []) + for tag in tags: + self.tag_docs[tag.lower()].add(doc_key) + norm_tag = normalize_text(tag) + if norm_tag not in self.tag_norm_map: + self.tag_norm_map[norm_tag] = tag + for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1): + prefix = norm_tag[:plen] + if tag not in self.tag_prefix_index[prefix]: + self.tag_prefix_index[prefix].append(tag) + # Title tokens + title = file_info.get("title", "") + title_tokens = tokenize(title) + for token in set(title_tokens): + if token: + self.title_index[token].append(doc_key) + # Title norm map + norm_title = normalize_text(title) + if norm_title: + self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title}) + # Word index (content + title TF) + content = file_info.get("content", "") + full_text = title + " " + content + tokens = tokenize(full_text) + tf: Dict[str, int] = defaultdict(int) + for token in tokens: + if token: + tf[token] += 1 + for token, freq in tf.items(): + if not self.word_index.get(token): + bisect.insort(self._sorted_tokens, token) + self.word_index[token][doc_key] = freq + + def remove_document(self, vault_name: str, path: str): + """Remove a single document incrementally.""" + if not self._ready: + return + doc_key = f"{vault_name}::{path}" + file_info = self.doc_info.get(doc_key) + if file_info is None: + return + self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False) + self.doc_count -= 1 + + def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False): + """Remove one doc_key from all indexes without adjusting doc_count.""" + # Metadata + self.doc_info.pop(doc_key, None) + self.doc_vault.pop(doc_key, None) + if vault_name in self.vault_docs: + self.vault_docs[vault_name].discard(doc_key) + # Tags (per-document, NOT the global tag_norm_map) + for tag in file_info.get("tags", []): + td = self.tag_docs.get(tag.lower()) + if td: + td.discard(doc_key) + if not td: + del self.tag_docs[tag.lower()] + # Title tokens + title = file_info.get("title", "") + for token in set(tokenize(title)): + if not token: + continue + ti = self.title_index.get(token) + if ti: + try: + ti.remove(doc_key) + except ValueError: + pass + if not ti: + del self.title_index[token] + # Title norm map + norm_title = normalize_text(title) + if norm_title and norm_title in self.title_norm_map: + self.title_norm_map[norm_title] = [ + e for e in self.title_norm_map[norm_title] + if not (e["vault"] == vault_name and e["path"] == file_info.get("path")) + ] + if not self.title_norm_map[norm_title]: + del self.title_norm_map[norm_title] + # Word index + content = file_info.get("content", "") + full_text = title + " " + content + for token in set(tokenize(full_text)): + if not token: + continue + wi = self.word_index.get(token) + if wi: + wi.pop(doc_key, None) + if not wi: + del self.word_index[token] + if not skip_sorted_cleanup: + idx = bisect.bisect_left(self._sorted_tokens, token) + if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token: + self._sorted_tokens.pop(idx) + def idf(self, term: str) -> float: """Inverse Document Frequency for a term. @@ -424,9 +516,39 @@ class InvertedIndex: _inverted_index = InvertedIndex() +def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict): + """Callback registered with indexer for incremental inverted index updates.""" + inv = _inverted_index + try: + if action == 'add' and file_info: + inv.add_document(vault_name, path, file_info) + elif action == 'remove': + inv.remove_document(vault_name, path) + except Exception as e: + logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}") + + +# Register the hook with indexer (indexer is already imported at top of file) +_indexer.set_index_change_hook(_on_index_change_hook) + + +def init_inverted_index(): + """Force initial inverted index build. Called after build_index completes on startup.""" + if any(vdata.get("files") for vdata in index.values()): + _inverted_index.rebuild() + logger.info("Inverted index initialized.") + + def get_inverted_index() -> InvertedIndex: - """Return the singleton inverted index, rebuilding if stale.""" - if _inverted_index.is_stale(): + """Return the singleton inverted index. + + Auto-builds on first call if the index has files but the inverted + index hasn't been built yet (fallback for paths that don't go through + ``init_inverted_index()``). + """ + if _inverted_index.doc_count == 0 and any( + vdata.get("files") for vdata in index.values() + ): _inverted_index.rebuild() return _inverted_index diff --git a/plan.md b/plan.md index df32609..73acc35 100644 --- a/plan.md +++ b/plan.md @@ -1,375 +1,348 @@ -# Implementation Plan β€” Remaining Roadmap Items +# Plan: Incremental InvertedIndex for 40k+ files -## 1. πŸ“ Documentation OpenAPI enrichie (P3) β€” 5 min +## Problem Summary -**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`. +Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` β€” O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable. -**Models to update (lines 89–311):** +The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it. -| Line | Model | Fields to annotate | -|------|-------|--------------------| -| 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` | -| 103 | `FileRawResponse` | `vault`, `path`, `raw` | -| 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` | -| 118 | `FileDeleteResponse` | `status`, `vault`, `path` | -| 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` | -| 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) | -| 146 | `TagsResponse` | `vault_filter`, `tags` | -| 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) | -| 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` | -| 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` | -| 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) | -| 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) | -| 196 | `TitleSuggestion` | `vault`, `path`, `title` | -| 203 | `SuggestResponse` | `query`, `suggestions` | -| 209 | `TagSuggestion` | `tag`, `count` | -| 215 | `TagSuggestResponse` | `query`, `suggestions` | -| 221 | `GraphNode` | (all fields already have Field) | -| 231 | `GraphEdge` | (all fields already have Field) | -| 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` | -| 247 | `ReloadResponse` | `status`, `vaults` | -| 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` | -| 265 | `DirectoryCreateResponse` | `success`, `path` | -| 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` | -| 296 | `FileCreateResponse` | `success`, `path` | -| 307 | `FileRenameResponse` | `success` | +## Solution: Incremental Add/Remove on the InvertedIndex -**Dependency:** None. Pure documentation change. +Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed: ---- +- `add_document(vault_name, path, file_info)` β€” called on create/modify +- `remove_document(vault_name, path)` β€” called on delete/move-source -## 2. πŸ“Š Dashboard statistiques (P3) β€” 30 min +Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale. -### 2a. Backend: `GET /api/dashboard` (new endpoint) +Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current. -**File:** `backend/main.py` β€” insert at **line ~2547** (after `/api/diagnostics`) +## Dependency Architecture + +**Current import chain:** +``` +main.py β†’ search.py β†’ indexer.py (search.py imports `from backend import indexer as _indexer`) +``` + +**Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py β†’ indexer.py β†’ search.py`. + +**Fix β€” Option C (Callback/Hook pattern, simplest):** + +Add a module-level hook variable in `backend/indexer.py`: ```python -@app.get("/api/dashboard") -async def api_dashboard(current_user=Depends(require_auth)): - """Aggregated dashboard statistics across all accessible vaults.""" - from backend.indexer import index, vault_config, path_index - user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", []) +# In backend/indexer.py +_on_index_change: callable = None # Called as (action, vault_name, path, file_info_or_None) - vault_stats = [] - total_files = 0 - total_tags = set() - total_size = 0 +def set_index_change_hook(hook): + """Register a callback for incremental index updates. + hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'. + """ + global _on_index_change + _on_index_change = hook +``` - for vname, vdata in index.items(): - if "*" not in user_vaults and vname not in user_vaults: - continue - files = vdata.get("files", []) - file_count = len(files) - total_files += file_count - tags = set() - for f in files: - tags.update(f.get("tags", [])) - total_size += f.get("size", 0) - total_tags.update(tags) - vault_stats.append({ - "name": vname, - "file_count": file_count, - "tag_count": len(tags), - "total_size_bytes": sum(f.get("size", 0) for f in files), +Then at the end of `_add_file_to_structures`: +```python +if _on_index_change: + _on_index_change('add', vault_name, rel_path, file_info) +``` + +At the end of `_remove_file_from_structures`: +```python +if _on_index_change: + _on_index_change('remove', vault_name, rel_path, file_info) # file_info = removed dict or None +``` + +Then in `backend/search.py`, at module load time (after InvertedIndex class is defined): +```python +def _on_index_change_hook(action, vault_name, path, file_info): + inv = get_inverted_index_raw() # get without rebuild check + if action == 'add': + inv.add_document(vault_name, path, file_info) + elif action == 'remove': + inv.remove_document(vault_name, path) + +# Register the hook β€” this triggers an import of indexer, but indexer is already imported +# by the time this line runs (since search.py does `from backend import indexer as _indexer` above) +_indexer.set_index_change_hook(_on_index_change_hook) +``` + +This avoids circular imports completely because: +1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`) +2. `indexer.py` never imports `search.py` β€” it just stores a callback +3. `search.py` registers the callback AFTER the InvertedIndex class is defined + +## Detailed Implementation + +### Step 1: Add hook variable to `backend/indexer.py` + +File: `backend/indexer.py` +Changes: +- After `_index_generation` global (line ~28), add: + ```python + _on_index_change: callable = None + ``` +- Add function `set_index_change_hook(hook)` (bottom of file, near other public functions) +- Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665) +- Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597) + +### Step 2: Add `add_document` and `remove_document` to InvertedIndex + +File: `backend/search.py` + +#### `add_document(vault_name, path, file_info)` + +```python +def add_document(self, vault_name: str, path: str, file_info: dict): + """Add or update a single document in the inverted index.""" + + doc_key = f"{vault_name}::{path}" + old_file_info = self.doc_info.get(doc_key) + + # If updating an existing document, remove old entries first + if old_file_info is not None: + self._remove_doc_internals(doc_key, vault_name, old_file_info) + else: + self.doc_count += 1 + + # --- Metadata --- + self.doc_info[doc_key] = file_info + self.doc_vault[doc_key] = vault_name + self.vault_docs[vault_name].add(doc_key) + + # --- Tags --- + tags = file_info.get("tags", []) + for tag in tags: + self.tag_docs[tag.lower()].add(doc_key) + + # --- Title tokens --- + title = file_info.get("title", "") + title_tokens = tokenize(title) + for token in set(title_tokens): + self.title_index[token].append(doc_key) + + # --- Normalized title for prefix suggestions --- + norm_title = normalize_text(title) + if norm_title: + self.title_norm_map[norm_title].append({ + "vault": vault_name, + "path": path, + "title": title, }) - - return { - "vaults": vault_stats, - "total_files": total_files, - "total_tags": len(total_tags), - "total_size_bytes": total_size, - } + + # --- Word index (content + title TF) --- + content = file_info.get("content", "") + full_text = title + " " + content + tokens = tokenize(full_text) + tf = defaultdict(int) + for token in tokens: + tf[token] += 1 + + # Track which tokens are new (not previously indexed) for sorted_tokens update + new_tokens = [] + for token, freq in tf.items(): + if not self.word_index.get(token): + new_tokens.append(token) + self.word_index[token][doc_key] = freq + + # Incrementally update _sorted_tokens (avoid O(V log V) full re-sort) + if new_tokens: + for token in new_tokens: + bisect.insort(self._sorted_tokens, token) ``` -**No new model needed** β€” return plain dict (or add optional `DashboardResponse` model). - -### 2b. Frontend: Insert stats widget in dashboard-home - -**File:** `frontend/index.html` β€” **after line 364** (`` closing bookmarks section, before ``) - -Add: -```html - -
-
-
- -

Statistiques

-
-
-
-
Chargement...
-
-
-``` - -**File:** `frontend/app.js` β€” add `DashboardStatsWidget` module (insert at **line ~3343**, before `DashboardRecentWidget`): - -```javascript -const DashboardStatsWidget = { - async load() { - const grid = document.getElementById("dashboard-stats-grid"); - if (!grid) return; - grid.innerHTML = '
Chargement...
'; - try { - const data = await api("/api/dashboard"); - this.render(data); - } catch (err) { - grid.innerHTML = `
Erreur: ${escapeHtml(err.message)}
`; - } - }, - render(data) { - const grid = document.getElementById("dashboard-stats-grid"); - if (!grid) return; - const items = [ - { icon: "files", label: "Fichiers", value: data.total_files.toLocaleString() }, - { icon: "tags", label: "Tags uniques", value: data.total_tags.toLocaleString() }, - { icon: "hard-drive", label: "Taille totale", value: this._formatSize(data.total_size_bytes) }, - { icon: "folder", label: "Vaults", value: data.vaults.length.toString() }, - ]; - grid.innerHTML = items.map(i => ` -
- - ${i.value} - ${i.label} -
- `).join(""); - safeCreateIcons(); - }, - _formatSize(bytes) { /* KB/MB/GB formatter */ } -}; -``` - -**Also update `showWelcome()`** at **line ~5417** β€” the dashboard rebuild HTML must include the stats section div. And **line ~5490** β€” add `DashboardStatsWidget.load()` call. - -**File:** `frontend/style.css` β€” add CSS for `.dashboard-stats-grid`, `.stat-card`, `.stat-icon`, `.stat-value`, `.stat-label`. - -**Dependency:** Item 1 (Pydantic models) β€” none. Standalone. - ---- - -## 3. πŸ”” Webhooks (P3) β€” 45 min - -### 3a. New backend module: `backend/webhooks.py` - -Create full module with: -- `WEBHOOKS_FILE = Path("data/webhooks.json")` β€” persistence -- `_DEFAULT_WEBHOOKS = []` -- `get_webhooks() -> list` β€” reads from disk -- `create_webhook(name, url, events, secret=None) -> dict` -- `update_webhook(id, updates) -> dict` -- `delete_webhook(id) -> bool` -- `async def dispatch_webhooks(event_type: str, data: dict)` β€” calls all webhooks subscribed to `event_type`, sends JSON POST with HMAC-SHA256 signature header if secret is set, timeout 5s, logs failures -- Model: `WebhookConfig` with `id`, `name`, `url`, `events` (list of event type strings), `secret` (optional), `enabled`, `created_at`, `last_fired_at` - -### 3b. Backend: CRUD endpoints in `backend/main.py` - -Insert at **line ~2470** (before `GET /api/config`): +#### `remove_document(vault_name, path)` ```python -@app.get("/api/webhooks") -@app.post("/api/webhooks") -@app.patch("/api/webhooks/{webhook_id}") -@app.delete("/api/webhooks/{webhook_id}") +def remove_document(self, vault_name: str, path: str): + """Remove a document from the inverted index.""" + + doc_key = f"{vault_name}::{path}" + file_info = self.doc_info.get(doc_key) + if file_info is None: + return + + self._remove_doc_internals(doc_key, vault_name, file_info) + self.doc_count -= 1 ``` -Import `from backend.webhooks import get_webhooks, create_webhook, update_webhook, delete_webhook, dispatch_webhooks` - -### 3c. Backend: Hook dispatch_webhooks into file events - -Add `await dispatch_webhooks("file_created", {...})` calls alongside each `sse_manager.broadcast(...)` call: - -| Line | Event | Add dispatch | -|------|-------|-------------| -| ~1252 | `file_deleted` | `dispatch_webhooks("file_deleted", {"vault":..., "path":...})` | -| ~1330 | `directory_created` | `dispatch_webhooks("directory_created", {...})` | -| ~1401 | `directory_renamed` | `dispatch_webhooks("directory_renamed", {...})` | -| ~1462 | `directory_deleted` | `dispatch_webhooks("directory_deleted", {...})` | -| ~1532 | `file_created` | `dispatch_webhooks("file_created", {...})` | -| ~1607 | `file_renamed` | `dispatch_webhooks("file_renamed", {...})` | - -### 3d. Frontend: Webhooks management UI in Configurations modal - -**File:** `frontend/index.html` β€” insert at **line ~633** (after `` section, before `` closing config-content): - -```html -
-

πŸ”” Webhooks

-

Notifications HTTP vers des services externes lors des changements de fichiers.

-
-
- - - -
-
-``` - -**File:** `frontend/app.js` β€” in `initConfigModal()` at **line ~3918**, add: -```javascript -loadWebhooks(); // in the open handler -// Event binding for webhook add/save/delete buttons -``` - -Add functions: `loadWebhooks()`, `renderWebhooks(webhooks)`, `addWebhook()`, `deleteWebhook(id)`, `toggleWebhook(id, enabled)`. All use `api("/api/webhooks", ...)`. - -**File:** `frontend/style.css` β€” add `.webhook-item`, `.webhook-toggle`, `.webhook-delete` styles. - -**Dependency:** None on items 1–2. Standalone. - ---- - -## 4. πŸ“€ Publication publique de documents (P3) β€” 60 min - -### 4a. New backend module: `backend/share.py` - -Create full module: -- `SHARES_FILE = Path("data/shares.json")` -- ShareToken model: `id`, `vault`, `path`, `token` (64-char hex), `created_by`, `created_at`, `expires_at` (optional, null = never), `access_count`, `last_accessed` -- `create_share(vault, path, created_by, expires_in_hours=None) -> dict` β€” generates token, stores, returns share info -- `get_share_by_token(token) -> dict | None` β€” validates expiry, returns share -- `revoke_share(id) -> bool` -- `list_shares(vault_filter=None) -> list` β€” for admin/settings page -- `record_access(token)` β€” increments access_count - -### 4b. Backend: Endpoints in `backend/main.py` - -Insert at **line ~1619** (before `GET /api/file/{vault_name}`): +#### `_remove_doc_internals(doc_key, vault_name, file_info)` (private helper) ```python -# Share management -@app.post("/api/share/{vault_name}") -@app.get("/api/shares") -@app.delete("/api/share/{share_id}") - -# Public view (no auth required!) -@app.get("/s/{token}") -async def public_share_view(token: str): ... +def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict): + """Internal: remove one doc_key from all indexes without adjusting doc_count.""" + + # --- Metadata --- + self.doc_info.pop(doc_key, None) + self.doc_vault.pop(doc_key, None) + if vault_name in self.vault_docs: + self.vault_docs[vault_name].discard(doc_key) + + # --- Tags --- + for tag in file_info.get("tags", []): + td = self.tag_docs.get(tag.lower()) + if td: + td.discard(doc_key) + if not td: + del self.tag_docs[tag.lower()] + + # --- Title tokens --- + title = file_info.get("title", "") + for token in set(tokenize(title)): + ti = self.title_index.get(token) + if ti: + try: + ti.remove(doc_key) + except ValueError: + pass + if not ti: + del self.title_index[token] + + # --- Title norm map --- + norm_title = normalize_text(title) + if norm_title and norm_title in self.title_norm_map: + self.title_norm_map[norm_title] = [ + e for e in self.title_norm_map[norm_title] + if not (e["vault"] == vault_name and e["path"] == file_info.get("path")) + ] + if not self.title_norm_map[norm_title]: + del self.title_norm_map[norm_title] + + # --- Word index --- + content = file_info.get("content", "") + full_text = title + " " + content + for token in set(tokenize(full_text)): + wi = self.word_index.get(token) + if wi: + wi.pop(doc_key, None) + if not wi: + del self.word_index[token] + # Remove from sorted tokens via bisect + idx = bisect.bisect_left(self._sorted_tokens, token) + if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token: + self._sorted_tokens.pop(idx) ``` -The public view endpoint: -1. Looks up token via `get_share_by_token(token)` -2. Reads the file content -3. Renders markdown with redacted secrets -4. Returns simple HTML page (not SPA) with rendered content -5. Increments access count +Key: `bisect.insort` for insertion and `bisect.bisect_left` + `pop(idx)` for removal keep `_sorted_tokens` sorted in O(V) worst case (list shift) but this is negligible compared to O(N * content) rebuild. -### 4c. Frontend: Share button in file actions +### Step 3: Modify `get_inverted_index()` to NOT check staleness -**File:** `frontend/app.js` β€” in `renderFile()`, at **line ~3250** (after pop-out button): - -```javascript -const shareBtn = el("button", { class: "btn-action", title: "Partager" }, [icon("share-2", 14), document.createTextNode("Partager")]); -shareBtn.addEventListener("click", () => openShareDialog(data.vault, data.path)); -``` - -Add `shareBtn` to the file-actions div at **line ~3300**. - -Add `openShareDialog(vault, path)` function that: -- Calls `POST /api/share/{vault}` to create a share -- Shows a modal with the share URL (copyable) and expiration options -- Shows existing shares list with revoke buttons - -### 4d. Frontend: Share management in Configurations - -**File:** `frontend/index.html` β€” add share management section in config modal (alongside webhooks). - -**File:** `frontend/app.js` β€” `loadShares()` and `renderShares()` functions. - -**File:** `frontend/style.css` β€” add `.share-dialog`, `.share-url`, `.share-item` styles. - -**Dependency:** None. Standalone, but needs item 1 for clean models. - ---- - -## 5. πŸ”„ Gestion conflits Syncthing (P2) β€” 45 min - -### 5a. Backend: Conflict file detection - -**File:** `backend/indexer.py` β€” add after `_backlink_index`: +File: `backend/search.py` ```python -def get_conflicts() -> list: - """Scan all vaults for Syncthing/Nextcloud sync-conflict files.""" - conflicts = [] - pattern = re.compile(r'\.sync-conflict-(\d{8}-\d{6})\.') - for vname, vdata in index.items(): - for f in vdata.get("files", []): - m = pattern.search(f["path"]) - if m: - # Find the original file - orig_path = pattern.sub("", f["path"]) - conflicts.append({ - "vault": vname, - "conflict_path": f["path"], - "original_path": orig_path, - "conflict_date": m.group(1), - "conflict_title": f.get("title", ""), - }) - return conflicts +def get_inverted_index() -> InvertedIndex: + """Return the singleton inverted index. Always up-to-date via hooks.""" + return _inverted_index ``` -### 5b. Backend: Endpoints +Remove `is_stale()` and the `_source_generation` / `_last_rebuild` / `_rebuild_cooldown` fields. Keep `rebuild()` for initial build and manual reindex (still called once at startup via `build_index`). -**File:** `backend/main.py` β€” insert at **line ~2547**: +### Step 4: Call `rebuild()` once after initial index build + +In `backend/search.py`, register the hook AND call rebuild once: ```python -@app.get("/api/conflicts") -async def api_conflicts(current_user=Depends(require_auth)): - """List sync-conflict files across accessible vaults.""" - ... +# After InvertedIndex class and _inverted_index = InvertedIndex() -@app.post("/api/conflicts/resolve") -async def api_conflict_resolve(body: dict, current_user=Depends(require_auth)): - """Resolve a conflict: keep_local (delete conflict), keep_conflict (replace original).""" - ... +def _on_index_change_hook(action, vault_name, path, file_info): + inv = _inverted_index + try: + if action == 'add': + inv.add_document(vault_name, path, file_info) + elif action == 'remove': + inv.remove_document(vault_name, path) + except Exception as e: + logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}") + # Fallback: mark for rebuild on next search + inv._needs_rebuild = True + +_indexer.set_index_change_hook(_on_index_change_hook) + +# Initial build trigger β€” called after first index is built +def init_inverted_index(): + """Force initial inverted index build. Called after build_index completes.""" + _inverted_index.rebuild() + +def get_inverted_index() -> InvertedIndex: + """Return the singleton inverted index.""" + # Only check for rebuild if incremental updates have failed + # OR if this is the very first call (doc_count == 0 and index has files) + if getattr(_inverted_index, '_needs_rebuild', False): + _inverted_index.rebuild() + _inverted_index._needs_rebuild = False + elif _inverted_index.doc_count == 0 and any( + vdata.get("files") for vdata in index.values() + ): + _inverted_index.rebuild() + return _inverted_index ``` -### 5c. Backend: Diff endpoint +### Step 5: Call `init_inverted_index()` from `build_index` in main.py + +In `backend/main.py`, after `build_index()` completes in the lifespan handler, call: ```python -@app.get("/api/conflicts/diff") -async def api_conflict_diff(vault: str, original: str, conflict: str, current_user=Depends(require_auth)): - """Return unified diff between original and conflict file.""" - import difflib - ... +from backend.search import init_inverted_index +init_inverted_index() ``` -### 5d. Frontend: Conflict dashboard widget +This ensures the inverted index is built once on startup, then incrementally maintained thereafter. -**File:** `frontend/index.html` β€” add `#dashboard-conflicts-section` in dashboard after stats section. +### Tag prefix index handling -**File:** `frontend/app.js` β€” add `DashboardConflictsWidget` (pattern similar to recent/bookmarks): -- `load()` β†’ `GET /api/conflicts` -- `render()` β†’ shows conflict cards with file names and dates -- Click β†’ opens diff modal showing side-by-side comparison -- Action buttons: "Garder l'original", "Garder le conflit" +The `tag_norm_map` and `tag_prefix_index` are built per-vault in `rebuild()`. For incremental updates, we need to handle tag changes: -**File:** `frontend/style.css` β€” add `.conflict-card`, `.conflict-diff`, `.conflict-actions` styles. +In `add_document`, after adding doc tags: +```python +# Check if any tags are new (not in tag_norm_map) +for tag in tags: + norm_tag = normalize_text(tag) + if norm_tag not in self.tag_norm_map: + self.tag_norm_map[norm_tag] = tag + for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1): + prefix = norm_tag[:plen] + if tag not in self.tag_prefix_index[prefix]: + self.tag_prefix_index[prefix].append(tag) +``` -**Dependency:** None. Standalone. +In `_remove_doc_internals`, we do NOT remove tags from `tag_norm_map` or `tag_prefix_index` β€” these are global (per-vault tag vocabulary), not per-document. They only grow over the lifetime of the inverted index. A periodic `rebuild()` on manual reindex will clean them up. ---- +### Step 6: Remove cooldown hack from search.py -## Execution Order (optimal) +Remove: +- `_last_rebuild` and `_rebuild_cooldown` fields from `InvertedIndex.__init__` +- `is_stale()` method +- `_source_generation` field (no longer needed for staleness, but keep for diagnostics) -1. **Item 1** β€” OpenAPI docs (quick win, no risk) -2. **Item 2** β€” Dashboard stats (standalone, visible result) -3. **Item 3** β€” Webhooks (new module + integration, most code) -4. **Item 4** β€” Public shares (new module + public view, security-sensitive) -5. **Item 5** β€” Syncthing conflicts (standalone, nice-to-have) +### Step 7: Remove coalescence hack from main.py -**Total estimated effort:** ~3 hours +In `_on_vault_change` in `backend/main.py`, remove: +```python +old_gen = idx._index_generation +... +if idx._index_generation > old_gen + 1: + idx._index_generation = old_gen + 1 +``` -## Files Summary +This hack was only needed to reduce the number of inverted index rebuilds. With incremental updates, it's unnecessary β€” each mutation is cheap. -| File | Action | Items | -|------|--------|-------| -| `backend/main.py` | Edit | 1 (models), 2a (endpoint), 3b+c (webhook CRUD+dispatch), 4b (share+public view), 5b+c (conflicts) | -| `backend/webhooks.py` | **Create** | 3a | -| `backend/share.py` | **Create** | 4a | -| `backend/indexer.py` | Edit | 5a (get_conflicts) | -| `frontend/index.html` | Edit | 2b, 3d, 4d, 5d (dashboard + config sections) | -| `frontend/app.js` | Edit | 2b, 3d, 4c, 5d (widgets + share button + webhook UI) | -| `frontend/style.css` | Edit | 2b, 3d, 4c, 5d (all new CSS classes) | +## Files Modified (Summary) + +| File | Changes | +|------|---------| +| `backend/indexer.py` | +`_on_index_change` hook variable, +`set_index_change_hook()`, +hook calls in `_add_file_to_structures` and `_remove_file_from_structures` | +| `backend/search.py` | +`add_document()`, +`remove_document()`, +`_remove_doc_internals()`, +`init_inverted_index()`, +hook registration, remove `is_stale()`/cooldown, simplify `get_inverted_index()` | +| `backend/main.py` | +`init_inverted_index()` call after `build_index()`, remove coalescence hack in `_on_vault_change` | + +## Risks & Edge Cases + +1. **Thread safety:** `_add_file_to_structures` and `_remove_file_from_structures` are protected by `_index_lock` / `_async_index_lock` in indexer.py. The InvertedIndex methods are called inside these locks, so they're also protected. No additional locking needed. + +2. **Hook registration timing:** `search.py` imports `indexer.py` at the top, then later registers the hook. The hook is registered at module load time, BEFORE the first call to `build_index`. So `_on_index_change` is set when `build_index` runs β€” but `build_index` calls `_add_file_to_structures` internally, which would try to incrementally update an empty inverted index. **Fix:** The hook checks `if _inverted_index.doc_count == 0` and skips incremental updates; the initial `rebuild()` handles the bulk load. + +3. **Hook call during initial build_index:** `build_index` iterates files and calls `_add_file_to_structures`. The hook fires for each file, calling `add_document()` on an empty inverted index. This is slower than a single `rebuild()`. **Fix:** Add a flag `_inverted_index._ready = False` initially, set to True after `init_inverted_index()`. The hook skips when `_ready` is False. + +4. **Sorted tokens performance:** `bisect.insort` and `list.pop(idx)` are O(V) worst case for large V. For 40k files, the vocabulary size V is typically 50k-200k tokens. O(V) for a single insertion is ~0.001ms, acceptable. The rebuild() call at startup handles the initial bulk. + +5. **tag_norm_map / tag_prefix_index growth:** These grow monotonically (never shrink on incremental remove). With 40k files and thousands of tags, this is a few thousand entries β€” negligible. A manual "RΓ©indexer" button triggers a full `rebuild()` to clean up.