diff --git a/backend/indexer.py b/backend/indexer.py index de24d76..806d637 100644 --- a/backend/indexer.py +++ b/backend/indexer.py @@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path] _index_generation += 1 + + # Notify inverted index for incremental update + if _on_index_change: + _on_index_change('remove', vault_name, rel_path, removed) + return removed @@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]): _index_generation += 1 + # Notify inverted index for incremental update + if _on_index_change: + _on_index_change('add', vault_name, file_info["path"], file_info) + async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]: """Re-index a single file without full rebuild. diff --git a/backend/main.py b/backend/main.py index a973d9c..52c96c4 100644 --- a/backend/main.py +++ b/backend/main.py @@ -44,7 +44,7 @@ from backend.indexer import ( remove_vault_from_index, add_vault_to_index, ) -from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags +from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index from backend.image_processor import preprocess_images from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats from backend.vault_settings import ( @@ -379,14 +379,9 @@ async def _on_vault_change(events: list): Processes each event (create/modify/delete/move) and updates the index incrementally, then broadcasts SSE notifications. """ - import backend.indexer as idx updated_vaults = set() changes = [] - # Temporarily suppress per-file generation increments to coalesce them - # into a single increment at the end of the batch. - old_gen = idx._index_generation - for event in events: vault_name = event["vault"] event_type = event["type"] @@ -415,11 +410,6 @@ async def _on_vault_change(events: list): except Exception as e: logger.error(f"Error processing {event_type} event for {src}: {e}") - # Restore generation to only increment by 1 for the whole batch - # (unless it was already incremented by other operations) - if idx._index_generation > old_gen + 1: - idx._index_generation = old_gen + 1 - if changes: await sse_manager.broadcast("index_updated", { "vaults": list(updated_vaults), @@ -520,6 +510,9 @@ async def lifespan(app: FastAPI): logger.info("Background indexing started") await build_index(_progress_cb) + # Build inverted index for search (one-time, then incremental) + init_inverted_index() + # Start file watcher config = _load_config() watcher_enabled = config.get("watcher_enabled", True) diff --git a/backend/search.py b/backend/search.py index 4ee1a57..3ff27e4 100644 --- a/backend/search.py +++ b/backend/search.py @@ -239,9 +239,9 @@ def _escape_html(text: str) -> str: class InvertedIndex: """In-memory inverted index supporting TF-IDF scoring. - Built lazily from the global ``index`` dict whenever a search or - suggestion request detects that the underlying vault index has changed. - The class is designed to be a singleton β use ``get_inverted_index()``. + Built initially via ``rebuild()`` from the global index, then + maintained incrementally via ``add_document()`` / ``remove_document()`` + hooks from the file watcher and API mutations. Attributes: word_index: ``{token: {doc_key: term_frequency}}`` @@ -249,7 +249,6 @@ class InvertedIndex: tag_norm_map: ``{normalized_tag: original_tag}`` tag_prefix_index: ``{prefix: [original_tag, ...]}`` doc_count: Total number of indexed documents. - _source_id: Fingerprint of the source index to detect staleness. """ def __init__(self) -> None: @@ -264,23 +263,7 @@ class InvertedIndex: self.vault_docs: Dict[str, set] = defaultdict(set) self.tag_docs: Dict[str, set] = defaultdict(set) self._sorted_tokens: List[str] = [] - self._source_generation: int = -1 - self._last_rebuild: float = 0 - self._rebuild_cooldown: float = 3.0 # seconds - - def is_stale(self) -> bool: - """Check if the inverted index needs rebuilding. - - Uses a cooldown (3s) to prevent rapid rebuilds from file watcher - events. Staleness is only reported if the generation has changed - AND the cooldown has elapsed since the last rebuild. - """ - import time - if _indexer._index_generation == self._source_generation: - return False - if time.time() - self._last_rebuild < self._rebuild_cooldown: - return False - return True + self._ready: bool = False # True after initial build def rebuild(self) -> None: """Rebuild inverted index from the global ``index`` dict. @@ -288,9 +271,7 @@ class InvertedIndex: Tokenizes titles and content of every file, computes term frequencies, and builds auxiliary indexes for tag and title prefix suggestions. """ - import time - self._last_rebuild = time.time() - logger.info("Rebuilding inverted index...") + logger.info("Building inverted index...") self.word_index = defaultdict(dict) self.title_index = defaultdict(list) self.tag_norm_map = {} @@ -351,7 +332,7 @@ class InvertedIndex: self.tag_prefix_index[prefix].append(tag) self._sorted_tokens = sorted(self.word_index.keys()) - self._source_generation = _indexer._index_generation + self._ready = True logger.info( "Inverted index built: %d documents, %d unique tokens, %d tags", self.doc_count, @@ -359,6 +340,117 @@ class InvertedIndex: len(self.tag_norm_map), ) + def add_document(self, vault_name: str, path: str, file_info: dict): + """Add or update a single document incrementally.""" + if not self._ready: + return + doc_key = f"{vault_name}::{path}" + old_file_info = self.doc_info.get(doc_key) + if old_file_info is not None: + self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True) + else: + self.doc_count += 1 + # Metadata + self.doc_info[doc_key] = file_info + self.doc_vault[doc_key] = vault_name + self.vault_docs[vault_name].add(doc_key) + # Tags + tags = file_info.get("tags", []) + for tag in tags: + self.tag_docs[tag.lower()].add(doc_key) + norm_tag = normalize_text(tag) + if norm_tag not in self.tag_norm_map: + self.tag_norm_map[norm_tag] = tag + for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1): + prefix = norm_tag[:plen] + if tag not in self.tag_prefix_index[prefix]: + self.tag_prefix_index[prefix].append(tag) + # Title tokens + title = file_info.get("title", "") + title_tokens = tokenize(title) + for token in set(title_tokens): + if token: + self.title_index[token].append(doc_key) + # Title norm map + norm_title = normalize_text(title) + if norm_title: + self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title}) + # Word index (content + title TF) + content = file_info.get("content", "") + full_text = title + " " + content + tokens = tokenize(full_text) + tf: Dict[str, int] = defaultdict(int) + for token in tokens: + if token: + tf[token] += 1 + for token, freq in tf.items(): + if not self.word_index.get(token): + bisect.insort(self._sorted_tokens, token) + self.word_index[token][doc_key] = freq + + def remove_document(self, vault_name: str, path: str): + """Remove a single document incrementally.""" + if not self._ready: + return + doc_key = f"{vault_name}::{path}" + file_info = self.doc_info.get(doc_key) + if file_info is None: + return + self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False) + self.doc_count -= 1 + + def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False): + """Remove one doc_key from all indexes without adjusting doc_count.""" + # Metadata + self.doc_info.pop(doc_key, None) + self.doc_vault.pop(doc_key, None) + if vault_name in self.vault_docs: + self.vault_docs[vault_name].discard(doc_key) + # Tags (per-document, NOT the global tag_norm_map) + for tag in file_info.get("tags", []): + td = self.tag_docs.get(tag.lower()) + if td: + td.discard(doc_key) + if not td: + del self.tag_docs[tag.lower()] + # Title tokens + title = file_info.get("title", "") + for token in set(tokenize(title)): + if not token: + continue + ti = self.title_index.get(token) + if ti: + try: + ti.remove(doc_key) + except ValueError: + pass + if not ti: + del self.title_index[token] + # Title norm map + norm_title = normalize_text(title) + if norm_title and norm_title in self.title_norm_map: + self.title_norm_map[norm_title] = [ + e for e in self.title_norm_map[norm_title] + if not (e["vault"] == vault_name and e["path"] == file_info.get("path")) + ] + if not self.title_norm_map[norm_title]: + del self.title_norm_map[norm_title] + # Word index + content = file_info.get("content", "") + full_text = title + " " + content + for token in set(tokenize(full_text)): + if not token: + continue + wi = self.word_index.get(token) + if wi: + wi.pop(doc_key, None) + if not wi: + del self.word_index[token] + if not skip_sorted_cleanup: + idx = bisect.bisect_left(self._sorted_tokens, token) + if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token: + self._sorted_tokens.pop(idx) + def idf(self, term: str) -> float: """Inverse Document Frequency for a term. @@ -424,9 +516,39 @@ class InvertedIndex: _inverted_index = InvertedIndex() +def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict): + """Callback registered with indexer for incremental inverted index updates.""" + inv = _inverted_index + try: + if action == 'add' and file_info: + inv.add_document(vault_name, path, file_info) + elif action == 'remove': + inv.remove_document(vault_name, path) + except Exception as e: + logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}") + + +# Register the hook with indexer (indexer is already imported at top of file) +_indexer.set_index_change_hook(_on_index_change_hook) + + +def init_inverted_index(): + """Force initial inverted index build. Called after build_index completes on startup.""" + if any(vdata.get("files") for vdata in index.values()): + _inverted_index.rebuild() + logger.info("Inverted index initialized.") + + def get_inverted_index() -> InvertedIndex: - """Return the singleton inverted index, rebuilding if stale.""" - if _inverted_index.is_stale(): + """Return the singleton inverted index. + + Auto-builds on first call if the index has files but the inverted + index hasn't been built yet (fallback for paths that don't go through + ``init_inverted_index()``). + """ + if _inverted_index.doc_count == 0 and any( + vdata.get("files") for vdata in index.values() + ): _inverted_index.rebuild() return _inverted_index diff --git a/plan.md b/plan.md index df32609..73acc35 100644 --- a/plan.md +++ b/plan.md @@ -1,375 +1,348 @@ -# Implementation Plan β Remaining Roadmap Items +# Plan: Incremental InvertedIndex for 40k+ files -## 1. π Documentation OpenAPI enrichie (P3) β 5 min +## Problem Summary -**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`. +Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` β O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable. -**Models to update (lines 89β311):** +The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it. -| Line | Model | Fields to annotate | -|------|-------|--------------------| -| 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` | -| 103 | `FileRawResponse` | `vault`, `path`, `raw` | -| 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` | -| 118 | `FileDeleteResponse` | `status`, `vault`, `path` | -| 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` | -| 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) | -| 146 | `TagsResponse` | `vault_filter`, `tags` | -| 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) | -| 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` | -| 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` | -| 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) | -| 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) | -| 196 | `TitleSuggestion` | `vault`, `path`, `title` | -| 203 | `SuggestResponse` | `query`, `suggestions` | -| 209 | `TagSuggestion` | `tag`, `count` | -| 215 | `TagSuggestResponse` | `query`, `suggestions` | -| 221 | `GraphNode` | (all fields already have Field) | -| 231 | `GraphEdge` | (all fields already have Field) | -| 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` | -| 247 | `ReloadResponse` | `status`, `vaults` | -| 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` | -| 265 | `DirectoryCreateResponse` | `success`, `path` | -| 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` | -| 296 | `FileCreateResponse` | `success`, `path` | -| 307 | `FileRenameResponse` | `success` | +## Solution: Incremental Add/Remove on the InvertedIndex -**Dependency:** None. Pure documentation change. +Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed: ---- +- `add_document(vault_name, path, file_info)` β called on create/modify +- `remove_document(vault_name, path)` β called on delete/move-source -## 2. π Dashboard statistiques (P3) β 30 min +Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale. -### 2a. Backend: `GET /api/dashboard` (new endpoint) +Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current. -**File:** `backend/main.py` β insert at **line ~2547** (after `/api/diagnostics`) +## Dependency Architecture + +**Current import chain:** +``` +main.py β search.py β indexer.py (search.py imports `from backend import indexer as _indexer`) +``` + +**Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py β indexer.py β search.py`. + +**Fix β Option C (Callback/Hook pattern, simplest):** + +Add a module-level hook variable in `backend/indexer.py`: ```python -@app.get("/api/dashboard") -async def api_dashboard(current_user=Depends(require_auth)): - """Aggregated dashboard statistics across all accessible vaults.""" - from backend.indexer import index, vault_config, path_index - user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", []) +# In backend/indexer.py +_on_index_change: callable = None # Called as (action, vault_name, path, file_info_or_None) - vault_stats = [] - total_files = 0 - total_tags = set() - total_size = 0 +def set_index_change_hook(hook): + """Register a callback for incremental index updates. + hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'. + """ + global _on_index_change + _on_index_change = hook +``` - for vname, vdata in index.items(): - if "*" not in user_vaults and vname not in user_vaults: - continue - files = vdata.get("files", []) - file_count = len(files) - total_files += file_count - tags = set() - for f in files: - tags.update(f.get("tags", [])) - total_size += f.get("size", 0) - total_tags.update(tags) - vault_stats.append({ - "name": vname, - "file_count": file_count, - "tag_count": len(tags), - "total_size_bytes": sum(f.get("size", 0) for f in files), +Then at the end of `_add_file_to_structures`: +```python +if _on_index_change: + _on_index_change('add', vault_name, rel_path, file_info) +``` + +At the end of `_remove_file_from_structures`: +```python +if _on_index_change: + _on_index_change('remove', vault_name, rel_path, file_info) # file_info = removed dict or None +``` + +Then in `backend/search.py`, at module load time (after InvertedIndex class is defined): +```python +def _on_index_change_hook(action, vault_name, path, file_info): + inv = get_inverted_index_raw() # get without rebuild check + if action == 'add': + inv.add_document(vault_name, path, file_info) + elif action == 'remove': + inv.remove_document(vault_name, path) + +# Register the hook β this triggers an import of indexer, but indexer is already imported +# by the time this line runs (since search.py does `from backend import indexer as _indexer` above) +_indexer.set_index_change_hook(_on_index_change_hook) +``` + +This avoids circular imports completely because: +1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`) +2. `indexer.py` never imports `search.py` β it just stores a callback +3. `search.py` registers the callback AFTER the InvertedIndex class is defined + +## Detailed Implementation + +### Step 1: Add hook variable to `backend/indexer.py` + +File: `backend/indexer.py` +Changes: +- After `_index_generation` global (line ~28), add: + ```python + _on_index_change: callable = None + ``` +- Add function `set_index_change_hook(hook)` (bottom of file, near other public functions) +- Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665) +- Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597) + +### Step 2: Add `add_document` and `remove_document` to InvertedIndex + +File: `backend/search.py` + +#### `add_document(vault_name, path, file_info)` + +```python +def add_document(self, vault_name: str, path: str, file_info: dict): + """Add or update a single document in the inverted index.""" + + doc_key = f"{vault_name}::{path}" + old_file_info = self.doc_info.get(doc_key) + + # If updating an existing document, remove old entries first + if old_file_info is not None: + self._remove_doc_internals(doc_key, vault_name, old_file_info) + else: + self.doc_count += 1 + + # --- Metadata --- + self.doc_info[doc_key] = file_info + self.doc_vault[doc_key] = vault_name + self.vault_docs[vault_name].add(doc_key) + + # --- Tags --- + tags = file_info.get("tags", []) + for tag in tags: + self.tag_docs[tag.lower()].add(doc_key) + + # --- Title tokens --- + title = file_info.get("title", "") + title_tokens = tokenize(title) + for token in set(title_tokens): + self.title_index[token].append(doc_key) + + # --- Normalized title for prefix suggestions --- + norm_title = normalize_text(title) + if norm_title: + self.title_norm_map[norm_title].append({ + "vault": vault_name, + "path": path, + "title": title, }) - - return { - "vaults": vault_stats, - "total_files": total_files, - "total_tags": len(total_tags), - "total_size_bytes": total_size, - } + + # --- Word index (content + title TF) --- + content = file_info.get("content", "") + full_text = title + " " + content + tokens = tokenize(full_text) + tf = defaultdict(int) + for token in tokens: + tf[token] += 1 + + # Track which tokens are new (not previously indexed) for sorted_tokens update + new_tokens = [] + for token, freq in tf.items(): + if not self.word_index.get(token): + new_tokens.append(token) + self.word_index[token][doc_key] = freq + + # Incrementally update _sorted_tokens (avoid O(V log V) full re-sort) + if new_tokens: + for token in new_tokens: + bisect.insort(self._sorted_tokens, token) ``` -**No new model needed** β return plain dict (or add optional `DashboardResponse` model). - -### 2b. Frontend: Insert stats widget in dashboard-home - -**File:** `frontend/index.html` β **after line 364** (`` closing bookmarks section, before ``) - -Add: -```html - -
Notifications HTTP vers des services externes lors des changements de fichiers.
- -