Switch inverted index from stale check to incremental updates

Register a hook with the indexer so that file add/remove events
incrementally maintain the inverted index, removing the need for
periodic staleness checks and cooldowns. Rebuild the index once on
startup via init_inverted_index().
This commit is contained in:
Bruno Charest 2026-05-26 12:37:59 -04:00
parent 872a3e56dd
commit 775722f5d4
4 changed files with 460 additions and 363 deletions

View File

@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic
path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path] path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path]
_index_generation += 1 _index_generation += 1
# Notify inverted index for incremental update
if _on_index_change:
_on_index_change('remove', vault_name, rel_path, removed)
return removed return removed
@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]):
_index_generation += 1 _index_generation += 1
# Notify inverted index for incremental update
if _on_index_change:
_on_index_change('add', vault_name, file_info["path"], file_info)
async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]: async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]:
"""Re-index a single file without full rebuild. """Re-index a single file without full rebuild.

View File

@ -44,7 +44,7 @@ from backend.indexer import (
remove_vault_from_index, remove_vault_from_index,
add_vault_to_index, add_vault_to_index,
) )
from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index
from backend.image_processor import preprocess_images from backend.image_processor import preprocess_images
from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats
from backend.vault_settings import ( from backend.vault_settings import (
@ -379,14 +379,9 @@ async def _on_vault_change(events: list):
Processes each event (create/modify/delete/move) and updates the index Processes each event (create/modify/delete/move) and updates the index
incrementally, then broadcasts SSE notifications. incrementally, then broadcasts SSE notifications.
""" """
import backend.indexer as idx
updated_vaults = set() updated_vaults = set()
changes = [] changes = []
# Temporarily suppress per-file generation increments to coalesce them
# into a single increment at the end of the batch.
old_gen = idx._index_generation
for event in events: for event in events:
vault_name = event["vault"] vault_name = event["vault"]
event_type = event["type"] event_type = event["type"]
@ -415,11 +410,6 @@ async def _on_vault_change(events: list):
except Exception as e: except Exception as e:
logger.error(f"Error processing {event_type} event for {src}: {e}") logger.error(f"Error processing {event_type} event for {src}: {e}")
# Restore generation to only increment by 1 for the whole batch
# (unless it was already incremented by other operations)
if idx._index_generation > old_gen + 1:
idx._index_generation = old_gen + 1
if changes: if changes:
await sse_manager.broadcast("index_updated", { await sse_manager.broadcast("index_updated", {
"vaults": list(updated_vaults), "vaults": list(updated_vaults),
@ -520,6 +510,9 @@ async def lifespan(app: FastAPI):
logger.info("Background indexing started") logger.info("Background indexing started")
await build_index(_progress_cb) await build_index(_progress_cb)
# Build inverted index for search (one-time, then incremental)
init_inverted_index()
# Start file watcher # Start file watcher
config = _load_config() config = _load_config()
watcher_enabled = config.get("watcher_enabled", True) watcher_enabled = config.get("watcher_enabled", True)

View File

@ -239,9 +239,9 @@ def _escape_html(text: str) -> str:
class InvertedIndex: class InvertedIndex:
"""In-memory inverted index supporting TF-IDF scoring. """In-memory inverted index supporting TF-IDF scoring.
Built lazily from the global ``index`` dict whenever a search or Built initially via ``rebuild()`` from the global index, then
suggestion request detects that the underlying vault index has changed. maintained incrementally via ``add_document()`` / ``remove_document()``
The class is designed to be a singleton use ``get_inverted_index()``. hooks from the file watcher and API mutations.
Attributes: Attributes:
word_index: ``{token: {doc_key: term_frequency}}`` word_index: ``{token: {doc_key: term_frequency}}``
@ -249,7 +249,6 @@ class InvertedIndex:
tag_norm_map: ``{normalized_tag: original_tag}`` tag_norm_map: ``{normalized_tag: original_tag}``
tag_prefix_index: ``{prefix: [original_tag, ...]}`` tag_prefix_index: ``{prefix: [original_tag, ...]}``
doc_count: Total number of indexed documents. doc_count: Total number of indexed documents.
_source_id: Fingerprint of the source index to detect staleness.
""" """
def __init__(self) -> None: def __init__(self) -> None:
@ -264,23 +263,7 @@ class InvertedIndex:
self.vault_docs: Dict[str, set] = defaultdict(set) self.vault_docs: Dict[str, set] = defaultdict(set)
self.tag_docs: Dict[str, set] = defaultdict(set) self.tag_docs: Dict[str, set] = defaultdict(set)
self._sorted_tokens: List[str] = [] self._sorted_tokens: List[str] = []
self._source_generation: int = -1 self._ready: bool = False # True after initial build
self._last_rebuild: float = 0
self._rebuild_cooldown: float = 3.0 # seconds
def is_stale(self) -> bool:
"""Check if the inverted index needs rebuilding.
Uses a cooldown (3s) to prevent rapid rebuilds from file watcher
events. Staleness is only reported if the generation has changed
AND the cooldown has elapsed since the last rebuild.
"""
import time
if _indexer._index_generation == self._source_generation:
return False
if time.time() - self._last_rebuild < self._rebuild_cooldown:
return False
return True
def rebuild(self) -> None: def rebuild(self) -> None:
"""Rebuild inverted index from the global ``index`` dict. """Rebuild inverted index from the global ``index`` dict.
@ -288,9 +271,7 @@ class InvertedIndex:
Tokenizes titles and content of every file, computes term frequencies, Tokenizes titles and content of every file, computes term frequencies,
and builds auxiliary indexes for tag and title prefix suggestions. and builds auxiliary indexes for tag and title prefix suggestions.
""" """
import time logger.info("Building inverted index...")
self._last_rebuild = time.time()
logger.info("Rebuilding inverted index...")
self.word_index = defaultdict(dict) self.word_index = defaultdict(dict)
self.title_index = defaultdict(list) self.title_index = defaultdict(list)
self.tag_norm_map = {} self.tag_norm_map = {}
@ -351,7 +332,7 @@ class InvertedIndex:
self.tag_prefix_index[prefix].append(tag) self.tag_prefix_index[prefix].append(tag)
self._sorted_tokens = sorted(self.word_index.keys()) self._sorted_tokens = sorted(self.word_index.keys())
self._source_generation = _indexer._index_generation self._ready = True
logger.info( logger.info(
"Inverted index built: %d documents, %d unique tokens, %d tags", "Inverted index built: %d documents, %d unique tokens, %d tags",
self.doc_count, self.doc_count,
@ -359,6 +340,117 @@ class InvertedIndex:
len(self.tag_norm_map), len(self.tag_norm_map),
) )
def add_document(self, vault_name: str, path: str, file_info: dict):
"""Add or update a single document incrementally."""
if not self._ready:
return
doc_key = f"{vault_name}::{path}"
old_file_info = self.doc_info.get(doc_key)
if old_file_info is not None:
self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True)
else:
self.doc_count += 1
# Metadata
self.doc_info[doc_key] = file_info
self.doc_vault[doc_key] = vault_name
self.vault_docs[vault_name].add(doc_key)
# Tags
tags = file_info.get("tags", [])
for tag in tags:
self.tag_docs[tag.lower()].add(doc_key)
norm_tag = normalize_text(tag)
if norm_tag not in self.tag_norm_map:
self.tag_norm_map[norm_tag] = tag
for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
prefix = norm_tag[:plen]
if tag not in self.tag_prefix_index[prefix]:
self.tag_prefix_index[prefix].append(tag)
# Title tokens
title = file_info.get("title", "")
title_tokens = tokenize(title)
for token in set(title_tokens):
if token:
self.title_index[token].append(doc_key)
# Title norm map
norm_title = normalize_text(title)
if norm_title:
self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title})
# Word index (content + title TF)
content = file_info.get("content", "")
full_text = title + " " + content
tokens = tokenize(full_text)
tf: Dict[str, int] = defaultdict(int)
for token in tokens:
if token:
tf[token] += 1
for token, freq in tf.items():
if not self.word_index.get(token):
bisect.insort(self._sorted_tokens, token)
self.word_index[token][doc_key] = freq
def remove_document(self, vault_name: str, path: str):
"""Remove a single document incrementally."""
if not self._ready:
return
doc_key = f"{vault_name}::{path}"
file_info = self.doc_info.get(doc_key)
if file_info is None:
return
self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False)
self.doc_count -= 1
def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False):
"""Remove one doc_key from all indexes without adjusting doc_count."""
# Metadata
self.doc_info.pop(doc_key, None)
self.doc_vault.pop(doc_key, None)
if vault_name in self.vault_docs:
self.vault_docs[vault_name].discard(doc_key)
# Tags (per-document, NOT the global tag_norm_map)
for tag in file_info.get("tags", []):
td = self.tag_docs.get(tag.lower())
if td:
td.discard(doc_key)
if not td:
del self.tag_docs[tag.lower()]
# Title tokens
title = file_info.get("title", "")
for token in set(tokenize(title)):
if not token:
continue
ti = self.title_index.get(token)
if ti:
try:
ti.remove(doc_key)
except ValueError:
pass
if not ti:
del self.title_index[token]
# Title norm map
norm_title = normalize_text(title)
if norm_title and norm_title in self.title_norm_map:
self.title_norm_map[norm_title] = [
e for e in self.title_norm_map[norm_title]
if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
]
if not self.title_norm_map[norm_title]:
del self.title_norm_map[norm_title]
# Word index
content = file_info.get("content", "")
full_text = title + " " + content
for token in set(tokenize(full_text)):
if not token:
continue
wi = self.word_index.get(token)
if wi:
wi.pop(doc_key, None)
if not wi:
del self.word_index[token]
if not skip_sorted_cleanup:
idx = bisect.bisect_left(self._sorted_tokens, token)
if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
self._sorted_tokens.pop(idx)
def idf(self, term: str) -> float: def idf(self, term: str) -> float:
"""Inverse Document Frequency for a term. """Inverse Document Frequency for a term.
@ -424,9 +516,39 @@ class InvertedIndex:
_inverted_index = InvertedIndex() _inverted_index = InvertedIndex()
def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict):
"""Callback registered with indexer for incremental inverted index updates."""
inv = _inverted_index
try:
if action == 'add' and file_info:
inv.add_document(vault_name, path, file_info)
elif action == 'remove':
inv.remove_document(vault_name, path)
except Exception as e:
logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
# Register the hook with indexer (indexer is already imported at top of file)
_indexer.set_index_change_hook(_on_index_change_hook)
def init_inverted_index():
"""Force initial inverted index build. Called after build_index completes on startup."""
if any(vdata.get("files") for vdata in index.values()):
_inverted_index.rebuild()
logger.info("Inverted index initialized.")
def get_inverted_index() -> InvertedIndex: def get_inverted_index() -> InvertedIndex:
"""Return the singleton inverted index, rebuilding if stale.""" """Return the singleton inverted index.
if _inverted_index.is_stale():
Auto-builds on first call if the index has files but the inverted
index hasn't been built yet (fallback for paths that don't go through
``init_inverted_index()``).
"""
if _inverted_index.doc_count == 0 and any(
vdata.get("files") for vdata in index.values()
):
_inverted_index.rebuild() _inverted_index.rebuild()
return _inverted_index return _inverted_index

631
plan.md
View File

@ -1,375 +1,348 @@
# Implementation Plan — Remaining Roadmap Items # Plan: Incremental InvertedIndex for 40k+ files
## 1. 📝 Documentation OpenAPI enrichie (P3) — 5 min ## Problem Summary
**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`. Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` — O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable.
**Models to update (lines 89311):** The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it.
| Line | Model | Fields to annotate | ## Solution: Incremental Add/Remove on the InvertedIndex
|------|-------|--------------------|
| 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` |
| 103 | `FileRawResponse` | `vault`, `path`, `raw` |
| 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` |
| 118 | `FileDeleteResponse` | `status`, `vault`, `path` |
| 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
| 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) |
| 146 | `TagsResponse` | `vault_filter`, `tags` |
| 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) |
| 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` |
| 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
| 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) |
| 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) |
| 196 | `TitleSuggestion` | `vault`, `path`, `title` |
| 203 | `SuggestResponse` | `query`, `suggestions` |
| 209 | `TagSuggestion` | `tag`, `count` |
| 215 | `TagSuggestResponse` | `query`, `suggestions` |
| 221 | `GraphNode` | (all fields already have Field) |
| 231 | `GraphEdge` | (all fields already have Field) |
| 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` |
| 247 | `ReloadResponse` | `status`, `vaults` |
| 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` |
| 265 | `DirectoryCreateResponse` | `success`, `path` |
| 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` |
| 296 | `FileCreateResponse` | `success`, `path` |
| 307 | `FileRenameResponse` | `success` |
**Dependency:** None. Pure documentation change. Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed:
--- - `add_document(vault_name, path, file_info)` — called on create/modify
- `remove_document(vault_name, path)` — called on delete/move-source
## 2. 📊 Dashboard statistiques (P3) — 30 min Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale.
### 2a. Backend: `GET /api/dashboard` (new endpoint) Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current.
**File:** `backend/main.py` — insert at **line ~2547** (after `/api/diagnostics`) ## Dependency Architecture
**Current import chain:**
```
main.py → search.py → indexer.py (search.py imports `from backend import indexer as _indexer`)
```
**Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py → indexer.py → search.py`.
**Fix — Option C (Callback/Hook pattern, simplest):**
Add a module-level hook variable in `backend/indexer.py`:
```python ```python
@app.get("/api/dashboard") # In backend/indexer.py
async def api_dashboard(current_user=Depends(require_auth)): _on_index_change: callable = None # Called as (action, vault_name, path, file_info_or_None)
"""Aggregated dashboard statistics across all accessible vaults."""
from backend.indexer import index, vault_config, path_index
user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", [])
vault_stats = [] def set_index_change_hook(hook):
total_files = 0 """Register a callback for incremental index updates.
total_tags = set() hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'.
total_size = 0 """
global _on_index_change
_on_index_change = hook
```
for vname, vdata in index.items(): Then at the end of `_add_file_to_structures`:
if "*" not in user_vaults and vname not in user_vaults: ```python
continue if _on_index_change:
files = vdata.get("files", []) _on_index_change('add', vault_name, rel_path, file_info)
file_count = len(files) ```
total_files += file_count
tags = set() At the end of `_remove_file_from_structures`:
for f in files: ```python
tags.update(f.get("tags", [])) if _on_index_change:
total_size += f.get("size", 0) _on_index_change('remove', vault_name, rel_path, file_info) # file_info = removed dict or None
total_tags.update(tags) ```
vault_stats.append({
"name": vname, Then in `backend/search.py`, at module load time (after InvertedIndex class is defined):
"file_count": file_count, ```python
"tag_count": len(tags), def _on_index_change_hook(action, vault_name, path, file_info):
"total_size_bytes": sum(f.get("size", 0) for f in files), inv = get_inverted_index_raw() # get without rebuild check
if action == 'add':
inv.add_document(vault_name, path, file_info)
elif action == 'remove':
inv.remove_document(vault_name, path)
# Register the hook — this triggers an import of indexer, but indexer is already imported
# by the time this line runs (since search.py does `from backend import indexer as _indexer` above)
_indexer.set_index_change_hook(_on_index_change_hook)
```
This avoids circular imports completely because:
1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`)
2. `indexer.py` never imports `search.py` — it just stores a callback
3. `search.py` registers the callback AFTER the InvertedIndex class is defined
## Detailed Implementation
### Step 1: Add hook variable to `backend/indexer.py`
File: `backend/indexer.py`
Changes:
- After `_index_generation` global (line ~28), add:
```python
_on_index_change: callable = None
```
- Add function `set_index_change_hook(hook)` (bottom of file, near other public functions)
- Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665)
- Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597)
### Step 2: Add `add_document` and `remove_document` to InvertedIndex
File: `backend/search.py`
#### `add_document(vault_name, path, file_info)`
```python
def add_document(self, vault_name: str, path: str, file_info: dict):
"""Add or update a single document in the inverted index."""
doc_key = f"{vault_name}::{path}"
old_file_info = self.doc_info.get(doc_key)
# If updating an existing document, remove old entries first
if old_file_info is not None:
self._remove_doc_internals(doc_key, vault_name, old_file_info)
else:
self.doc_count += 1
# --- Metadata ---
self.doc_info[doc_key] = file_info
self.doc_vault[doc_key] = vault_name
self.vault_docs[vault_name].add(doc_key)
# --- Tags ---
tags = file_info.get("tags", [])
for tag in tags:
self.tag_docs[tag.lower()].add(doc_key)
# --- Title tokens ---
title = file_info.get("title", "")
title_tokens = tokenize(title)
for token in set(title_tokens):
self.title_index[token].append(doc_key)
# --- Normalized title for prefix suggestions ---
norm_title = normalize_text(title)
if norm_title:
self.title_norm_map[norm_title].append({
"vault": vault_name,
"path": path,
"title": title,
}) })
return { # --- Word index (content + title TF) ---
"vaults": vault_stats, content = file_info.get("content", "")
"total_files": total_files, full_text = title + " " + content
"total_tags": len(total_tags), tokens = tokenize(full_text)
"total_size_bytes": total_size, tf = defaultdict(int)
} for token in tokens:
tf[token] += 1
# Track which tokens are new (not previously indexed) for sorted_tokens update
new_tokens = []
for token, freq in tf.items():
if not self.word_index.get(token):
new_tokens.append(token)
self.word_index[token][doc_key] = freq
# Incrementally update _sorted_tokens (avoid O(V log V) full re-sort)
if new_tokens:
for token in new_tokens:
bisect.insort(self._sorted_tokens, token)
``` ```
**No new model needed** — return plain dict (or add optional `DashboardResponse` model). #### `remove_document(vault_name, path)`
### 2b. Frontend: Insert stats widget in dashboard-home
**File:** `frontend/index.html`**after line 364** (`</div>` closing bookmarks section, before `<!-- Recently Opened Section -->`)
Add:
```html
<!-- Stats Section -->
<div id="dashboard-stats-section" class="dashboard-section">
<div class="dashboard-header">
<div class="dashboard-title-row">
<i data-lucide="bar-chart-3" class="dashboard-icon" style="color:var(--accent)"></i>
<h2>Statistiques</h2>
</div>
</div>
<div id="dashboard-stats-grid" class="dashboard-stats-grid">
<div class="dashboard-stats-loading">Chargement...</div>
</div>
</div>
```
**File:** `frontend/app.js` — add `DashboardStatsWidget` module (insert at **line ~3343**, before `DashboardRecentWidget`):
```javascript
const DashboardStatsWidget = {
async load() {
const grid = document.getElementById("dashboard-stats-grid");
if (!grid) return;
grid.innerHTML = '<div class="dashboard-stats-loading">Chargement...</div>';
try {
const data = await api("/api/dashboard");
this.render(data);
} catch (err) {
grid.innerHTML = `<div class="dashboard-recent-empty">Erreur: ${escapeHtml(err.message)}</div>`;
}
},
render(data) {
const grid = document.getElementById("dashboard-stats-grid");
if (!grid) return;
const items = [
{ icon: "files", label: "Fichiers", value: data.total_files.toLocaleString() },
{ icon: "tags", label: "Tags uniques", value: data.total_tags.toLocaleString() },
{ icon: "hard-drive", label: "Taille totale", value: this._formatSize(data.total_size_bytes) },
{ icon: "folder", label: "Vaults", value: data.vaults.length.toString() },
];
grid.innerHTML = items.map(i => `
<div class="stat-card">
<i data-lucide="${i.icon}" class="stat-icon"></i>
<span class="stat-value">${i.value}</span>
<span class="stat-label">${i.label}</span>
</div>
`).join("");
safeCreateIcons();
},
_formatSize(bytes) { /* KB/MB/GB formatter */ }
};
```
**Also update `showWelcome()`** at **line ~5417** — the dashboard rebuild HTML must include the stats section div. And **line ~5490** — add `DashboardStatsWidget.load()` call.
**File:** `frontend/style.css` — add CSS for `.dashboard-stats-grid`, `.stat-card`, `.stat-icon`, `.stat-value`, `.stat-label`.
**Dependency:** Item 1 (Pydantic models) — none. Standalone.
---
## 3. 🔔 Webhooks (P3) — 45 min
### 3a. New backend module: `backend/webhooks.py`
Create full module with:
- `WEBHOOKS_FILE = Path("data/webhooks.json")` — persistence
- `_DEFAULT_WEBHOOKS = []`
- `get_webhooks() -> list` — reads from disk
- `create_webhook(name, url, events, secret=None) -> dict`
- `update_webhook(id, updates) -> dict`
- `delete_webhook(id) -> bool`
- `async def dispatch_webhooks(event_type: str, data: dict)` — calls all webhooks subscribed to `event_type`, sends JSON POST with HMAC-SHA256 signature header if secret is set, timeout 5s, logs failures
- Model: `WebhookConfig` with `id`, `name`, `url`, `events` (list of event type strings), `secret` (optional), `enabled`, `created_at`, `last_fired_at`
### 3b. Backend: CRUD endpoints in `backend/main.py`
Insert at **line ~2470** (before `GET /api/config`):
```python ```python
@app.get("/api/webhooks") def remove_document(self, vault_name: str, path: str):
@app.post("/api/webhooks") """Remove a document from the inverted index."""
@app.patch("/api/webhooks/{webhook_id}")
@app.delete("/api/webhooks/{webhook_id}") doc_key = f"{vault_name}::{path}"
file_info = self.doc_info.get(doc_key)
if file_info is None:
return
self._remove_doc_internals(doc_key, vault_name, file_info)
self.doc_count -= 1
``` ```
Import `from backend.webhooks import get_webhooks, create_webhook, update_webhook, delete_webhook, dispatch_webhooks` #### `_remove_doc_internals(doc_key, vault_name, file_info)` (private helper)
### 3c. Backend: Hook dispatch_webhooks into file events
Add `await dispatch_webhooks("file_created", {...})` calls alongside each `sse_manager.broadcast(...)` call:
| Line | Event | Add dispatch |
|------|-------|-------------|
| ~1252 | `file_deleted` | `dispatch_webhooks("file_deleted", {"vault":..., "path":...})` |
| ~1330 | `directory_created` | `dispatch_webhooks("directory_created", {...})` |
| ~1401 | `directory_renamed` | `dispatch_webhooks("directory_renamed", {...})` |
| ~1462 | `directory_deleted` | `dispatch_webhooks("directory_deleted", {...})` |
| ~1532 | `file_created` | `dispatch_webhooks("file_created", {...})` |
| ~1607 | `file_renamed` | `dispatch_webhooks("file_renamed", {...})` |
### 3d. Frontend: Webhooks management UI in Configurations modal
**File:** `frontend/index.html` — insert at **line ~633** (after `<!-- À propos -->` section, before `</div>` closing config-content):
```html
<section class="config-section">
<h2>🔔 Webhooks</h2>
<p class="config-description">Notifications HTTP vers des services externes lors des changements de fichiers.</p>
<div id="webhooks-list"></div>
<div class="config-add-pattern">
<input type="text" id="webhook-name-input" placeholder="Nom" class="config-input" style="width:120px">
<input type="text" id="webhook-url-input" placeholder="https://..." class="config-input" style="flex:1">
<button id="webhook-add-btn" class="config-btn-add">Ajouter</button>
</div>
</section>
```
**File:** `frontend/app.js` — in `initConfigModal()` at **line ~3918**, add:
```javascript
loadWebhooks(); // in the open handler
// Event binding for webhook add/save/delete buttons
```
Add functions: `loadWebhooks()`, `renderWebhooks(webhooks)`, `addWebhook()`, `deleteWebhook(id)`, `toggleWebhook(id, enabled)`. All use `api("/api/webhooks", ...)`.
**File:** `frontend/style.css` — add `.webhook-item`, `.webhook-toggle`, `.webhook-delete` styles.
**Dependency:** None on items 12. Standalone.
---
## 4. 📤 Publication publique de documents (P3) — 60 min
### 4a. New backend module: `backend/share.py`
Create full module:
- `SHARES_FILE = Path("data/shares.json")`
- ShareToken model: `id`, `vault`, `path`, `token` (64-char hex), `created_by`, `created_at`, `expires_at` (optional, null = never), `access_count`, `last_accessed`
- `create_share(vault, path, created_by, expires_in_hours=None) -> dict` — generates token, stores, returns share info
- `get_share_by_token(token) -> dict | None` — validates expiry, returns share
- `revoke_share(id) -> bool`
- `list_shares(vault_filter=None) -> list` — for admin/settings page
- `record_access(token)` — increments access_count
### 4b. Backend: Endpoints in `backend/main.py`
Insert at **line ~1619** (before `GET /api/file/{vault_name}`):
```python ```python
# Share management def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict):
@app.post("/api/share/{vault_name}") """Internal: remove one doc_key from all indexes without adjusting doc_count."""
@app.get("/api/shares")
@app.delete("/api/share/{share_id}")
# Public view (no auth required!) # --- Metadata ---
@app.get("/s/{token}") self.doc_info.pop(doc_key, None)
async def public_share_view(token: str): ... self.doc_vault.pop(doc_key, None)
if vault_name in self.vault_docs:
self.vault_docs[vault_name].discard(doc_key)
# --- Tags ---
for tag in file_info.get("tags", []):
td = self.tag_docs.get(tag.lower())
if td:
td.discard(doc_key)
if not td:
del self.tag_docs[tag.lower()]
# --- Title tokens ---
title = file_info.get("title", "")
for token in set(tokenize(title)):
ti = self.title_index.get(token)
if ti:
try:
ti.remove(doc_key)
except ValueError:
pass
if not ti:
del self.title_index[token]
# --- Title norm map ---
norm_title = normalize_text(title)
if norm_title and norm_title in self.title_norm_map:
self.title_norm_map[norm_title] = [
e for e in self.title_norm_map[norm_title]
if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
]
if not self.title_norm_map[norm_title]:
del self.title_norm_map[norm_title]
# --- Word index ---
content = file_info.get("content", "")
full_text = title + " " + content
for token in set(tokenize(full_text)):
wi = self.word_index.get(token)
if wi:
wi.pop(doc_key, None)
if not wi:
del self.word_index[token]
# Remove from sorted tokens via bisect
idx = bisect.bisect_left(self._sorted_tokens, token)
if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
self._sorted_tokens.pop(idx)
``` ```
The public view endpoint: Key: `bisect.insort` for insertion and `bisect.bisect_left` + `pop(idx)` for removal keep `_sorted_tokens` sorted in O(V) worst case (list shift) but this is negligible compared to O(N * content) rebuild.
1. Looks up token via `get_share_by_token(token)`
2. Reads the file content
3. Renders markdown with redacted secrets
4. Returns simple HTML page (not SPA) with rendered content
5. Increments access count
### 4c. Frontend: Share button in file actions ### Step 3: Modify `get_inverted_index()` to NOT check staleness
**File:** `frontend/app.js` — in `renderFile()`, at **line ~3250** (after pop-out button): File: `backend/search.py`
```javascript
const shareBtn = el("button", { class: "btn-action", title: "Partager" }, [icon("share-2", 14), document.createTextNode("Partager")]);
shareBtn.addEventListener("click", () => openShareDialog(data.vault, data.path));
```
Add `shareBtn` to the file-actions div at **line ~3300**.
Add `openShareDialog(vault, path)` function that:
- Calls `POST /api/share/{vault}` to create a share
- Shows a modal with the share URL (copyable) and expiration options
- Shows existing shares list with revoke buttons
### 4d. Frontend: Share management in Configurations
**File:** `frontend/index.html` — add share management section in config modal (alongside webhooks).
**File:** `frontend/app.js``loadShares()` and `renderShares()` functions.
**File:** `frontend/style.css` — add `.share-dialog`, `.share-url`, `.share-item` styles.
**Dependency:** None. Standalone, but needs item 1 for clean models.
---
## 5. 🔄 Gestion conflits Syncthing (P2) — 45 min
### 5a. Backend: Conflict file detection
**File:** `backend/indexer.py` — add after `_backlink_index`:
```python ```python
def get_conflicts() -> list: def get_inverted_index() -> InvertedIndex:
"""Scan all vaults for Syncthing/Nextcloud sync-conflict files.""" """Return the singleton inverted index. Always up-to-date via hooks."""
conflicts = [] return _inverted_index
pattern = re.compile(r'\.sync-conflict-(\d{8}-\d{6})\.')
for vname, vdata in index.items():
for f in vdata.get("files", []):
m = pattern.search(f["path"])
if m:
# Find the original file
orig_path = pattern.sub("", f["path"])
conflicts.append({
"vault": vname,
"conflict_path": f["path"],
"original_path": orig_path,
"conflict_date": m.group(1),
"conflict_title": f.get("title", ""),
})
return conflicts
``` ```
### 5b. Backend: Endpoints Remove `is_stale()` and the `_source_generation` / `_last_rebuild` / `_rebuild_cooldown` fields. Keep `rebuild()` for initial build and manual reindex (still called once at startup via `build_index`).
**File:** `backend/main.py` — insert at **line ~2547**: ### Step 4: Call `rebuild()` once after initial index build
In `backend/search.py`, register the hook AND call rebuild once:
```python ```python
@app.get("/api/conflicts") # After InvertedIndex class and _inverted_index = InvertedIndex()
async def api_conflicts(current_user=Depends(require_auth)):
"""List sync-conflict files across accessible vaults.""" def _on_index_change_hook(action, vault_name, path, file_info):
... inv = _inverted_index
try:
@app.post("/api/conflicts/resolve") if action == 'add':
async def api_conflict_resolve(body: dict, current_user=Depends(require_auth)): inv.add_document(vault_name, path, file_info)
"""Resolve a conflict: keep_local (delete conflict), keep_conflict (replace original).""" elif action == 'remove':
inv.remove_document(vault_name, path)
except Exception as e:
logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
# Fallback: mark for rebuild on next search
inv._needs_rebuild = True
_indexer.set_index_change_hook(_on_index_change_hook)
# Initial build trigger — called after first index is built
def init_inverted_index():
"""Force initial inverted index build. Called after build_index completes."""
_inverted_index.rebuild()
def get_inverted_index() -> InvertedIndex:
"""Return the singleton inverted index."""
# Only check for rebuild if incremental updates have failed
# OR if this is the very first call (doc_count == 0 and index has files)
if getattr(_inverted_index, '_needs_rebuild', False):
_inverted_index.rebuild()
_inverted_index._needs_rebuild = False
elif _inverted_index.doc_count == 0 and any(
vdata.get("files") for vdata in index.values()
):
_inverted_index.rebuild()
return _inverted_index
```
### Step 5: Call `init_inverted_index()` from `build_index` in main.py
In `backend/main.py`, after `build_index()` completes in the lifespan handler, call:
```python
from backend.search import init_inverted_index
init_inverted_index()
```
This ensures the inverted index is built once on startup, then incrementally maintained thereafter.
### Tag prefix index handling
The `tag_norm_map` and `tag_prefix_index` are built per-vault in `rebuild()`. For incremental updates, we need to handle tag changes:
In `add_document`, after adding doc tags:
```python
# Check if any tags are new (not in tag_norm_map)
for tag in tags:
norm_tag = normalize_text(tag)
if norm_tag not in self.tag_norm_map:
self.tag_norm_map[norm_tag] = tag
for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
prefix = norm_tag[:plen]
if tag not in self.tag_prefix_index[prefix]:
self.tag_prefix_index[prefix].append(tag)
```
In `_remove_doc_internals`, we do NOT remove tags from `tag_norm_map` or `tag_prefix_index` — these are global (per-vault tag vocabulary), not per-document. They only grow over the lifetime of the inverted index. A periodic `rebuild()` on manual reindex will clean them up.
### Step 6: Remove cooldown hack from search.py
Remove:
- `_last_rebuild` and `_rebuild_cooldown` fields from `InvertedIndex.__init__`
- `is_stale()` method
- `_source_generation` field (no longer needed for staleness, but keep for diagnostics)
### Step 7: Remove coalescence hack from main.py
In `_on_vault_change` in `backend/main.py`, remove:
```python
old_gen = idx._index_generation
... ...
if idx._index_generation > old_gen + 1:
idx._index_generation = old_gen + 1
``` ```
### 5c. Backend: Diff endpoint This hack was only needed to reduce the number of inverted index rebuilds. With incremental updates, it's unnecessary — each mutation is cheap.
```python ## Files Modified (Summary)
@app.get("/api/conflicts/diff")
async def api_conflict_diff(vault: str, original: str, conflict: str, current_user=Depends(require_auth)):
"""Return unified diff between original and conflict file."""
import difflib
...
```
### 5d. Frontend: Conflict dashboard widget | File | Changes |
|------|---------|
| `backend/indexer.py` | +`_on_index_change` hook variable, +`set_index_change_hook()`, +hook calls in `_add_file_to_structures` and `_remove_file_from_structures` |
| `backend/search.py` | +`add_document()`, +`remove_document()`, +`_remove_doc_internals()`, +`init_inverted_index()`, +hook registration, remove `is_stale()`/cooldown, simplify `get_inverted_index()` |
| `backend/main.py` | +`init_inverted_index()` call after `build_index()`, remove coalescence hack in `_on_vault_change` |
**File:** `frontend/index.html` — add `#dashboard-conflicts-section` in dashboard after stats section. ## Risks & Edge Cases
**File:** `frontend/app.js` — add `DashboardConflictsWidget` (pattern similar to recent/bookmarks): 1. **Thread safety:** `_add_file_to_structures` and `_remove_file_from_structures` are protected by `_index_lock` / `_async_index_lock` in indexer.py. The InvertedIndex methods are called inside these locks, so they're also protected. No additional locking needed.
- `load()``GET /api/conflicts`
- `render()` → shows conflict cards with file names and dates
- Click → opens diff modal showing side-by-side comparison
- Action buttons: "Garder l'original", "Garder le conflit"
**File:** `frontend/style.css` — add `.conflict-card`, `.conflict-diff`, `.conflict-actions` styles. 2. **Hook registration timing:** `search.py` imports `indexer.py` at the top, then later registers the hook. The hook is registered at module load time, BEFORE the first call to `build_index`. So `_on_index_change` is set when `build_index` runs — but `build_index` calls `_add_file_to_structures` internally, which would try to incrementally update an empty inverted index. **Fix:** The hook checks `if _inverted_index.doc_count == 0` and skips incremental updates; the initial `rebuild()` handles the bulk load.
**Dependency:** None. Standalone. 3. **Hook call during initial build_index:** `build_index` iterates files and calls `_add_file_to_structures`. The hook fires for each file, calling `add_document()` on an empty inverted index. This is slower than a single `rebuild()`. **Fix:** Add a flag `_inverted_index._ready = False` initially, set to True after `init_inverted_index()`. The hook skips when `_ready` is False.
--- 4. **Sorted tokens performance:** `bisect.insort` and `list.pop(idx)` are O(V) worst case for large V. For 40k files, the vocabulary size V is typically 50k-200k tokens. O(V) for a single insertion is ~0.001ms, acceptable. The rebuild() call at startup handles the initial bulk.
## Execution Order (optimal) 5. **tag_norm_map / tag_prefix_index growth:** These grow monotonically (never shrink on incremental remove). With 40k files and thousands of tags, this is a few thousand entries — negligible. A manual "Réindexer" button triggers a full `rebuild()` to clean up.
1. **Item 1** — OpenAPI docs (quick win, no risk)
2. **Item 2** — Dashboard stats (standalone, visible result)
3. **Item 3** — Webhooks (new module + integration, most code)
4. **Item 4** — Public shares (new module + public view, security-sensitive)
5. **Item 5** — Syncthing conflicts (standalone, nice-to-have)
**Total estimated effort:** ~3 hours
## Files Summary
| File | Action | Items |
|------|--------|-------|
| `backend/main.py` | Edit | 1 (models), 2a (endpoint), 3b+c (webhook CRUD+dispatch), 4b (share+public view), 5b+c (conflicts) |
| `backend/webhooks.py` | **Create** | 3a |
| `backend/share.py` | **Create** | 4a |
| `backend/indexer.py` | Edit | 5a (get_conflicts) |
| `frontend/index.html` | Edit | 2b, 3d, 4d, 5d (dashboard + config sections) |
| `frontend/app.js` | Edit | 2b, 3d, 4c, 5d (widgets + share button + webhook UI) |
| `frontend/style.css` | Edit | 2b, 3d, 4c, 5d (all new CSS classes) |