Switch inverted index from stale check to incremental updates

Register a hook with the indexer so that file add/remove events
incrementally maintain the inverted index, removing the need for
periodic staleness checks and cooldowns. Rebuild the index once on
startup via init_inverted_index().
This commit is contained in:
Bruno Charest 2026-05-26 12:37:59 -04:00
parent 872a3e56dd
commit 775722f5d4
4 changed files with 460 additions and 363 deletions

View File

@ -599,6 +599,11 @@ def _remove_file_from_structures(vault_name: str, rel_path: str) -> Optional[Dic
path_index[vault_name] = [p for p in path_index[vault_name] if p["path"] != rel_path]
_index_generation += 1
# Notify inverted index for incremental update
if _on_index_change:
_on_index_change('remove', vault_name, rel_path, removed)
return removed
@ -666,6 +671,10 @@ def _add_file_to_structures(vault_name: str, file_info: Dict[str, Any]):
_index_generation += 1
# Notify inverted index for incremental update
if _on_index_change:
_on_index_change('add', vault_name, file_info["path"], file_info)
async def update_single_file(vault_name: str, abs_file_path: str) -> Optional[Dict[str, Any]]:
"""Re-index a single file without full rebuild.

View File

@ -44,7 +44,7 @@ from backend.indexer import (
remove_vault_from_index,
add_vault_to_index,
)
from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags
from backend.search import search, get_all_tags, advanced_search, suggest_titles, suggest_tags, init_inverted_index
from backend.image_processor import preprocess_images
from backend.attachment_indexer import rescan_vault_attachments, get_attachment_stats
from backend.vault_settings import (
@ -379,14 +379,9 @@ async def _on_vault_change(events: list):
Processes each event (create/modify/delete/move) and updates the index
incrementally, then broadcasts SSE notifications.
"""
import backend.indexer as idx
updated_vaults = set()
changes = []
# Temporarily suppress per-file generation increments to coalesce them
# into a single increment at the end of the batch.
old_gen = idx._index_generation
for event in events:
vault_name = event["vault"]
event_type = event["type"]
@ -415,11 +410,6 @@ async def _on_vault_change(events: list):
except Exception as e:
logger.error(f"Error processing {event_type} event for {src}: {e}")
# Restore generation to only increment by 1 for the whole batch
# (unless it was already incremented by other operations)
if idx._index_generation > old_gen + 1:
idx._index_generation = old_gen + 1
if changes:
await sse_manager.broadcast("index_updated", {
"vaults": list(updated_vaults),
@ -520,6 +510,9 @@ async def lifespan(app: FastAPI):
logger.info("Background indexing started")
await build_index(_progress_cb)
# Build inverted index for search (one-time, then incremental)
init_inverted_index()
# Start file watcher
config = _load_config()
watcher_enabled = config.get("watcher_enabled", True)

View File

@ -239,9 +239,9 @@ def _escape_html(text: str) -> str:
class InvertedIndex:
"""In-memory inverted index supporting TF-IDF scoring.
Built lazily from the global ``index`` dict whenever a search or
suggestion request detects that the underlying vault index has changed.
The class is designed to be a singleton use ``get_inverted_index()``.
Built initially via ``rebuild()`` from the global index, then
maintained incrementally via ``add_document()`` / ``remove_document()``
hooks from the file watcher and API mutations.
Attributes:
word_index: ``{token: {doc_key: term_frequency}}``
@ -249,7 +249,6 @@ class InvertedIndex:
tag_norm_map: ``{normalized_tag: original_tag}``
tag_prefix_index: ``{prefix: [original_tag, ...]}``
doc_count: Total number of indexed documents.
_source_id: Fingerprint of the source index to detect staleness.
"""
def __init__(self) -> None:
@ -264,23 +263,7 @@ class InvertedIndex:
self.vault_docs: Dict[str, set] = defaultdict(set)
self.tag_docs: Dict[str, set] = defaultdict(set)
self._sorted_tokens: List[str] = []
self._source_generation: int = -1
self._last_rebuild: float = 0
self._rebuild_cooldown: float = 3.0 # seconds
def is_stale(self) -> bool:
"""Check if the inverted index needs rebuilding.
Uses a cooldown (3s) to prevent rapid rebuilds from file watcher
events. Staleness is only reported if the generation has changed
AND the cooldown has elapsed since the last rebuild.
"""
import time
if _indexer._index_generation == self._source_generation:
return False
if time.time() - self._last_rebuild < self._rebuild_cooldown:
return False
return True
self._ready: bool = False # True after initial build
def rebuild(self) -> None:
"""Rebuild inverted index from the global ``index`` dict.
@ -288,9 +271,7 @@ class InvertedIndex:
Tokenizes titles and content of every file, computes term frequencies,
and builds auxiliary indexes for tag and title prefix suggestions.
"""
import time
self._last_rebuild = time.time()
logger.info("Rebuilding inverted index...")
logger.info("Building inverted index...")
self.word_index = defaultdict(dict)
self.title_index = defaultdict(list)
self.tag_norm_map = {}
@ -351,7 +332,7 @@ class InvertedIndex:
self.tag_prefix_index[prefix].append(tag)
self._sorted_tokens = sorted(self.word_index.keys())
self._source_generation = _indexer._index_generation
self._ready = True
logger.info(
"Inverted index built: %d documents, %d unique tokens, %d tags",
self.doc_count,
@ -359,6 +340,117 @@ class InvertedIndex:
len(self.tag_norm_map),
)
def add_document(self, vault_name: str, path: str, file_info: dict):
"""Add or update a single document incrementally."""
if not self._ready:
return
doc_key = f"{vault_name}::{path}"
old_file_info = self.doc_info.get(doc_key)
if old_file_info is not None:
self._remove_doc_internals(doc_key, vault_name, old_file_info, skip_sorted_cleanup=True)
else:
self.doc_count += 1
# Metadata
self.doc_info[doc_key] = file_info
self.doc_vault[doc_key] = vault_name
self.vault_docs[vault_name].add(doc_key)
# Tags
tags = file_info.get("tags", [])
for tag in tags:
self.tag_docs[tag.lower()].add(doc_key)
norm_tag = normalize_text(tag)
if norm_tag not in self.tag_norm_map:
self.tag_norm_map[norm_tag] = tag
for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
prefix = norm_tag[:plen]
if tag not in self.tag_prefix_index[prefix]:
self.tag_prefix_index[prefix].append(tag)
# Title tokens
title = file_info.get("title", "")
title_tokens = tokenize(title)
for token in set(title_tokens):
if token:
self.title_index[token].append(doc_key)
# Title norm map
norm_title = normalize_text(title)
if norm_title:
self.title_norm_map[norm_title].append({"vault": vault_name, "path": path, "title": title})
# Word index (content + title TF)
content = file_info.get("content", "")
full_text = title + " " + content
tokens = tokenize(full_text)
tf: Dict[str, int] = defaultdict(int)
for token in tokens:
if token:
tf[token] += 1
for token, freq in tf.items():
if not self.word_index.get(token):
bisect.insort(self._sorted_tokens, token)
self.word_index[token][doc_key] = freq
def remove_document(self, vault_name: str, path: str):
"""Remove a single document incrementally."""
if not self._ready:
return
doc_key = f"{vault_name}::{path}"
file_info = self.doc_info.get(doc_key)
if file_info is None:
return
self._remove_doc_internals(doc_key, vault_name, file_info, skip_sorted_cleanup=False)
self.doc_count -= 1
def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict, skip_sorted_cleanup: bool = False):
"""Remove one doc_key from all indexes without adjusting doc_count."""
# Metadata
self.doc_info.pop(doc_key, None)
self.doc_vault.pop(doc_key, None)
if vault_name in self.vault_docs:
self.vault_docs[vault_name].discard(doc_key)
# Tags (per-document, NOT the global tag_norm_map)
for tag in file_info.get("tags", []):
td = self.tag_docs.get(tag.lower())
if td:
td.discard(doc_key)
if not td:
del self.tag_docs[tag.lower()]
# Title tokens
title = file_info.get("title", "")
for token in set(tokenize(title)):
if not token:
continue
ti = self.title_index.get(token)
if ti:
try:
ti.remove(doc_key)
except ValueError:
pass
if not ti:
del self.title_index[token]
# Title norm map
norm_title = normalize_text(title)
if norm_title and norm_title in self.title_norm_map:
self.title_norm_map[norm_title] = [
e for e in self.title_norm_map[norm_title]
if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
]
if not self.title_norm_map[norm_title]:
del self.title_norm_map[norm_title]
# Word index
content = file_info.get("content", "")
full_text = title + " " + content
for token in set(tokenize(full_text)):
if not token:
continue
wi = self.word_index.get(token)
if wi:
wi.pop(doc_key, None)
if not wi:
del self.word_index[token]
if not skip_sorted_cleanup:
idx = bisect.bisect_left(self._sorted_tokens, token)
if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
self._sorted_tokens.pop(idx)
def idf(self, term: str) -> float:
"""Inverse Document Frequency for a term.
@ -424,9 +516,39 @@ class InvertedIndex:
_inverted_index = InvertedIndex()
def _on_index_change_hook(action: str, vault_name: str, path: str, file_info: dict):
"""Callback registered with indexer for incremental inverted index updates."""
inv = _inverted_index
try:
if action == 'add' and file_info:
inv.add_document(vault_name, path, file_info)
elif action == 'remove':
inv.remove_document(vault_name, path)
except Exception as e:
logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
# Register the hook with indexer (indexer is already imported at top of file)
_indexer.set_index_change_hook(_on_index_change_hook)
def init_inverted_index():
"""Force initial inverted index build. Called after build_index completes on startup."""
if any(vdata.get("files") for vdata in index.values()):
_inverted_index.rebuild()
logger.info("Inverted index initialized.")
def get_inverted_index() -> InvertedIndex:
"""Return the singleton inverted index, rebuilding if stale."""
if _inverted_index.is_stale():
"""Return the singleton inverted index.
Auto-builds on first call if the index has files but the inverted
index hasn't been built yet (fallback for paths that don't go through
``init_inverted_index()``).
"""
if _inverted_index.doc_count == 0 and any(
vdata.get("files") for vdata in index.values()
):
_inverted_index.rebuild()
return _inverted_index

631
plan.md
View File

@ -1,375 +1,348 @@
# Implementation Plan — Remaining Roadmap Items
# Plan: Incremental InvertedIndex for 40k+ files
## 1. 📝 Documentation OpenAPI enrichie (P3) — 5 min
## Problem Summary
**Goal:** Add `Field(description=...)` to all Pydantic models without descriptions in `backend/main.py`.
Every file mutation calls `_add_file_to_structures` / `_remove_file_from_structures` in `backend/indexer.py`, which increments `_index_generation`. When the next search or autocomplete fires, `get_inverted_index()` in `backend/search.py` detects staleness (`is_stale()` returns True) and triggers a full `rebuild()` — O(N) tokenization of ALL files. With 40k+ files this takes 2-5 seconds, making search unusable.
**Models to update (lines 89311):**
The existing 3-second cooldown hack in `is_stale()` only masks the problem; it doesn't fix it.
| Line | Model | Fields to annotate |
|------|-------|--------------------|
| 89 | `FileContentResponse` | `vault`, `path`, `title`, `tags`, `frontmatter`, `html`, `raw_length`, `extension`, `is_markdown`, `unsupported`, `size_bytes` |
| 103 | `FileRawResponse` | `vault`, `path`, `raw` |
| 110 | `FileSaveResponse` | `status`, `vault`, `path`, `size` |
| 118 | `FileDeleteResponse` | `status`, `vault`, `path` |
| 125 | `SearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
| 136 | `SearchResponse` | `query`, `vault_filter`, `tag_filter`, `count`, `results` (total, offset, limit already have Field) |
| 146 | `TagsResponse` | `vault_filter`, `tags` |
| 152 | `TreeSearchResult` | `vault`, `path`, `name`, `matched_path` (type has Field) |
| 161 | `TreeSearchResponse` | `query`, `vault_filter`, `results` |
| 168 | `AdvancedSearchResultItem` | `vault`, `path`, `title`, `tags`, `score`, `snippet`, `modified` |
| 179 | `SearchFacets` | `tags`, `vaults` (already have default_factory) |
| 185 | `AdvancedSearchResponse` | `results`, `total`, `offset`, `limit`, `facets` (query_time_ms has Field) |
| 196 | `TitleSuggestion` | `vault`, `path`, `title` |
| 203 | `SuggestResponse` | `query`, `suggestions` |
| 209 | `TagSuggestion` | `tag`, `count` |
| 215 | `TagSuggestResponse` | `query`, `suggestions` |
| 221 | `GraphNode` | (all fields already have Field) |
| 231 | `GraphEdge` | (all fields already have Field) |
| 239 | `GraphResponse` | `vault`, `path`, `nodes`, `edges` |
| 247 | `ReloadResponse` | `status`, `vaults` |
| 253 | `HealthResponse` | `status`, `version`, `vaults`, `total_files` |
| 265 | `DirectoryCreateResponse` | `success`, `path` |
| 284 | `DirectoryDeleteResponse` | `success`, `deleted_count` |
| 296 | `FileCreateResponse` | `success`, `path` |
| 307 | `FileRenameResponse` | `success` |
## Solution: Incremental Add/Remove on the InvertedIndex
**Dependency:** None. Pure documentation change.
Add two methods to `InvertedIndex` that update ALL internal data structures incrementally when a single file is added, modified, or removed:
---
- `add_document(vault_name, path, file_info)` — called on create/modify
- `remove_document(vault_name, path)` — called on delete/move-source
## 2. 📊 Dashboard statistiques (P3) — 30 min
Then hook these into `_add_file_to_structures` and `_remove_file_from_structures` in `backend/indexer.py` so the inverted index never goes stale.
### 2a. Backend: `GET /api/dashboard` (new endpoint)
Remove the `is_stale()` / `rebuild()` / cooldown mechanism entirely. The inverted index is always current.
**File:** `backend/main.py` — insert at **line ~2547** (after `/api/diagnostics`)
## Dependency Architecture
**Current import chain:**
```
main.py → search.py → indexer.py (search.py imports `from backend import indexer as _indexer`)
```
**Problem:** `indexer.py` currently does NOT import from `search.py`. If we add `from backend.search import get_inverted_index` to indexer.py, we create a circular import: `search.py → indexer.py → search.py`.
**Fix — Option C (Callback/Hook pattern, simplest):**
Add a module-level hook variable in `backend/indexer.py`:
```python
@app.get("/api/dashboard")
async def api_dashboard(current_user=Depends(require_auth)):
"""Aggregated dashboard statistics across all accessible vaults."""
from backend.indexer import index, vault_config, path_index
user_vaults = current_user.get("_token_vaults") or current_user.get("vaults", [])
# In backend/indexer.py
_on_index_change: callable = None # Called as (action, vault_name, path, file_info_or_None)
vault_stats = []
total_files = 0
total_tags = set()
total_size = 0
def set_index_change_hook(hook):
"""Register a callback for incremental index updates.
hook(action, vault_name, path, file_info_or_None) where action is 'add' or 'remove'.
"""
global _on_index_change
_on_index_change = hook
```
for vname, vdata in index.items():
if "*" not in user_vaults and vname not in user_vaults:
continue
files = vdata.get("files", [])
file_count = len(files)
total_files += file_count
tags = set()
for f in files:
tags.update(f.get("tags", []))
total_size += f.get("size", 0)
total_tags.update(tags)
vault_stats.append({
"name": vname,
"file_count": file_count,
"tag_count": len(tags),
"total_size_bytes": sum(f.get("size", 0) for f in files),
Then at the end of `_add_file_to_structures`:
```python
if _on_index_change:
_on_index_change('add', vault_name, rel_path, file_info)
```
At the end of `_remove_file_from_structures`:
```python
if _on_index_change:
_on_index_change('remove', vault_name, rel_path, file_info) # file_info = removed dict or None
```
Then in `backend/search.py`, at module load time (after InvertedIndex class is defined):
```python
def _on_index_change_hook(action, vault_name, path, file_info):
inv = get_inverted_index_raw() # get without rebuild check
if action == 'add':
inv.add_document(vault_name, path, file_info)
elif action == 'remove':
inv.remove_document(vault_name, path)
# Register the hook — this triggers an import of indexer, but indexer is already imported
# by the time this line runs (since search.py does `from backend import indexer as _indexer` above)
_indexer.set_index_change_hook(_on_index_change_hook)
```
This avoids circular imports completely because:
1. `search.py` already imports `indexer.py` at the top (`from backend import indexer as _indexer`)
2. `indexer.py` never imports `search.py` — it just stores a callback
3. `search.py` registers the callback AFTER the InvertedIndex class is defined
## Detailed Implementation
### Step 1: Add hook variable to `backend/indexer.py`
File: `backend/indexer.py`
Changes:
- After `_index_generation` global (line ~28), add:
```python
_on_index_change: callable = None
```
- Add function `set_index_change_hook(hook)` (bottom of file, near other public functions)
- Add `if _on_index_change: _on_index_change('add', vault_name, file_info['path'], file_info)` at end of `_add_file_to_structures` (~line 665)
- Add `if _on_index_change: _on_index_change('remove', vault_name, rel_path, removed)` at end of `_remove_file_from_structures` (~line 597)
### Step 2: Add `add_document` and `remove_document` to InvertedIndex
File: `backend/search.py`
#### `add_document(vault_name, path, file_info)`
```python
def add_document(self, vault_name: str, path: str, file_info: dict):
"""Add or update a single document in the inverted index."""
doc_key = f"{vault_name}::{path}"
old_file_info = self.doc_info.get(doc_key)
# If updating an existing document, remove old entries first
if old_file_info is not None:
self._remove_doc_internals(doc_key, vault_name, old_file_info)
else:
self.doc_count += 1
# --- Metadata ---
self.doc_info[doc_key] = file_info
self.doc_vault[doc_key] = vault_name
self.vault_docs[vault_name].add(doc_key)
# --- Tags ---
tags = file_info.get("tags", [])
for tag in tags:
self.tag_docs[tag.lower()].add(doc_key)
# --- Title tokens ---
title = file_info.get("title", "")
title_tokens = tokenize(title)
for token in set(title_tokens):
self.title_index[token].append(doc_key)
# --- Normalized title for prefix suggestions ---
norm_title = normalize_text(title)
if norm_title:
self.title_norm_map[norm_title].append({
"vault": vault_name,
"path": path,
"title": title,
})
return {
"vaults": vault_stats,
"total_files": total_files,
"total_tags": len(total_tags),
"total_size_bytes": total_size,
}
# --- Word index (content + title TF) ---
content = file_info.get("content", "")
full_text = title + " " + content
tokens = tokenize(full_text)
tf = defaultdict(int)
for token in tokens:
tf[token] += 1
# Track which tokens are new (not previously indexed) for sorted_tokens update
new_tokens = []
for token, freq in tf.items():
if not self.word_index.get(token):
new_tokens.append(token)
self.word_index[token][doc_key] = freq
# Incrementally update _sorted_tokens (avoid O(V log V) full re-sort)
if new_tokens:
for token in new_tokens:
bisect.insort(self._sorted_tokens, token)
```
**No new model needed** — return plain dict (or add optional `DashboardResponse` model).
### 2b. Frontend: Insert stats widget in dashboard-home
**File:** `frontend/index.html`**after line 364** (`</div>` closing bookmarks section, before `<!-- Recently Opened Section -->`)
Add:
```html
<!-- Stats Section -->
<div id="dashboard-stats-section" class="dashboard-section">
<div class="dashboard-header">
<div class="dashboard-title-row">
<i data-lucide="bar-chart-3" class="dashboard-icon" style="color:var(--accent)"></i>
<h2>Statistiques</h2>
</div>
</div>
<div id="dashboard-stats-grid" class="dashboard-stats-grid">
<div class="dashboard-stats-loading">Chargement...</div>
</div>
</div>
```
**File:** `frontend/app.js` — add `DashboardStatsWidget` module (insert at **line ~3343**, before `DashboardRecentWidget`):
```javascript
const DashboardStatsWidget = {
async load() {
const grid = document.getElementById("dashboard-stats-grid");
if (!grid) return;
grid.innerHTML = '<div class="dashboard-stats-loading">Chargement...</div>';
try {
const data = await api("/api/dashboard");
this.render(data);
} catch (err) {
grid.innerHTML = `<div class="dashboard-recent-empty">Erreur: ${escapeHtml(err.message)}</div>`;
}
},
render(data) {
const grid = document.getElementById("dashboard-stats-grid");
if (!grid) return;
const items = [
{ icon: "files", label: "Fichiers", value: data.total_files.toLocaleString() },
{ icon: "tags", label: "Tags uniques", value: data.total_tags.toLocaleString() },
{ icon: "hard-drive", label: "Taille totale", value: this._formatSize(data.total_size_bytes) },
{ icon: "folder", label: "Vaults", value: data.vaults.length.toString() },
];
grid.innerHTML = items.map(i => `
<div class="stat-card">
<i data-lucide="${i.icon}" class="stat-icon"></i>
<span class="stat-value">${i.value}</span>
<span class="stat-label">${i.label}</span>
</div>
`).join("");
safeCreateIcons();
},
_formatSize(bytes) { /* KB/MB/GB formatter */ }
};
```
**Also update `showWelcome()`** at **line ~5417** — the dashboard rebuild HTML must include the stats section div. And **line ~5490** — add `DashboardStatsWidget.load()` call.
**File:** `frontend/style.css` — add CSS for `.dashboard-stats-grid`, `.stat-card`, `.stat-icon`, `.stat-value`, `.stat-label`.
**Dependency:** Item 1 (Pydantic models) — none. Standalone.
---
## 3. 🔔 Webhooks (P3) — 45 min
### 3a. New backend module: `backend/webhooks.py`
Create full module with:
- `WEBHOOKS_FILE = Path("data/webhooks.json")` — persistence
- `_DEFAULT_WEBHOOKS = []`
- `get_webhooks() -> list` — reads from disk
- `create_webhook(name, url, events, secret=None) -> dict`
- `update_webhook(id, updates) -> dict`
- `delete_webhook(id) -> bool`
- `async def dispatch_webhooks(event_type: str, data: dict)` — calls all webhooks subscribed to `event_type`, sends JSON POST with HMAC-SHA256 signature header if secret is set, timeout 5s, logs failures
- Model: `WebhookConfig` with `id`, `name`, `url`, `events` (list of event type strings), `secret` (optional), `enabled`, `created_at`, `last_fired_at`
### 3b. Backend: CRUD endpoints in `backend/main.py`
Insert at **line ~2470** (before `GET /api/config`):
#### `remove_document(vault_name, path)`
```python
@app.get("/api/webhooks")
@app.post("/api/webhooks")
@app.patch("/api/webhooks/{webhook_id}")
@app.delete("/api/webhooks/{webhook_id}")
def remove_document(self, vault_name: str, path: str):
"""Remove a document from the inverted index."""
doc_key = f"{vault_name}::{path}"
file_info = self.doc_info.get(doc_key)
if file_info is None:
return
self._remove_doc_internals(doc_key, vault_name, file_info)
self.doc_count -= 1
```
Import `from backend.webhooks import get_webhooks, create_webhook, update_webhook, delete_webhook, dispatch_webhooks`
### 3c. Backend: Hook dispatch_webhooks into file events
Add `await dispatch_webhooks("file_created", {...})` calls alongside each `sse_manager.broadcast(...)` call:
| Line | Event | Add dispatch |
|------|-------|-------------|
| ~1252 | `file_deleted` | `dispatch_webhooks("file_deleted", {"vault":..., "path":...})` |
| ~1330 | `directory_created` | `dispatch_webhooks("directory_created", {...})` |
| ~1401 | `directory_renamed` | `dispatch_webhooks("directory_renamed", {...})` |
| ~1462 | `directory_deleted` | `dispatch_webhooks("directory_deleted", {...})` |
| ~1532 | `file_created` | `dispatch_webhooks("file_created", {...})` |
| ~1607 | `file_renamed` | `dispatch_webhooks("file_renamed", {...})` |
### 3d. Frontend: Webhooks management UI in Configurations modal
**File:** `frontend/index.html` — insert at **line ~633** (after `<!-- À propos -->` section, before `</div>` closing config-content):
```html
<section class="config-section">
<h2>🔔 Webhooks</h2>
<p class="config-description">Notifications HTTP vers des services externes lors des changements de fichiers.</p>
<div id="webhooks-list"></div>
<div class="config-add-pattern">
<input type="text" id="webhook-name-input" placeholder="Nom" class="config-input" style="width:120px">
<input type="text" id="webhook-url-input" placeholder="https://..." class="config-input" style="flex:1">
<button id="webhook-add-btn" class="config-btn-add">Ajouter</button>
</div>
</section>
```
**File:** `frontend/app.js` — in `initConfigModal()` at **line ~3918**, add:
```javascript
loadWebhooks(); // in the open handler
// Event binding for webhook add/save/delete buttons
```
Add functions: `loadWebhooks()`, `renderWebhooks(webhooks)`, `addWebhook()`, `deleteWebhook(id)`, `toggleWebhook(id, enabled)`. All use `api("/api/webhooks", ...)`.
**File:** `frontend/style.css` — add `.webhook-item`, `.webhook-toggle`, `.webhook-delete` styles.
**Dependency:** None on items 12. Standalone.
---
## 4. 📤 Publication publique de documents (P3) — 60 min
### 4a. New backend module: `backend/share.py`
Create full module:
- `SHARES_FILE = Path("data/shares.json")`
- ShareToken model: `id`, `vault`, `path`, `token` (64-char hex), `created_by`, `created_at`, `expires_at` (optional, null = never), `access_count`, `last_accessed`
- `create_share(vault, path, created_by, expires_in_hours=None) -> dict` — generates token, stores, returns share info
- `get_share_by_token(token) -> dict | None` — validates expiry, returns share
- `revoke_share(id) -> bool`
- `list_shares(vault_filter=None) -> list` — for admin/settings page
- `record_access(token)` — increments access_count
### 4b. Backend: Endpoints in `backend/main.py`
Insert at **line ~1619** (before `GET /api/file/{vault_name}`):
#### `_remove_doc_internals(doc_key, vault_name, file_info)` (private helper)
```python
# Share management
@app.post("/api/share/{vault_name}")
@app.get("/api/shares")
@app.delete("/api/share/{share_id}")
def _remove_doc_internals(self, doc_key: str, vault_name: str, file_info: dict):
"""Internal: remove one doc_key from all indexes without adjusting doc_count."""
# Public view (no auth required!)
@app.get("/s/{token}")
async def public_share_view(token: str): ...
# --- Metadata ---
self.doc_info.pop(doc_key, None)
self.doc_vault.pop(doc_key, None)
if vault_name in self.vault_docs:
self.vault_docs[vault_name].discard(doc_key)
# --- Tags ---
for tag in file_info.get("tags", []):
td = self.tag_docs.get(tag.lower())
if td:
td.discard(doc_key)
if not td:
del self.tag_docs[tag.lower()]
# --- Title tokens ---
title = file_info.get("title", "")
for token in set(tokenize(title)):
ti = self.title_index.get(token)
if ti:
try:
ti.remove(doc_key)
except ValueError:
pass
if not ti:
del self.title_index[token]
# --- Title norm map ---
norm_title = normalize_text(title)
if norm_title and norm_title in self.title_norm_map:
self.title_norm_map[norm_title] = [
e for e in self.title_norm_map[norm_title]
if not (e["vault"] == vault_name and e["path"] == file_info.get("path"))
]
if not self.title_norm_map[norm_title]:
del self.title_norm_map[norm_title]
# --- Word index ---
content = file_info.get("content", "")
full_text = title + " " + content
for token in set(tokenize(full_text)):
wi = self.word_index.get(token)
if wi:
wi.pop(doc_key, None)
if not wi:
del self.word_index[token]
# Remove from sorted tokens via bisect
idx = bisect.bisect_left(self._sorted_tokens, token)
if idx < len(self._sorted_tokens) and self._sorted_tokens[idx] == token:
self._sorted_tokens.pop(idx)
```
The public view endpoint:
1. Looks up token via `get_share_by_token(token)`
2. Reads the file content
3. Renders markdown with redacted secrets
4. Returns simple HTML page (not SPA) with rendered content
5. Increments access count
Key: `bisect.insort` for insertion and `bisect.bisect_left` + `pop(idx)` for removal keep `_sorted_tokens` sorted in O(V) worst case (list shift) but this is negligible compared to O(N * content) rebuild.
### 4c. Frontend: Share button in file actions
### Step 3: Modify `get_inverted_index()` to NOT check staleness
**File:** `frontend/app.js` — in `renderFile()`, at **line ~3250** (after pop-out button):
```javascript
const shareBtn = el("button", { class: "btn-action", title: "Partager" }, [icon("share-2", 14), document.createTextNode("Partager")]);
shareBtn.addEventListener("click", () => openShareDialog(data.vault, data.path));
```
Add `shareBtn` to the file-actions div at **line ~3300**.
Add `openShareDialog(vault, path)` function that:
- Calls `POST /api/share/{vault}` to create a share
- Shows a modal with the share URL (copyable) and expiration options
- Shows existing shares list with revoke buttons
### 4d. Frontend: Share management in Configurations
**File:** `frontend/index.html` — add share management section in config modal (alongside webhooks).
**File:** `frontend/app.js``loadShares()` and `renderShares()` functions.
**File:** `frontend/style.css` — add `.share-dialog`, `.share-url`, `.share-item` styles.
**Dependency:** None. Standalone, but needs item 1 for clean models.
---
## 5. 🔄 Gestion conflits Syncthing (P2) — 45 min
### 5a. Backend: Conflict file detection
**File:** `backend/indexer.py` — add after `_backlink_index`:
File: `backend/search.py`
```python
def get_conflicts() -> list:
"""Scan all vaults for Syncthing/Nextcloud sync-conflict files."""
conflicts = []
pattern = re.compile(r'\.sync-conflict-(\d{8}-\d{6})\.')
for vname, vdata in index.items():
for f in vdata.get("files", []):
m = pattern.search(f["path"])
if m:
# Find the original file
orig_path = pattern.sub("", f["path"])
conflicts.append({
"vault": vname,
"conflict_path": f["path"],
"original_path": orig_path,
"conflict_date": m.group(1),
"conflict_title": f.get("title", ""),
})
return conflicts
def get_inverted_index() -> InvertedIndex:
"""Return the singleton inverted index. Always up-to-date via hooks."""
return _inverted_index
```
### 5b. Backend: Endpoints
Remove `is_stale()` and the `_source_generation` / `_last_rebuild` / `_rebuild_cooldown` fields. Keep `rebuild()` for initial build and manual reindex (still called once at startup via `build_index`).
**File:** `backend/main.py` — insert at **line ~2547**:
### Step 4: Call `rebuild()` once after initial index build
In `backend/search.py`, register the hook AND call rebuild once:
```python
@app.get("/api/conflicts")
async def api_conflicts(current_user=Depends(require_auth)):
"""List sync-conflict files across accessible vaults."""
...
@app.post("/api/conflicts/resolve")
async def api_conflict_resolve(body: dict, current_user=Depends(require_auth)):
"""Resolve a conflict: keep_local (delete conflict), keep_conflict (replace original)."""
# After InvertedIndex class and _inverted_index = InvertedIndex()
def _on_index_change_hook(action, vault_name, path, file_info):
inv = _inverted_index
try:
if action == 'add':
inv.add_document(vault_name, path, file_info)
elif action == 'remove':
inv.remove_document(vault_name, path)
except Exception as e:
logger.warning(f"Inverted index incremental update failed ({action} {vault_name}/{path}): {e}")
# Fallback: mark for rebuild on next search
inv._needs_rebuild = True
_indexer.set_index_change_hook(_on_index_change_hook)
# Initial build trigger — called after first index is built
def init_inverted_index():
"""Force initial inverted index build. Called after build_index completes."""
_inverted_index.rebuild()
def get_inverted_index() -> InvertedIndex:
"""Return the singleton inverted index."""
# Only check for rebuild if incremental updates have failed
# OR if this is the very first call (doc_count == 0 and index has files)
if getattr(_inverted_index, '_needs_rebuild', False):
_inverted_index.rebuild()
_inverted_index._needs_rebuild = False
elif _inverted_index.doc_count == 0 and any(
vdata.get("files") for vdata in index.values()
):
_inverted_index.rebuild()
return _inverted_index
```
### Step 5: Call `init_inverted_index()` from `build_index` in main.py
In `backend/main.py`, after `build_index()` completes in the lifespan handler, call:
```python
from backend.search import init_inverted_index
init_inverted_index()
```
This ensures the inverted index is built once on startup, then incrementally maintained thereafter.
### Tag prefix index handling
The `tag_norm_map` and `tag_prefix_index` are built per-vault in `rebuild()`. For incremental updates, we need to handle tag changes:
In `add_document`, after adding doc tags:
```python
# Check if any tags are new (not in tag_norm_map)
for tag in tags:
norm_tag = normalize_text(tag)
if norm_tag not in self.tag_norm_map:
self.tag_norm_map[norm_tag] = tag
for plen in range(MIN_PREFIX_LENGTH, len(norm_tag) + 1):
prefix = norm_tag[:plen]
if tag not in self.tag_prefix_index[prefix]:
self.tag_prefix_index[prefix].append(tag)
```
In `_remove_doc_internals`, we do NOT remove tags from `tag_norm_map` or `tag_prefix_index` — these are global (per-vault tag vocabulary), not per-document. They only grow over the lifetime of the inverted index. A periodic `rebuild()` on manual reindex will clean them up.
### Step 6: Remove cooldown hack from search.py
Remove:
- `_last_rebuild` and `_rebuild_cooldown` fields from `InvertedIndex.__init__`
- `is_stale()` method
- `_source_generation` field (no longer needed for staleness, but keep for diagnostics)
### Step 7: Remove coalescence hack from main.py
In `_on_vault_change` in `backend/main.py`, remove:
```python
old_gen = idx._index_generation
...
if idx._index_generation > old_gen + 1:
idx._index_generation = old_gen + 1
```
### 5c. Backend: Diff endpoint
This hack was only needed to reduce the number of inverted index rebuilds. With incremental updates, it's unnecessary — each mutation is cheap.
```python
@app.get("/api/conflicts/diff")
async def api_conflict_diff(vault: str, original: str, conflict: str, current_user=Depends(require_auth)):
"""Return unified diff between original and conflict file."""
import difflib
...
```
## Files Modified (Summary)
### 5d. Frontend: Conflict dashboard widget
| File | Changes |
|------|---------|
| `backend/indexer.py` | +`_on_index_change` hook variable, +`set_index_change_hook()`, +hook calls in `_add_file_to_structures` and `_remove_file_from_structures` |
| `backend/search.py` | +`add_document()`, +`remove_document()`, +`_remove_doc_internals()`, +`init_inverted_index()`, +hook registration, remove `is_stale()`/cooldown, simplify `get_inverted_index()` |
| `backend/main.py` | +`init_inverted_index()` call after `build_index()`, remove coalescence hack in `_on_vault_change` |
**File:** `frontend/index.html` — add `#dashboard-conflicts-section` in dashboard after stats section.
## Risks & Edge Cases
**File:** `frontend/app.js` — add `DashboardConflictsWidget` (pattern similar to recent/bookmarks):
- `load()``GET /api/conflicts`
- `render()` → shows conflict cards with file names and dates
- Click → opens diff modal showing side-by-side comparison
- Action buttons: "Garder l'original", "Garder le conflit"
1. **Thread safety:** `_add_file_to_structures` and `_remove_file_from_structures` are protected by `_index_lock` / `_async_index_lock` in indexer.py. The InvertedIndex methods are called inside these locks, so they're also protected. No additional locking needed.
**File:** `frontend/style.css` — add `.conflict-card`, `.conflict-diff`, `.conflict-actions` styles.
2. **Hook registration timing:** `search.py` imports `indexer.py` at the top, then later registers the hook. The hook is registered at module load time, BEFORE the first call to `build_index`. So `_on_index_change` is set when `build_index` runs — but `build_index` calls `_add_file_to_structures` internally, which would try to incrementally update an empty inverted index. **Fix:** The hook checks `if _inverted_index.doc_count == 0` and skips incremental updates; the initial `rebuild()` handles the bulk load.
**Dependency:** None. Standalone.
3. **Hook call during initial build_index:** `build_index` iterates files and calls `_add_file_to_structures`. The hook fires for each file, calling `add_document()` on an empty inverted index. This is slower than a single `rebuild()`. **Fix:** Add a flag `_inverted_index._ready = False` initially, set to True after `init_inverted_index()`. The hook skips when `_ready` is False.
---
4. **Sorted tokens performance:** `bisect.insort` and `list.pop(idx)` are O(V) worst case for large V. For 40k files, the vocabulary size V is typically 50k-200k tokens. O(V) for a single insertion is ~0.001ms, acceptable. The rebuild() call at startup handles the initial bulk.
## Execution Order (optimal)
1. **Item 1** — OpenAPI docs (quick win, no risk)
2. **Item 2** — Dashboard stats (standalone, visible result)
3. **Item 3** — Webhooks (new module + integration, most code)
4. **Item 4** — Public shares (new module + public view, security-sensitive)
5. **Item 5** — Syncthing conflicts (standalone, nice-to-have)
**Total estimated effort:** ~3 hours
## Files Summary
| File | Action | Items |
|------|--------|-------|
| `backend/main.py` | Edit | 1 (models), 2a (endpoint), 3b+c (webhook CRUD+dispatch), 4b (share+public view), 5b+c (conflicts) |
| `backend/webhooks.py` | **Create** | 3a |
| `backend/share.py` | **Create** | 4a |
| `backend/indexer.py` | Edit | 5a (get_conflicts) |
| `frontend/index.html` | Edit | 2b, 3d, 4d, 5d (dashboard + config sections) |
| `frontend/app.js` | Edit | 2b, 3d, 4c, 5d (widgets + share button + webhook UI) |
| `frontend/style.css` | Edit | 2b, 3d, 4c, 5d (all new CSS classes) |
5. **tag_norm_map / tag_prefix_index growth:** These grow monotonically (never shrink on incremental remove). With 40k files and thousands of tags, this is a few thousand entries — negligible. A manual "Réindexer" button triggers a full `rebuild()` to clean up.