diff --git a/backend/indexer.py b/backend/indexer.py index 90b6d8e..18ce1ee 100644 --- a/backend/indexer.py +++ b/backend/indexer.py @@ -23,6 +23,9 @@ _index_lock = threading.Lock() # O(1) lookup table for wikilink resolution: {filename_lower: [{vault, path}, ...]} _file_lookup: Dict[str, List[Dict[str, str]]] = {} +# O(1) path index for tree filtering: {vault_name: [{path, name, type}, ...]} +path_index: Dict[str, List[Dict[str, str]]] = {} + # Maximum content size stored per file for in-memory search (bytes) SEARCH_CONTENT_LIMIT = 100_000 @@ -162,28 +165,50 @@ def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]: vault_path: Absolute filesystem path to the vault root. Returns: - Dict with keys ``files`` (list), ``tags`` (counter dict), ``path`` (str). + Dict with keys ``files`` (list), ``tags`` (counter dict), ``path`` (str), ``paths`` (list). """ vault_root = Path(vault_path) files: List[Dict[str, Any]] = [] tag_counts: Dict[str, int] = {} + paths: List[Dict[str, str]] = [] if not vault_root.exists(): logger.warning(f"Vault path does not exist: {vault_path}") - return {"files": [], "tags": {}, "path": vault_path} + return {"files": [], "tags": {}, "path": vault_path, "paths": []} for fpath in vault_root.rglob("*"): - if not fpath.is_file(): - continue - # Skip hidden files and files inside hidden directories + # Skip hidden files and directories rel_parts = fpath.relative_to(vault_root).parts if any(part.startswith(".") for part in rel_parts): continue + + rel_path_str = str(fpath.relative_to(vault_root)).replace("\\", "/") + + # Add all paths (files and directories) to path index + if fpath.is_dir(): + paths.append({ + "path": rel_path_str, + "name": fpath.name, + "type": "directory" + }) + continue + + # Files only from here + if not fpath.is_file(): + continue ext = fpath.suffix.lower() # Also match extensionless files named like Dockerfile, Makefile basename_lower = fpath.name.lower() if ext not in SUPPORTED_EXTENSIONS and basename_lower not in ("dockerfile", "makefile", "cmakelists.txt"): continue + + # Add file to path index + paths.append({ + "path": rel_path_str, + "name": fpath.name, + "type": "file" + }) + try: relative = fpath.relative_to(vault_root) stat = fpath.stat() @@ -225,8 +250,8 @@ def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]: logger.error(f"Error indexing {fpath}: {e}") continue - logger.info(f"Vault '{vault_name}': indexed {len(files)} files, {len(tag_counts)} unique tags") - return {"files": files, "tags": tag_counts, "path": vault_path} + logger.info(f"Vault '{vault_name}': indexed {len(files)} files, {len(paths)} paths, {len(tag_counts)} unique tags") + return {"files": files, "tags": tag_counts, "path": vault_path, "paths": paths} async def build_index() -> None: @@ -265,12 +290,19 @@ async def build_index() -> None: new_lookup[key] = [] new_lookup[key].append(entry) + # Build path index for tree filtering + new_path_index: Dict[str, List[Dict[str, str]]] = {} + for vname, vdata in new_index.items(): + new_path_index[vname] = vdata.get("paths", []) + # Atomic swap under lock for thread safety during concurrent reads with _index_lock: index.clear() index.update(new_index) _file_lookup.clear() _file_lookup.update(new_lookup) + path_index.clear() + path_index.update(new_path_index) total_files = sum(len(v["files"]) for v in index.values()) logger.info(f"Index built: {len(index)} vaults, {total_files} total files") diff --git a/backend/main.py b/backend/main.py index ea36011..dab5cff 100644 --- a/backend/main.py +++ b/backend/main.py @@ -16,6 +16,7 @@ from backend.indexer import ( build_index, reload_index, index, + path_index, get_vault_data, get_vault_names, find_file_in_index, @@ -610,65 +611,40 @@ async def api_tree_search( q: str = Query("", description="Search query"), vault: str = Query("all", description="Vault filter"), ): - """Search for files and directories in the tree structure. + """Search for files and directories in the tree structure using pre-built index. - Searches through the file index for matching paths, returning - both files and their parent directories that match the query. + Uses the in-memory path index for instant filtering without filesystem access. Args: q: Search string to match against file/directory paths. vault: Vault name or "all" to search everywhere. Returns: - ``TreeSearchResponse`` with matching paths and their parent directories. + ``TreeSearchResponse`` with matching paths. """ if not q: return {"query": q, "vault_filter": vault, "results": []} query_lower = q.lower() results = [] - seen_paths = set() # Avoid duplicates - vaults_to_search = [vault] if vault != "all" else list(index.keys()) + vaults_to_search = [vault] if vault != "all" else list(path_index.keys()) for vault_name in vaults_to_search: - vault_data = get_vault_data(vault_name) - if not vault_data: - continue - - vault_root = Path(vault_data["path"]) - if not vault_root.exists(): - continue + vault_paths = path_index.get(vault_name, []) - for fpath in vault_root.rglob("*"): - if fpath.name.startswith("."): - continue - - try: - rel_path = str(fpath.relative_to(vault_root)).replace("\\", "/") - path_lower = rel_path.lower() - name_lower = fpath.name.lower() - - if query_lower not in name_lower and query_lower not in path_lower: - continue - - entry_type = "directory" if fpath.is_dir() else "file" - entry_key = f"{vault_name}:{entry_type}:{rel_path}" - if entry_key in seen_paths: - continue - - seen_paths.add(entry_key) + for entry in vault_paths: + path_lower = entry["path"].lower() + name_lower = entry["name"].lower() + + if query_lower in name_lower or query_lower in path_lower: results.append({ "vault": vault_name, - "path": rel_path, - "name": fpath.name, - "type": entry_type, - "matched_path": rel_path, + "path": entry["path"], + "name": entry["name"], + "type": entry["type"], + "matched_path": entry["path"], }) - except PermissionError: - continue - except Exception: - continue return {"query": q, "vault_filter": vault, "results": results}