Add fallback parser for markdown files with invalid YAML frontmatter to prevent indexing failures

This commit is contained in:
Bruno Charest 2026-03-21 11:20:03 -04:00
parent 2ed5f65a7a
commit 1213eb4781
2 changed files with 18 additions and 2 deletions

View File

@ -1,6 +1,7 @@
import os import os
import asyncio import asyncio
import logging import logging
import re
from pathlib import Path from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Dict, List, Optional, Any from typing import Dict, List, Optional, Any
@ -62,6 +63,20 @@ def _extract_title(post: frontmatter.Post, filepath: Path) -> str:
return str(title) return str(title)
def parse_markdown_file(raw: str) -> frontmatter.Post:
"""Parse markdown frontmatter, falling back to plain content if YAML is invalid."""
try:
return frontmatter.loads(raw)
except Exception as exc:
logger.warning(f"Invalid frontmatter detected, falling back to plain markdown parsing: {exc}")
content = raw
if raw.startswith("---"):
match = re.match(r"^---\s*\r?\n.*?\r?\n---\s*\r?\n?", raw, flags=re.DOTALL)
if match:
content = raw[match.end():]
return frontmatter.Post(content, **{})
def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]: def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
"""Synchronously scan a single vault directory.""" """Synchronously scan a single vault directory."""
vault_root = Path(vault_path) vault_root = Path(vault_path)
@ -96,7 +111,7 @@ def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
content_preview = raw[:200].strip() content_preview = raw[:200].strip()
if ext == ".md": if ext == ".md":
post = frontmatter.loads(raw) post = parse_markdown_file(raw)
tags = _extract_tags(post) tags = _extract_tags(post)
title = _extract_title(post, fpath) title = _extract_title(post, fpath)
content_preview = post.content[:200].strip() content_preview = post.content[:200].strip()

View File

@ -16,6 +16,7 @@ from backend.indexer import (
index, index,
get_vault_data, get_vault_data,
find_file_in_index, find_file_in_index,
parse_markdown_file,
SUPPORTED_EXTENSIONS, SUPPORTED_EXTENSIONS,
) )
from backend.search import search, get_all_tags from backend.search import search, get_all_tags
@ -211,7 +212,7 @@ async def api_file(vault_name: str, path: str = Query(..., description="Relative
ext = file_path.suffix.lower() ext = file_path.suffix.lower()
if ext == ".md": if ext == ".md":
post = frontmatter.loads(raw) post = parse_markdown_file(raw)
# Extract metadata # Extract metadata
tags = post.metadata.get("tags", []) tags = post.metadata.get("tags", [])