Add fallback parser for markdown files with invalid YAML frontmatter to prevent indexing failures

This commit is contained in:
Bruno Charest 2026-03-21 11:20:03 -04:00
parent 2ed5f65a7a
commit 1213eb4781
2 changed files with 18 additions and 2 deletions

View File

@ -1,6 +1,7 @@
import os
import asyncio
import logging
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
@ -62,6 +63,20 @@ def _extract_title(post: frontmatter.Post, filepath: Path) -> str:
return str(title)
def parse_markdown_file(raw: str) -> frontmatter.Post:
"""Parse markdown frontmatter, falling back to plain content if YAML is invalid."""
try:
return frontmatter.loads(raw)
except Exception as exc:
logger.warning(f"Invalid frontmatter detected, falling back to plain markdown parsing: {exc}")
content = raw
if raw.startswith("---"):
match = re.match(r"^---\s*\r?\n.*?\r?\n---\s*\r?\n?", raw, flags=re.DOTALL)
if match:
content = raw[match.end():]
return frontmatter.Post(content, **{})
def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
"""Synchronously scan a single vault directory."""
vault_root = Path(vault_path)
@ -96,7 +111,7 @@ def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
content_preview = raw[:200].strip()
if ext == ".md":
post = frontmatter.loads(raw)
post = parse_markdown_file(raw)
tags = _extract_tags(post)
title = _extract_title(post, fpath)
content_preview = post.content[:200].strip()

View File

@ -16,6 +16,7 @@ from backend.indexer import (
index,
get_vault_data,
find_file_in_index,
parse_markdown_file,
SUPPORTED_EXTENSIONS,
)
from backend.search import search, get_all_tags
@ -211,7 +212,7 @@ async def api_file(vault_name: str, path: str = Query(..., description="Relative
ext = file_path.suffix.lower()
if ext == ".md":
post = frontmatter.loads(raw)
post = parse_markdown_file(raw)
# Extract metadata
tags = post.metadata.get("tags", [])