Add fallback parser for markdown files with invalid YAML frontmatter to prevent indexing failures
This commit is contained in:
parent
2ed5f65a7a
commit
1213eb4781
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Dict, List, Optional, Any
|
from typing import Dict, List, Optional, Any
|
||||||
@ -62,6 +63,20 @@ def _extract_title(post: frontmatter.Post, filepath: Path) -> str:
|
|||||||
return str(title)
|
return str(title)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_markdown_file(raw: str) -> frontmatter.Post:
|
||||||
|
"""Parse markdown frontmatter, falling back to plain content if YAML is invalid."""
|
||||||
|
try:
|
||||||
|
return frontmatter.loads(raw)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"Invalid frontmatter detected, falling back to plain markdown parsing: {exc}")
|
||||||
|
content = raw
|
||||||
|
if raw.startswith("---"):
|
||||||
|
match = re.match(r"^---\s*\r?\n.*?\r?\n---\s*\r?\n?", raw, flags=re.DOTALL)
|
||||||
|
if match:
|
||||||
|
content = raw[match.end():]
|
||||||
|
return frontmatter.Post(content, **{})
|
||||||
|
|
||||||
|
|
||||||
def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
|
def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
|
||||||
"""Synchronously scan a single vault directory."""
|
"""Synchronously scan a single vault directory."""
|
||||||
vault_root = Path(vault_path)
|
vault_root = Path(vault_path)
|
||||||
@ -96,7 +111,7 @@ def _scan_vault(vault_name: str, vault_path: str) -> Dict[str, Any]:
|
|||||||
content_preview = raw[:200].strip()
|
content_preview = raw[:200].strip()
|
||||||
|
|
||||||
if ext == ".md":
|
if ext == ".md":
|
||||||
post = frontmatter.loads(raw)
|
post = parse_markdown_file(raw)
|
||||||
tags = _extract_tags(post)
|
tags = _extract_tags(post)
|
||||||
title = _extract_title(post, fpath)
|
title = _extract_title(post, fpath)
|
||||||
content_preview = post.content[:200].strip()
|
content_preview = post.content[:200].strip()
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from backend.indexer import (
|
|||||||
index,
|
index,
|
||||||
get_vault_data,
|
get_vault_data,
|
||||||
find_file_in_index,
|
find_file_in_index,
|
||||||
|
parse_markdown_file,
|
||||||
SUPPORTED_EXTENSIONS,
|
SUPPORTED_EXTENSIONS,
|
||||||
)
|
)
|
||||||
from backend.search import search, get_all_tags
|
from backend.search import search, get_all_tags
|
||||||
@ -211,7 +212,7 @@ async def api_file(vault_name: str, path: str = Query(..., description="Relative
|
|||||||
ext = file_path.suffix.lower()
|
ext = file_path.suffix.lower()
|
||||||
|
|
||||||
if ext == ".md":
|
if ext == ".md":
|
||||||
post = frontmatter.loads(raw)
|
post = parse_markdown_file(raw)
|
||||||
|
|
||||||
# Extract metadata
|
# Extract metadata
|
||||||
tags = post.metadata.get("tags", [])
|
tags = post.metadata.get("tags", [])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user