import re import logging from pathlib import Path from typing import Optional, Tuple from html import escape as html_escape from backend.attachment_indexer import resolve_image_path logger = logging.getLogger("obsigate.image_processor") def preprocess_images( content: str, vault_name: str, vault_root: Path, current_file_path: Optional[Path] = None, attachments_path: Optional[str] = None ) -> str: """Preprocess markdown content to transform all Obsidian image syntaxes. Handles 4 image syntax formats: 1. Standard Markdown with HTML attributes: [](url) 2. Obsidian wiki-link embed with full path: ![[path/to/image.ext]] 3. Obsidian wiki-link embed with filename only: ![[image.ext]] 4. Standard Markdown image: ![alt](path) All image paths are resolved using the multi-strategy resolver and transformed to use the /api/image endpoint. Args: content: Raw markdown content. vault_name: Name of the vault. vault_root: Absolute path to vault root. current_file_path: Absolute path to the current markdown file. attachments_path: Optional configured attachments folder. Returns: Preprocessed markdown with resolved image paths. """ # Process in order of specificity to avoid conflicts # 1. Handle [](url) - HTML img in markdown link content = _process_html_img_in_link(content, vault_name, vault_root, current_file_path, attachments_path) # 2. Handle ![[image]] - Obsidian wiki-link embeds content = _process_wikilink_embeds(content, vault_name, vault_root, current_file_path, attachments_path) # 3. Handle ![alt](path) - Standard markdown images content = _process_standard_images(content, vault_name, vault_root, current_file_path, attachments_path) return content def _process_html_img_in_link( content: str, vault_name: str, vault_root: Path, current_file_path: Optional[Path], attachments_path: Optional[str] ) -> str: """Process [](url) syntax. Transforms to: """ # Pattern: [](url) pattern = r'\[]*?)\s*/?>\]\(([^)]+)\)' def replace_html_img(match): img_attrs = match.group(1) link_url = match.group(2) # Extract src attribute src_match = re.search(r'src\s*=\s*["\']([^"\']+)["\']', img_attrs) if not src_match: return match.group(0) # No src, return unchanged src_path = src_match.group(1) # Extract width and height if present width_match = re.search(r'width\s*=\s*["\']([^"\']+)["\']', img_attrs) height_match = re.search(r'height\s*=\s*["\']([^"\']+)["\']', img_attrs) width = width_match.group(1) if width_match else None height = height_match.group(1) if height_match else None # Resolve the image path resolved_path = resolve_image_path( src_path, vault_name, vault_root, current_file_path, attachments_path ) if resolved_path: # Build the API URL try: rel_path = resolved_path.relative_to(vault_root) api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}' except ValueError: # Path is outside vault - use absolute path encoding api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}' # Build img tag with attributes img_tag = f'{img_tag}' else: # Image not found - show placeholder placeholder = f'[image not found: {html_escape(Path(src_path).name)}]' return f'{placeholder}' return re.sub(pattern, replace_html_img, content) def _process_wikilink_embeds( content: str, vault_name: str, vault_root: Path, current_file_path: Optional[Path], attachments_path: Optional[str] ) -> str: """Process ![[image]] and ![[path/to/image]] wiki-link embeds. Transforms to: ![](resolved_path) """ # Pattern: ![[path/to/image.ext]] or ![[image.ext]] pattern = r'!\[\[([^\]]+?\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\]\]' def replace_wikilink(match): image_target = match.group(1).strip() # Resolve the image path resolved_path = resolve_image_path( image_target, vault_name, vault_root, current_file_path, attachments_path ) if resolved_path: # Build the API URL try: rel_path = resolved_path.relative_to(vault_root) api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}' except ValueError: api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}' # Transform to standard markdown image return f'![{Path(image_target).stem}]({api_url})' else: # Image not found - show placeholder return f'[image not found: {html_escape(Path(image_target).name)}]' return re.sub(pattern, replace_wikilink, content, flags=re.IGNORECASE) def _process_standard_images( content: str, vault_name: str, vault_root: Path, current_file_path: Optional[Path], attachments_path: Optional[str] ) -> str: """Process ![alt](path) standard markdown images. Resolves the path and updates to use /api/image endpoint. """ # Pattern: ![alt](path) - match everything including spaces, parentheses, and emojis # Captures from ]( to ) where the content ends with an image extension pattern = r'!\[([^\]]*)\]\((.+\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\)' def replace_standard_img(match): alt_text = match.group(1) image_path = match.group(2).strip() # Skip if it's already an absolute URL (http://, https://, //) if re.match(r'^(https?://|//)', image_path): return match.group(0) # Keep external URLs unchanged # Resolve the image path resolved_path = resolve_image_path( image_path, vault_name, vault_root, current_file_path, attachments_path ) if resolved_path: # Build the API URL try: rel_path = resolved_path.relative_to(vault_root) api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}' except ValueError: api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}' return f'![{alt_text}]({api_url})' else: # Image not found - show placeholder return f'[image not found: {html_escape(Path(image_path).name)}]' return re.sub(pattern, replace_standard_img, content, flags=re.IGNORECASE)