ObsiGate/backend/image_processor.py

import re
import logging
from pathlib import Path
from typing import Optional, Tuple
from html import escape as html_escape

from backend.attachment_indexer import resolve_image_path

logger = logging.getLogger("obsigate.image_processor")


def preprocess_images(
    content: str,
    vault_name: str,
    vault_root: Path,
    current_file_path: Optional[Path] = None,
    attachments_path: Optional[str] = None
) -> str:
    """Preprocess markdown content to transform all Obsidian image syntaxes.

    Handles 4 image syntax formats:
    1. Standard Markdown with HTML attributes: [<img ... src="path"/>](url)
    2. Obsidian wiki-link embed with full path: ![[path/to/image.ext]]
    3. Obsidian wiki-link embed with filename only: ![[image.ext]]
    4. Standard Markdown image: ![alt](path)

    All image paths are resolved using the multi-strategy resolver and
    transformed to use the /api/image endpoint.

    Args:
        content: Raw markdown content.
        vault_name: Name of the vault.
        vault_root: Absolute path to vault root.
        current_file_path: Absolute path to the current markdown file.
        attachments_path: Optional configured attachments folder.

    Returns:
        Preprocessed markdown with resolved image paths.
    """
    # Process in order of specificity to avoid conflicts

    # 1. Handle [<img ... src="path"/>](url) - HTML img in markdown link
    content = _process_html_img_in_link(content, vault_name, vault_root, current_file_path, attachments_path)

    # 2. Handle ![[image]] - Obsidian wiki-link embeds
    content = _process_wikilink_embeds(content, vault_name, vault_root, current_file_path, attachments_path)

    # 3. Handle ![alt](path) - Standard markdown images
    content = _process_standard_images(content, vault_name, vault_root, current_file_path, attachments_path)

    return content


def _process_html_img_in_link(
    content: str,
    vault_name: str,
    vault_root: Path,
    current_file_path: Optional[Path],
    attachments_path: Optional[str]
) -> str:
    """Process [<img ... src="path"/>](url) syntax.

    Transforms to: <a href="url"><img src="/api/image/vault?path=resolved" width="..." height="..."/></a>
    """
    # Pattern: [<img ... src="path" ... />](url)
    pattern = r'\[<img\s+([^>]*?)\s*/?>\]\(([^)]+)\)'

    def replace_html_img(match):
        img_attrs = match.group(1)
        link_url = match.group(2)

        # Extract src attribute
        src_match = re.search(r'src\s*=\s*["\']([^"\']+)["\']', img_attrs)
        if not src_match:
            return match.group(0)  # No src, return unchanged

        src_path = src_match.group(1)

        # Extract width and height if present
        width_match = re.search(r'width\s*=\s*["\']([^"\']+)["\']', img_attrs)
        height_match = re.search(r'height\s*=\s*["\']([^"\']+)["\']', img_attrs)

        width = width_match.group(1) if width_match else None
        height = height_match.group(1) if height_match else None

        # Resolve the image path
        resolved_path = resolve_image_path(
            src_path, vault_name, vault_root, current_file_path, attachments_path
        )

        if resolved_path:
            # Build the API URL
            try:
                rel_path = resolved_path.relative_to(vault_root)
                api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
            except ValueError:
                # Path is outside vault - use absolute path encoding
                api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'

            # Build img tag with attributes
            img_tag = f'<img src="{api_url}"'
            if width:
                img_tag += f' width="{html_escape(width)}"'
            if height:
                img_tag += f' height="{html_escape(height)}"'
            img_tag += ' />'

            # Wrap in link
            return f'<a href="{html_escape(link_url)}">{img_tag}</a>'
        else:
            # Image not found - show placeholder
            placeholder = f'<span class="image-not-found" title="Image not found: {html_escape(src_path)}">[image not found: {html_escape(Path(src_path).name)}]</span>'
            return f'<a href="{html_escape(link_url)}">{placeholder}</a>'

    return re.sub(pattern, replace_html_img, content)


def _process_wikilink_embeds(
    content: str,
    vault_name: str,
    vault_root: Path,
    current_file_path: Optional[Path],
    attachments_path: Optional[str]
) -> str:
    """Process ![[image]] and ![[path/to/image]] wiki-link embeds.

    Transforms to: ![](resolved_path)
    """
    # Pattern: ![[path/to/image.ext]] or ![[image.ext]]
    pattern = r'!\[\[([^\]]+?\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\]\]'

    def replace_wikilink(match):
        image_target = match.group(1).strip()

        # Resolve the image path
        resolved_path = resolve_image_path(
            image_target, vault_name, vault_root, current_file_path, attachments_path
        )

        if resolved_path:
            # Build the API URL
            try:
                rel_path = resolved_path.relative_to(vault_root)
                api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
            except ValueError:
                api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'

            # Transform to standard markdown image
            return f'![{Path(image_target).stem}]({api_url})'
        else:
            # Image not found - show placeholder
            return f'<span class="image-not-found" title="Image not found: {html_escape(image_target)}">[image not found: {html_escape(Path(image_target).name)}]</span>'

    return re.sub(pattern, replace_wikilink, content, flags=re.IGNORECASE)


def _process_standard_images(
    content: str,
    vault_name: str,
    vault_root: Path,
    current_file_path: Optional[Path],
    attachments_path: Optional[str]
) -> str:
    """Process ![alt](path) standard markdown images.

    Resolves the path and updates to use /api/image endpoint.
    """
    # Pattern: ![alt](path) - match everything including spaces, parentheses, and emojis
    # Captures from ]( to ) where the content ends with an image extension
    pattern = r'!\[([^\]]*)\]\((.+\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\)'

    def replace_standard_img(match):
        alt_text = match.group(1)
        image_path = match.group(2).strip()

        # Skip if it's already an absolute URL (http://, https://, //)
        if re.match(r'^(https?://|//)', image_path):
            return match.group(0)  # Keep external URLs unchanged

        # Resolve the image path
        resolved_path = resolve_image_path(
            image_path, vault_name, vault_root, current_file_path, attachments_path
        )

        if resolved_path:
            # Build the API URL
            try:
                rel_path = resolved_path.relative_to(vault_root)
                api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
            except ValueError:
                api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'

            return f'![{alt_text}]({api_url})'
        else:
            # Image not found - show placeholder
            return f'<span class="image-not-found" title="Image not found: {html_escape(image_path)}">[image not found: {html_escape(Path(image_path).name)}]</span>'

    return re.sub(pattern, replace_standard_img, content, flags=re.IGNORECASE)