ObsiGate/backend/image_processor.py

199 lines
7.6 KiB
Python

import re
import logging
from pathlib import Path
from typing import Optional, Tuple
from html import escape as html_escape
from backend.attachment_indexer import resolve_image_path
logger = logging.getLogger("obsigate.image_processor")
def preprocess_images(
content: str,
vault_name: str,
vault_root: Path,
current_file_path: Optional[Path] = None,
attachments_path: Optional[str] = None
) -> str:
"""Preprocess markdown content to transform all Obsidian image syntaxes.
Handles 4 image syntax formats:
1. Standard Markdown with HTML attributes: [<img ... src="path"/>](url)
2. Obsidian wiki-link embed with full path: ![[path/to/image.ext]]
3. Obsidian wiki-link embed with filename only: ![[image.ext]]
4. Standard Markdown image: ![alt](path)
All image paths are resolved using the multi-strategy resolver and
transformed to use the /api/image endpoint.
Args:
content: Raw markdown content.
vault_name: Name of the vault.
vault_root: Absolute path to vault root.
current_file_path: Absolute path to the current markdown file.
attachments_path: Optional configured attachments folder.
Returns:
Preprocessed markdown with resolved image paths.
"""
# Process in order of specificity to avoid conflicts
# 1. Handle [<img ... src="path"/>](url) - HTML img in markdown link
content = _process_html_img_in_link(content, vault_name, vault_root, current_file_path, attachments_path)
# 2. Handle ![[image]] - Obsidian wiki-link embeds
content = _process_wikilink_embeds(content, vault_name, vault_root, current_file_path, attachments_path)
# 3. Handle ![alt](path) - Standard markdown images
content = _process_standard_images(content, vault_name, vault_root, current_file_path, attachments_path)
return content
def _process_html_img_in_link(
content: str,
vault_name: str,
vault_root: Path,
current_file_path: Optional[Path],
attachments_path: Optional[str]
) -> str:
"""Process [<img ... src="path"/>](url) syntax.
Transforms to: <a href="url"><img src="/api/image/vault?path=resolved" width="..." height="..."/></a>
"""
# Pattern: [<img ... src="path" ... />](url)
pattern = r'\[<img\s+([^>]*?)\s*/?>\]\(([^)]+)\)'
def replace_html_img(match):
img_attrs = match.group(1)
link_url = match.group(2)
# Extract src attribute
src_match = re.search(r'src\s*=\s*["\']([^"\']+)["\']', img_attrs)
if not src_match:
return match.group(0) # No src, return unchanged
src_path = src_match.group(1)
# Extract width and height if present
width_match = re.search(r'width\s*=\s*["\']([^"\']+)["\']', img_attrs)
height_match = re.search(r'height\s*=\s*["\']([^"\']+)["\']', img_attrs)
width = width_match.group(1) if width_match else None
height = height_match.group(1) if height_match else None
# Resolve the image path
resolved_path = resolve_image_path(
src_path, vault_name, vault_root, current_file_path, attachments_path
)
if resolved_path:
# Build the API URL
try:
rel_path = resolved_path.relative_to(vault_root)
api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
except ValueError:
# Path is outside vault - use absolute path encoding
api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'
# Build img tag with attributes
img_tag = f'<img src="{api_url}"'
if width:
img_tag += f' width="{html_escape(width)}"'
if height:
img_tag += f' height="{html_escape(height)}"'
img_tag += ' />'
# Wrap in link
return f'<a href="{html_escape(link_url)}">{img_tag}</a>'
else:
# Image not found - show placeholder
placeholder = f'<span class="image-not-found" title="Image not found: {html_escape(src_path)}">[image not found: {html_escape(Path(src_path).name)}]</span>'
return f'<a href="{html_escape(link_url)}">{placeholder}</a>'
return re.sub(pattern, replace_html_img, content)
def _process_wikilink_embeds(
content: str,
vault_name: str,
vault_root: Path,
current_file_path: Optional[Path],
attachments_path: Optional[str]
) -> str:
"""Process ![[image]] and ![[path/to/image]] wiki-link embeds.
Transforms to: ![](resolved_path)
"""
# Pattern: ![[path/to/image.ext]] or ![[image.ext]]
pattern = r'!\[\[([^\]]+?\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\]\]'
def replace_wikilink(match):
image_target = match.group(1).strip()
# Resolve the image path
resolved_path = resolve_image_path(
image_target, vault_name, vault_root, current_file_path, attachments_path
)
if resolved_path:
# Build the API URL
try:
rel_path = resolved_path.relative_to(vault_root)
api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
except ValueError:
api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'
# Transform to standard markdown image
return f'![{Path(image_target).stem}]({api_url})'
else:
# Image not found - show placeholder
return f'<span class="image-not-found" title="Image not found: {html_escape(image_target)}">[image not found: {html_escape(Path(image_target).name)}]</span>'
return re.sub(pattern, replace_wikilink, content, flags=re.IGNORECASE)
def _process_standard_images(
content: str,
vault_name: str,
vault_root: Path,
current_file_path: Optional[Path],
attachments_path: Optional[str]
) -> str:
"""Process ![alt](path) standard markdown images.
Resolves the path and updates to use /api/image endpoint.
"""
# Pattern: ![alt](path) - match everything including spaces, parentheses, and emojis
# Captures from ]( to ) where the content ends with an image extension
pattern = r'!\[([^\]]*)\]\((.+\.(?:png|jpg|jpeg|gif|svg|webp|bmp|ico))\)'
def replace_standard_img(match):
alt_text = match.group(1)
image_path = match.group(2).strip()
# Skip if it's already an absolute URL (http://, https://, //)
if re.match(r'^(https?://|//)', image_path):
return match.group(0) # Keep external URLs unchanged
# Resolve the image path
resolved_path = resolve_image_path(
image_path, vault_name, vault_root, current_file_path, attachments_path
)
if resolved_path:
# Build the API URL
try:
rel_path = resolved_path.relative_to(vault_root)
api_url = f'/api/image/{vault_name}?path={str(rel_path).replace(chr(92), "/")}'
except ValueError:
api_url = f'/api/image/{vault_name}?path={str(resolved_path).replace(chr(92), "/")}'
return f'![{alt_text}]({api_url})'
else:
# Image not found - show placeholder
return f'<span class="image-not-found" title="Image not found: {html_escape(image_path)}">[image not found: {html_escape(Path(image_path).name)}]</span>'
return re.sub(pattern, replace_standard_img, content, flags=re.IGNORECASE)