From aa2c05b05fba98cddf809089a48611576cb2d470 Mon Sep 17 00:00:00 2001 From: Bruno Charest Date: Wed, 27 May 2026 08:15:39 -0400 Subject: [PATCH] Add regex search with highlighted snippet support --- backend/search.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++- frontend/app.js | 4 ++- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/backend/search.py b/backend/search.py index d7fa3ee..5a84b70 100644 --- a/backend/search.py +++ b/backend/search.py @@ -167,6 +167,69 @@ def _extract_highlighted_snippet( return prefix + highlighted + suffix +def _extract_regex_snippet( + content: str, + pattern_text: str, + context_chars: int = SNIPPET_CONTEXT_CHARS, + max_highlights: int = MAX_SNIPPET_HIGHLIGHTS, +) -> str: + """Extract a snippet and highlight actual regex matches. + + Unlike ``_extract_highlighted_snippet`` which works with tokenized terms, + this function compiles the raw regex pattern and wraps each match in + ```` tags. Falls back to the beginning of content if no match. + + Args: + content: Full text to search within. + pattern_text: Raw regex pattern string. + context_chars: Number of context characters on each side. + max_highlights: Maximum highlighted regions. + + Returns: + HTML snippet string with ```` highlights. + """ + if not content or not pattern_text: + return content[:200].strip() if content else "" + + try: + pattern = re.compile(pattern_text, re.IGNORECASE) + except re.error: + return _escape_html(content[:200].strip()) + + matches = list(pattern.finditer(content)) + if not matches: + return _escape_html(content[:200].strip()) + + # Find the first match position for centering the snippet + best_pos = matches[0].start() + start = max(0, best_pos - context_chars) + end = min(len(content), best_pos + context_chars + 40) + snippet = content[start:end].strip() + prefix = "..." if start > 0 else "" + suffix = "..." if end < len(content) else "" + + # Highlight regex matches in the snippet (re-compile on snippet for correct positions) + snippet_matches = list(pattern.finditer(snippet)) + if not snippet_matches: + return prefix + _escape_html(snippet) + suffix + + parts = [] + prev = 0 + count = 0 + for m in snippet_matches: + if count >= max_highlights: + break + if m.start() > prev: + parts.append(_escape_html(snippet[prev:m.start()])) + parts.append(f"{_escape_html(snippet[m.start():m.end()])}") + prev = m.end() + count += 1 + if prev < len(snippet): + parts.append(_escape_html(snippet[prev:])) + + return prefix + "".join(parts) + suffix + + def _highlight_terms(text: str, terms: List[str], max_highlights: int) -> str: """Wrap occurrences of *terms* in *text* with ```` tags. @@ -1005,7 +1068,11 @@ def advanced_search( # Build highlighted snippet content = file_info.get("content", "") if has_terms: - snippet = _extract_highlighted_snippet(content, query_terms) + if regex: + raw_regex = " ".join(query_terms_raw) if query_terms_raw else "" + snippet = _extract_regex_snippet(content, raw_regex) + else: + snippet = _extract_highlighted_snippet(content, query_terms) else: snippet = _escape_html(content[:200].strip()) if content else "" diff --git a/frontend/app.js b/frontend/app.js index 90545f5..6c14cc2 100644 --- a/frontend/app.js +++ b/frontend/app.js @@ -5173,7 +5173,9 @@ titleDiv.textContent = r.title; } const snippetDiv = el("div", { class: "search-result-snippet" }); - if (query && query.trim() && r.snippet) { + if (r.snippet && r.snippet.includes("")) { + snippetDiv.innerHTML = r.snippet; + } else if (query && query.trim() && r.snippet) { highlightSearchText(snippetDiv, r.snippet, query, searchCaseSensitive); } else { snippetDiv.textContent = r.snippet || "";