ObsiViewer/src/core/search/search-parser.ts

622 lines
18 KiB
TypeScript

/**
* Obsidian-compatible search query parser
* Full operator support: path:, file:, content:, tag:, line:, block:, section:,
* task:, task-todo:, task-done:, match-case:, ignore-case:, [property], OR, AND, -, *, "", /regex/
*/
import {
SearchTerm,
SearchGroup,
SearchNode,
ParsedQuery,
SearchPredicate,
SearchContext,
SearchOptions,
SectionContent,
TaskInfo,
ParseDiagnostics,
SearchFilterDiagnostics
} from './search-parser.types';
// Re-export types for convenience
export type { SearchContext, SearchOptions, SectionContent, TaskInfo };
/**
* Parse an Obsidian search query into an AST
*/
export function parseSearchQuery(query: string, options?: SearchOptions): ParsedQuery {
if (!query || !query.trim()) {
return {
ast: { type: 'group', operator: 'AND', terms: [] },
isEmpty: true,
diagnostics: {
tokens: [],
filters: createEmptyDiagnosticsFilters(options),
warnings: []
}
};
}
const tokens = tokenize(query);
const diagnostics: ParseDiagnostics = {
tokens: [...tokens],
filters: createEmptyDiagnosticsFilters(options),
warnings: []
};
const ast = parseTokens(tokens, options, diagnostics);
return {
ast,
isEmpty: false,
diagnostics
};
}
/**
* Convert parsed query into a predicate function
*/
export function queryToPredicate(parsed: ParsedQuery, options?: SearchOptions): SearchPredicate {
if (parsed.isEmpty) {
return () => true;
}
return (context: SearchContext) => evaluateNode(parsed.ast, context, options);
}
/**
* Tokenize the query string
*/
function tokenize(query: string): string[] {
const tokens: string[] = [];
// This regex handles:
// - quoted strings (double and single)
// - regex patterns /.../
// - parentheses
// - property searches like [prop]:"value"
// - operators and words
const regex = /\s*("([^"]*)"|'([^']*)'|\/([^\/]*)\/|\(|\)|-?\[[^\]]*\]:?"[^"]*"|-?\[[^\]]*\]|-?[^\s\(\)]+)/g;
let match;
while ((match = regex.exec(query)) !== null) {
if (match[1]) {
tokens.push(match[1]);
}
}
return tokens;
}
/**
* Parse tokens into AST
*/
function parseTokens(tokens: string[], options: SearchOptions | undefined, diagnostics: ParseDiagnostics): SearchNode {
const terms: SearchNode[] = [];
let i = 0;
while (i < tokens.length) {
const token = tokens[i];
// Handle OR operator
if (token.toUpperCase() === 'OR') {
i++;
continue;
}
// Handle AND operator (implicit)
if (token.toUpperCase() === 'AND') {
i++;
continue;
}
// Handle parentheses
if (token === '(') {
const { node, endIndex } = parseGroup(tokens, i + 1, options, diagnostics);
terms.push(node);
i = endIndex + 1;
continue;
}
// Parse term
const term = parseTerm(token, options, diagnostics);
if (term) {
terms.push(term);
}
i++;
}
// Determine operator based on OR presence
const hasOr = tokens.some(t => t.toUpperCase() === 'OR');
const operator = hasOr ? 'OR' : 'AND';
return {
type: 'group',
operator,
terms
};
}
/**
* Parse a group enclosed in parentheses
*/
function parseGroup(
tokens: string[],
startIndex: number,
options: SearchOptions | undefined,
diagnostics: ParseDiagnostics
): { node: SearchNode; endIndex: number } {
const terms: SearchNode[] = [];
let i = startIndex;
let depth = 1;
while (i < tokens.length && depth > 0) {
const token = tokens[i];
if (token === '(') {
depth++;
} else if (token === ')') {
depth--;
if (depth === 0) {
break;
}
}
if (token.toUpperCase() !== 'OR' && token.toUpperCase() !== 'AND' && token !== '(' && token !== ')') {
const term = parseTerm(token, options, diagnostics);
if (term) {
terms.push(term);
}
}
i++;
}
const hasOr = tokens.slice(startIndex, i).some(t => t.toUpperCase() === 'OR');
const operator = hasOr ? 'OR' : 'AND';
return {
node: {
type: 'group',
operator,
terms
},
endIndex: i
};
}
/**
* Parse a single search term
*/
function parseTerm(token: string, options: SearchOptions | undefined, diagnostics: ParseDiagnostics): SearchTerm | null {
if (!token) {
return null;
}
let negated = false;
let value = token;
if (value.startsWith('-')) {
negated = true;
value = value.substring(1);
}
let term: SearchTerm | null = null;
let negativeValue: string | undefined;
// Support property form: [key]:value (Obsidian compatibility)
if (value.startsWith('[') && value.includes(']:')) {
const closeBracket = value.indexOf(']');
if (closeBracket > 0) {
const propertyKey = value.substring(1, closeBracket);
let propertyValue = value.substring(closeBracket + 2);
let propValueQuoted = false;
if (propertyValue.startsWith('"') && propertyValue.endsWith('"')) {
propValueQuoted = true;
propertyValue = propertyValue.substring(1, propertyValue.length - 1);
}
term = {
type: 'property',
value: propertyValue,
propertyKey,
negated,
quoted: propValueQuoted,
wildcard: propertyValue.includes('*')
};
negativeValue = propertyValue || propertyKey;
}
}
if (!term && value.startsWith('/') && value.endsWith('/') && value.length > 2) {
const regexPattern = value.substring(1, value.length - 1);
term = { type: 'regex', value: regexPattern, negated, quoted: false, wildcard: false };
diagnostics.filters.regex = true;
negativeValue = regexPattern;
}
let quoted = false;
if (!term && value.startsWith('"') && value.endsWith('"')) {
quoted = true;
value = value.substring(1, value.length - 1);
}
const wildcard = value.includes('*');
if (!term) {
const colonIndex = value.indexOf(':');
if (colonIndex > 0) {
const prefix = value.substring(0, colonIndex).toLowerCase();
const searchValueRaw = value.substring(colonIndex + 1);
let cleanValue = searchValueRaw;
let valueQuoted = false;
if (cleanValue.startsWith('"') && cleanValue.endsWith('"')) {
valueQuoted = true;
cleanValue = cleanValue.substring(1, cleanValue.length - 1);
}
if (prefix.startsWith('[') && prefix.endsWith(']')) {
const propertyKey = prefix.substring(1, prefix.length - 1);
term = {
type: 'property',
value: cleanValue,
propertyKey,
negated,
quoted: valueQuoted,
wildcard: cleanValue.includes('*')
};
negativeValue = cleanValue || propertyKey;
} else {
switch (prefix) {
case 'path':
if (!negated) {
diagnostics.filters.path.push(cleanValue);
}
term = { type: 'path', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'file':
if (!negated) {
diagnostics.filters.file.push(cleanValue);
}
term = { type: 'file', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'content':
term = { type: 'content', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'tag':
if (!negated) {
diagnostics.filters.tag.push(cleanValue);
}
term = { type: 'tag', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'line':
term = { type: 'line', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'block':
term = { type: 'block', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'section':
term = { type: 'section', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'task':
term = { type: 'task', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'task-todo':
term = { type: 'task-todo', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'task-done':
term = { type: 'task-done', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*') };
negativeValue = cleanValue;
break;
case 'match-case':
diagnostics.filters.caseSensitive = true;
term = { type: 'match-case', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*'), caseSensitive: true };
negativeValue = cleanValue;
break;
case 'ignore-case':
diagnostics.filters.caseSensitive = false;
term = { type: 'ignore-case', value: cleanValue, negated, quoted: valueQuoted, wildcard: cleanValue.includes('*'), caseSensitive: false };
negativeValue = cleanValue;
break;
default:
diagnostics.warnings.push(`UnknownOperator:${prefix}`);
// treat as text with original value including prefix
term = { type: 'text', value, negated, quoted, wildcard };
negativeValue = value;
break;
}
}
}
}
if (!term && value.startsWith('[') && value.endsWith(']')) {
const inner = value.substring(1, value.length - 1);
const propColonIndex = inner.indexOf(':');
if (propColonIndex > 0) {
const propertyKey = inner.substring(0, propColonIndex);
let propertyValue = inner.substring(propColonIndex + 1);
let propValueQuoted = false;
if (propertyValue.startsWith('"') && propertyValue.endsWith('"')) {
propValueQuoted = true;
propertyValue = propertyValue.substring(1, propertyValue.length - 1);
}
term = {
type: 'property',
value: propertyValue,
propertyKey,
negated,
quoted: propValueQuoted,
wildcard: propertyValue.includes('*')
};
negativeValue = propertyValue || propertyKey;
} else {
term = {
type: 'property',
value: '',
propertyKey: inner,
negated,
quoted: false,
wildcard: false
};
negativeValue = inner;
}
}
if (!term) {
term = { type: 'text', value, negated, quoted, wildcard };
negativeValue = value;
}
if (negated && negativeValue && term) {
diagnostics.filters.negative.push(negativeValue);
diagnostics.filters.negativeDetails.push({
type: term.type,
value: negativeValue,
wildcard: 'wildcard' in term ? Boolean((term as any).wildcard) : undefined
});
}
if (term.type === 'regex') {
diagnostics.filters.regex = true;
}
return term;
}
function createEmptyDiagnosticsFilters(options?: SearchOptions): SearchFilterDiagnostics {
return {
tag: [],
path: [],
file: [],
negative: [],
negativeDetails: [],
regex: options?.regexMode ?? false,
caseSensitive: options?.caseSensitive ?? false,
wholeWord: (options as any)?.wholeWord ?? false
};
}
/**
* Evaluate a search node against context
*/
function evaluateNode(node: SearchNode, context: SearchContext, options?: SearchOptions): boolean {
if (node.type === 'group') {
return evaluateGroup(node, context, options);
}
return evaluateTerm(node, context, options);
}
/**
* Evaluate a group node
*/
function evaluateGroup(group: SearchGroup, context: SearchContext, options?: SearchOptions): boolean {
if (group.terms.length === 0) {
return true;
}
const results = group.terms.map(term => evaluateNode(term, context, options));
if (group.operator === 'OR') {
return results.some(r => r);
}
// AND operator (default)
return results.every(r => r);
}
/**
* Evaluate a single term
*/
function evaluateTerm(term: SearchTerm, context: SearchContext, options?: SearchOptions): boolean {
let result = false;
// Determine case sensitivity for this term
const caseSensitive = term.caseSensitive !== undefined
? term.caseSensitive
: (options?.caseSensitive || false);
switch (term.type) {
case 'path':
result = matchString(context.filePath, term.value, term.wildcard, caseSensitive);
break;
case 'file':
result = matchString(context.fileName, term.value, term.wildcard, caseSensitive) ||
matchString(context.fileNameWithExt, term.value, term.wildcard, caseSensitive);
break;
case 'content':
result = matchString(context.content, term.value, term.wildcard, caseSensitive);
break;
case 'tag':
const searchTag = term.value.startsWith('#') ? term.value.substring(1) : term.value;
result = context.tags.some(tag => {
const cleanTag = tag.startsWith('#') ? tag.substring(1) : tag;
return matchString(cleanTag, searchTag, term.wildcard, caseSensitive);
});
break;
case 'line':
result = context.lines.some(line => matchString(line, term.value, term.wildcard, caseSensitive));
break;
case 'block':
result = context.blocks.some(block => matchString(block, term.value, term.wildcard, caseSensitive));
break;
case 'section':
result = context.sections.some(section =>
matchString(section.content, term.value, term.wildcard, caseSensitive) ||
matchString(section.heading, term.value, term.wildcard, caseSensitive)
);
break;
case 'task':
result = context.tasks.some(task => matchString(task.text, term.value, term.wildcard, caseSensitive));
break;
case 'task-todo':
result = context.tasks.some(task =>
!task.completed && matchString(task.text, term.value, term.wildcard, caseSensitive)
);
break;
case 'task-done':
result = context.tasks.some(task =>
task.completed && matchString(task.text, term.value, term.wildcard, caseSensitive)
);
break;
case 'match-case':
// This is a text search with forced case sensitivity
result = matchString(context.content, term.value, term.wildcard, true);
break;
case 'ignore-case':
// This is a text search with forced case insensitivity
result = matchString(context.content, term.value, term.wildcard, false);
break;
case 'regex':
try {
const flags = caseSensitive ? '' : 'i';
const regex = new RegExp(term.value, flags);
result = regex.test(context.content);
} catch (e) {
// Invalid regex, no match
result = false;
}
break;
case 'property':
if (term.propertyKey) {
const propValue = context.properties[term.propertyKey];
if (term.value === '') {
// Property existence check
result = propValue !== undefined;
} else {
// Property value check
if (Array.isArray(propValue)) {
result = propValue.some(v => matchString(String(v), term.value, term.wildcard, caseSensitive));
} else {
result = matchString(String(propValue || ''), term.value, term.wildcard, caseSensitive);
}
}
}
break;
case 'text':
default:
// Search in content
result = matchString(context.content, term.value, term.wildcard, caseSensitive);
break;
}
// Apply negation
return term.negated ? !result : result;
}
/**
* Match a string with optional wildcard support and case sensitivity
*/
function matchString(text: string, pattern: string, wildcard: boolean = false, caseSensitive: boolean = false): boolean {
if (!caseSensitive) {
const textLower = text.toLowerCase();
const patternLower = pattern.toLowerCase();
if (!wildcard) {
return textLower.includes(patternLower);
}
// Convert wildcard pattern to regex
const regexPattern = patternLower
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // Escape special regex chars
.replace(/\*/g, '.*'); // Convert * to .*
const regex = new RegExp(regexPattern, 'i');
return regex.test(text);
} else {
// Case sensitive matching
if (!wildcard) {
return text.includes(pattern);
}
// Convert wildcard pattern to regex (case sensitive)
const regexPattern = pattern
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // Escape special regex chars
.replace(/\*/g, '.*'); // Convert * to .*
const regex = new RegExp(regexPattern);
return regex.test(text);
}
}
/**
* Extract search operators/prefixes from a partial query
* Used for autocomplete suggestions
*/
export function detectQueryType(query: string): {
type: 'path' | 'file' | 'content' | 'tag' | 'line' | 'block' | 'section' | 'task' | 'task-todo' | 'task-done' | 'match-case' | 'ignore-case' | 'property' | 'general' | null;
prefix: string;
value: string;
} {
const trimmed = query.trim();
// Check for property
if (trimmed.startsWith('[')) {
const closeBracket = trimmed.indexOf(']');
if (closeBracket === -1) {
// Still typing property
const inner = trimmed.substring(1);
const colonIndex = inner.indexOf(':');
if (colonIndex > 0) {
return { type: 'property', prefix: inner.substring(0, colonIndex), value: inner.substring(colonIndex + 1) };
}
return { type: 'property', prefix: '', value: inner };
}
}
// Check for standard prefixes
const colonIndex = trimmed.indexOf(':');
if (colonIndex > 0) {
const prefix = trimmed.substring(0, colonIndex).toLowerCase();
const value = trimmed.substring(colonIndex + 1);
const validPrefixes = [
'path', 'file', 'content', 'tag', 'line', 'block', 'section',
'task', 'task-todo', 'task-done', 'match-case', 'ignore-case'
];
if (validPrefixes.includes(prefix)) {
return { type: prefix as any, prefix, value };
}
}
return { type: 'general', prefix: '', value: trimmed };
}