Skip to main content
Glama
query-parser.ts12.8 kB
/** * Query Parser for Full-Text Search * * Parses and sanitizes user queries for PostgreSQL ts_query format. * Handles boolean operators, phrase queries, and special characters. * * Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 1.2, 1.3, 1.4, 1.5, 1.6 */ import { SearchValidationError, type ParsedQuery } from "./types"; /** * Query parser for converting natural language queries to PostgreSQL ts_query format */ export class QueryParser { private readonly maxQueryLength: number; // Control characters to remove: null bytes (0x00), control chars (0x01-0x1F), and DEL (0x7F) private readonly CONTROL_CHARS_PATTERN: RegExp; constructor(maxQueryLength: number = 1000) { this.maxQueryLength = maxQueryLength; // Define control character pattern explicitly to avoid eslint no-control-regex warning // This matches: null byte, ASCII control characters (1-31), and DEL character (127) this.CONTROL_CHARS_PATTERN = new RegExp( `${String.fromCharCode(0)}-${String.fromCharCode(31)}${String.fromCharCode(127)}`, "g" ); } /** * Parse user query to PostgreSQL ts_query format * * @param query - User query string * @param _language - Text search language (default: 'english') - currently unused * @returns Parsed ts_query string (for backward compatibility) * @throws SearchValidationError if query is invalid */ parse(query: string, _language: string = "english"): string { return this.parseQuery(query, _language).tsQuery; } /** * Parse user query to structured ParsedQuery with include/exclude terms * * @param query - User query string * @param _language - Text search language (default: 'english') - currently unused * @returns ParsedQuery with tsQuery, includeTerms, and excludeTerms * @throws SearchValidationError if query is invalid */ parseQuery(query: string, _language: string = "english"): ParsedQuery { // Validate query this.validate(query); // Sanitize query let sanitized = this.sanitize(query); // Extract NOT terms before conversion (handles both "NOT term" and "!term") const { includeTerms, excludeTerms } = this.extractNotTerms(sanitized); // Convert "NOT " to "!" for ts_query format (case insensitive) sanitized = this.convertNotOperator(sanitized); // Handle phrase queries first (before other operators) // Use placeholder to protect phrases from operator insertion sanitized = this.handlePhrases(sanitized); // Convert boolean operators (but not within phrases) sanitized = this.convertBooleanOperators(sanitized); // Restore phrase markers sanitized = sanitized.replace(/__PHRASE__/g, ""); // Clean up whitespace sanitized = sanitized.trim().replace(/\s+/g, " "); return { tsQuery: sanitized, includeTerms, excludeTerms, }; } /** * Convert "NOT " keyword to "& !" operator for ts_query format * * PostgreSQL tsquery requires NOT (!) to be preceded by an AND (&) operator * unless it's at the start of the query or after OR (|) or opening parenthesis. * * @param query - Query string with potential NOT keywords * @returns Query string with NOT converted to & !term (no space between ! and term) */ private convertNotOperator(query: string): string { // First convert AND/OR keywords to operators to avoid treating them as words let result = query.replace(/\bAND\b/gi, "&"); result = result.replace(/\bOR\b/gi, "|"); // Now handle NOT conversions iteratively to catch all occurrences // Keep converting until no more changes (handles multiple NOTs) let previous = ""; while (previous !== result) { previous = result; // Case 1: NOT at start of query -> !term result = result.replace(/^\s*NOT\s+(\S+)/gi, "!$1"); // Case 2: After | operator -> | !term result = result.replace(/\|\s*NOT\s+(\S+)/gi, "| !$1"); // Case 3: After & operator -> & !term result = result.replace(/&\s*NOT\s+(\S+)/gi, "& !$1"); // Case 4: After opening parenthesis -> (!term result = result.replace(/\(\s*NOT\s+(\S+)/gi, "(!$1"); // Case 5: After !term followed by NOT -> !term & !nextterm result = result.replace(/(!\S+)\s+NOT\s+(\S+)/gi, "$1 & !$2"); // Case 6: Word followed by NOT (implicit AND needed) -> word & !term result = result.replace(/(\w+)\s+NOT\s+(\S+)/gi, "$1 & !$2"); } return result; } /** * Extract include and exclude terms from query * * Identifies terms prefixed with NOT or ! as exclude terms. * All other terms are include terms. * * @param query - Sanitized query string * @returns Object with includeTerms and excludeTerms arrays */ private extractNotTerms(query: string): { includeTerms: string[]; excludeTerms: string[] } { const includeTerms: string[] = []; const excludeTerms: string[] = []; // First, convert NOT to ! for consistent processing const normalizedQuery = query.replace(/\bNOT\s+/gi, "!"); // Extract terms, tracking which are negated // Match: !term, !"phrase", or regular terms const termPattern = /(!?)(?:"([^"]+)"|(\S+))/g; let match; while ((match = termPattern.exec(normalizedQuery)) !== null) { const isNegated = match[1] === "!"; // Get the term (either quoted phrase or single word) let term = (match[2] || match[3] || "").toLowerCase(); // Remove parentheses from term term = term.replace(/[()]/g, ""); // Skip operators and empty terms // Filter both symbol forms (&, |, !) and word forms (and, or) // Note: "not" is already handled separately above if (!term || ["&", "|", "!", "and", "or"].includes(term)) { continue; } // Skip if term is just "not" (the operator itself) if (term === "not") { continue; } if (isNegated) { // Split phrase into individual words for excludeTerms const words = term .split(/\s+/) .filter((w) => w.length > 0) .map((w) => w.replace(/[()]/g, "")); excludeTerms.push(...words); } else { // Split phrase into individual words for includeTerms const words = term .split(/\s+/) .filter((w) => w.length > 0) .map((w) => w.replace(/[()]/g, "")); includeTerms.push(...words); } } // Remove duplicates return { includeTerms: Array.from(new Set(includeTerms)), excludeTerms: Array.from(new Set(excludeTerms)), }; } /** * Validate query syntax and constraints * * @param query - Query string to validate * @throws SearchValidationError if query is invalid */ validate(query: string): void { if (!query || typeof query !== "string") { throw new SearchValidationError("Query must be a non-empty string", "query", query); } if (query.trim().length === 0) { throw new SearchValidationError("Query cannot be empty", "query", query); } if (query.length > this.maxQueryLength) { throw new SearchValidationError( `Query exceeds maximum length of ${this.maxQueryLength} characters`, "query", query ); } } /** * Sanitize query by removing/escaping dangerous characters * * @param query - Query string to sanitize * @returns Sanitized query string */ sanitize(query: string): string { // Remove null bytes and control characters using explicit pattern let sanitized = query.replace(this.CONTROL_CHARS_PATTERN, ""); // Handle special programming language names (C++, C#, etc.) // Convert to searchable terms sanitized = sanitized.replace(/C\+\+/gi, "cplusplus"); sanitized = sanitized.replace(/C#/gi, "csharp"); sanitized = sanitized.replace(/F#/gi, "fsharp"); // Remove potentially dangerous SQL characters (except those used in ts_query) // Keep: & | ! ( ) " ' - (used in boolean operators and phrases) // Remove: ; @ # $ % ^ * = + [ ] { } \ / < > sanitized = sanitized.replace(/[;@#$%^*=+[\]{}\\/<>]/g, " "); return sanitized; } /** * Convert boolean operators to ts_query format * * Handles: * - Implicit AND (space between words) * - Explicit AND (&) * - OR (|) * - NOT (!) * - Grouping with parentheses * * @param query - Query string with boolean operators * @returns Query string in ts_query format */ convertBooleanOperators(query: string): string { let converted = query; // Preserve explicit operators and parentheses // & → & (already correct) // | → | (already correct) // ! → ! (already correct) // ( ) → ( ) (already correct) // Handle implicit AND: convert spaces between words to & // But preserve spaces within phrases (marked with __PHRASE__) // Split by operators and parentheses to process each term const tokens = converted.split(/([&|!()]|__PHRASE__[^_]*__PHRASE__)/); const processed = tokens.map((token) => { // Skip operators, parentheses, and phrase markers if (["&", "|", "!", "(", ")"].includes(token.trim()) || token.includes("__PHRASE__")) { return token; } // For word sequences, add implicit AND for precision // Using AND ensures results contain all search terms const words = token .trim() .split(/\s+/) .filter((w) => w.length > 0); if (words.length > 1) { return words.join(" & "); } return token; }); converted = processed.join(""); // Clean up multiple operators converted = converted.replace(/&\s*&+/g, "&"); converted = converted.replace(/\|\s*\|+/g, "|"); // Clean up whitespace around operators // For & | ( ) - add space on both sides converted = converted.replace(/\s*([&|()])s*/g, " $1 "); // For ! - add space before but NOT after (! must be attached to term) converted = converted.replace(/\s*!\s*/g, " !"); // For ) - ensure space after converted = converted.replace(/\)\s*/g, ") "); // Clean up multiple spaces converted = converted.replace(/\s+/g, " "); return converted.trim(); } /** * Handle phrase queries with quotes * * Converts "phrase query" to phrase <-> query format for ts_query * * @param query - Query string with potential phrases * @returns Query string with phrases converted to <-> operator */ handlePhrases(query: string): string { // Match quoted phrases const phraseRegex = /"([^"]+)"/g; return query.replace(phraseRegex, (_match, phrase) => { // Split phrase into words and join with <-> operator const words = phrase .trim() .split(/\s+/) .filter((w: string) => w.length > 0); if (words.length === 0) { return ""; } if (words.length === 1) { return words[0]; } // Join words with <-> for phrase matching // Use markers to protect from operator insertion return `__PHRASE__(${words.join(" <-> ")})__PHRASE__`; }); } /** * Extract individual search terms from query (excludes NOT terms) * * Useful for highlighting and matched terms extraction. * Only returns terms that should be matched (not negated terms). * * @param query - Original query string (not parsed) * @returns Array of search terms (excluding NOT terms) */ extractTerms(query: string): string[] { // Sanitize first to handle special characters const sanitized = this.sanitize(query); // Use extractNotTerms to get only include terms const { includeTerms } = this.extractNotTerms(sanitized); return includeTerms; } /** * Extract all terms from a parsed ts_query string * * This extracts all terms from the ts_query format, including negated ones. * Used internally for backward compatibility. * * @param tsQuery - Parsed ts_query string * @returns Array of all terms in the query */ extractAllTerms(tsQuery: string): string[] { // Remove operators, parentheses, and phrase markers const cleaned = tsQuery.replace(/[&|!()<>-]|__PHRASE__/g, " "); // Split into words and remove duplicates const words = cleaned .split(/\s+/) .filter((w) => w.length > 0) .map((w) => w.toLowerCase()); return Array.from(new Set(words)); } /** * Extract only the excluded (NOT) terms from a query * * @param query - Original query string * @returns Array of excluded terms */ extractExcludeTerms(query: string): string[] { // Sanitize first to handle special characters const sanitized = this.sanitize(query); // Use extractNotTerms to get only exclude terms const { excludeTerms } = this.extractNotTerms(sanitized); return excludeTerms; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/keyurgolani/ThoughtMcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server