Skip to main content
Glama
RecommendationEngine.ts13.3 kB
import { DocumentChunkOptimized, TfidfAnalysisResult, SearchRecommendation, SuggestionStrategy } from '../types/index.js'; import { log } from './Logger.js'; export interface RecommendationEngineConfig { maxQueryTerms?: number; maxAnalysisDocuments?: number; } /** * TF-IDF based contextual search recommendation engine * Analyzes low-confidence search results and generates intelligent query suggestions */ export class RecommendationEngine { private static readonly DEFAULT_MAX_ANALYSIS_DOCUMENTS = 5; private static readonly DEFAULT_MAX_QUERY_TERMS = 8; private static readonly TFIDF_FORMULA = 'score = TF × log((docCount + 1) / (DF + 1))'; // Default expiry for generated recommendations (30 days) private static readonly RECOMMENDATION_EXPIRY_MS = 30 * 24 * 60 * 60 * 1000; private readonly maxQueryTerms: number; private readonly maxAnalysisDocuments: number; constructor(config: RecommendationEngineConfig = {}) { this.maxQueryTerms = config.maxQueryTerms ?? RecommendationEngine.DEFAULT_MAX_QUERY_TERMS; this.maxAnalysisDocuments = config.maxAnalysisDocuments ?? RecommendationEngine.DEFAULT_MAX_ANALYSIS_DOCUMENTS; // Validate configuration if (this.maxQueryTerms < 1 || this.maxQueryTerms > 100) { throw new Error(`maxQueryTerms must be between 1 and 100, got ${this.maxQueryTerms}`); } if (this.maxAnalysisDocuments < 1 || this.maxAnalysisDocuments > 50) { throw new Error(`maxAnalysisDocuments must be between 1 and 50, got ${this.maxAnalysisDocuments}`); } log.debug('RecommendationEngine initialized', { maxQueryTerms: this.maxQueryTerms, maxAnalysisDocuments: this.maxAnalysisDocuments }); } /** * Analyze low-confidence search results and generate contextual recommendations * @param query Original user query * @param results Search results (presumed low-confidence) * @param totalDocuments Total documents in the index * @param currentThreshold Current TF-IDF threshold from learning system * @returns Analysis results with suggested query modifications */ async analyzeForRecommendations( query: string, results: DocumentChunkOptimized[], totalDocuments: number, currentThreshold: number ): Promise<Omit<SearchRecommendation, 'id'> | null> { const timer = log.time('tfidf-analysis'); try { // Limit analysis to top results for performance const analysisResults = results.slice(0, this.maxAnalysisDocuments); if (analysisResults.length === 0) { log.debug('No results to analyze for recommendations'); return null; } // Extract and tokenize query terms const queryTerms = this.tokenizeQuery(query); if (queryTerms.length === 0 || queryTerms.length > this.maxQueryTerms) { log.debug('Query has no terms or too many terms for analysis', { termCount: queryTerms.length, maxAllowed: this.maxQueryTerms }); return null; } log.debug('Starting TF-IDF analysis', { query: query.substring(0, 50), resultCount: analysisResults.length, queryTerms: queryTerms.length, totalDocuments, currentThreshold }); // Calculate TF-IDF scores for each term across the result documents const tfidfResults = this.calculateTfidfAnalysis(queryTerms, analysisResults, totalDocuments); // Generate recommendations based on TF-IDF analysis const recommendation = this.generateRecommendation(query, queryTerms, tfidfResults, currentThreshold); timer(); if (recommendation) { log.info('Generated contextual recommendation', { strategy: recommendation.suggestionStrategy, confidence: recommendation.confidence, suggestedTermCount: recommendation.suggestedTerms.length }); } else { log.debug('No recommendation generated for query'); } return recommendation; } catch (error: any) { log.error('TF-IDF analysis failed', error, { query: query.substring(0, 50) }); return null; } } /** * Tokenize query into individual terms, preserving quoted phrases * @param query The search query * @returns Array of query terms */ private tokenizeQuery(query: string): string[] { const terms: string[] = []; const regex = /"([^"]*)"|(\S+)/g; let match; while ((match = regex.exec(query.toLowerCase())) !== null) { // Use quoted content if present, otherwise the unquoted term const term = match[1] || match[2]; if (term && term.length > 1) { // Skip single characters terms.push(term); } } return [...new Set(terms)]; // Remove duplicates } /** * Calculate TF-IDF analysis for query terms across result documents * @param queryTerms Tokenized query terms * @param results Top search results to analyze * @param totalDocuments Total documents in index * @returns TF-IDF analysis results for each term */ private calculateTfidfAnalysis( queryTerms: string[], results: DocumentChunkOptimized[], totalDocuments: number ): TfidfAnalysisResult[] { const termAnalysis: TfidfAnalysisResult[] = []; for (const term of queryTerms) { // Calculate document frequency (DF): how many result documents contain this term let df = 0; let totalTf = 0; for (const result of results) { const content = result.content.toLowerCase(); const occurrences = (content.match(new RegExp(`\\b${this.escapeRegex(term)}\\b`, 'g')) || []).length; if (occurrences > 0) { df++; totalTf += occurrences; } } // Calculate average TF across documents that contain the term const avgTf = df > 0 ? totalTf / df : 0; // Apply TF-IDF formula: TF × log((N + 1) / (DF + 1)) // Using +1 smoothing to avoid division by zero and reduce impact of rare terms const tfidf = avgTf * Math.log((totalDocuments + 1) / (df + 1)); // Determine if term should be removed based on TF-IDF score const shouldRemove = tfidf < 1.0; // Conservative threshold const confidence = Math.max(0, Math.min(1, tfidf / 5.0)); // Normalize confidence to 0-1 termAnalysis.push({ term, tf: avgTf, df, tfidf, shouldRemove, confidence }); } // Sort by TF-IDF score (lowest first - most likely to cause issues) return termAnalysis.sort((a, b) => a.tfidf - b.tfidf); } /** * Generate a contextual recommendation based on TF-IDF analysis * @param originalQuery The original query * @param queryTerms Tokenized terms * @param tfidfResults TF-IDF analysis results * @param currentThreshold Current learning threshold * @returns Recommendation or null if no good suggestions */ private generateRecommendation( originalQuery: string, queryTerms: string[], tfidfResults: TfidfAnalysisResult[], currentThreshold: number ): Omit<SearchRecommendation, 'id'> | null { // Safety check: never remove all terms const removableTerms = tfidfResults.filter(r => r.shouldRemove && r.tfidf < currentThreshold && !this.isEssentialTerm(r.term, originalQuery) ); if (removableTerms.length === 0) { return null; } // Find the most problematic term (lowest TF-IDF score) const mostProblematicTerm = removableTerms[0]; // Strategy 1: Term removal - safest approach if (this.canSafelyRemoveTerm(mostProblematicTerm.term, queryTerms)) { const suggestedTerms = queryTerms.filter(t => t !== mostProblematicTerm.term); return { query: originalQuery, suggestedTerms, suggestionStrategy: SuggestionStrategy.TERM_REMOVAL, tfidfThreshold: currentThreshold, confidence: mostProblematicTerm.confidence, generatedAt: new Date(), expiresAt: new Date(Date.now() + RecommendationEngine.RECOMMENDATION_EXPIRY_MS), // 30 days totalDocuments: 0, // Will be set by caller analyzedDocuments: tfidfResults.length }; } // Strategy 2: Term refinement - replace with more specific term const refinedSuggestion = this.generateRefinedTerm(mostProblematicTerm.term, tfidfResults); if (refinedSuggestion) { return { query: originalQuery, suggestedTerms: [refinedSuggestion], suggestionStrategy: SuggestionStrategy.TERM_REFINEMENT, tfidfThreshold: currentThreshold, confidence: 0.7, // Moderate confidence for refinements generatedAt: new Date(), expiresAt: new Date(Date.now() + RecommendationEngine.RECOMMENDATION_EXPIRY_MS), totalDocuments: 0, analyzedDocuments: tfidfResults.length }; } // Strategy 3: Contextual addition - add clarifying terms const contextualTerms = this.suggestContextualTerms(mostProblematicTerm.term, tfidfResults); if (contextualTerms.length > 0) { const combinedTerms = [...queryTerms, ...contextualTerms]; return { query: originalQuery, suggestedTerms: combinedTerms, suggestionStrategy: SuggestionStrategy.CONTEXTUAL_ADDITION, tfidfThreshold: currentThreshold, confidence: 0.6, // Lower confidence for additions generatedAt: new Date(), expiresAt: new Date(Date.now() + RecommendationEngine.RECOMMENDATION_EXPIRY_MS), totalDocuments: 0, analyzedDocuments: tfidfResults.length }; } return null; } /** * Check if a term is essential and shouldn't be removed * @param term The term to check * @param originalQuery The original query string * @returns true if term is essential */ private isEssentialTerm(term: string, originalQuery: string): boolean { // Keep terms that are in quotes (phrase searches) if (originalQuery.includes(`"${term}"`)) { return true; } // Keep very common programming terms const essentialTerms = ['function', 'class', 'method', 'variable', 'import', 'export', 'async', 'await', 'return', 'const', 'let', 'var']; return essentialTerms.includes(term.toLowerCase()); } /** * Check if a term can be safely removed without making the query meaningless * @param termToRemove The term being considered for removal * @param allTerms All query terms * @returns true if safe to remove */ private canSafelyRemoveTerm(termToRemove: string, allTerms: string[]): boolean { // Keep at least 2 terms to maintain query specificity return allTerms.length > 2; } /** * Generate a refined version of a problematic term * @param problematicTerm The term causing issues * @param tfidfResults All TF-IDF results for context * @returns Refined term suggestion or null */ private generateRefinedTerm(problematicTerm: string, tfidfResults: TfidfAnalysisResult[]): string | null { // Look for better-performing related terms in the results const betterTerms = tfidfResults .filter(r => !r.shouldRemove && r.tfidf > 2.0) // Good TF-IDF scores .filter(r => this.areTermsRelated(r.term, problematicTerm)) .sort((a, b) => b.tfidf - a.tfidf); return betterTerms.length > 0 ? betterTerms[0].term : null; } /** * Suggest contextual terms to add for clarification * @param problematicTerm The term causing ambiguity * @param tfidfResults All TF-IDF results * @returns Array of contextual terms to add */ private suggestContextualTerms(problematicTerm: string, tfidfResults: TfidfAnalysisResult[]): string[] { // Find terms that appear in multiple highly-ranked results const highScoringTerms = tfidfResults .filter(r => r.tfidf > 3.0 && r.df >= 2) // Good TF-IDF and appears in multiple docs .filter(r => r.term !== problematicTerm) .slice(0, 2); // Limit to 2 contextual terms return highScoringTerms.map(r => r.term); } /** * Check if two terms are semantically related * @param term1 First term * @param term2 Second term * @returns true if terms are related */ private areTermsRelated(term1: string, term2: string): boolean { // Simple heuristic: check for common prefixes/suffixes or substring relationships const lower1 = term1.toLowerCase(); const lower2 = term2.toLowerCase(); // Direct substring relationship if (lower1.includes(lower2) || lower2.includes(lower1)) { return lower1 !== lower2; // Not identical } // Common programming term variations const commonMappings: Record<string, string[]> = { 'function': ['method', 'func'], 'class': ['object', 'instance'], 'variable': ['var', 'const', 'let'], 'import': ['require', 'module'], 'async': ['await', 'promise'], }; for (const [key, variants] of Object.entries(commonMappings)) { if ((lower1 === key && variants.includes(lower2)) || (lower2 === key && variants.includes(lower1))) { return true; } } return false; } /** * Escape special regex characters * @param string String to escape * @returns Escaped string */ private escapeRegex(string: string): string { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server