Skip to main content
Glama
search-engine.ts22 kB
import { FileIndexer } from './file-indexer.js'; import { ContentExtractor, ContentAnalyzer } from './content-extractor.js'; import { WebDAVClient } from '../client/webdav.js'; import { getClient } from './client-manager.js'; import { SearchOptions, SearchResult, FileMetadata, FileIndex, ParsedQuery, QueryFilter, QueryOperator, SearchScope, SearchConfig, DEFAULT_SEARCH_CONFIG } from '../models/webdav-search.js'; /** * Main search engine for WebDAV files */ export class SearchEngine { private indexer: FileIndexer; private extractor: ContentExtractor; private config: SearchConfig; private resultCache = new Map<string, { results: SearchResult[]; timestamp: Date }>(); constructor( indexer?: FileIndexer, extractor?: ContentExtractor, config: SearchConfig = DEFAULT_SEARCH_CONFIG ) { this.indexer = indexer || new FileIndexer(config); this.extractor = extractor || new ContentExtractor(config); this.config = config; } /** * Main search method */ async search(options: SearchOptions): Promise<SearchResult[]> { console.log('Starting search with options:', options); // Parse and validate query outside try block for scope const parsedQuery = this.parseQuery(options.query); if (parsedQuery.terms.length === 0) { return []; } try { // Check cache first const cacheKey = this.getCacheKey(options); const cached = this.resultCache.get(cacheKey); if (cached && this.isCacheValid(cached)) { console.log('Returning cached search results'); return cached.results; } // Get file index with smart mode selection const useQuickMode = options.basePath === '/' || !options.basePath; const index = await this.indexer.getIndex(options.basePath || '/', useQuickMode); console.log(`Searching in index with ${index.fileCount} files (quick mode: ${useQuickMode})`); // Apply file type and date filters first to reduce search scope let filteredFiles = this.applyPreFilters(index.files, options); console.log(`After pre-filtering: ${filteredFiles.length} files`); // Perform searches based on scope const allResults: SearchResult[] = []; if (options.searchIn.includes('filename')) { const filenameResults = this.searchFilenames(parsedQuery, filteredFiles); allResults.push(...filenameResults); } if (options.searchIn.includes('metadata')) { const metadataResults = this.searchMetadata(parsedQuery, filteredFiles); allResults.push(...metadataResults); } if (options.searchIn.includes('content')) { const contentResults = await this.searchContent(parsedQuery, filteredFiles); allResults.push(...contentResults); } // Remove duplicates and merge results const mergedResults = this.mergeResults(allResults); // Apply additional filters const filteredResults = this.applyPostFilters(mergedResults, options); // Rank and sort results const rankedResults = this.rankResults(filteredResults, parsedQuery); // Apply limit const finalResults = rankedResults.slice(0, options.limit || 50); // Add content previews if requested if (options.includeContent) { await this.addContentPreviews(finalResults); } // Cache results this.cacheResults(cacheKey, finalResults); console.log(`Search completed: ${finalResults.length} results`); return finalResults; } catch (error) { console.error('Search failed:', error); // Try fallback search for critical failures if (error instanceof Error && (error.message.includes('timeout') || error.message.includes('index'))) { console.log('Attempting fallback search due to indexing issues'); try { return await this.performFallbackSearch(options, parsedQuery); } catch (fallbackError) { console.error('Fallback search also failed:', fallbackError); } } throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Perform the main search operation */ private async performSearch(options: SearchOptions, parsedQuery: ParsedQuery): Promise<SearchResult[]> { // Get file index with smart mode selection const useQuickMode = options.basePath === '/' || !options.basePath; const index = await this.indexer.getIndex(options.basePath || '/', useQuickMode); console.log(`Searching in index with ${index.fileCount} files (quick mode: ${useQuickMode})`); // Apply file type and date filters first to reduce search scope let filteredFiles = this.applyPreFilters(index.files, options); console.log(`After pre-filtering: ${filteredFiles.length} files`); // Perform searches based on scope const allResults: SearchResult[] = []; if (options.searchIn.includes('filename')) { const filenameResults = this.searchFilenames(parsedQuery, filteredFiles); allResults.push(...filenameResults); } if (options.searchIn.includes('metadata')) { const metadataResults = this.searchMetadata(parsedQuery, filteredFiles); allResults.push(...metadataResults); } if (options.searchIn.includes('content')) { // Limit content search for large datasets const contentSearchFiles = useQuickMode ? filteredFiles.slice(0, 100) : filteredFiles; const contentResults = await this.searchContent(parsedQuery, contentSearchFiles); allResults.push(...contentResults); } // Remove duplicates and merge results const mergedResults = this.mergeResults(allResults); // Apply additional filters const filteredResults = this.applyPostFilters(mergedResults, options); // Rank and sort results const rankedResults = this.rankResults(filteredResults, parsedQuery); // Apply limit const finalResults = rankedResults.slice(0, options.limit || 50); // Add content previews if requested if (options.includeContent) { await this.addContentPreviews(finalResults); } return finalResults; } /** * Fallback search when full indexing fails or times out */ private async performFallbackSearch(options: SearchOptions, parsedQuery: ParsedQuery): Promise<SearchResult[]> { console.log('Performing fallback search with limited scope'); try { // Try to get just the immediate directory listing const webdavClient = getClient(WebDAVClient); const basePath = options.basePath || '/'; const directoryContents = await webdavClient.listDirectory(basePath); // Parse the directory contents into file metadata const files = this.parseDirectoryContents(directoryContents, basePath); console.log(`Fallback search: found ${files.length} files in immediate directory`); // Apply filters const filteredFiles = this.applyPreFilters(files, options); // Search only in filenames and metadata (skip content search for fallback) const results: SearchResult[] = []; if (options.searchIn.includes('filename')) { const filenameResults = this.searchFilenames(parsedQuery, filteredFiles); results.push(...filenameResults); } if (options.searchIn.includes('metadata')) { const metadataResults = this.searchMetadata(parsedQuery, filteredFiles); results.push(...metadataResults); } // Merge, filter, and rank results const mergedResults = this.mergeResults(results); const filteredResults = this.applyPostFilters(mergedResults, options); const rankedResults = this.rankResults(filteredResults, parsedQuery); const finalResults = rankedResults.slice(0, options.limit || 20); // Smaller limit for fallback console.log(`Fallback search completed: ${finalResults.length} results`); return finalResults; } catch (error) { console.error('Fallback search failed:', error); return []; // Return empty results rather than failing completely } } /** * Parse directory contents into FileMetadata array (simplified) */ private parseDirectoryContents(contents: any, basePath: string): FileMetadata[] { const files: FileMetadata[] = []; try { let items: any[] = []; if (Array.isArray(contents)) { items = contents; } else if (contents && typeof contents === 'object') { if (contents.items && Array.isArray(contents.items)) { items = contents.items; } else { items = [contents]; } } for (const item of items) { try { const path = item.path || item.href || `${basePath}/${item.name}`; const name = item.name || path.split('/').pop() || ''; const extension = name.includes('.') ? name.split('.').pop()?.toLowerCase() || '' : ''; files.push({ path, name, size: item.size || 0, lastModified: item.lastModified ? new Date(item.lastModified) : new Date(), mimeType: item.mimeType || item.contentType || 'application/octet-stream', extension, isDirectory: item.isDirectory || false, depth: 0 }); } catch (error) { console.warn('Failed to parse directory item:', error, item); } } } catch (error) { console.warn('Failed to parse directory contents:', error); } return files; } /** * Search in filenames */ private searchFilenames(query: ParsedQuery, files: FileMetadata[]): SearchResult[] { const results: SearchResult[] = []; for (const file of files) { const relevance = this.calculateFilenameRelevance(file.name, query.terms); if (relevance > 0) { results.push({ file, matchType: 'filename', relevanceScore: relevance, highlights: this.findHighlights(file.name, query.terms), context: file.name }); } } return results; } /** * Search in file metadata */ private searchMetadata(query: ParsedQuery, files: FileMetadata[]): SearchResult[] { const results: SearchResult[] = []; for (const file of files) { const metadataText = this.extractor.extractMetadataAsText(file); const relevance = this.calculateContentRelevance(metadataText, query.terms); if (relevance > 0) { results.push({ file, matchType: 'metadata', relevanceScore: relevance * 0.7, // Lower weight for metadata matches highlights: this.findHighlights(metadataText, query.terms), context: metadataText.substring(0, 200) }); } } return results; } /** * Search in file content */ private async searchContent(query: ParsedQuery, files: FileMetadata[]): Promise<SearchResult[]> { const results: SearchResult[] = []; // Only search content for files that can have extractable content const searchableFiles = files.filter(file => this.extractor.isSearchableContent(file) && file.size <= this.config.maxFileSize ); console.log(`Searching content in ${searchableFiles.length} files`); // Process files in batches to avoid memory issues const batchSize = 10; for (let i = 0; i < searchableFiles.length; i += batchSize) { const batch = searchableFiles.slice(i, i + batchSize); const batchPromises = batch.map(async (file): Promise<SearchResult | null> => { try { const content = await this.extractor.extractContent(file); const relevance = this.calculateContentRelevance(content, query.terms); if (relevance > 0) { const contexts = ContentAnalyzer.findContext(content, query.terms.join(' ')); return { file, matchType: 'content' as const, relevanceScore: relevance, highlights: this.findHighlights(content, query.terms), context: contexts[0] || content.substring(0, 200) }; } } catch (error) { console.warn(`Failed to search content in ${file.path}:`, error); } return null; }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults.filter(r => r !== null) as SearchResult[]); } return results; } /** * Parse search query into components */ private parseQuery(query: string): ParsedQuery { // Simple query parsing - can be enhanced with more sophisticated parsing const terms = query .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(/\s+/) .filter(term => term.length > 0) .filter(term => !this.isStopWord(term)); return { terms, operators: [], // TODO: Implement operator parsing filters: [], // TODO: Implement filter parsing originalQuery: query }; } /** * Calculate filename relevance score */ private calculateFilenameRelevance(filename: string, searchTerms: string[]): number { const lowerFilename = filename.toLowerCase(); let score = 0; for (const term of searchTerms) { const lowerTerm = term.toLowerCase(); // Exact match gets highest score if (lowerFilename === lowerTerm) { score += 100; } // Exact word match else if (lowerFilename.includes(` ${lowerTerm} `) || lowerFilename.startsWith(`${lowerTerm} `) || lowerFilename.endsWith(` ${lowerTerm}`)) { score += 80; } // Contains term else if (lowerFilename.includes(lowerTerm)) { // Higher score if term is at the beginning const index = lowerFilename.indexOf(lowerTerm); const baseScore = 60; const positionBonus = Math.max(0, 20 - (index / lowerFilename.length) * 20); score += baseScore + positionBonus; } } return Math.min(100, score); } /** * Calculate content relevance score */ private calculateContentRelevance(content: string, searchTerms: string[]): number { const lowerContent = content.toLowerCase(); let score = 0; let totalMatches = 0; for (const term of searchTerms) { const lowerTerm = term.toLowerCase(); const matches = (lowerContent.match(new RegExp(lowerTerm, 'g')) || []).length; if (matches > 0) { totalMatches += matches; // Score based on frequency, but with diminishing returns score += Math.min(50, matches * 10); } } // Bonus for multiple term matches if (searchTerms.length > 1) { const uniqueMatches = searchTerms.filter(term => lowerContent.includes(term.toLowerCase()) ).length; score += (uniqueMatches / searchTerms.length) * 30; } return Math.min(100, score); } /** * Find highlighted terms in text */ private findHighlights(text: string, searchTerms: string[]): string[] { const highlights: string[] = []; const lowerText = text.toLowerCase(); for (const term of searchTerms) { const lowerTerm = term.toLowerCase(); if (lowerText.includes(lowerTerm)) { // Find the actual case-preserved term const index = lowerText.indexOf(lowerTerm); if (index !== -1) { const actualTerm = text.substring(index, index + term.length); highlights.push(actualTerm); } } } return [...new Set(highlights)]; // Remove duplicates } /** * Apply pre-filters (file type, size, date) */ private applyPreFilters(files: FileMetadata[], options: SearchOptions): FileMetadata[] { return files.filter(file => { // File type filter if (options.fileTypes && options.fileTypes.length > 0) { if (!options.fileTypes.includes(file.extension)) { return false; } } // Size range filter if (options.sizeRange) { if (options.sizeRange.min && file.size < options.sizeRange.min) { return false; } if (options.sizeRange.max && file.size > options.sizeRange.max) { return false; } } // Date range filter if (options.dateRange) { if (options.dateRange.from && file.lastModified < options.dateRange.from) { return false; } if (options.dateRange.to && file.lastModified > options.dateRange.to) { return false; } } return true; }); } /** * Apply post-filters and additional processing */ private applyPostFilters(results: SearchResult[], options: SearchOptions): SearchResult[] { return results.filter(result => { // Case sensitivity filter if (options.caseSensitive) { return result.highlights.some(highlight => options.query.split(/\s+/).some(term => highlight.includes(term)) ); } return true; }); } /** * Merge duplicate results from different search types */ private mergeResults(results: SearchResult[]): SearchResult[] { const merged = new Map<string, SearchResult>(); for (const result of results) { const key = result.file.path; const existing = merged.get(key); if (!existing) { merged.set(key, result); } else { // Merge results for the same file, keeping the higher scoring match type if (result.relevanceScore > existing.relevanceScore) { merged.set(key, { ...result, highlights: [...new Set([...existing.highlights, ...result.highlights])] }); } } } return Array.from(merged.values()); } /** * Rank and sort results */ private rankResults(results: SearchResult[], query: ParsedQuery): SearchResult[] { return results .map(result => ({ ...result, relevanceScore: this.calculateFinalScore(result, query) })) .sort((a, b) => { // Primary sort: relevance score if (b.relevanceScore !== a.relevanceScore) { return b.relevanceScore - a.relevanceScore; } // Secondary sort: match type priority const matchTypePriority = { filename: 3, content: 2, metadata: 1 }; const aPriority = matchTypePriority[a.matchType]; const bPriority = matchTypePriority[b.matchType]; if (bPriority !== aPriority) { return bPriority - aPriority; } // Tertiary sort: file name alphabetically return a.file.name.localeCompare(b.file.name); }); } /** * Calculate final relevance score with bonuses */ private calculateFinalScore(result: SearchResult, query: ParsedQuery): number { let score = result.relevanceScore; // Bonus for recent files (within last 30 days) const daysSinceModified = (Date.now() - result.file.lastModified.getTime()) / (1000 * 60 * 60 * 24); if (daysSinceModified <= 30) { score += Math.max(0, 10 - (daysSinceModified / 30) * 10); } // Bonus for smaller files (easier to work with) if (result.file.size < 100 * 1024) { // < 100KB score += 5; } // Bonus for certain file types const preferredExtensions = ['txt', 'md', 'json', 'js', 'ts', 'py']; if (preferredExtensions.includes(result.file.extension)) { score += 3; } return Math.min(100, score); } /** * Add content previews to results */ private async addContentPreviews(results: SearchResult[]): Promise<void> { for (const result of results) { try { if (result.matchType === 'content' || result.matchType === 'filename') { result.contentPreview = await this.extractor.getContentPreview(result.file, 3); } } catch (error) { console.warn(`Failed to get content preview for ${result.file.path}:`, error); } } } /** * Check if word is a stop word */ private isStopWord(word: string): boolean { const stopWords = new Set([ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'this', 'that', 'these', 'those' ]); return stopWords.has(word.toLowerCase()); } /** * Generate cache key for search results */ private getCacheKey(options: SearchOptions): string { return JSON.stringify({ query: options.query.toLowerCase(), searchIn: options.searchIn.sort(), fileTypes: options.fileTypes?.sort(), basePath: options.basePath, caseSensitive: options.caseSensitive }); } /** * Check if cached results are still valid */ private isCacheValid(cached: { results: SearchResult[]; timestamp: Date }): boolean { const age = Date.now() - cached.timestamp.getTime(); return age < 60000; // 1 minute cache } /** * Cache search results */ private cacheResults(key: string, results: SearchResult[]): void { this.resultCache.set(key, { results, timestamp: new Date() }); // Clean up old cache entries if (this.resultCache.size > 100) { const oldestKey = Array.from(this.resultCache.keys())[0]; this.resultCache.delete(oldestKey); } } /** * Clear all caches */ clearAllCaches(): void { this.resultCache.clear(); this.indexer.clearCache(); this.extractor.clearCache(); console.log('All search caches cleared'); } /** * Get search statistics */ getStats(): { resultCacheSize: number; indexCacheStats: any; contentCacheStats: any; } { return { resultCacheSize: this.resultCache.size, indexCacheStats: this.indexer.getCacheStats(), contentCacheStats: this.extractor.getCacheStats() }; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hithereiamaliff/mcp-nextcloud'

If you have feedback or need assistance with the MCP directory API, please join our Discord server