Skip to main content
Glama
content-extractor.ts11.9 kB
import { WebDAVClient } from '../client/webdav.js'; import { getClient } from './client-manager.js'; import { FileMetadata, SupportedFileTypes, FILE_TYPE_MAPPINGS, MIME_TYPE_MAPPINGS, SearchConfig, DEFAULT_SEARCH_CONFIG } from '../models/webdav-search.js'; /** * Content cache entry */ interface ContentCacheEntry { content: string; timestamp: Date; size: number; } /** * Service for extracting searchable content from files */ export class ContentExtractor { private contentCache = new Map<string, ContentCacheEntry>(); private config: SearchConfig; private totalCacheSize = 0; constructor(config: SearchConfig = DEFAULT_SEARCH_CONFIG) { this.config = config; } /** * Extract searchable content from file based on type */ async extractContent(file: FileMetadata): Promise<string> { // Check cache first const cacheKey = this.getCacheKey(file.path); const cached = this.contentCache.get(cacheKey); if (cached && this.isCacheValid(cached, file)) { return cached.content; } // Skip if file is too large if (file.size > this.config.maxFileSize) { console.log(`Skipping content extraction for large file: ${file.path} (${file.size} bytes)`); return this.extractMetadataAsText(file); } try { const fileType = this.getFileType(file.mimeType, file.extension); let content: string; switch (fileType) { case SupportedFileTypes.TEXT: case SupportedFileTypes.CODE: case SupportedFileTypes.CONFIG: content = await this.extractTextContent(file.path); break; case SupportedFileTypes.DOCUMENT: case SupportedFileTypes.MEDIA: content = this.extractMetadataAsText(file); break; default: content = this.extractMetadataAsText(file); } // Cache the content this.cacheContent(cacheKey, content, file.size); return content; } catch (error) { console.warn(`Failed to extract content from ${file.path}:`, error); return this.extractMetadataAsText(file); } } /** * Detect file type and determine extraction strategy */ getFileType(mimeType: string, extension: string): SupportedFileTypes { // First try MIME type mapping if (mimeType && MIME_TYPE_MAPPINGS[mimeType]) { return MIME_TYPE_MAPPINGS[mimeType]; } // Then try extension mapping if (extension && FILE_TYPE_MAPPINGS[extension]) { return FILE_TYPE_MAPPINGS[extension]; } // Default to text if it looks like a text MIME type if (mimeType && mimeType.startsWith('text/')) { return SupportedFileTypes.TEXT; } // Default fallback return SupportedFileTypes.DOCUMENT; } /** * Check if file is searchable (has extractable content) */ isSearchableContent(file: FileMetadata): boolean { const fileType = this.getFileType(file.mimeType, file.extension); return [ SupportedFileTypes.TEXT, SupportedFileTypes.CODE, SupportedFileTypes.CONFIG ].includes(fileType); } /** * Get content preview (first few lines) */ async getContentPreview(file: FileMetadata, maxLines: number = 3): Promise<string> { try { const content = await this.extractContent(file); const lines = content.split('\n').slice(0, maxLines); return lines.join('\n'); } catch (error) { console.warn(`Failed to get content preview for ${file.path}:`, error); return ''; } } /** * Clear content cache */ clearCache(): void { this.contentCache.clear(); this.totalCacheSize = 0; console.log('Content cache cleared'); } /** * Get cache statistics */ getCacheStats(): { size: number; totalSize: number; keys: string[] } { return { size: this.contentCache.size, totalSize: this.totalCacheSize, keys: Array.from(this.contentCache.keys()) }; } /** * Extract content from text files */ private async extractTextContent(path: string): Promise<string> { try { const webdavClient = getClient(WebDAVClient); const content = await webdavClient.readFile(path); // Basic content cleaning return this.cleanTextContent(content); } catch (error) { console.warn(`Failed to read text content from ${path}:`, error); throw error; } } /** * Extract metadata as searchable text */ public extractMetadataAsText(file: FileMetadata): string { const metadata: string[] = []; // Add filename (without extension) const nameWithoutExt = file.name.replace(new RegExp(`\\.${file.extension}$`), ''); metadata.push(nameWithoutExt); // Add extension if (file.extension) { metadata.push(file.extension); } // Add MIME type description metadata.push(this.getMimeTypeDescription(file.mimeType)); // Add size category metadata.push(this.getSizeCategory(file.size)); // Add date information const dateStr = file.lastModified.toISOString().split('T')[0]; metadata.push(dateStr); metadata.push(file.lastModified.getFullYear().toString()); // Add path components (directories) const pathParts = file.path.split('/').filter(part => part.length > 0); metadata.push(...pathParts); return metadata.join(' '); } /** * Clean and normalize text content */ private cleanTextContent(content: string): string { return content // Remove excessive whitespace .replace(/\s+/g, ' ') // Remove control characters except newlines and tabs .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, '') // Limit length to prevent memory issues .substring(0, 100000) // 100KB limit .trim(); } /** * Get human-readable MIME type description */ private getMimeTypeDescription(mimeType: string): string { const descriptions: Record<string, string> = { 'text/plain': 'text file', 'text/markdown': 'markdown document', 'text/csv': 'spreadsheet data', 'application/json': 'json data', 'text/xml': 'xml document', 'application/xml': 'xml document', 'text/html': 'web page', 'text/css': 'stylesheet', 'application/javascript': 'javascript code', 'text/javascript': 'javascript code', 'application/pdf': 'pdf document', 'image/jpeg': 'jpeg image', 'image/png': 'png image', 'video/mp4': 'mp4 video', 'audio/mpeg': 'mp3 audio' }; return descriptions[mimeType] || mimeType.split('/')[0] || 'file'; } /** * Get size category description */ private getSizeCategory(size: number): string { if (size === 0) return 'empty'; if (size < 1024) return 'tiny'; if (size < 10 * 1024) return 'small'; if (size < 100 * 1024) return 'medium'; if (size < 1024 * 1024) return 'large'; if (size < 10 * 1024 * 1024) return 'very large'; return 'huge'; } /** * Generate cache key for content */ private getCacheKey(path: string): string { return path; } /** * Check if cached content is still valid */ private isCacheValid(cached: ContentCacheEntry, file: FileMetadata): boolean { // Check if cache has expired const age = Date.now() - cached.timestamp.getTime(); if (age > this.config.contentTTL) { return false; } // For now, we'll assume content is valid if not expired // In a more sophisticated implementation, we could check file modification time return true; } /** * Cache extracted content */ private cacheContent(key: string, content: string, fileSize: number): void { // Check if we need to make space const contentSize = Buffer.byteLength(content, 'utf8'); // Make space if needed while (this.totalCacheSize + contentSize > this.config.maxContentSize && this.contentCache.size > 0) { this.evictOldestCacheEntry(); } // Don't cache if the content is too large if (contentSize > this.config.maxContentSize / 10) { console.log(`Not caching large content: ${key} (${contentSize} bytes)`); return; } // Cache the content this.contentCache.set(key, { content, timestamp: new Date(), size: contentSize }); this.totalCacheSize += contentSize; } /** * Evict the oldest cache entry */ private evictOldestCacheEntry(): void { let oldestKey: string | null = null; let oldestTime = Date.now(); for (const [key, entry] of this.contentCache.entries()) { if (entry.timestamp.getTime() < oldestTime) { oldestTime = entry.timestamp.getTime(); oldestKey = key; } } if (oldestKey) { const entry = this.contentCache.get(oldestKey); if (entry) { this.totalCacheSize -= entry.size; this.contentCache.delete(oldestKey); } } } } /** * Utility functions for content analysis */ export class ContentAnalyzer { /** * Extract keywords from content */ static extractKeywords(content: string, maxKeywords: number = 20): string[] { const words = content .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(/\s+/) .filter(word => word.length > 2 && word.length < 50) .filter(word => !this.isStopWord(word)); // Count word frequency const wordCount = new Map<string, number>(); words.forEach(word => { wordCount.set(word, (wordCount.get(word) || 0) + 1); }); // Sort by frequency and return top keywords return Array.from(wordCount.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, maxKeywords) .map(([word]) => word); } /** * Check if word is a stop word (common words to ignore) */ private static isStopWord(word: string): boolean { const stopWords = new Set([ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'as', 'up', 'it', 'is', 'be', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'shall', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs', 'from', 'into', 'about', 'through', 'during', 'before', 'after', 'above', 'below', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'just', 'don', 'now', 'd', 'll', 'm', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn' ]); return stopWords.has(word.toLowerCase()); } /** * Find text around search matches (context) */ static findContext(content: string, searchTerm: string, contextSize: number = 50): string[] { const contexts: string[] = []; const lowerContent = content.toLowerCase(); const lowerTerm = searchTerm.toLowerCase(); let index = 0; while ((index = lowerContent.indexOf(lowerTerm, index)) !== -1) { const start = Math.max(0, index - contextSize); const end = Math.min(content.length, index + lowerTerm.length + contextSize); const context = content.substring(start, end); contexts.push(context.trim()); index += lowerTerm.length; // Limit contexts to avoid too many results if (contexts.length >= 5) { break; } } return contexts; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hithereiamaliff/mcp-nextcloud'

If you have feedback or need assistance with the MCP directory API, please join our Discord server