Nextcloud MCP Server

content-extractor.ts•11.6 KiB

import { WebDAVClient } from '../client/webdav.js'; import { getClient } from './client-manager.js'; import { FileMetadata, SupportedFileTypes, FILE_TYPE_MAPPINGS, MIME_TYPE_MAPPINGS, SearchConfig, DEFAULT_SEARCH_CONFIG } from '../models/webdav-search.js'; /** * Content cache entry */ interface ContentCacheEntry { content: string; timestamp: Date; size: number; } /** * Service for extracting searchable content from files */ export class ContentExtractor { private contentCache = new Map<string, ContentCacheEntry>(); private config: SearchConfig; private totalCacheSize = 0; constructor(config: SearchConfig = DEFAULT_SEARCH_CONFIG) { this.config = config; } /** * Extract searchable content from file based on type */ async extractContent(file: FileMetadata): Promise<string> { // Check cache first const cacheKey = this.getCacheKey(file.path); const cached = this.contentCache.get(cacheKey); if (cached && this.isCacheValid(cached, file)) { return cached.content; } // Skip if file is too large if (file.size > this.config.maxFileSize) { console.log(`Skipping content extraction for large file: ${file.path} (${file.size} bytes)`); return this.extractMetadataAsText(file); } try { const fileType = this.getFileType(file.mimeType, file.extension); let content: string; switch (fileType) { case SupportedFileTypes.TEXT: case SupportedFileTypes.CODE: case SupportedFileTypes.CONFIG: content = await this.extractTextContent(file.path); break; case SupportedFileTypes.DOCUMENT: case SupportedFileTypes.MEDIA: content = this.extractMetadataAsText(file); break; default: content = this.extractMetadataAsText(file); } // Cache the content this.cacheContent(cacheKey, content, file.size); return content; } catch (error) { console.warn(`Failed to extract content from ${file.path}:`, error); return this.extractMetadataAsText(file); } } /** * Detect file type and determine extraction strategy */ getFileType(mimeType: string, extension: string): SupportedFileTypes { // First try MIME type mapping if (mimeType && MIME_TYPE_MAPPINGS[mimeType]) { return MIME_TYPE_MAPPINGS[mimeType]; } // Then try extension mapping if (extension && FILE_TYPE_MAPPINGS[extension]) { return FILE_TYPE_MAPPINGS[extension]; } // Default to text if it looks like a text MIME type if (mimeType && mimeType.startsWith('text/')) { return SupportedFileTypes.TEXT; } // Default fallback return SupportedFileTypes.DOCUMENT; } /** * Check if file is searchable (has extractable content) */ isSearchableContent(file: FileMetadata): boolean { const fileType = this.getFileType(file.mimeType, file.extension); return [ SupportedFileTypes.TEXT, SupportedFileTypes.CODE, SupportedFileTypes.CONFIG ].includes(fileType); } /** * Get content preview (first few lines) */ async getContentPreview(file: FileMetadata, maxLines: number = 3): Promise<string> { try { const content = await this.extractContent(file); const lines = content.split('\n').slice(0, maxLines); return lines.join('\n'); } catch (error) { console.warn(`Failed to get content preview for ${file.path}:`, error); return ''; } } /** * Clear content cache */ clearCache(): void { this.contentCache.clear(); this.totalCacheSize = 0; console.log('Content cache cleared'); } /** * Get cache statistics */ getCacheStats(): { size: number; totalSize: number; keys: string[] } { return { size: this.contentCache.size, totalSize: this.totalCacheSize, keys: Array.from(this.contentCache.keys()) }; } /** * Extract content from text files */ private async extractTextContent(path: string): Promise<string> { try { const webdavClient = getClient(WebDAVClient); const content = await webdavClient.readFile(path); // Basic content cleaning return this.cleanTextContent(content); } catch (error) { console.warn(`Failed to read text content from ${path}:`, error); throw error; } } /** * Extract metadata as searchable text */ public extractMetadataAsText(file: FileMetadata): string { const metadata: string[] = []; // Add filename (without extension) const nameWithoutExt = file.name.replace(new RegExp(`\\.${file.extension}$`), ''); metadata.push(nameWithoutExt); // Add extension if (file.extension) { metadata.push(file.extension); } // Add MIME type description metadata.push(this.getMimeTypeDescription(file.mimeType)); // Add size category metadata.push(this.getSizeCategory(file.size)); // Add date information const dateStr = file.lastModified.toISOString().split('T')[0]; metadata.push(dateStr); metadata.push(file.lastModified.getFullYear().toString()); // Add path components (directories) const pathParts = file.path.split('/').filter(part => part.length > 0); metadata.push(...pathParts); return metadata.join(' '); } /** * Clean and normalize text content */ private cleanTextContent(content: string): string { return content // Remove excessive whitespace .replace(/\s+/g, ' ') // Remove control characters except newlines and tabs .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, '') // Limit length to prevent memory issues .substring(0, 100000) // 100KB limit .trim(); } /** * Get human-readable MIME type description */ private getMimeTypeDescription(mimeType: string): string { const descriptions: Record<string, string> = { 'text/plain': 'text file', 'text/markdown': 'markdown document', 'text/csv': 'spreadsheet data', 'application/json': 'json data', 'text/xml': 'xml document', 'application/xml': 'xml document', 'text/html': 'web page', 'text/css': 'stylesheet', 'application/javascript': 'javascript code', 'text/javascript': 'javascript code', 'application/pdf': 'pdf document', 'image/jpeg': 'jpeg image', 'image/png': 'png image', 'video/mp4': 'mp4 video', 'audio/mpeg': 'mp3 audio' }; return descriptions[mimeType] || mimeType.split('/')[0] || 'file'; } /** * Get size category description */ private getSizeCategory(size: number): string { if (size === 0) return 'empty'; if (size < 1024) return 'tiny'; if (size < 10 * 1024) return 'small'; if (size < 100 * 1024) return 'medium'; if (size < 1024 * 1024) return 'large'; if (size < 10 * 1024 * 1024) return 'very large'; return 'huge'; } /** * Generate cache key for content */ private getCacheKey(path: string): string { return path; } /** * Check if cached content is still valid */ private isCacheValid(cached: ContentCacheEntry, file: FileMetadata): boolean { // Check if cache has expired const age = Date.now() - cached.timestamp.getTime(); if (age > this.config.contentTTL) { return false; } // For now, we'll assume content is valid if not expired // In a more sophisticated implementation, we could check file modification time return true; } /** * Cache extracted content */ private cacheContent(key: string, content: string, fileSize: number): void { // Check if we need to make space const contentSize = Buffer.byteLength(content, 'utf8'); // Make space if needed while (this.totalCacheSize + contentSize > this.config.maxContentSize && this.contentCache.size > 0) { this.evictOldestCacheEntry(); } // Don't cache if the content is too large if (contentSize > this.config.maxContentSize / 10) { console.log(`Not caching large content: ${key} (${contentSize} bytes)`); return; } // Cache the content this.contentCache.set(key, { content, timestamp: new Date(), size: contentSize }); this.totalCacheSize += contentSize; } /** * Evict the oldest cache entry */ private evictOldestCacheEntry(): void { let oldestKey: string | null = null; let oldestTime = Date.now(); for (const [key, entry] of this.contentCache.entries()) { if (entry.timestamp.getTime() < oldestTime) { oldestTime = entry.timestamp.getTime(); oldestKey = key; } } if (oldestKey) { const entry = this.contentCache.get(oldestKey); if (entry) { this.totalCacheSize -= entry.size; this.contentCache.delete(oldestKey); } } } } /** * Utility functions for content analysis */ export class ContentAnalyzer { /** * Extract keywords from content */ static extractKeywords(content: string, maxKeywords: number = 20): string[] { const words = content .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(/\s+/) .filter(word => word.length > 2 && word.length < 50) .filter(word => !this.isStopWord(word)); // Count word frequency const wordCount = new Map<string, number>(); words.forEach(word => { wordCount.set(word, (wordCount.get(word) || 0) + 1); }); // Sort by frequency and return top keywords return Array.from(wordCount.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, maxKeywords) .map(([word]) => word); } /** * Check if word is a stop word (common words to ignore) */ private static isStopWord(word: string): boolean { const stopWords = new Set([ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'as', 'up', 'it', 'is', 'be', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'shall', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs', 'from', 'into', 'about', 'through', 'during', 'before', 'after', 'above', 'below', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'just', 'don', 'now', 'd', 'll', 'm', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn' ]); return stopWords.has(word.toLowerCase()); } /** * Find text around search matches (context) */ static findContext(content: string, searchTerm: string, contextSize: number = 50): string[] { const contexts: string[] = []; const lowerContent = content.toLowerCase(); const lowerTerm = searchTerm.toLowerCase(); let index = 0; while ((index = lowerContent.indexOf(lowerTerm, index)) !== -1) { const start = Math.max(0, index - contextSize); const end = Math.min(content.length, index + lowerTerm.length + contextSize); const context = content.substring(start, end); contexts.push(context.trim()); index += lowerTerm.length; // Limit contexts to avoid too many results if (contexts.length >= 5) { break; } } return contexts; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hithereiamaliff/mcp-nextcloud'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content-extractor.ts•11.6 KiB