folder-mcp

Overview Schema Related Servers Score Discussions

extraction-service.ts•19.9 KiB

/** * Semantic Extraction Service * * Implements research-validated semantic extraction using KeyBERT and other * advanced techniques. Replaces the broken ContentProcessingService. */ import { ISemanticExtractionService, SemanticData, SemanticExtractionOptions, IPythonSemanticService } from './interfaces.js'; import { SemanticScore } from '../../types/index.js'; import { ILoggingService } from '../../di/interfaces.js'; import { NGramCosineExtractor, createNGramCosineExtractor, IEmbeddingModel, NGramExtractionOptions } from './algorithms/ngram-cosine-extractor.js'; import { IReadabilityCalculator, createReadabilityCalculator } from './algorithms/readability-calculator.js'; import { TextPreprocessor, createTextPreprocessor } from './text-preprocessor.js'; /** * Default extraction options */ const DEFAULT_OPTIONS: SemanticExtractionOptions = { maxKeyPhrases: 10, ngramRange: [1, 3], useMmr: true, diversity: 0.5, calculateMetrics: true }; /** * Semantic Extraction Service Implementation * * Uses KeyBERT for multiword phrase extraction and proper readability formulas. * Designed to achieve >80% multiword phrases vs current 11%. */ export class SemanticExtractionService implements ISemanticExtractionService { private options: SemanticExtractionOptions; private ngramExtractor: NGramCosineExtractor | null = null; private readabilityCalculator: IReadabilityCalculator; constructor( private pythonService: IPythonSemanticService | null, private logger: ILoggingService, private embeddingModel?: IEmbeddingModel | null ) { this.options = DEFAULT_OPTIONS; // Initialize Coleman-Liau readability calculator (always available) this.readabilityCalculator = createReadabilityCalculator(); // Initialize N-gram extractor for ONNX models (which don't use Python service) // ONNX models run entirely in Node.js and use N-gram extraction if (!pythonService && embeddingModel) { this.ngramExtractor = createNGramCosineExtractor(embeddingModel, logger); this.logger.info('Initialized N-gram extractor for ONNX model semantic extraction'); } } /** * Extract semantic data from text with enhanced options (Sprint 13) */ async extractFromText(text: string, embeddings?: Float32Array, options?: SemanticExtractionOptions): Promise<SemanticData> { const startTime = Date.now(); try { // Apply text preprocessing if file type is specified (Sprint 13) let processedText = text; if (options?.fileType) { const preprocessor = createTextPreprocessor(options.fileType); processedText = preprocessor.preprocess(text); this.logger.debug('Applied text preprocessing', { fileType: options.fileType, originalLength: text.length, processedLength: processedText.length }); } // Extract key phrases using KeyBERT if available, fallback to n-gram with enhanced options const keyPhrases = await this.extractKeyPhrases(processedText, embeddings, options); // Calculate readability using Coleman-Liau (Sprint 2 implementation) const readabilityScore = this.readabilityCalculator.calculate(text); // Calculate quality metrics const qualityMetrics = this.options.calculateMetrics ? this.calculateQualityMetrics(keyPhrases) : undefined; const processingTimeMs = Date.now() - startTime; const extractionMethod = this.pythonService ? 'keybert' : 'ngram'; this.logger.debug('Semantic extraction complete', { method: extractionMethod, phraseCount: keyPhrases.length, multiwordRatio: qualityMetrics?.multiwordRatio, timeMs: processingTimeMs }); return { keyPhrases, readabilityScore, extractionMethod, processingTimeMs, qualityMetrics }; } catch (error) { // FAIL LOUDLY - propagate the error! this.logger.error('CRITICAL: Semantic extraction failed completely!'); this.logger.error('Error details: ' + (error instanceof Error ? error.message : String(error))); if (error instanceof Error && error.stack) { this.logger.error('Stack trace: ' + error.stack); } throw error; // Re-throw to fail loudly } } /** * Extract semantic data from multiple texts (batch processing) * Optimized for performance when processing multiple chunks */ async extractFromTextBatch(texts: string[], embeddings?: Float32Array[], options?: SemanticExtractionOptions): Promise<SemanticData[]> { const startTime = Date.now(); try { // Apply text preprocessing to all texts if needed (Sprint 13) let processedTexts = texts; if (options?.fileType) { const preprocessor = createTextPreprocessor(options.fileType); processedTexts = texts.map(text => preprocessor.preprocess(text)); this.logger.debug('Applied batch text preprocessing', { fileType: options.fileType, textCount: texts.length, avgOriginalLength: texts.reduce((sum, t) => sum + t.length, 0) / texts.length, avgProcessedLength: processedTexts.reduce((sum, t) => sum + t.length, 0) / processedTexts.length }); } // For GPU models with Python service, use batch KeyBERT processing if (this.pythonService) { return await this.extractBatchWithKeyBERT(processedTexts, options); } // For ONNX models with N-gram extraction, process each text individually // because N-gram requires individual embeddings and is already optimized if (this.ngramExtractor) { return await this.extractBatchWithNGram(processedTexts, embeddings, options); } // No extraction method available - FAIL LOUDLY throw new Error( 'CRITICAL: No semantic extraction method available for batch processing! ' + 'GPU models require Python with KeyBERT, ONNX models require N-gram extractor.' ); } catch (error) { this.logger.error('CRITICAL: Batch semantic extraction failed completely!'); this.logger.error('Error details: ' + (error instanceof Error ? error.message : String(error))); if (error instanceof Error && error.stack) { this.logger.error('Stack trace: ' + error.stack); } throw error; // Re-throw to fail loudly } } /** * Batch extraction using KeyBERT (for GPU models) */ private async extractBatchWithKeyBERT(texts: string[], options?: SemanticExtractionOptions): Promise<SemanticData[]> { if (!this.pythonService) { throw new Error('Python service not available for KeyBERT batch processing'); } // Check if KeyBERT is available const available = await this.pythonService.isKeyBERTAvailable(); if (!available) { throw new Error('KeyBERT not available in Python environment'); } // Extract key phrases using batch KeyBERT // GPU models (KeyBERT) don't need structured candidates - they extract quality phrases naturally const batchStartTime = Date.now(); const mergedOptions = { ...this.options, ...options }; // Remove structured candidates for GPU/KeyBERT models - they don't need format-based hints const keyBertOptions = { ...mergedOptions }; delete keyBertOptions.structuredCandidates; delete keyBertOptions.contentZones; const keyPhrasesArray = await this.pythonService.extractKeyPhrasesKeyBERTBatch(texts, keyBertOptions); const batchTime = Date.now() - batchStartTime; // Process each text result const results: SemanticData[] = []; for (let i = 0; i < texts.length; i++) { const text = texts[i]!; let keyPhrases = keyPhrasesArray[i] || []; // FALLBACK: If KeyBERT returns empty, generate basic key phrases from text // This prevents "No key phrases provided" errors during storage if (keyPhrases.length === 0) { keyPhrases = this.generateFallbackKeyPhrases(text); this.logger.warn('KeyBERT returned empty key phrases, using fallback extraction', { textIndex: i, textLength: text.length, fallbackPhraseCount: keyPhrases.length }); } // Calculate readability const readabilityScore = this.readabilityCalculator.calculate(text); // Calculate quality metrics const qualityMetrics = this.options.calculateMetrics ? this.calculateQualityMetrics(keyPhrases) : undefined; results.push({ keyPhrases, readabilityScore, extractionMethod: 'keybert', processingTimeMs: batchTime / texts.length, // Approximate per-text time qualityMetrics }); } const totalTime = Date.now() - batchStartTime; return results; } /** * Batch extraction using N-gram (for ONNX models) */ private async extractBatchWithNGram(texts: string[], embeddings?: Float32Array[], options?: SemanticExtractionOptions): Promise<SemanticData[]> { if (!this.ngramExtractor) { throw new Error('N-gram extractor not available for batch processing'); } if (!embeddings || embeddings.length !== texts.length) { throw new Error('N-gram batch processing requires embeddings for each text'); } const results: SemanticData[] = []; const batchStartTime = Date.now(); // Process each text individually (N-gram is already optimized with batching internally) for (let i = 0; i < texts.length; i++) { const text = texts[i]!; const embedding = embeddings[i]!; const textStartTime = Date.now(); // Extract key phrases using N-gram with merged options const mergedOptions = { ...this.options, ...options }; const extractOptions: Partial<NGramExtractionOptions> = {}; if (mergedOptions.ngramRange) extractOptions.ngramRange = mergedOptions.ngramRange; if (mergedOptions.maxKeyPhrases !== undefined) extractOptions.topK = mergedOptions.maxKeyPhrases; if (mergedOptions.diversity !== undefined) extractOptions.diversityFactor = mergedOptions.diversity; // Pass structured candidates to N-gram extractor (Sprint 13) if (mergedOptions.structuredCandidates) { extractOptions.structuredCandidates = mergedOptions.structuredCandidates; } if (mergedOptions.contentZones) { extractOptions.contentZones = mergedOptions.contentZones; } const scoredPhrases = await this.ngramExtractor.extractKeyPhrasesWithScores(text, embedding, extractOptions); const keyPhrases: SemanticScore[] = scoredPhrases.map(item => ({ text: item.phrase, score: item.score })); // Calculate readability const readabilityScore = this.readabilityCalculator.calculate(text); // Calculate quality metrics const qualityMetrics = this.options.calculateMetrics ? this.calculateQualityMetrics(keyPhrases) : undefined; const processingTimeMs = Date.now() - textStartTime; results.push({ keyPhrases, readabilityScore, extractionMethod: 'ngram', processingTimeMs, qualityMetrics }); } const totalTime = Date.now() - batchStartTime; return results; } /** * Extract key phrases using KeyBERT for GPU models or N-gram for ONNX models */ async extractKeyPhrases(text: string, embeddings?: Float32Array, options?: SemanticExtractionOptions): Promise<SemanticScore[]> { // Try KeyBERT first (for GPU models) if (this.pythonService) { try { // Check if KeyBERT is available const available = await this.pythonService.isKeyBERTAvailable(); if (!available) { throw new Error('KeyBERT not available in Python environment'); } // Extract key phrases using KeyBERT with merged options const mergedOptions = { ...this.options, ...options }; const phrases = await this.pythonService.extractKeyPhrasesKeyBERT(text, mergedOptions); this.logger.debug('KeyBERT extraction successful', { phraseCount: phrases.length, sample: phrases.slice(0, 3).map(p => p.text), multiwordRatio: this.calculateMultiwordRatio(phrases.map(p => p.text)) }); return phrases; } catch (error) { this.logger.error('KeyBERT extraction failed for GPU model', error instanceof Error ? error : new Error(String(error))); // FAIL LOUDLY - GPU models require KeyBERT throw new Error( `CRITICAL: KeyBERT extraction failed for GPU model! ${error instanceof Error ? error.message : String(error)}` ); } } // Use N-gram extractor for ONNX models if (this.ngramExtractor && embeddings) { this.logger.info('Using N-gram + Cosine similarity for ONNX model'); try { const mergedOptions = { ...this.options, ...options }; const extractOptions: Partial<NGramExtractionOptions> = {}; if (mergedOptions.ngramRange) extractOptions.ngramRange = mergedOptions.ngramRange; if (mergedOptions.maxKeyPhrases !== undefined) extractOptions.topK = mergedOptions.maxKeyPhrases; if (mergedOptions.diversity !== undefined) extractOptions.diversityFactor = mergedOptions.diversity; // Pass structured candidates to N-gram extractor (Sprint 13) if (mergedOptions.structuredCandidates) { extractOptions.structuredCandidates = mergedOptions.structuredCandidates; } if (mergedOptions.contentZones) { extractOptions.contentZones = mergedOptions.contentZones; } const scoredPhrases = await this.ngramExtractor.extractKeyPhrasesWithScores( text, embeddings, extractOptions ); this.logger.debug('N-gram extraction successful', { phraseCount: scoredPhrases.length, multiwordRatio: this.calculateMultiwordRatio(scoredPhrases.map(p => p.phrase)) }); // Convert to SemanticScore format const semanticScores: SemanticScore[] = scoredPhrases.map(item => ({ text: item.phrase, score: item.score })); return semanticScores; } catch (error) { this.logger.error('N-gram extraction failed', error instanceof Error ? error : new Error(String(error))); throw new Error(`Semantic extraction failed: ${error instanceof Error ? error.message : String(error)}`); } } // No extraction method available - FAIL LOUDLY throw new Error( 'CRITICAL: No semantic extraction method available! ' + 'GPU models require Python with KeyBERT, ONNX models require embeddings for N-gram extraction.' ); } /** * Calculate multiword ratio for quality metrics */ private calculateMultiwordRatio(phrases: string[]): number { if (phrases.length === 0) return 0; const multiwordCount = phrases.filter(p => p.includes(' ')).length; return multiwordCount / phrases.length; } /** * Validate extraction quality */ validateExtraction(data: SemanticData): boolean { // Check for minimum quality standards if (!data.keyPhrases || data.keyPhrases.length === 0) { return false; } // Check multiword ratio if metrics available if (data.qualityMetrics) { if (data.qualityMetrics.multiwordRatio < 0.5) { this.logger.warn('Low multiword ratio in extraction', { ratio: data.qualityMetrics.multiwordRatio }); return false; } } return true; } /** * Calculate quality metrics for extracted data */ private calculateQualityMetrics(keyPhrases: SemanticScore[]): SemanticData['qualityMetrics'] { const phraseTexts = keyPhrases.map(p => p.text); const multiwordPhrases = phraseTexts.filter(p => p.includes(' ')); const multiwordRatio = phraseTexts.length > 0 ? multiwordPhrases.length / phraseTexts.length : 0; const wordCounts = phraseTexts.map(p => p.split(' ').length); const averageWordsPerPhrase = wordCounts.length > 0 ? wordCounts.reduce((a, b) => a + b, 0) / wordCounts.length : 1; return { multiwordRatio, averageWordsPerPhrase }; } /** * Generate fallback key phrases when KeyBERT extraction fails or returns empty. * Uses simple word frequency and n-gram extraction to ensure at least minimal * semantic data is available for storage. * * This prevents "No key phrases provided" errors during chunk storage. */ private generateFallbackKeyPhrases(text: string): SemanticScore[] { // Clean and normalize text const cleanText = text .toLowerCase() .replace(/[^\w\s]/g, ' ') // Remove punctuation .replace(/\s+/g, ' ') // Normalize whitespace .trim(); // Split into words and filter const words = cleanText.split(' ') .filter(w => w.length >= 3) // Minimum 3 chars .filter(w => !this.isStopWord(w)); // Remove stop words if (words.length === 0) { // Ultra-fallback: return the first meaningful chunk of text const firstWords = text.trim().split(/\s+/).slice(0, 3).join(' '); return [{ text: firstWords || 'content', score: 0.1 // Low confidence score }]; } // Calculate word frequencies const wordFreq = new Map<string, number>(); for (const word of words) { wordFreq.set(word, (wordFreq.get(word) || 0) + 1); } // Get top words by frequency const sortedWords = Array.from(wordFreq.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, 5); // Generate bigrams from consecutive words const bigrams: string[] = []; for (let i = 0; i < words.length - 1; i++) { bigrams.push(`${words[i]} ${words[i + 1]}`); } // Calculate bigram frequencies const bigramFreq = new Map<string, number>(); for (const bigram of bigrams) { bigramFreq.set(bigram, (bigramFreq.get(bigram) || 0) + 1); } // Get top bigrams const sortedBigrams = Array.from(bigramFreq.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, 3); // Combine results with scores based on frequency const maxFreq = Math.max(...sortedWords.map(w => w[1]), 1); const results: SemanticScore[] = []; // Add bigrams first (higher value for multiword phrases) for (const [bigram, freq] of sortedBigrams) { results.push({ text: bigram, score: 0.3 + (freq / maxFreq) * 0.2 // Score 0.3-0.5 for bigrams }); } // Add single words for (const [word, freq] of sortedWords) { if (results.length >= 5) break; // Max 5 phrases results.push({ text: word, score: 0.1 + (freq / maxFreq) * 0.2 // Score 0.1-0.3 for single words }); } return results.slice(0, 5); // Ensure max 5 phrases } /** * Check if a word is a common stop word (should be skipped) */ private isStopWord(word: string): boolean { const stopWords = new Set([ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'is', 'are', 'was', 'were', 'been', 'being', 'has', 'had', 'does', 'did' ]); return stopWords.has(word.toLowerCase()); } } /** * Factory function for creating semantic extraction service */ export function createSemanticExtractionService( pythonService: IPythonSemanticService | null, logger: ILoggingService, embeddingModel?: IEmbeddingModel | null ): ISemanticExtractionService { return new SemanticExtractionService(pythonService, logger, embeddingModel); } // Export the interface for import elsewhere export type { ISemanticExtractionService } from './interfaces.js';

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extraction-service.ts•19.9 KiB