folder-mcp

Overview Schema Related Servers Score Discussions

ngram-cosine-extractor.ts•17.7 KiB

/** * N-gram + Cosine Similarity Key Phrase Extractor * * Implements the research-backed approach for extracting key phrases * when KeyBERT is not available (ONNX models in Node.js). * Achieves 60-70% multiword phrase extraction vs 11% with word frequency. */ import { cosineSimilarity, maximalMarginalRelevance } from './similarity-utils.js'; import { extractNGrams, filterCandidates, scoreTechnicalRelevance, extractKeyPhraseCandidates } from './ngram-utils.js'; import { ILoggingService } from '../../../di/interfaces.js'; /** * Interface for embedding models that can generate embeddings */ export interface IEmbeddingModel { generateEmbedding(text: string): Promise<Float32Array>; generateEmbeddings?(texts: string[]): Promise<Float32Array[]>; // Optional batch method getModelDimensions(): number; } /** * Configuration options for N-gram extraction (Enhanced for Sprint 13) */ export interface NGramExtractionOptions { ngramRange: [number, number]; // e.g., [2, 4] for bigrams to 4-grams topK: number; // Number of phrases to extract diversityFactor: number; // MMR diversity factor (0-1) minSimilarity: number; // Minimum cosine similarity threshold /** Format-specific keyword candidates extracted during parsing (Sprint 13) */ structuredCandidates?: { metadata?: string[]; headers?: string[]; entities?: string[]; emphasized?: string[]; captions?: string[]; }; /** Content zones with importance weights for keyword scoring (Sprint 13) */ contentZones?: Array<{ text: string; type: 'title' | 'header1' | 'header2' | 'header3' | 'body' | 'caption' | 'footer'; weight: number; }>; } /** * N-gram + Cosine Similarity Extractor for ONNX models * * This extractor works entirely in TypeScript without Python dependencies. * It extracts n-grams from text and ranks them by cosine similarity to * the document embedding. */ export class NGramCosineExtractor { private readonly defaultOptions: NGramExtractionOptions = { ngramRange: [2, 4], topK: 10, diversityFactor: 0.5, minSimilarity: 0.3 }; // Embedding cache to avoid redundant computations private embeddingCache: Map<string, Float32Array> = new Map(); private readonly MAX_CACHE_SIZE = 500; // Limit cache size to prevent memory issues private cacheHits = 0; private cacheMisses = 0; constructor( private embeddingModel: IEmbeddingModel | null, private logger: ILoggingService ) {} /** * Clear the embedding cache * Should be called between documents to prevent cross-document cache pollution */ clearCache(): void { const previousSize = this.embeddingCache.size; this.embeddingCache.clear(); this.cacheHits = 0; this.cacheMisses = 0; this.logger.debug(`[NGRAM-EXTRACT] Cache cleared (was ${previousSize} entries)`); } /** * Extract key phrases using n-grams and cosine similarity * * @param text The document text * @param docEmbedding Pre-computed document embedding (optional) * @param options Extraction options * @returns Array of extracted key phrases */ async extractKeyPhrases( text: string, docEmbedding?: Float32Array, options: Partial<NGramExtractionOptions> = {} ): Promise<string[]> { const opts = { ...this.defaultOptions, ...options }; this.logger.debug('[NGRAM-EXTRACT] Starting n-gram extraction', { textLength: text.length, hasEmbedding: !!docEmbedding, hasModel: !!this.embeddingModel }); // If no embedding model available, fall back to frequency-based extraction if (!this.embeddingModel || !docEmbedding) { this.logger.warn('[NGRAM-EXTRACT] No embedding model available, using frequency-based fallback'); return this.extractByFrequency(text, opts.topK); } try { // Step 1: Extract n-grams const ngrams = extractNGrams(text, opts.ngramRange[0], opts.ngramRange[1]); this.logger.debug(`[NGRAM-EXTRACT] Extracted ${ngrams.length} n-grams`); // Step 2: Filter candidates for quality let candidates = filterCandidates(ngrams); this.logger.debug(`[NGRAM-EXTRACT] Filtered to ${candidates.length} quality candidates`); if (candidates.length === 0) { this.logger.warn('[NGRAM-EXTRACT] No quality candidates found, returning empty array'); return []; } // Step 2.5: Limit candidates to reduce embedding overhead // Use frequency-based pre-filtering to get top candidates if (candidates.length > 50) { this.logger.debug(`[NGRAM-EXTRACT] Too many candidates (${candidates.length}), reducing to top 50 by frequency`); // Build frequency map from ORIGINAL ngrams (before deduplication) to get meaningful counts const frequencyMap = new Map<string, number>(); for (const ngram of ngrams) { // Only count frequencies for candidates that passed the filter if (candidates.includes(ngram)) { frequencyMap.set(ngram, (frequencyMap.get(ngram) || 0) + 1); } } // Sort candidates by their actual frequency in the original text and take top 50 candidates = candidates .map(candidate => ({ phrase: candidate, frequency: frequencyMap.get(candidate) || 1 })) .sort((a, b) => b.frequency - a.frequency) .slice(0, 50) .map(item => item.phrase); this.logger.debug(`[NGRAM-EXTRACT] Reduced to ${candidates.length} most frequent candidates`); } // Step 3: Generate embeddings for each candidate this.logger.debug('[NGRAM-EXTRACT] Generating embeddings for candidates...'); const candidateEmbeddings = await this.generateCandidateEmbeddings(candidates); // Step 4: Calculate cosine similarity with document const similarities = candidateEmbeddings.map(candEmb => cosineSimilarity(candEmb, docEmbedding!) ); // Step 5: Filter by minimum similarity threshold const validIndices: number[] = []; const validEmbeddings: Float32Array[] = []; const validScores: number[] = []; for (let i = 0; i < similarities.length; i++) { const similarity = similarities[i]; if (similarity !== undefined && similarity >= opts.minSimilarity) { validIndices.push(i); validEmbeddings.push(candidateEmbeddings[i]!); validScores.push(similarity); } } this.logger.debug(`[NGRAM-EXTRACT] ${validIndices.length} candidates above similarity threshold`); if (validIndices.length === 0) { this.logger.warn('[NGRAM-EXTRACT] No candidates above similarity threshold'); return this.extractByFrequency(text, opts.topK); } // Step 6: Apply MMR for diversity const selectedIndices = maximalMarginalRelevance( validScores, validEmbeddings, opts.diversityFactor, Math.min(opts.topK, validIndices.length) ); // Map back to original candidates const selectedPhrases = selectedIndices.map(idx => { const originalIdx = validIndices[idx]; return originalIdx !== undefined ? candidates[originalIdx]! : ''; }).filter(phrase => phrase !== ''); // Calculate multiword ratio for logging const multiwordCount = selectedPhrases.filter(p => p.includes(' ')).length; const multiwordRatio = selectedPhrases.length > 0 ? (multiwordCount / selectedPhrases.length) * 100 : 0; this.logger.info('[NGRAM-EXTRACT] Extraction complete', { totalPhrases: selectedPhrases.length, multiwordRatio: `${multiwordRatio.toFixed(1)}%`, sample: selectedPhrases.slice(0, 3) }); return selectedPhrases; } catch (error) { this.logger.error('[NGRAM-EXTRACT] Failed to extract key phrases', error instanceof Error ? error : new Error(String(error))); // Fall back to frequency-based extraction on error return this.extractByFrequency(text, opts.topK); } } /** * Generate embeddings for candidate phrases * Optimized to use batch processing and caching when available */ private async generateCandidateEmbeddings( candidates: string[] ): Promise<Float32Array[]> { if (!this.embeddingModel) { throw new Error('No embedding model available'); } this.logger.debug(`[NGRAM-EXTRACT] Generating embeddings for ${candidates.length} candidates`); const startTime = Date.now(); // Split candidates into cached and uncached const cachedEmbeddings: Map<number, Float32Array> = new Map(); const uncachedCandidates: string[] = []; const uncachedIndices: number[] = []; for (let i = 0; i < candidates.length; i++) { const candidate = candidates[i]!; if (this.embeddingCache.has(candidate)) { cachedEmbeddings.set(i, this.embeddingCache.get(candidate)!); this.cacheHits++; } else { uncachedCandidates.push(candidate); uncachedIndices.push(i); this.cacheMisses++; } } this.logger.debug(`[NGRAM-EXTRACT] Cache stats: ${cachedEmbeddings.size} hits, ${uncachedCandidates.length} misses`); // Generate embeddings only for uncached candidates let newEmbeddings: Float32Array[] = []; if (uncachedCandidates.length > 0) { // Check if batch method is available for optimal performance if (this.embeddingModel.generateEmbeddings) { this.logger.debug('[NGRAM-EXTRACT] Using optimized batch embedding generation for uncached candidates'); try { // Process all uncached candidates in a single batch newEmbeddings = await this.embeddingModel.generateEmbeddings(uncachedCandidates); // Add to cache with LRU eviction for (let i = 0; i < uncachedCandidates.length; i++) { const candidate = uncachedCandidates[i]!; const embedding = newEmbeddings[i]!; // LRU eviction if cache is full if (this.embeddingCache.size >= this.MAX_CACHE_SIZE) { const firstKey = this.embeddingCache.keys().next().value; if (firstKey !== undefined) { this.embeddingCache.delete(firstKey); } } this.embeddingCache.set(candidate, embedding); } } catch (error) { this.logger.warn('[NGRAM-EXTRACT] Batch embedding failed, falling back to individual generation', error); // Fall through to individual generation newEmbeddings = []; } } // Fallback: Individual generation if batch failed or not available if (newEmbeddings.length === 0) { this.logger.debug('[NGRAM-EXTRACT] Using individual embedding generation for uncached candidates'); const batchSize = 10; for (let i = 0; i < uncachedCandidates.length; i += batchSize) { const batch = uncachedCandidates.slice(i, Math.min(i + batchSize, uncachedCandidates.length)); const batchEmbeddings = await Promise.all( batch.map(async (candidate) => { const embedding = await this.embeddingModel!.generateEmbedding(candidate); // Add to cache with LRU eviction if (this.embeddingCache.size >= this.MAX_CACHE_SIZE) { const firstKey = this.embeddingCache.keys().next().value; if (firstKey !== undefined) { this.embeddingCache.delete(firstKey); } } this.embeddingCache.set(candidate, embedding); return embedding; }) ); newEmbeddings.push(...batchEmbeddings); } } } // Combine cached and newly generated embeddings in original order const allEmbeddings: Float32Array[] = new Array(candidates.length); // Place cached embeddings for (const [index, embedding] of cachedEmbeddings) { allEmbeddings[index] = embedding; } // Place newly generated embeddings for (let i = 0; i < uncachedIndices.length; i++) { const originalIndex = uncachedIndices[i]!; allEmbeddings[originalIndex] = newEmbeddings[i]!; } const elapsed = Date.now() - startTime; const cacheHitRate = candidates.length > 0 ? ((cachedEmbeddings.size / candidates.length) * 100).toFixed(1) : '0'; this.logger.info( `[NGRAM-EXTRACT] Generated ${candidates.length} embeddings in ${elapsed}ms ` + `(cache hit rate: ${cacheHitRate}%, cache size: ${this.embeddingCache.size})` ); return allEmbeddings; } /** * Fallback: Extract key phrases by frequency when embeddings unavailable */ private extractByFrequency(text: string, topK: number): string[] { this.logger.debug('[NGRAM-EXTRACT] Using frequency-based extraction'); const phrases = extractKeyPhraseCandidates(text, topK); // Calculate multiword ratio const multiwordCount = phrases.filter(p => p.includes(' ')).length; const multiwordRatio = phrases.length > 0 ? (multiwordCount / phrases.length) * 100 : 0; this.logger.debug('[NGRAM-EXTRACT] Frequency extraction complete', { totalPhrases: phrases.length, multiwordRatio: `${multiwordRatio.toFixed(1)}%` }); return phrases; } /** * Extract and score key phrases (returns phrases with scores) */ async extractKeyPhrasesWithScores( text: string, docEmbedding?: Float32Array, options: Partial<NGramExtractionOptions> = {} ): Promise<Array<{ phrase: string; score: number }>> { const opts = { ...this.defaultOptions, ...options }; if (!this.embeddingModel || !docEmbedding) { // Frequency-based with technical relevance scoring const phrases = extractKeyPhraseCandidates(text, opts.topK * 2); return phrases.map(phrase => ({ phrase, score: scoreTechnicalRelevance(phrase, text.length) })).sort((a, b) => b.score - a.score).slice(0, opts.topK); } // Start with structured candidates with high weights (Sprint 13) const priorityCandidates: Array<{ phrase: string; weight: number }> = []; if (opts.structuredCandidates) { // Add metadata keywords with highest weight if (opts.structuredCandidates.metadata) { opts.structuredCandidates.metadata.forEach(keyword => { priorityCandidates.push({ phrase: keyword, weight: 1.0 }); }); } // Add headers with high weight if (opts.structuredCandidates.headers) { opts.structuredCandidates.headers.forEach(header => { priorityCandidates.push({ phrase: header, weight: 0.9 }); }); } // Add entities with medium-high weight if (opts.structuredCandidates.entities) { opts.structuredCandidates.entities.forEach(entity => { priorityCandidates.push({ phrase: entity, weight: 0.8 }); }); } // Add emphasized text with medium weight if (opts.structuredCandidates.emphasized) { opts.structuredCandidates.emphasized.forEach(emphasized => { priorityCandidates.push({ phrase: emphasized, weight: 0.7 }); }); } } // Extract n-grams and calculate similarities const ngrams = extractNGrams(text, opts.ngramRange[0], opts.ngramRange[1]); const ngramCandidates = filterCandidates(ngrams); // Combine structured candidates with n-gram candidates const structuredTexts = priorityCandidates.map(c => c.phrase); const filteredNgrams = ngramCandidates.filter(ngram => !structuredTexts.some(structured => structured.toLowerCase().includes(ngram.toLowerCase())) ); const candidates = [...structuredTexts, ...filteredNgrams]; if (candidates.length === 0) { return []; } // PERFORMANCE OPTIMIZATION: Limit candidates for CPU models // For CPU models, generating embeddings is expensive, so we limit candidates // CPU models are detected by checking if batch embedding is NOT available const isCpuModel = !this.embeddingModel.generateEmbeddings; const maxCandidates = isCpuModel ? 15 : 50; // CPU: 15 candidates, GPU: 50 candidates const limitedCandidates = candidates.length > maxCandidates ? candidates.slice(0, maxCandidates) : candidates; if (isCpuModel && candidates.length > maxCandidates) { this.logger.debug(`[NGRAM-CPU-OPT] Limited candidates from ${candidates.length} to ${maxCandidates} for CPU performance`); } const candidateEmbeddings = await this.generateCandidateEmbeddings(limitedCandidates); const similarities = candidateEmbeddings.map(candEmb => cosineSimilarity(candEmb, docEmbedding!) ); // Combine with scores and apply structural weights (Sprint 13) const phrasesWithScores: Array<{ phrase: string; score: number }> = []; for (let i = 0; i < limitedCandidates.length; i++) { const phrase = limitedCandidates[i]!; const cosineSim = similarities[i]; if (cosineSim !== undefined) { // Check if this phrase is from structured candidates and apply weight boost const priorityCandidate = priorityCandidates.find(pc => pc.phrase === phrase); const structuralWeight = priorityCandidate ? priorityCandidate.weight : 0.4; // Default weight for n-grams // Balanced scoring: avoid 100% formatting-based extraction // Give more weight to semantic similarity while still boosting structured content const finalScore = structuralWeight * 0.3 + cosineSim * 0.7; // 30% structural weight, 70% semantic similarity phrasesWithScores.push({ phrase, score: finalScore }); } } return phrasesWithScores .filter(item => item.score >= opts.minSimilarity) .sort((a, b) => b.score - a.score) .slice(0, opts.topK); } } /** * Factory function to create N-gram extractor */ export function createNGramCosineExtractor( embeddingModel: IEmbeddingModel | null, logger: ILoggingService ): NGramCosineExtractor { return new NGramCosineExtractor(embeddingModel, logger); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ngram-cosine-extractor.ts•17.7 KiB