folder-mcp

Overview Schema Related Servers Score Discussions

ngram-utils.ts•6.98 KiB

/** * N-gram Extraction Utilities * * Provides functions to extract n-grams from text for semantic analysis. * Focuses on extracting meaningful multi-word phrases (2-4 words). */ /** * Common English stop words to filter out */ const STOP_WORDS = new Set([ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'by', 'for', 'from', 'has', 'had', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'will', 'with', 'the', 'this', 'these', 'those', 'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could', 'should', 'would', 'may', 'might', 'must', 'shall', 'will', 'would', 'am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'very', 'also', 'just', 'only' ]); /** * Token representing a word with its properties */ interface Token { word: string; // Lowercase version for matching originalWord: string; // Original casing preserved isStopWord: boolean; isNumber: boolean; isPunctuation: boolean; } /** * Tokenize text into words */ function tokenize(text: string): Token[] { // Split on whitespace and punctuation, keeping some punctuation as separate tokens const originalWords = text .replace(/([.!?,;:])/g, ' $1 ') // Add spaces around punctuation .split(/\s+/) .filter(w => w.length > 0); return originalWords.map(originalWord => { const word = originalWord.toLowerCase(); return { word, originalWord, // Preserve original casing isStopWord: STOP_WORDS.has(word), isNumber: /^\d+$/.test(word), isPunctuation: /^[.!?,;:]+$/.test(word) }; }); } /** * Extract n-grams from text * * @param text Input text * @param minN Minimum n-gram size (default: 2 for bigrams) * @param maxN Maximum n-gram size (default: 4) * @returns Array of n-gram strings */ export function extractNGrams( text: string, minN: number = 2, maxN: number = 4 ): string[] { const tokens = tokenize(text); const ngrams = new Set<string>(); const ngramsLower = new Set<string>(); // Track lowercase versions for O(1) deduplication // For each n from minN to maxN for (let n = minN; n <= maxN; n++) { // Slide window of size n over tokens for (let i = 0; i <= tokens.length - n; i++) { const ngramTokens = tokens.slice(i, i + n); // Skip if any token is punctuation if (ngramTokens.some(t => t.isPunctuation)) { continue; } // Skip if all tokens are stop words or numbers if (ngramTokens.every(t => t.isStopWord || t.isNumber)) { continue; } // Skip if starts or ends with stop word (for better phrase quality) if (n > 1) { if (ngramTokens[0]?.isStopWord || ngramTokens[n - 1]?.isStopWord) { continue; } } // Create n-gram using original casing for better technical term detection const ngram = ngramTokens.map(t => t.originalWord).join(' '); // But also store the lowercase version for deduplication const ngramLower = ngramTokens.map(t => t.word).join(' '); // O(1) lookup instead of O(n) - massive performance improvement if (!ngramsLower.has(ngramLower)) { ngrams.add(ngram); ngramsLower.add(ngramLower); } } } return Array.from(ngrams); } /** * Filter n-gram candidates based on quality criteria * * @param ngrams List of n-gram strings * @returns Filtered list of high-quality n-grams */ export function filterCandidates(ngrams: string[]): string[] { return ngrams.filter(ngram => { const words = ngram.split(' '); // Filter out very short n-grams if (ngram.length < 5) return false; // Filter out n-grams that are just numbers if (/^\d+(\s+\d+)*$/.test(ngram)) return false; // Prefer n-grams with 2-3 words for technical content if (words.length < 2 || words.length > 4) return false; // Filter out n-grams with too many short words const shortWords = words.filter(w => w.length <= 2).length; if (shortWords > words.length / 2) return false; return true; }); } /** * Count n-gram frequency in text * * @param ngrams List of n-grams * @returns Map of n-gram to frequency count */ export function countNGramFrequency(ngrams: string[]): Map<string, number> { const frequency = new Map<string, number>(); for (const ngram of ngrams) { frequency.set(ngram, (frequency.get(ngram) || 0) + 1); } return frequency; } /** * Extract candidate key phrases using n-grams * This is a simpler alternative when embeddings aren't available * * @param text Input text * @param topK Number of top phrases to return * @returns Array of key phrase strings */ export function extractKeyPhraseCandidates( text: string, topK: number = 20 ): string[] { // Extract n-grams const ngrams = extractNGrams(text, 2, 4); // Filter for quality const candidates = filterCandidates(ngrams); // Count frequency const frequency = countNGramFrequency(candidates); // Sort by frequency and return top K return Array.from(frequency.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, topK) .map(([ngram]) => ngram); } /** * Check if an n-gram is likely a technical term * * @param ngram The n-gram to check * @returns True if likely technical */ export function isTechnicalTerm(ngram: string): boolean { const technicalPatterns = [ /\b(api|sdk|cli|gui|cpu|gpu|ram|sql|css|html|xml|json|yaml)\b/i, /\b(server|client|database|cache|queue|stream|buffer|thread)\b/i, /\b(function|method|class|interface|module|package|library)\b/i, /\b(algorithm|model|neural|network|machine|learning|deep)\b/i, /\b(embed|vector|transform|encoder|decoder|attention)\b/i, /\b(docker|kubernetes|container|orchestrat|deploy|pipeline)\b/i, /\b(test|debug|profile|optimize|refactor|implement)\b/i ]; return technicalPatterns.some(pattern => pattern.test(ngram)); } /** * Score n-grams based on technical relevance * Higher scores indicate more likely technical key phrases * * @param ngram The n-gram to score * @param documentLength Total document length for normalization * @returns Score between 0 and 1 */ export function scoreTechnicalRelevance( ngram: string, documentLength: number ): number { let score = 0; const words = ngram.split(' '); const wordCount = words.length; // Prefer multi-word phrases (2-3 words ideal) if (wordCount === 2) score += 0.3; else if (wordCount === 3) score += 0.25; else if (wordCount === 4) score += 0.15; // Boost technical terms if (isTechnicalTerm(ngram)) score += 0.3; // Boost phrases with capital letters (likely proper nouns, acronyms) // Now this will actually work since we preserve original casing const hasCapital = words.some(w => /[A-Z]/.test(w)); if (hasCapital) score += 0.1; // Boost phrases with numbers (versions, metrics) const hasNumber = words.some(w => /\d/.test(w)); if (hasNumber) score += 0.05; // Penalize very common patterns const commonPatterns = /^(this is|that is|it is|there are|there is)/i; if (commonPatterns.test(ngram)) score -= 0.2; // Normalize score to 0-1 range return Math.max(0, Math.min(1, score)); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ngram-utils.ts•6.98 KiB