Skip to main content
Glama

DollhouseMCP

by DollhouseMCP
NLPScoringManager.ts14.3 kB
/** * NLP Scoring Manager - Jaccard similarity and Shannon entropy for semantic analysis * * Implements intelligent document similarity scoring using: * - Jaccard similarity for vocabulary overlap * - Shannon entropy for information density * - Combined scoring for meaningful semantic relationships * * Key insights from analysis: * - High Jaccard (>60%) + Moderate entropy (4.5-6.0) = Same technical domain * - High Jaccard + Low entropy (<3.0) = Stop word pollution, superficial * - Low Jaccard + Similar entropy = Different domains, equally complex * * Part of Enhanced Capability Index (#1085) */ import { logger } from '../utils/logger.js'; import { UnicodeValidator } from '../security/validators/unicodeValidator.js'; import { IndexConfigManager } from './config/IndexConfig.js'; /** * Scoring result with detailed metrics */ export interface ScoringResult { jaccard: number; // 0.0 to 1.0 entropy: number; // Typically 0 to 8 bits for text combinedScore: number; // 0.0 to 1.0 interpretation: string; // Human-readable explanation tokenCount: number; // Number of unique tokens overlapCount: number; // Number of overlapping tokens } /** * Pairwise similarity between two elements */ export interface PairwiseSimilarity { element1: string; element2: string; similarity: ScoringResult; timestamp: string; } /** * Configuration for scoring algorithm */ export interface ScoringConfig { minTokenLength: number; cacheExpiry: number; maxCacheSize: number; // Maximum cache entries for LRU eviction // milliseconds entropyBands: { low: number; // < 3.0 typically (high repetition/common words) moderate: number; // 3.0 - 6.0 typically (balanced vocabulary) high: number; // > 6.0 typically (diverse specialized terms) }; jaccardThresholds: { low: number; // < 0.2 moderate: number; // 0.2 - 0.6 high: number; // > 0.6 }; } export class NLPScoringManager { private cache: Map<string, { result: ScoringResult; timestamp: number; lastAccessed: number }>; private cacheAccessOrder: string[]; // Track access order for LRU private config: ScoringConfig; private unicodeValidator: UnicodeValidator; constructor(config?: Partial<ScoringConfig>) { // Get config from central manager const indexConfig = IndexConfigManager.getInstance().getConfig(); this.config = { minTokenLength: indexConfig.nlp.minTokenLength, cacheExpiry: indexConfig.nlp.cacheExpiryMinutes * 60 * 1000, maxCacheSize: indexConfig.memory.maxCacheSize, entropyBands: indexConfig.nlp.entropyBands, jaccardThresholds: indexConfig.nlp.jaccardThresholds, ...config }; this.cache = new Map(); this.cacheAccessOrder = []; this.unicodeValidator = new UnicodeValidator(); // Periodic cleanup of expired entries setInterval(() => this.cleanExpiredCache(), 60000); // Every minute } /** * Clean and tokenize text for analysis * Works with any language - no hardcoded stop words */ private cleanAndTokenize(text: string): Set<string> { // Normalize Unicode for security const validation = UnicodeValidator.normalize(text); if (validation.detectedIssues && validation.detectedIssues.length > 0) { logger.warn('Unicode issues in NLP text', { issues: validation.detectedIssues }); } text = validation.normalizedContent; // Convert to lowercase and split on word boundaries // Keep Unicode letter characters for multilingual support const tokens = text.toLowerCase() .replaceAll(/[^\p{L}\p{N}\s_-]/gu, ' ') // Unicode-aware: keep letters, numbers, underscore, hyphen .split(/\s+/) .filter(token => token.length >= this.config.minTokenLength); return new Set(tokens); } /** * Calculate Jaccard similarity between two text strings * * Jaccard = |A ∩ B| / |A ∪ B| * * Returns value between 0 (no overlap) and 1 (identical) */ public calculateJaccard(text1: string, text2: string): number { const tokens1 = this.cleanAndTokenize(text1); const tokens2 = this.cleanAndTokenize(text2); if (tokens1.size === 0 && tokens2.size === 0) { return 1.0; // Both empty = identical } if (tokens1.size === 0 || tokens2.size === 0) { return 0.0; // One empty = no similarity } // Calculate intersection const intersection = new Set([...tokens1].filter(token => tokens2.has(token))); // Calculate union const union = new Set([...tokens1, ...tokens2]); return intersection.size / union.size; } /** * Calculate Shannon entropy for text * * H(X) = -Σ p(x) * log2(p(x)) * * Measures information density/vocabulary richness * Higher entropy = more diverse vocabulary */ public calculateEntropy(text: string): number { const tokens = Array.from(this.cleanAndTokenize(text)); if (tokens.length === 0) { return 0; } // Calculate token frequencies const frequencies = new Map<string, number>(); for (const token of tokens) { frequencies.set(token, (frequencies.get(token) || 0) + 1); } // Calculate probabilities and entropy let entropy = 0; const totalTokens = tokens.length; for (const count of frequencies.values()) { const probability = count / totalTokens; if (probability > 0) { entropy -= probability * Math.log2(probability); } } return entropy; } /** * Calculate combined relevance score using Jaccard and entropy * * Interprets the relationship between similarity and complexity */ public scoreRelevance(text1: string, text2: string): ScoringResult { // Check cache first const cacheKey = `${text1.substring(0, 50)}:::${text2.substring(0, 50)}`; const cached = this.cache.get(cacheKey); if (cached && Date.now() - cached.timestamp < this.config.cacheExpiry) { // Update access order for LRU cached.lastAccessed = Date.now(); this.updateAccessOrder(cacheKey); return cached.result; } // Calculate metrics const jaccard = this.calculateJaccard(text1, text2); const entropy1 = this.calculateEntropy(text1); const entropy2 = this.calculateEntropy(text2); const avgEntropy = (entropy1 + entropy2) / 2; // Get token sets for additional metrics const tokens1 = this.cleanAndTokenize(text1); const tokens2 = this.cleanAndTokenize(text2); const intersection = new Set([...tokens1].filter(token => tokens2.has(token))); // Interpret the combination let combinedScore: number; let interpretation: string; // High Jaccard + Moderate-High entropy = Excellent match (same domain) if (jaccard >= this.config.jaccardThresholds.high && avgEntropy >= this.config.entropyBands.low) { // Scale score based on entropy quality const entropyQuality = Math.min(1.0, (avgEntropy - 2.0) / 4.0); // 0 at entropy=2, 1 at entropy=6+ combinedScore = 0.7 + (jaccard - 0.6) * 0.5 + entropyQuality * 0.2; if (avgEntropy >= this.config.entropyBands.moderate) { interpretation = 'Excellent match - same technical domain with rich vocabulary'; } else { interpretation = 'Good match - high overlap but simpler vocabulary'; } } // High Jaccard + Very Low entropy = Common word pollution else if (jaccard >= this.config.jaccardThresholds.high && avgEntropy < 2.0) { combinedScore = 0.3 + jaccard * 0.2; // Penalize heavily interpretation = 'Superficial similarity - mostly common words'; } // Moderate Jaccard + Good entropy = Related concepts else if (jaccard >= this.config.jaccardThresholds.moderate && avgEntropy >= this.config.entropyBands.low) { combinedScore = 0.4 + jaccard * 0.4 + (avgEntropy / 10) * 0.2; interpretation = 'Moderate match - related concepts with good complexity'; } // Low Jaccard + Similar entropy = Different domains else if (jaccard < this.config.jaccardThresholds.low && Math.abs(entropy1 - entropy2) < 1.0) { combinedScore = jaccard * 0.5; interpretation = 'Different domains with similar complexity'; } // Low Jaccard + Different entropy = Unrelated else { combinedScore = jaccard * 0.3; interpretation = 'Low relevance - different topics and complexity'; } // Ensure score is between 0 and 1 combinedScore = Math.max(0, Math.min(1, combinedScore)); const result: ScoringResult = { jaccard, entropy: avgEntropy, combinedScore, interpretation, tokenCount: tokens1.size + tokens2.size, overlapCount: intersection.size }; // Cache the result with LRU management this.addToCache(cacheKey, result); // Log scoring event for monitoring logger.debug('NLP scoring completed', { jaccard: result.jaccard.toFixed(3), entropy: result.entropy.toFixed(3), score: result.combinedScore.toFixed(3), interpretation: result.interpretation }); return result; } /** * Build a pairwise similarity matrix for multiple texts * * Useful for clustering and relationship discovery */ public buildSimilarityMatrix( elements: Map<string, string> ): Map<string, Map<string, ScoringResult>> { const matrix = new Map<string, Map<string, ScoringResult>>(); const keys = Array.from(elements.keys()); for (let i = 0; i < keys.length; i++) { const key1 = keys[i]; const text1 = elements.get(key1)!; if (!matrix.has(key1)) { matrix.set(key1, new Map()); } for (let j = i + 1; j < keys.length; j++) { const key2 = keys[j]; const text2 = elements.get(key2)!; // Calculate similarity const similarity = this.scoreRelevance(text1, text2); // Store bidirectionally matrix.get(key1)!.set(key2, similarity); if (!matrix.has(key2)) { matrix.set(key2, new Map()); } matrix.get(key2)!.set(key1, similarity); } } return matrix; } /** * Find most similar elements to a given text */ public findSimilar( targetText: string, candidates: Map<string, string>, topK: number = 5 ): Array<{ name: string; score: ScoringResult }> { const scores: Array<{ name: string; score: ScoringResult }> = []; for (const [name, text] of candidates.entries()) { const score = this.scoreRelevance(targetText, text); scores.push({ name, score }); } // Sort by combined score descending scores.sort((a, b) => b.score.combinedScore - a.score.combinedScore); return scores.slice(0, topK); } /** * Extract key terms from text based on entropy contribution * * Terms that contribute most to entropy are likely important */ public extractKeyTerms(text: string, topK: number = 10): string[] { const tokens = Array.from(this.cleanAndTokenize(text)); if (tokens.length === 0) { return []; } // Calculate token frequencies const frequencies = new Map<string, number>(); for (const token of tokens) { frequencies.set(token, (frequencies.get(token) || 0) + 1); } // Calculate entropy contribution for each unique token const totalTokens = tokens.length; const contributions: Array<{ token: string; contribution: number }> = []; for (const [token, count] of frequencies.entries()) { const probability = count / totalTokens; const contribution = -probability * Math.log2(probability); contributions.push({ token, contribution }); } // Sort by contribution descending contributions.sort((a, b) => b.contribution - a.contribution); return contributions.slice(0, topK).map(c => c.token); } /** * Add result to cache with LRU eviction */ private addToCache(key: string, result: ScoringResult): void { const now = Date.now(); // Check if we need to evict if (this.cache.size >= this.config.maxCacheSize && !this.cache.has(key)) { // Find least recently used entry let lruKey: string | null = null; let oldestAccess = now; for (const [k, v] of this.cache.entries()) { if (v.lastAccessed < oldestAccess) { oldestAccess = v.lastAccessed; lruKey = k; } } if (lruKey) { this.cache.delete(lruKey); const idx = this.cacheAccessOrder.indexOf(lruKey); if (idx > -1) { this.cacheAccessOrder.splice(idx, 1); } logger.debug('Evicted LRU cache entry', { key: lruKey, cacheSize: this.cache.size }); } } // Add new entry this.cache.set(key, { result, timestamp: now, lastAccessed: now }); this.updateAccessOrder(key); } /** * Update access order for LRU tracking */ private updateAccessOrder(key: string): void { const idx = this.cacheAccessOrder.indexOf(key); if (idx > -1) { this.cacheAccessOrder.splice(idx, 1); } this.cacheAccessOrder.push(key); } /** * Clean expired cache entries */ private cleanExpiredCache(): void { const now = Date.now(); let removed = 0; for (const [key, value] of this.cache.entries()) { if (now - value.timestamp > this.config.cacheExpiry) { this.cache.delete(key); const idx = this.cacheAccessOrder.indexOf(key); if (idx > -1) { this.cacheAccessOrder.splice(idx, 1); } removed++; } } if (removed > 0) { logger.debug('Cleaned expired cache entries', { removed, remaining: this.cache.size }); } } /** * Clear the cache */ public clearCache(): void { this.cache.clear(); this.cacheAccessOrder = []; logger.debug('NLP scoring cache cleared'); } /** * Get cache statistics */ public getCacheStats(): { size: number; oldestEntry: number | null } { let oldest: number | null = null; for (const entry of this.cache.values()) { if (oldest === null || entry.timestamp < oldest) { oldest = entry.timestamp; } } return { size: this.cache.size, oldestEntry: oldest }; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DollhouseMCP/DollhouseMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server