Skip to main content
Glama
enum-detector.js19.5 kB
/** * Enum Detection Module - Port of DataFlood's EnumDetector.cs * Detects whether a set of values should be treated as an enum * based on statistical analysis and heuristics */ class EnumDetector { constructor(config = {}) { this.config = { largeDatasetSampleSize: config.largeDatasetSampleSize || 10000, enableSampling: config.enableSampling !== false, useStandardSetMatching: config.useStandardSetMatching !== false, useSemanticAnalysis: config.useSemanticAnalysis !== false, usePowerLawAnalysis: config.usePowerLawAnalysis !== false, minimumConfidence: config.minimumConfidence || 'medium' }; this.standardSets = this.initializeStandardSets(); } /** * Main detection method * @param {Array} values - Array of values to analyze * @param {string} fieldName - Name of the field (for context) * @returns {Object} EnumDetectionResult */ detectEnum(values, fieldName = '') { if (!values || values.length === 0) { return this.createResult(false, [], 'veryLow', 'No values to analyze', 'notEnum'); } // Sample large datasets if needed const sampledValues = this.sampleDataset(values); // Calculate metrics const metrics = this.calculateMetrics(sampledValues); // Apply detection rules based on dataset size return this.applyDetectionRules(metrics, fieldName); } /** * Sample dataset for performance with large datasets */ sampleDataset(values) { if (!this.config.enableSampling || values.length <= this.config.largeDatasetSampleSize) { return values; } // Reservoir sampling for uniform random sample const sampleSize = this.config.largeDatasetSampleSize; const sampled = values.slice(0, sampleSize); for (let i = sampleSize; i < values.length; i++) { const j = Math.floor(Math.random() * (i + 1)); if (j < sampleSize) { sampled[j] = values[i]; } } return sampled; } /** * Calculate statistical metrics for enum detection */ calculateMetrics(values) { const metrics = { totalCount: values.length, uniqueCount: 0, uniquenessRatio: 0, frequencyDistribution: {}, averageLength: 0, maxLength: 0, minLength: Number.MAX_VALUE, lengthVariance: 0, entropyScore: 0, powerLawExponent: 0, concentrationRatio: 0, semanticCoherence: 0, hasCommonPatterns: { isStandardSet: false, isCodePattern: false, isNaturalLanguage: false, hasStructuredFormat: false } }; // Build frequency distribution const uniqueValues = new Set(); let totalLength = 0; const lengths = []; for (const value of values) { const strValue = String(value); uniqueValues.add(strValue); metrics.frequencyDistribution[strValue] = (metrics.frequencyDistribution[strValue] || 0) + 1; const length = strValue.length; totalLength += length; lengths.push(length); metrics.maxLength = Math.max(metrics.maxLength, length); metrics.minLength = Math.min(metrics.minLength, length); } metrics.uniqueCount = uniqueValues.size; metrics.uniquenessRatio = metrics.uniqueCount / metrics.totalCount; metrics.averageLength = totalLength / metrics.totalCount; // Calculate length variance const avgLength = metrics.averageLength; const variance = lengths.reduce((sum, len) => sum + Math.pow(len - avgLength, 2), 0) / lengths.length; metrics.lengthVariance = Math.sqrt(variance); // Calculate entropy (Shannon's entropy) metrics.entropyScore = this.calculateEntropy(metrics.frequencyDistribution, metrics.totalCount); // Calculate power law exponent (simplified) if (this.config.usePowerLawAnalysis) { metrics.powerLawExponent = this.calculatePowerLawExponent(metrics.frequencyDistribution); } // Calculate concentration ratio (top 20% of values) metrics.concentrationRatio = this.calculateConcentrationRatio(metrics.frequencyDistribution); // Check for common patterns metrics.hasCommonPatterns = this.detectPatterns(uniqueValues); // Calculate semantic coherence (simplified) if (this.config.useSemanticAnalysis) { metrics.semanticCoherence = this.calculateSemanticCoherence(uniqueValues); } return metrics; } /** * Calculate Shannon's entropy */ calculateEntropy(distribution, total) { let entropy = 0; for (const count of Object.values(distribution)) { if (count > 0) { const probability = count / total; entropy -= probability * Math.log2(probability); } } return entropy; } /** * Calculate power law exponent (simplified Zipf's law check) */ calculatePowerLawExponent(distribution) { const frequencies = Object.values(distribution).sort((a, b) => b - a); if (frequencies.length < 10) { return 0; } // Simple linear regression in log-log space let sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0; const n = Math.min(frequencies.length, 100); // Limit for performance for (let i = 0; i < n; i++) { const x = Math.log(i + 1); const y = Math.log(frequencies[i]); sumX += x; sumY += y; sumXY += x * y; sumX2 += x * x; } const slope = (n * sumXY - sumX * sumY) / (n * sumX2 - sumX * sumX); return Math.abs(slope); } /** * Calculate concentration ratio (how concentrated the values are) */ calculateConcentrationRatio(distribution) { const frequencies = Object.values(distribution).sort((a, b) => b - a); const total = frequencies.reduce((sum, freq) => sum + freq, 0); if (frequencies.length === 0) return 0; // Calculate what percentage of unique values account for 80% of occurrences let cumulativeFreq = 0; let topCount = 0; for (const freq of frequencies) { cumulativeFreq += freq; topCount++; if (cumulativeFreq >= total * 0.8) { break; } } return topCount / frequencies.length; } /** * Detect common patterns in values */ detectPatterns(uniqueValues) { const patterns = { isStandardSet: false, isCodePattern: false, isNaturalLanguage: false, hasStructuredFormat: false }; const valuesArray = Array.from(uniqueValues); // Check for standard sets if (this.config.useStandardSetMatching) { patterns.isStandardSet = this.isStandardSet(valuesArray); } // Check for code patterns (e.g., STATUS_OK, ERROR_404) patterns.isCodePattern = this.hasCodePattern(valuesArray); // Check for natural language (simple check) patterns.isNaturalLanguage = this.hasNaturalLanguagePattern(valuesArray); // Check for structured format (e.g., all same format) patterns.hasStructuredFormat = this.hasStructuredFormat(valuesArray); return patterns; } /** * Check if values match a standard set */ isStandardSet(values) { for (const [setName, standardValues] of Object.entries(this.standardSets)) { const overlap = values.filter(v => standardValues.some(sv => sv.toLowerCase() === v.toLowerCase()) ); // If > 50% overlap with a standard set, consider it a match if (overlap.length / values.length > 0.5) { return true; } } return false; } /** * Check for code-like patterns */ hasCodePattern(values) { const codePatterns = [ /^[A-Z_]+$/, // CONSTANT_CASE /^[A-Z][A-Z0-9_]*$/, // UPPER_SNAKE_CASE /^\w+_\w+$/, // snake_case /^[A-Z][a-z]+(?:[A-Z][a-z]+)*$/, // PascalCase ]; let matchCount = 0; for (const value of values) { if (codePatterns.some(pattern => pattern.test(value))) { matchCount++; } } return matchCount / values.length > 0.7; } /** * Check for natural language patterns */ hasNaturalLanguagePattern(values) { let naturalCount = 0; for (const value of values) { // Simple check: contains spaces and mostly letters if (/^[A-Za-z\s,.'"-]+$/.test(value) && value.includes(' ')) { naturalCount++; } } return naturalCount / values.length > 0.5; } /** * Check if values follow a structured format */ hasStructuredFormat(values) { if (values.length < 3) return false; // Check for consistent patterns like XXX-000, ABC123, etc. const formatPatterns = values.map(v => String(v).replace(/[A-Z]/g, 'A') .replace(/[a-z]/g, 'a') .replace(/[0-9]/g, '0') .replace(/[^Aa0]/g, 'X') ); const uniquePatterns = new Set(formatPatterns); // If most values follow the same 1-3 patterns, it's structured // Fixed logic: check if we have few unique patterns relative to total values const patternRatio = uniquePatterns.size / values.length; return uniquePatterns.size <= 3 || patternRatio < 0.2; } /** * Calculate semantic coherence (simplified) */ calculateSemanticCoherence(uniqueValues) { const values = Array.from(uniqueValues); if (values.length < 2) return 0; // Simple coherence: check if values share common prefixes/suffixes let commonPrefixScore = 0; let commonSuffixScore = 0; // Find common prefix length const minLength = Math.min(...values.map(v => v.length)); for (let i = 0; i < minLength; i++) { const char = values[0][i]; if (values.every(v => v[i] === char)) { commonPrefixScore++; } else { break; } } // Find common suffix length for (let i = 0; i < minLength; i++) { const char = values[0][values[0].length - 1 - i]; if (values.every(v => v[v.length - 1 - i] === char)) { commonSuffixScore++; } else { break; } } // Normalize scores const maxScore = Math.max(commonPrefixScore, commonSuffixScore); return minLength > 0 ? maxScore / minLength : 0; } /** * Apply detection rules based on dataset size */ applyDetectionRules(metrics, fieldName) { const datasetSize = metrics.totalCount; if (datasetSize <= 1000) { return this.detectSmallDatasetEnum(metrics, fieldName); } else if (datasetSize <= 100000) { return this.detectMediumDatasetEnum(metrics, fieldName); } else { return this.detectLargeDatasetEnum(metrics, fieldName); } } /** * Detection rules for small datasets (≤1000 items) */ detectSmallDatasetEnum(metrics, fieldName) { // Very high cardinality suggests not an enum if (metrics.uniquenessRatio > 0.8) { return this.createResult(false, [], 'veryLow', 'Too many unique values for dataset size', 'notEnum'); } // Standard sets are always enums if (metrics.hasCommonPatterns.isStandardSet) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'high', 'Matches standard set', 'standardSet'); } // Code patterns with reasonable cardinality if (metrics.hasCommonPatterns.isCodePattern && metrics.uniqueCount <= 50) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'high', 'Code pattern with reasonable cardinality', 'applicationEnum'); } // Low cardinality with high concentration if (metrics.uniqueCount <= 20 && metrics.concentrationRatio < 0.3) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'medium', 'Low cardinality with high concentration', 'applicationEnum'); } // Natural language patterns usually aren't enums if (metrics.hasCommonPatterns.isNaturalLanguage) { return this.createResult(false, [], 'low', 'Natural language pattern detected', 'notEnum'); } // Borderline cases if (metrics.uniqueCount <= 30 && metrics.entropyScore < 3) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'low', 'Borderline case - low entropy', 'applicationEnum'); } return this.createResult(false, [], 'veryLow', 'Does not meet enum criteria', 'notEnum'); } /** * Detection rules for medium datasets (≤100,000 items) */ detectMediumDatasetEnum(metrics, fieldName) { // Very high cardinality definitely not an enum if (metrics.uniquenessRatio > 0.5 || metrics.uniqueCount > 1000) { return this.createResult(false, [], 'veryLow', 'Cardinality too high for enum', 'notEnum'); } // Standard sets if (metrics.hasCommonPatterns.isStandardSet) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'high', 'Matches standard set', 'standardSet'); } // Strong power law suggests enum-like behavior if (metrics.powerLawExponent > 1.5 && metrics.uniqueCount <= 100) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'medium', 'Strong power law distribution', 'applicationEnum'); } // Code patterns with reasonable cardinality if (metrics.hasCommonPatterns.isCodePattern && metrics.uniqueCount <= 200) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'medium', 'Code pattern detected', 'applicationEnum'); } // Very low entropy with reasonable cardinality if (metrics.entropyScore < 4 && metrics.uniqueCount <= 50) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'medium', 'Low entropy categorical data', 'applicationEnum'); } return this.createResult(false, [], 'low', 'Does not meet enum criteria for medium dataset', 'notEnum'); } /** * Detection rules for large datasets (>100,000 items) */ detectLargeDatasetEnum(metrics, fieldName) { // For large datasets, be very conservative if (metrics.uniqueCount > 500) { return this.createResult(false, [], 'veryLow', 'Too many unique values for large dataset', 'notEnum'); } // Standard sets with high confidence if (metrics.hasCommonPatterns.isStandardSet && metrics.semanticCoherence > 0.3) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'high', 'Standard set with high coherence', 'standardSet'); } // Very strong power law with low cardinality if (metrics.powerLawExponent > 2.0 && metrics.uniqueCount <= 50) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'medium', 'Strong categorical behavior in large dataset', 'applicationEnum'); } // Structured format with very low cardinality if (metrics.hasCommonPatterns.hasStructuredFormat && metrics.uniqueCount <= 20) { return this.createResult(true, Object.keys(metrics.frequencyDistribution), 'low', 'Structured categorical data', 'naturalConstraint'); } return this.createResult(false, [], 'veryLow', 'Large dataset does not exhibit enum characteristics', 'notEnum'); } /** * Create result object */ createResult(shouldCreate, enumValues, confidence, reasoning, category) { return { shouldCreateEnum: shouldCreate, enumValues: enumValues.slice(0, 1000), // Limit for performance confidence: confidence, reasoning: reasoning, category: category }; } /** * Initialize standard sets for matching */ initializeStandardSets() { return { booleans: ['true', 'false', 'yes', 'no', '1', '0', 'on', 'off'], months: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], countries: ['United States', 'Canada', 'Mexico', 'United Kingdom', 'France', 'Germany', 'Spain', 'Italy', 'China', 'Japan', 'India', 'Brazil', 'Australia', 'USA', 'US', 'UK', 'GB', 'FR', 'DE', 'ES', 'IT', 'CN', 'JP', 'IN', 'BR', 'AU'], states: ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA'], currencies: ['USD', 'EUR', 'GBP', 'JPY', 'CNY', 'AUD', 'CAD', 'CHF', 'HKD', 'SGD'], httpMethods: ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS', 'HEAD'], httpStatus: ['200', '201', '204', '301', '302', '400', '401', '403', '404', '500', '502', '503'], priorities: ['low', 'medium', 'high', 'critical', 'urgent', '1', '2', '3', '4', '5'], sizes: ['small', 'medium', 'large', 'extra-large', 'XS', 'S', 'M', 'L', 'XL', 'XXL'], genders: ['male', 'female', 'other', 'M', 'F', 'O'], directions: ['north', 'south', 'east', 'west', 'N', 'S', 'E', 'W', 'NE', 'NW', 'SE', 'SW'] }; } } export { EnumDetector };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/smallmindsco/MongTap'

If you have feedback or need assistance with the MCP directory API, please join our Discord server