Skip to main content
Glama
inferrer.js22.3 kB
/** * Schema Inferrer Module * Port of DataFlood C# SchemaInferrer.cs * Infers DataFlood schemas from JSON documents */ import { DataFloodModel, DataFloodStringModel, DataFloodHistogram } from '../models/DataFloodModel.js'; import { detectType, detectArrayItemType, detectConsistentFormat, shouldBeEnum, analyzeNumericConstraints, analyzeStringConstraints, detectPattern, } from './type-detector.js'; import logger from '../../utils/logger.js'; const log = logger.child('SchemaInferrer'); export class SchemaInferrer { constructor(options = {}) { this.enumThreshold = options.enumThreshold || 0.5; this.detectFormats = options.detectFormats !== false; this.detectPatterns = options.detectPatterns !== false; this.inferStringModels = options.inferStringModels !== false; this.inferHistograms = options.inferHistograms !== false; this.detectRelationships = options.detectRelationships || false; } /** * Infer schema from array of documents * Matches C# InferSchema method */ inferSchema(documents) { if (!documents || documents.length === 0) { throw new Error('No documents provided for schema inference'); } log.debug(`Inferring schema from ${documents.length} documents`); // Determine root types const types = [...new Set(documents.map(detectType))]; let rootSchema; if (types.length === 1) { rootSchema = this.inferSchemaForType(documents, types[0]); } else { // Multiple types - create union schema with anyOf rootSchema = new DataFloodModel({ anyOf: types.map(type => { const docsOfType = documents.filter(d => detectType(d) === type); return this.inferSchemaForType(docsOfType, type); }), }); } // Set schema URL only at root level (matching C#) rootSchema.$schema = 'https://smallminds.co/DataFlood/schema#'; return rootSchema; } /** * Infer schema for specific type * Matches C# InferSchemaForType method */ inferSchemaForType(documents, type, isRoot = false) { // Create schema WITHOUT $schema property (only root should have it) const schema = new DataFloodModel({ type }); if (!isRoot) { delete schema.$schema; // Remove $schema for non-root schemas } switch (type) { case 'object': this.inferObjectSchema(schema, documents); break; case 'array': this.inferArraySchema(schema, documents); break; case 'string': this.inferStringSchema(schema, documents); break; case 'number': case 'integer': this.inferNumberSchema(schema, documents, type); break; case 'boolean': // Boolean has no additional constraints break; case 'null': schema.type = 'null'; break; } return schema; } /** * Infer object schema * Matches C# InferObjectSchema method */ inferObjectSchema(schema, objects) { const allProperties = {}; const requiredProperties = new Set(); // Collect all properties across all objects for (const obj of objects) { for (const [key, value] of Object.entries(obj)) { if (!allProperties[key]) { allProperties[key] = []; } if (value !== null && value !== undefined) { allProperties[key].push(value); } } } // Determine required properties (present in all objects) for (const propName of Object.keys(allProperties)) { const presentCount = objects.filter(obj => propName in obj).length; if (presentCount === objects.length) { requiredProperties.add(propName); } } // Generate schema for each property schema.properties = {}; for (const [propName, values] of Object.entries(allProperties)) { if (values.length > 0) { // Infer type from all values const propTypes = [...new Set(values.map(detectType))]; if (propTypes.length === 1) { schema.properties[propName] = this.inferSchemaForType(values, propTypes[0]); } else { // Multiple types for this property - use anyOf // Don't use DataFloodModel for nested schemas to avoid adding $schema schema.properties[propName] = { anyOf: propTypes.map(type => { const valuesOfType = values.filter(v => detectType(v) === type); const subSchema = this.inferSchemaForType(valuesOfType, type); // Remove $schema from nested schemas delete subSchema.$schema; return subSchema; }) }; } } } if (requiredProperties.size > 0) { schema.required = Array.from(requiredProperties).sort(); } // Detect potential relationships (foreign keys) if (this.detectRelationships) { const relationships = this.detectPotentialRelationships(schema.properties, objects); if (relationships.length > 0) { schema.relationships = relationships; } } } /** * Detect potential foreign key relationships based on field naming and patterns * This is a heuristic approach for basic relationship detection */ detectPotentialRelationships(properties, objects) { const relationships = []; const foreignKeyPatterns = [ /_id$/i, // user_id, account_id, etc. /_ref$/i, // user_ref, account_ref /_key$/i, // foreign_key /^parent_/i, // parent_id, parent_ref /^child_/i, // child_id /^reference_/i, // reference_number /^related_/i, // related_entity ]; for (const [fieldName, fieldSchema] of Object.entries(properties)) { // Check if field name matches foreign key patterns const isForeignKeyCandidate = foreignKeyPatterns.some(pattern => pattern.test(fieldName)); if (isForeignKeyCandidate) { // Analyze the field values const fieldValues = objects.map(obj => obj[fieldName]).filter(v => v != null); // Check if values look like IDs (strings or numbers that are unique or semi-unique) if (fieldValues.length > 0) { const uniqueValues = new Set(fieldValues); const uniqueRatio = uniqueValues.size / fieldValues.length; // High unique ratio suggests this could be a foreign key if (uniqueRatio > 0.5) { const relationship = { field: fieldName, type: 'foreign_key', confidence: Math.round(uniqueRatio * 100) / 100, valueType: fieldSchema.type || 'unknown', }; // Try to infer the referenced entity from field name const entityMatch = fieldName.match(/^(.+?)_(?:id|ref|key)$/i); if (entityMatch) { relationship.referencedEntity = entityMatch[1]; } // Check if it's a parent/child relationship if (/^parent_/i.test(fieldName)) { relationship.relationshipType = 'parent'; } else if (/^child_/i.test(fieldName)) { relationship.relationshipType = 'child'; } else { relationship.relationshipType = 'reference'; } relationships.push(relationship); } } } } return relationships; } /** * Infer array schema * Matches C# InferArraySchema method */ inferArraySchema(schema, arrays) { const allItems = []; // Collect all items from all arrays for (const array of arrays) { for (const item of array) { if (item !== null && item !== undefined) { allItems.push(item); } } } if (allItems.length > 0) { // Determine item types const itemTypes = [...new Set(allItems.map(detectType))]; if (itemTypes.length === 1) { schema.items = this.inferSchemaForType(allItems, itemTypes[0]); } else { // Multiple item types - use anyOf // Don't use DataFloodModel for nested schemas to avoid adding $schema schema.items = { anyOf: itemTypes.map(type => { const itemsOfType = allItems.filter(item => detectType(item) === type); const subSchema = this.inferSchemaForType(itemsOfType, type); // Remove $schema from nested schemas delete subSchema.$schema; return subSchema; }) }; } } // Set min/max items const lengths = arrays.map(a => a.length); if (lengths.length > 0) { const uniqueLengths = [...new Set(lengths)]; if (uniqueLengths.length === 1) { // All arrays same length schema.minItems = uniqueLengths[0]; schema.maxItems = uniqueLengths[0]; } else { schema.minItems = Math.min(...lengths); schema.maxItems = Math.max(...lengths); } } // Check for unique items for (const array of arrays) { const uniqueItems = [...new Set(array.map(JSON.stringify))]; if (uniqueItems.length === array.length) { schema.uniqueItems = true; break; } } } /** * Infer string schema * Matches C# InferStringSchema method */ inferStringSchema(schema, strings) { if (strings.length === 0) { return; } // Check for enum if (shouldBeEnum(strings, this.enumThreshold)) { schema.enum = [...new Set(strings)].sort(); // Don't return early - still need to set min/max length } // String length constraints const constraints = analyzeStringConstraints(strings); schema.minLength = constraints.minLength; schema.maxLength = constraints.maxLength; // Detect format if (this.detectFormats) { const format = detectConsistentFormat(strings); if (format) { schema.format = format; } } // Detect pattern if (this.detectPatterns && !schema.format) { const pattern = detectPattern(strings); if (pattern) { schema.pattern = pattern; } } // Build string model if requested if (this.inferStringModels) { schema.stringModel = this.buildStringModel(strings); } } /** * Infer number/integer schema * Matches C# InferNumberSchema method */ inferNumberSchema(schema, numbers, type) { if (numbers.length === 0) { return; } const constraints = analyzeNumericConstraints(numbers); schema.minimum = constraints.minimum; schema.maximum = constraints.maximum; if (constraints.multipleOf) { schema.multipleOf = constraints.multipleOf; } // Build histogram if requested if (this.inferHistograms && numbers.length >= 10) { schema.histogram = this.buildHistogram(numbers); } } /** * Build string model from samples * Matches DataFlood C# StringModelGenerator.cs implementation */ buildStringModel(strings, maxSamples = 20) { if (!strings || strings.length === 0) { return new DataFloodStringModel(); } const modelData = { minLength: Math.min(...strings.map(s => s.length)), maxLength: Math.max(...strings.map(s => s.length)), averageLength: Math.round(strings.reduce((sum, s) => sum + s.length, 0) / strings.length * 100) / 100, uniqueCharacters: [], characterFrequency: {}, characterProbability: {}, patterns: {}, nGrams: {}, commonPrefixes: {}, commonSuffixes: {}, valueFrequency: {}, sampleValues: [], uniqueValues: [], totalSamples: strings.length, uniqueValueCount: 0, entropyScore: 0, maxEntropy: 0, complexity: 0 }; // Character frequency analysis const charCounts = {}; let totalChars = 0; for (const str of strings) { for (const char of str) { charCounts[char] = (charCounts[char] || 0) + 1; totalChars++; } } modelData.uniqueCharacters = Object.keys(charCounts).sort(); modelData.characterFrequency = charCounts; // Character probability for (const [char, count] of Object.entries(charCounts)) { modelData.characterProbability[char] = Math.round((count / totalChars) * 10000) / 10000; } // Extract patterns const patterns = {}; for (const str of strings) { const pattern = this.generatePattern(str); patterns[pattern] = (patterns[pattern] || 0) + 1; } // Keep top 10 patterns modelData.patterns = Object.entries(patterns) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Analyze prefixes and suffixes const prefixes = {}; const suffixes = {}; const prefixLength = Math.min(3, modelData.minLength); const suffixLength = Math.min(3, modelData.minLength); if (prefixLength > 0) { for (const str of strings) { for (let len = 1; len <= Math.min(prefixLength, str.length); len++) { const prefix = str.substring(0, len); prefixes[prefix] = (prefixes[prefix] || 0) + 1; } } } if (suffixLength > 0) { for (const str of strings) { for (let len = 1; len <= Math.min(suffixLength, str.length); len++) { const suffix = str.substring(str.length - len); suffixes[suffix] = (suffixes[suffix] || 0) + 1; } } } // Keep common prefixes/suffixes (appearing more than once) modelData.commonPrefixes = Object.entries(prefixes) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); modelData.commonSuffixes = Object.entries(suffixes) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Generate n-grams (2-grams and 3-grams) const nGrams = {}; for (const str of strings) { for (let n = 2; n <= Math.min(3, str.length); n++) { for (let i = 0; i <= str.length - n; i++) { const nGram = str.substring(i, i + n); nGrams[nGram] = (nGrams[nGram] || 0) + 1; } } } // Keep top 20 n-grams that appear more than once modelData.nGrams = Object.entries(nGrams) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 20) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Analyze value frequencies const valueFreq = {}; for (const str of strings) { valueFreq[str] = (valueFreq[str] || 0) + 1; } modelData.valueFrequency = valueFreq; modelData.uniqueValues = Object.keys(valueFreq); modelData.uniqueValueCount = modelData.uniqueValues.length; // Store sample values (most frequent ones first) modelData.sampleValues = Object.entries(valueFreq) .sort((a, b) => b[1] - a[1]) .slice(0, maxSamples) .map(([key]) => key); // Create string model instance and calculate entropy/complexity const stringModel = new DataFloodStringModel(modelData); stringModel.calculateEntropy(); stringModel.calculateMaxEntropy(); stringModel.calculateComplexity(); return stringModel; } /** * Generate pattern from string (matching DataFlood C#) * d = digit, U = uppercase, L = lowercase, s = space, p = punctuation/special */ generatePattern(input) { if (!input) return ''; let pattern = ''; for (const ch of input) { if (/\d/.test(ch)) { pattern += 'd'; } else if (/[A-Z]/.test(ch)) { pattern += 'U'; } else if (/[a-z]/.test(ch)) { pattern += 'L'; } else if (/\s/.test(ch)) { pattern += 's'; } else { pattern += 'p'; // punctuation/special } } // Compress consecutive patterns (e.g., "ddd" -> "d{3}") return pattern.replace(/(.)\1+/g, (match, char) => `${char}{${match.length}}`); } /** * Build histogram from numeric samples * Matches DataFlood C# HistogramGenerator.cs implementation */ buildHistogram(numbers) { if (!numbers || numbers.length === 0) { return new DataFloodHistogram(); } const sortedNumbers = [...numbers].sort((a, b) => a - b); const minValue = sortedNumbers[0]; const maxValue = sortedNumbers[sortedNumbers.length - 1]; const range = maxValue - minValue; const histogramData = { bins: [], totalCount: numbers.length, minValue: minValue, maxValue: maxValue, complexity: 0, standardDeviation: 0, entropyScore: 0, maxEntropy: 0, }; // Calculate standard deviation if (numbers.length > 1) { const mean = numbers.reduce((sum, n) => sum + n, 0) / numbers.length; const sumOfSquares = numbers.reduce((sum, n) => sum + Math.pow(n - mean, 2), 0); histogramData.standardDeviation = Math.sqrt(sumOfSquares / (numbers.length - 1)); } // Handle case where all values are the same if (range === 0) { histogramData.bins.push({ rangeStart: minValue, rangeEnd: minValue, count: numbers.length, freqStart: 0.0, freqEnd: 100.0 }); const histogram = new DataFloodHistogram(histogramData); histogram.calculateEntropy(); histogram.calculateComplexity(); return histogram; } // Create bins (default 10 bins) const binCount = 10; const binWidth = range / binCount; let cumulativeFrequency = 0.0; for (let i = 0; i < binCount; i++) { const rangeStart = minValue + (i * binWidth); const rangeEnd = i === binCount - 1 ? maxValue : minValue + ((i + 1) * binWidth); const binNumbers = numbers.filter(n => { if (i === binCount - 1) { return n >= rangeStart && n <= rangeEnd; } return n >= rangeStart && n < rangeEnd; }); // Only add bins that contain data (frequency > 0) if (binNumbers.length > 0) { const binFrequencyPercent = Math.round((binNumbers.length / numbers.length) * 100 * 100) / 100; const freqStart = cumulativeFrequency; const freqEnd = cumulativeFrequency + binFrequencyPercent; histogramData.bins.push({ rangeStart: Math.round(rangeStart * 10000) / 10000, rangeEnd: Math.round(rangeEnd * 10000) / 10000, count: binNumbers.length, freqStart: Math.round(freqStart * 100) / 100, freqEnd: Math.round(freqEnd * 100) / 100 }); cumulativeFrequency = freqEnd; } } // Create histogram instance and calculate entropy/complexity const histogram = new DataFloodHistogram(histogramData); histogram.calculateEntropy(); histogram.calculateComplexity(); return histogram; } /** * Merge two schemas * Based on DataFlood C# MergeSchemas logic */ mergeSchemas(schema1, schema2) { if (!schema1) return schema2; if (!schema2) return schema1; const merged = new DataFloodModel({ $schema: schema1.$schema || schema2.$schema, }); // Merge types if (schema1.type === schema2.type) { merged.type = schema1.type; } else if (schema1.type && schema2.type) { // Different types - use anyOf merged.anyOf = [ { type: schema1.type }, { type: schema2.type } ]; } // Merge properties for objects if (schema1.properties || schema2.properties) { merged.properties = {}; // Add all properties from both schemas const allProps = new Set([ ...Object.keys(schema1.properties || {}), ...Object.keys(schema2.properties || {}) ]); for (const prop of allProps) { const prop1 = schema1.properties?.[prop]; const prop2 = schema2.properties?.[prop]; if (prop1 && prop2) { // Property exists in both - merge them merged.properties[prop] = this.mergeSchemas(prop1, prop2); } else { // Property exists in only one schema merged.properties[prop] = prop1 || prop2; } } } // Merge required fields (intersection - only if required in both) if (schema1.required && schema2.required) { const required = schema1.required.filter(r => schema2.required.includes(r)); if (required.length > 0) { merged.required = required; } } // Merge array items if (schema1.items || schema2.items) { merged.items = this.mergeSchemas(schema1.items, schema2.items); } // Preserve constraints (take most restrictive) if (schema1.minimum !== undefined || schema2.minimum !== undefined) { merged.minimum = Math.max(schema1.minimum ?? -Infinity, schema2.minimum ?? -Infinity); } if (schema1.maximum !== undefined || schema2.maximum !== undefined) { merged.maximum = Math.min(schema1.maximum ?? Infinity, schema2.maximum ?? Infinity); } if (schema1.minLength !== undefined || schema2.minLength !== undefined) { merged.minLength = Math.max(schema1.minLength ?? 0, schema2.minLength ?? 0); } if (schema1.maxLength !== undefined || schema2.maxLength !== undefined) { merged.maxLength = Math.min(schema1.maxLength ?? Infinity, schema2.maxLength ?? Infinity); } // Merge string models and histograms if (schema1.stringModel || schema2.stringModel) { // For now, prefer the first schema's model merged.stringModel = schema1.stringModel || schema2.stringModel; } if (schema1.histogram || schema2.histogram) { merged.histogram = schema1.histogram || schema2.histogram; } // Merge format and pattern merged.format = schema1.format || schema2.format; merged.pattern = schema1.pattern || schema2.pattern; return merged; } } // Export default instance and class const defaultInferrer = new SchemaInferrer(); export default defaultInferrer;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/smallmindsco/MongTap'

If you have feedback or need assistance with the MCP directory API, please join our Discord server