MongTap

inferrer.js•21.8 KiB

/** * Schema Inferrer Module * Port of DataFlood C# SchemaInferrer.cs * Infers DataFlood schemas from JSON documents */ import { DataFloodModel, DataFloodStringModel, DataFloodHistogram } from '../models/DataFloodModel.js'; import { detectType, detectArrayItemType, detectConsistentFormat, shouldBeEnum, analyzeNumericConstraints, analyzeStringConstraints, detectPattern, } from './type-detector.js'; import logger from '../../utils/logger.js'; const log = logger.child('SchemaInferrer'); export class SchemaInferrer { constructor(options = {}) { this.enumThreshold = options.enumThreshold || 0.5; this.detectFormats = options.detectFormats !== false; this.detectPatterns = options.detectPatterns !== false; this.inferStringModels = options.inferStringModels !== false; this.inferHistograms = options.inferHistograms !== false; this.detectRelationships = options.detectRelationships || false; } /** * Infer schema from array of documents * Matches C# InferSchema method */ inferSchema(documents) { if (!documents || documents.length === 0) { throw new Error('No documents provided for schema inference'); } log.debug(`Inferring schema from ${documents.length} documents`); // Determine root types const types = [...new Set(documents.map(detectType))]; let rootSchema; if (types.length === 1) { rootSchema = this.inferSchemaForType(documents, types[0]); } else { // Multiple types - create union schema with anyOf rootSchema = new DataFloodModel({ anyOf: types.map(type => { const docsOfType = documents.filter(d => detectType(d) === type); return this.inferSchemaForType(docsOfType, type); }), }); } // Set schema URL only at root level (matching C#) rootSchema.$schema = 'https://smallminds.co/DataFlood/schema#'; return rootSchema; } /** * Infer schema for specific type * Matches C# InferSchemaForType method */ inferSchemaForType(documents, type, isRoot = false) { // Create schema WITHOUT $schema property (only root should have it) const schema = new DataFloodModel({ type }); if (!isRoot) { delete schema.$schema; // Remove $schema for non-root schemas } switch (type) { case 'object': this.inferObjectSchema(schema, documents); break; case 'array': this.inferArraySchema(schema, documents); break; case 'string': this.inferStringSchema(schema, documents); break; case 'number': case 'integer': this.inferNumberSchema(schema, documents, type); break; case 'boolean': // Boolean has no additional constraints break; case 'null': schema.type = 'null'; break; } return schema; } /** * Infer object schema * Matches C# InferObjectSchema method */ inferObjectSchema(schema, objects) { const allProperties = {}; const requiredProperties = new Set(); // Collect all properties across all objects for (const obj of objects) { for (const [key, value] of Object.entries(obj)) { if (!allProperties[key]) { allProperties[key] = []; } if (value !== null && value !== undefined) { allProperties[key].push(value); } } } // Determine required properties (present in all objects) for (const propName of Object.keys(allProperties)) { const presentCount = objects.filter(obj => propName in obj).length; if (presentCount === objects.length) { requiredProperties.add(propName); } } // Generate schema for each property schema.properties = {}; for (const [propName, values] of Object.entries(allProperties)) { if (values.length > 0) { // Infer type from all values const propTypes = [...new Set(values.map(detectType))]; if (propTypes.length === 1) { schema.properties[propName] = this.inferSchemaForType(values, propTypes[0]); } else { // Multiple types for this property - use anyOf // Don't use DataFloodModel for nested schemas to avoid adding $schema schema.properties[propName] = { anyOf: propTypes.map(type => { const valuesOfType = values.filter(v => detectType(v) === type); const subSchema = this.inferSchemaForType(valuesOfType, type); // Remove $schema from nested schemas delete subSchema.$schema; return subSchema; }) }; } } } if (requiredProperties.size > 0) { schema.required = Array.from(requiredProperties).sort(); } // Detect potential relationships (foreign keys) if (this.detectRelationships) { const relationships = this.detectPotentialRelationships(schema.properties, objects); if (relationships.length > 0) { schema.relationships = relationships; } } } /** * Detect potential foreign key relationships based on field naming and patterns * This is a heuristic approach for basic relationship detection */ detectPotentialRelationships(properties, objects) { const relationships = []; const foreignKeyPatterns = [ /_id$/i, // user_id, account_id, etc. /_ref$/i, // user_ref, account_ref /_key$/i, // foreign_key /^parent_/i, // parent_id, parent_ref /^child_/i, // child_id /^reference_/i, // reference_number /^related_/i, // related_entity ]; for (const [fieldName, fieldSchema] of Object.entries(properties)) { // Check if field name matches foreign key patterns const isForeignKeyCandidate = foreignKeyPatterns.some(pattern => pattern.test(fieldName)); if (isForeignKeyCandidate) { // Analyze the field values const fieldValues = objects.map(obj => obj[fieldName]).filter(v => v != null); // Check if values look like IDs (strings or numbers that are unique or semi-unique) if (fieldValues.length > 0) { const uniqueValues = new Set(fieldValues); const uniqueRatio = uniqueValues.size / fieldValues.length; // High unique ratio suggests this could be a foreign key if (uniqueRatio > 0.5) { const relationship = { field: fieldName, type: 'foreign_key', confidence: Math.round(uniqueRatio * 100) / 100, valueType: fieldSchema.type || 'unknown', }; // Try to infer the referenced entity from field name const entityMatch = fieldName.match(/^(.+?)_(?:id|ref|key)$/i); if (entityMatch) { relationship.referencedEntity = entityMatch[1]; } // Check if it's a parent/child relationship if (/^parent_/i.test(fieldName)) { relationship.relationshipType = 'parent'; } else if (/^child_/i.test(fieldName)) { relationship.relationshipType = 'child'; } else { relationship.relationshipType = 'reference'; } relationships.push(relationship); } } } } return relationships; } /** * Infer array schema * Matches C# InferArraySchema method */ inferArraySchema(schema, arrays) { const allItems = []; // Collect all items from all arrays for (const array of arrays) { for (const item of array) { if (item !== null && item !== undefined) { allItems.push(item); } } } if (allItems.length > 0) { // Determine item types const itemTypes = [...new Set(allItems.map(detectType))]; if (itemTypes.length === 1) { schema.items = this.inferSchemaForType(allItems, itemTypes[0]); } else { // Multiple item types - use anyOf // Don't use DataFloodModel for nested schemas to avoid adding $schema schema.items = { anyOf: itemTypes.map(type => { const itemsOfType = allItems.filter(item => detectType(item) === type); const subSchema = this.inferSchemaForType(itemsOfType, type); // Remove $schema from nested schemas delete subSchema.$schema; return subSchema; }) }; } } // Set min/max items const lengths = arrays.map(a => a.length); if (lengths.length > 0) { const uniqueLengths = [...new Set(lengths)]; if (uniqueLengths.length === 1) { // All arrays same length schema.minItems = uniqueLengths[0]; schema.maxItems = uniqueLengths[0]; } else { schema.minItems = Math.min(...lengths); schema.maxItems = Math.max(...lengths); } } // Check for unique items for (const array of arrays) { const uniqueItems = [...new Set(array.map(JSON.stringify))]; if (uniqueItems.length === array.length) { schema.uniqueItems = true; break; } } } /** * Infer string schema * Matches C# InferStringSchema method */ inferStringSchema(schema, strings) { if (strings.length === 0) { return; } // Check for enum if (shouldBeEnum(strings, this.enumThreshold)) { schema.enum = [...new Set(strings)].sort(); // Don't return early - still need to set min/max length } // String length constraints const constraints = analyzeStringConstraints(strings); schema.minLength = constraints.minLength; schema.maxLength = constraints.maxLength; // Detect format if (this.detectFormats) { const format = detectConsistentFormat(strings); if (format) { schema.format = format; } } // Detect pattern if (this.detectPatterns && !schema.format) { const pattern = detectPattern(strings); if (pattern) { schema.pattern = pattern; } } // Build string model if requested if (this.inferStringModels) { schema.stringModel = this.buildStringModel(strings); } } /** * Infer number/integer schema * Matches C# InferNumberSchema method */ inferNumberSchema(schema, numbers, type) { if (numbers.length === 0) { return; } const constraints = analyzeNumericConstraints(numbers); schema.minimum = constraints.minimum; schema.maximum = constraints.maximum; if (constraints.multipleOf) { schema.multipleOf = constraints.multipleOf; } // Build histogram if requested if (this.inferHistograms && numbers.length >= 10) { schema.histogram = this.buildHistogram(numbers); } } /** * Build string model from samples * Matches DataFlood C# StringModelGenerator.cs implementation */ buildStringModel(strings, maxSamples = 20) { if (!strings || strings.length === 0) { return new DataFloodStringModel(); } const modelData = { minLength: Math.min(...strings.map(s => s.length)), maxLength: Math.max(...strings.map(s => s.length)), averageLength: Math.round(strings.reduce((sum, s) => sum + s.length, 0) / strings.length * 100) / 100, uniqueCharacters: [], characterFrequency: {}, characterProbability: {}, patterns: {}, nGrams: {}, commonPrefixes: {}, commonSuffixes: {}, valueFrequency: {}, sampleValues: [], uniqueValues: [], totalSamples: strings.length, uniqueValueCount: 0, entropyScore: 0, maxEntropy: 0, complexity: 0 }; // Character frequency analysis const charCounts = {}; let totalChars = 0; for (const str of strings) { for (const char of str) { charCounts[char] = (charCounts[char] || 0) + 1; totalChars++; } } modelData.uniqueCharacters = Object.keys(charCounts).sort(); modelData.characterFrequency = charCounts; // Character probability for (const [char, count] of Object.entries(charCounts)) { modelData.characterProbability[char] = Math.round((count / totalChars) * 10000) / 10000; } // Extract patterns const patterns = {}; for (const str of strings) { const pattern = this.generatePattern(str); patterns[pattern] = (patterns[pattern] || 0) + 1; } // Keep top 10 patterns modelData.patterns = Object.entries(patterns) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Analyze prefixes and suffixes const prefixes = {}; const suffixes = {}; const prefixLength = Math.min(3, modelData.minLength); const suffixLength = Math.min(3, modelData.minLength); if (prefixLength > 0) { for (const str of strings) { for (let len = 1; len <= Math.min(prefixLength, str.length); len++) { const prefix = str.substring(0, len); prefixes[prefix] = (prefixes[prefix] || 0) + 1; } } } if (suffixLength > 0) { for (const str of strings) { for (let len = 1; len <= Math.min(suffixLength, str.length); len++) { const suffix = str.substring(str.length - len); suffixes[suffix] = (suffixes[suffix] || 0) + 1; } } } // Keep common prefixes/suffixes (appearing more than once) modelData.commonPrefixes = Object.entries(prefixes) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); modelData.commonSuffixes = Object.entries(suffixes) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Generate n-grams (2-grams and 3-grams) const nGrams = {}; for (const str of strings) { for (let n = 2; n <= Math.min(3, str.length); n++) { for (let i = 0; i <= str.length - n; i++) { const nGram = str.substring(i, i + n); nGrams[nGram] = (nGrams[nGram] || 0) + 1; } } } // Keep top 20 n-grams that appear more than once modelData.nGrams = Object.entries(nGrams) .filter(([_, count]) => count > 1) .sort((a, b) => b[1] - a[1]) .slice(0, 20) .reduce((obj, [key, val]) => ({ ...obj, [key]: val }), {}); // Analyze value frequencies const valueFreq = {}; for (const str of strings) { valueFreq[str] = (valueFreq[str] || 0) + 1; } modelData.valueFrequency = valueFreq; modelData.uniqueValues = Object.keys(valueFreq); modelData.uniqueValueCount = modelData.uniqueValues.length; // Store sample values (most frequent ones first) modelData.sampleValues = Object.entries(valueFreq) .sort((a, b) => b[1] - a[1]) .slice(0, maxSamples) .map(([key]) => key); // Create string model instance and calculate entropy/complexity const stringModel = new DataFloodStringModel(modelData); stringModel.calculateEntropy(); stringModel.calculateMaxEntropy(); stringModel.calculateComplexity(); return stringModel; } /** * Generate pattern from string (matching DataFlood C#) * d = digit, U = uppercase, L = lowercase, s = space, p = punctuation/special */ generatePattern(input) { if (!input) return ''; let pattern = ''; for (const ch of input) { if (/\d/.test(ch)) { pattern += 'd'; } else if (/[A-Z]/.test(ch)) { pattern += 'U'; } else if (/[a-z]/.test(ch)) { pattern += 'L'; } else if (/\s/.test(ch)) { pattern += 's'; } else { pattern += 'p'; // punctuation/special } } // Compress consecutive patterns (e.g., "ddd" -> "d{3}") return pattern.replace(/(.)\1+/g, (match, char) => `${char}{${match.length}}`); } /** * Build histogram from numeric samples * Matches DataFlood C# HistogramGenerator.cs implementation */ buildHistogram(numbers) { if (!numbers || numbers.length === 0) { return new DataFloodHistogram(); } const sortedNumbers = [...numbers].sort((a, b) => a - b); const minValue = sortedNumbers[0]; const maxValue = sortedNumbers[sortedNumbers.length - 1]; const range = maxValue - minValue; const histogramData = { bins: [], totalCount: numbers.length, minValue: minValue, maxValue: maxValue, complexity: 0, standardDeviation: 0, entropyScore: 0, maxEntropy: 0, }; // Calculate standard deviation if (numbers.length > 1) { const mean = numbers.reduce((sum, n) => sum + n, 0) / numbers.length; const sumOfSquares = numbers.reduce((sum, n) => sum + Math.pow(n - mean, 2), 0); histogramData.standardDeviation = Math.sqrt(sumOfSquares / (numbers.length - 1)); } // Handle case where all values are the same if (range === 0) { histogramData.bins.push({ rangeStart: minValue, rangeEnd: minValue, count: numbers.length, freqStart: 0.0, freqEnd: 100.0 }); const histogram = new DataFloodHistogram(histogramData); histogram.calculateEntropy(); histogram.calculateComplexity(); return histogram; } // Create bins (default 10 bins) const binCount = 10; const binWidth = range / binCount; let cumulativeFrequency = 0.0; for (let i = 0; i < binCount; i++) { const rangeStart = minValue + (i * binWidth); const rangeEnd = i === binCount - 1 ? maxValue : minValue + ((i + 1) * binWidth); const binNumbers = numbers.filter(n => { if (i === binCount - 1) { return n >= rangeStart && n <= rangeEnd; } return n >= rangeStart && n < rangeEnd; }); // Only add bins that contain data (frequency > 0) if (binNumbers.length > 0) { const binFrequencyPercent = Math.round((binNumbers.length / numbers.length) * 100 * 100) / 100; const freqStart = cumulativeFrequency; const freqEnd = cumulativeFrequency + binFrequencyPercent; histogramData.bins.push({ rangeStart: Math.round(rangeStart * 10000) / 10000, rangeEnd: Math.round(rangeEnd * 10000) / 10000, count: binNumbers.length, freqStart: Math.round(freqStart * 100) / 100, freqEnd: Math.round(freqEnd * 100) / 100 }); cumulativeFrequency = freqEnd; } } // Create histogram instance and calculate entropy/complexity const histogram = new DataFloodHistogram(histogramData); histogram.calculateEntropy(); histogram.calculateComplexity(); return histogram; } /** * Merge two schemas * Based on DataFlood C# MergeSchemas logic */ mergeSchemas(schema1, schema2) { if (!schema1) return schema2; if (!schema2) return schema1; const merged = new DataFloodModel({ $schema: schema1.$schema || schema2.$schema, }); // Merge types if (schema1.type === schema2.type) { merged.type = schema1.type; } else if (schema1.type && schema2.type) { // Different types - use anyOf merged.anyOf = [ { type: schema1.type }, { type: schema2.type } ]; } // Merge properties for objects if (schema1.properties || schema2.properties) { merged.properties = {}; // Add all properties from both schemas const allProps = new Set([ ...Object.keys(schema1.properties || {}), ...Object.keys(schema2.properties || {}) ]); for (const prop of allProps) { const prop1 = schema1.properties?.[prop]; const prop2 = schema2.properties?.[prop]; if (prop1 && prop2) { // Property exists in both - merge them merged.properties[prop] = this.mergeSchemas(prop1, prop2); } else { // Property exists in only one schema merged.properties[prop] = prop1 || prop2; } } } // Merge required fields (intersection - only if required in both) if (schema1.required && schema2.required) { const required = schema1.required.filter(r => schema2.required.includes(r)); if (required.length > 0) { merged.required = required; } } // Merge array items if (schema1.items || schema2.items) { merged.items = this.mergeSchemas(schema1.items, schema2.items); } // Preserve constraints (take most restrictive) if (schema1.minimum !== undefined || schema2.minimum !== undefined) { merged.minimum = Math.max(schema1.minimum ?? -Infinity, schema2.minimum ?? -Infinity); } if (schema1.maximum !== undefined || schema2.maximum !== undefined) { merged.maximum = Math.min(schema1.maximum ?? Infinity, schema2.maximum ?? Infinity); } if (schema1.minLength !== undefined || schema2.minLength !== undefined) { merged.minLength = Math.max(schema1.minLength ?? 0, schema2.minLength ?? 0); } if (schema1.maxLength !== undefined || schema2.maxLength !== undefined) { merged.maxLength = Math.min(schema1.maxLength ?? Infinity, schema2.maxLength ?? Infinity); } // Merge string models and histograms if (schema1.stringModel || schema2.stringModel) { // For now, prefer the first schema's model merged.stringModel = schema1.stringModel || schema2.stringModel; } if (schema1.histogram || schema2.histogram) { merged.histogram = schema1.histogram || schema2.histogram; } // Merge format and pattern merged.format = schema1.format || schema2.format; merged.pattern = schema1.pattern || schema2.pattern; return merged; } } // Export default instance and class const defaultInferrer = new SchemaInferrer(); export default defaultInferrer;

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/smallmindsco/MongTap'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

inferrer.js•21.8 KiB