DGIdb MCP Server

ChunkingEngine.ts•19.1 KiB

import { TableSchema } from "./types"; export interface ChunkMetadata { contentId: string; totalChunks: number; originalSize: number; contentType: 'json' | 'text'; compressed: boolean; encoding?: string; } export interface ChunkRecord { id?: number; content_id: string; chunk_index: number; chunk_data: string; chunk_size: number; } export interface GraphQLFieldInfo { name: string; type: string; isList: boolean; isNullable: boolean; description?: string; } export interface GraphQLTypeInfo { name: string; kind: 'OBJECT' | 'SCALAR' | 'ENUM' | 'INTERFACE'; fields: Record<string, GraphQLFieldInfo>; description?: string; } export interface GraphQLSchemaInfo { types: Record<string, GraphQLTypeInfo>; relationships: Array<{ fromType: string; toType: string; fieldName: string; cardinality: string; }>; } export interface FieldChunkingRule { fieldName: string; typeName: string; // '*' for all types chunkThreshold: number; priority: 'always' | 'size-based' | 'never'; reason?: string; } /** * ChunkingEngine handles storage and retrieval of large content by breaking it into chunks. * This improves performance, avoids SQLite size limits, and enables better memory management. * * Key features: * - Automatic chunking for content over configurable threshold * - Optional compression for better storage efficiency * - Transparent reassembly during retrieval * - Schema-aware chunking strategies * - Extensible to any MCP server handling large responses */ export class ChunkingEngine { private readonly CHUNK_SIZE_THRESHOLD = 32 * 1024; // 32KB - configurable private readonly CHUNK_SIZE = 16 * 1024; // 16KB per chunk - optimal for SQLite private readonly ENABLE_COMPRESSION = true; // Feature flag for compression private schemaInfo?: GraphQLSchemaInfo; private chunkingRules: FieldChunkingRule[] = []; /** * Configure schema-aware chunking */ configureSchemaAwareness(schemaInfo: GraphQLSchemaInfo): void { this.schemaInfo = schemaInfo; this.generateChunkingRules(); } /** * Generate intelligent chunking rules based on GraphQL schema */ private generateChunkingRules(): void { if (!this.schemaInfo) return; this.chunkingRules = [ // Never chunk ID fields { fieldName: 'id', typeName: '*', chunkThreshold: Infinity, priority: 'never', reason: 'ID fields should never be chunked' }, // Always chunk known large text fields { fieldName: 'description', typeName: '*', chunkThreshold: 1024, priority: 'always', reason: 'Descriptions are typically large and compressible' }, { fieldName: 'summary', typeName: '*', chunkThreshold: 1024, priority: 'always', reason: 'Summaries are typically large and compressible' }, { fieldName: 'statement', typeName: '*', chunkThreshold: 2048, priority: 'always', reason: 'Evidence statements can be very long' }, // DGIdb-specific large fields { fieldName: 'interactions', typeName: '*', chunkThreshold: 4096, priority: 'size-based', reason: 'Interaction data can be extensive' }, { fieldName: 'sources', typeName: '*', chunkThreshold: 2048, priority: 'size-based', reason: 'Source data can contain large nested objects' }, // Conservative chunking for names and short text { fieldName: 'name', typeName: '*', chunkThreshold: 512, priority: 'size-based', reason: 'Names are usually short but could be long in some cases' }, { fieldName: 'displayName', typeName: '*', chunkThreshold: 512, priority: 'size-based', reason: 'Display names are usually short' }, ]; // Generate type-specific rules based on schema analysis for (const [typeName, typeInfo] of Object.entries(this.schemaInfo.types)) { if (typeInfo.kind === 'OBJECT') { for (const [fieldName, fieldInfo] of Object.entries(typeInfo.fields)) { // Large list fields should be chunked aggressively if (fieldInfo.isList && this.isLikelyLargeContent(fieldInfo)) { this.chunkingRules.push({ fieldName, typeName, chunkThreshold: 8192, priority: 'size-based', reason: `List field ${fieldName} on ${typeName} likely contains large content` }); } } } } } /** * Determine if a field is likely to contain large content based on schema info */ private isLikelyLargeContent(fieldInfo: GraphQLFieldInfo): boolean { const largeContentIndicators = [ 'description', 'summary', 'statement', 'content', 'text', 'body', 'metadata', 'details', 'comments', 'notes', 'evidence', 'interactions' ]; return largeContentIndicators.some(indicator => fieldInfo.name.toLowerCase().includes(indicator) || fieldInfo.description?.toLowerCase().includes(indicator) ); } /** * Schema-aware JSON stringification with intelligent chunking decisions */ async schemaAwareJsonStringify( obj: any, typeName: string, fieldName: string, sql: any ): Promise<string> { const jsonString = JSON.stringify(obj); // Check schema-based chunking rules first const applicableRule = this.getApplicableChunkingRule(fieldName, typeName); if (applicableRule) { if (applicableRule.priority === 'never') { return jsonString; } else if (applicableRule.priority === 'always' && jsonString.length > applicableRule.chunkThreshold) { const metadata = await this.storeChunkedContent(jsonString, 'json', sql); return this.createContentReference(metadata); } else if (applicableRule.priority === 'size-based' && jsonString.length > applicableRule.chunkThreshold) { const metadata = await this.storeChunkedContent(jsonString, 'json', sql); return this.createContentReference(metadata); } } // Fallback to default behavior if (!this.shouldChunk(jsonString)) { return jsonString; } const metadata = await this.storeChunkedContent(jsonString, 'json', sql); return this.createContentReference(metadata); } /** * Get the most specific chunking rule for a field */ private getApplicableChunkingRule(fieldName: string, typeName: string): FieldChunkingRule | null { // Try exact type match first let rule = this.chunkingRules.find(r => r.fieldName === fieldName && r.typeName === typeName); if (rule) return rule; // Try wildcard type match rule = this.chunkingRules.find(r => r.fieldName === fieldName && r.typeName === '*'); if (rule) return rule; return null; } /** * Analyze chunking effectiveness and provide recommendations */ async analyzeChunkingEffectiveness(sql: any): Promise<any> { const stats = await this.getChunkingStats(sql); if (!this.schemaInfo) { return { ...stats, recommendation: "Enable schema-aware chunking for better optimization", schema_awareness: false }; } // Analyze which fields are being chunked most const fieldAnalysis = await this.analyzeChunkedFields(sql); return { ...stats, schema_awareness: true, field_analysis: fieldAnalysis, recommendations: this.generateChunkingRecommendations(fieldAnalysis) }; } private async analyzeChunkedFields(sql: any): Promise<any> { // This would analyze the chunk_metadata to see which types of content // are being chunked most frequently and suggest optimizations try { const result = sql.exec(` SELECT content_type, original_size, compressed, COUNT(*) as chunk_count FROM chunk_metadata GROUP BY content_type, compressed ORDER BY chunk_count DESC `).toArray(); return result; } catch (error) { return { error: "Could not analyze chunked fields" }; } } private generateChunkingRecommendations(fieldAnalysis: any): string[] { const recommendations = []; if (this.chunkingRules.length === 0) { recommendations.push("Configure field-specific chunking rules based on your GraphQL schema"); } if (fieldAnalysis && fieldAnalysis.length > 0) { const uncompressedCount = fieldAnalysis.filter((f: any) => !f.compressed).length; if (uncompressedCount > 0) { recommendations.push("Enable compression for better storage efficiency"); } } recommendations.push("Monitor chunk size distribution and adjust thresholds based on query patterns"); return recommendations; } /** * Determines if content should be chunked based on size threshold */ shouldChunk(content: string): boolean { return content.length > this.CHUNK_SIZE_THRESHOLD; } /** * Stores large content as chunks, returns metadata for retrieval */ async storeChunkedContent( content: string, contentType: 'json' | 'text', sql: any ): Promise<ChunkMetadata> { const contentId = this.generateContentId(); let processedContent = content; let compressed = false; // Optional compression (when available in environment) if (this.ENABLE_COMPRESSION && this.shouldCompress(content)) { try { processedContent = await this.compress(content); compressed = true; } catch (error) { console.warn('Compression failed, storing uncompressed:', error); processedContent = content; } } // Ensure chunks table exists await this.ensureChunksTable(sql); // Split into chunks const chunks = this.splitIntoChunks(processedContent); // Store each chunk for (let i = 0; i < chunks.length; i++) { const chunkRecord: ChunkRecord = { content_id: contentId, chunk_index: i, chunk_data: chunks[i], chunk_size: chunks[i].length }; await this.insertChunk(chunkRecord, sql); } // Store metadata const metadata: ChunkMetadata = { contentId, totalChunks: chunks.length, originalSize: content.length, contentType, compressed, encoding: compressed ? 'gzip' : undefined }; await this.storeMetadata(metadata, sql); return metadata; } /** * Retrieves and reassembles chunked content */ async retrieveChunkedContent(contentId: string, sql: any): Promise<string | null> { try { // Get metadata const metadata = await this.getMetadata(contentId, sql); if (!metadata) return null; // Retrieve all chunks in order const chunks = await this.getChunks(contentId, metadata.totalChunks, sql); if (chunks.length !== metadata.totalChunks) { throw new Error(`Missing chunks: expected ${metadata.totalChunks}, found ${chunks.length}`); } // Reassemble content const reassembled = chunks.join(''); // Decompress if needed if (metadata.compressed) { try { return await this.decompress(reassembled); } catch (error) { console.error('Decompression failed:', error); throw new Error('Failed to decompress content'); } } return reassembled; } catch (error) { console.error(`Failed to retrieve chunked content ${contentId}:`, error); return null; } } /** * Creates a content reference for schema columns instead of storing large content directly */ createContentReference(metadata: ChunkMetadata): string { return `__CHUNKED__:${metadata.contentId}`; } /** * Checks if a value is a chunked content reference */ isContentReference(value: any): boolean { return typeof value === 'string' && value.startsWith('__CHUNKED__:'); } /** * Extracts content ID from a content reference */ extractContentId(reference: string): string { return reference.replace('__CHUNKED__:', ''); } /** * Enhanced JSON stringification with automatic chunking */ async smartJsonStringify(obj: any, sql: any): Promise<string> { const jsonString = JSON.stringify(obj); if (!this.shouldChunk(jsonString)) { return jsonString; } // Store as chunks and return reference const metadata = await this.storeChunkedContent(jsonString, 'json', sql); return this.createContentReference(metadata); } /** * Enhanced JSON parsing with automatic chunk retrieval */ async smartJsonParse(value: string, sql: any): Promise<any> { if (!this.isContentReference(value)) { return JSON.parse(value); } const contentId = this.extractContentId(value); const retrievedContent = await this.retrieveChunkedContent(contentId, sql); if (!retrievedContent) { throw new Error(`Failed to retrieve chunked content: ${contentId}`); } return JSON.parse(retrievedContent); } /** * Cleanup chunked content (for maintenance) */ async cleanupChunkedContent(contentId: string, sql: any): Promise<void> { try { // Delete chunks sql.exec( `DELETE FROM content_chunks WHERE content_id = ?`, contentId ); // Delete metadata sql.exec( `DELETE FROM chunk_metadata WHERE content_id = ?`, contentId ); } catch (error) { console.error(`Failed to cleanup chunked content ${contentId}:`, error); } } /** * Get statistics about chunked content storage */ async getChunkingStats(sql: any): Promise<any> { try { const metadataQuery = sql.exec(` SELECT COUNT(*) as total_items, SUM(original_size) as total_original_size, AVG(original_size) as avg_original_size, SUM(total_chunks) as total_chunks, COUNT(CASE WHEN compressed = 1 THEN 1 END) as compressed_items FROM chunk_metadata `); const metadataResults = metadataQuery.toArray(); let metadataResult: any = {}; if (metadataResults.length === 1) { metadataResult = metadataResults[0]; } const chunksQuery = sql.exec(` SELECT COUNT(*) as total_chunk_records, SUM(chunk_size) as total_stored_size, AVG(chunk_size) as avg_chunk_size FROM content_chunks `); const chunksResults = chunksQuery.toArray(); let chunksResult: any = {}; if (chunksResults.length === 1) { chunksResult = chunksResults[0]; } return { metadata: metadataResult || {}, chunks: chunksResult || {}, compression_ratio: metadataResult?.total_original_size && chunksResult?.total_stored_size ? (metadataResult.total_original_size / chunksResult.total_stored_size).toFixed(2) : null }; } catch (error) { return { error: error instanceof Error ? error.message : 'Failed to get stats' }; } } // Private helper methods private generateContentId(): string { return 'chunk_' + crypto.randomUUID().replace(/-/g, ''); } private shouldCompress(content: string): boolean { // Compress content larger than 8KB (good compression threshold) return content.length > 8192; } private async compress(content: string): Promise<string> { // In a real implementation, you might use compression libraries // For now, we'll use a simple base64 encoding as a placeholder // In production, consider using gzip/deflate when available try { const uint8Array = new TextEncoder().encode(content); // Check if CompressionStream is available (modern browsers/runtimes) if (typeof CompressionStream !== 'undefined') { const compressionStream = new CompressionStream('gzip'); const writer = compressionStream.writable.getWriter(); const reader = compressionStream.readable.getReader(); writer.write(uint8Array); writer.close(); const chunks: Uint8Array[] = []; let result = await reader.read(); while (!result.done) { chunks.push(result.value); result = await reader.read(); } // Combine chunks and encode to base64 const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; } return btoa(String.fromCharCode(...combined)); } // Fallback to simple base64 encoding (not real compression) return btoa(content); } catch (error) { throw new Error(`Compression failed: ${error}`); } } private async decompress(compressedContent: string): Promise<string> { try { // Check if DecompressionStream is available if (typeof DecompressionStream !== 'undefined') { const compressedData = Uint8Array.from(atob(compressedContent), c => c.charCodeAt(0)); const decompressionStream = new DecompressionStream('gzip'); const writer = decompressionStream.writable.getWriter(); const reader = decompressionStream.readable.getReader(); writer.write(compressedData); writer.close(); const chunks: Uint8Array[] = []; let result = await reader.read(); while (!result.done) { chunks.push(result.value); result = await reader.read(); } const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; } return new TextDecoder().decode(combined); } // Fallback from base64 return atob(compressedContent); } catch (error) { throw new Error(`Decompression failed: ${error}`); } } private splitIntoChunks(content: string): string[] { const chunks: string[] = []; for (let i = 0; i < content.length; i += this.CHUNK_SIZE) { chunks.push(content.slice(i, i + this.CHUNK_SIZE)); } return chunks; } private async ensureChunksTable(sql: any): Promise<void> { // Create chunks table sql.exec(` CREATE TABLE IF NOT EXISTS content_chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, content_id TEXT NOT NULL, chunk_index INTEGER NOT NULL, chunk_data TEXT NOT NULL, chunk_size INTEGER NOT NULL, created_at TEXT DEFAULT CURRENT_TIMESTAMP, UNIQUE(content_id, chunk_index) ) `); // Create metadata table sql.exec(` CREATE TABLE IF NOT EXISTS chunk_metadata ( content_id TEXT PRIMARY KEY, total_chunks INTEGER NOT NULL, original_size INTEGER NOT NULL, content_type TEXT NOT NULL, compressed INTEGER DEFAULT 0, encoding TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP ) `); // Create indexes for performance sql.exec(`CREATE INDEX IF NOT EXISTS idx_content_chunks_lookup ON content_chunks(content_id, chunk_index)`); sql.exec(`CREATE INDEX IF NOT EXISTS idx_chunk_metadata_size ON chunk_metadata(original_size)`); } private async insertChunk(chunk: ChunkRecord, sql: any): Promise<void> { sql.exec( `INSERT INTO content_chunks (content_id, chunk_index, chunk_data, chunk_size) VALUES (?, ?, ?, ?)`, chunk.content_id, chunk.chunk_index, chunk.chunk_data, chunk.chunk_size ); } private async storeMetadata(metadata: ChunkMetadata, sql: any): Promise<void> { sql.exec( `INSERT INTO chunk_metadata (content_id, total_chunks, original_size, content_type, compressed, encoding) VALUES (?, ?, ?, ?, ?, ?)`, metadata.contentId, metadata.totalChunks, metadata.originalSize, metadata.contentType, metadata.compressed ? 1 : 0, metadata.encoding || null ); } private async getMetadata(contentId: string, sql: any): Promise<ChunkMetadata | null> { const query = sql.exec( `SELECT * FROM chunk_metadata WHERE content_id = ?`, contentId ); const results = query.toArray(); if (results.length !== 1) return null; const result = results[0]; return { contentId: result.content_id, totalChunks: result.total_chunks, originalSize: result.original_size, contentType: result.content_type, compressed: Boolean(result.compressed), encoding: result.encoding }; } private async getChunks(contentId: string, expectedCount: number, sql: any): Promise<string[]> { const results = sql.exec( `SELECT chunk_data FROM content_chunks WHERE content_id = ? ORDER BY chunk_index ASC`, contentId ).toArray(); return results.map((row: any) => row.chunk_data); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/QuentinCody/dgidb-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ChunkingEngine.ts•19.1 KiB