Skip to main content
Glama
normalize.ts30.2 kB
/** * IndexFoundry-MCP: Normalize Tools (Phase 3) * * Chunking, enrichment, and deduplication tools. * All operations are deterministic and produce auditable outputs. * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import * as path from "path"; import * as fs from "fs/promises"; import type { DocumentChunk, ToolError } from "../types.js"; import type { NormalizeChunkInput, NormalizeEnrichInput, NormalizeDedupeInput } from "../schemas.js"; import { sha256, generateChunkId, pathExists, ensureDir, readJsonl, appendJsonl, writeJson, normalizeText, estimateTokens, hashConfig, createToolError, now, parseHeadingLevel, getParentContext, } from "../utils.js"; import { getRunManager } from "../run-manager.js"; // ============================================================================ // Normalize Chunk // ============================================================================ export interface NormalizeChunkResult { success: boolean; output_path: string; stats: { documents_processed: number; chunks_created: number; chunks_below_min: number; chunks_at_max: number; avg_chunk_chars: number; total_chars: number; }; chunker_config: { strategy: string; max_chars: number; overlap_chars: number; config_hash: string; }; } /** * Split text using recursive strategy */ function recursiveChunk( text: string, maxChars: number, minChars: number, overlap: number, separators: string[] ): string[] { // If text fits, return as single chunk if (text.length <= maxChars) { return [text]; } // Try each separator in order for (const sep of separators) { const parts = text.split(sep); // If we got multiple parts, process them if (parts.length > 1) { const chunks: string[] = []; let current = ""; for (const part of parts) { const candidate = current ? current + sep + part : part; if (candidate.length <= maxChars) { current = candidate; } else { // Push current chunk if it exists if (current.length >= minChars) { chunks.push(current); } else if (current && chunks.length > 0) { // Append small chunk to previous chunks[chunks.length - 1] += sep + current; } // Start new chunk if (part.length > maxChars) { // Part is too large, recurse with next separator const subChunks = recursiveChunk( part, maxChars, minChars, overlap, separators.slice(separators.indexOf(sep) + 1) ); chunks.push(...subChunks); current = ""; } else { current = part; } } } // Don't forget last chunk if (current.length >= minChars) { chunks.push(current); } else if (current && chunks.length > 0) { chunks[chunks.length - 1] += sep + current; } // Apply overlap if (overlap > 0 && chunks.length > 1) { const overlapped: string[] = [chunks[0]]; for (let i = 1; i < chunks.length; i++) { const prevChunk = chunks[i - 1]; const overlapText = prevChunk.slice(-overlap); overlapped.push(overlapText + chunks[i]); } return overlapped; } return chunks; } } // No separator worked, hard split const chunks: string[] = []; for (let i = 0; i < text.length; i += maxChars - overlap) { chunks.push(text.slice(i, i + maxChars)); } return chunks; } /** * Internal result type for hierarchical chunking with parent tracking. * Used to map temporary IDs to real SHA256-based chunk IDs. */ interface HierarchicalChunkResult { /** The text content of this chunk */ text: string; /** Hierarchy level: 1-6 for h1-h6, 0 for non-heading content */ level: number; /** Temporary ID of parent chunk (resolved to real SHA256 ID later) */ parentId?: string; /** Truncated context from parent chunk for semantic continuity */ parentContext?: string; /** Temporary chunk ID (replaced with deterministic SHA256 ID later) */ chunkId?: string; } /** * Performs hierarchical chunking on markdown content, creating parent-child * relationships based on heading structure (h1-h6). * * This strategy: * - Parses ATX-style headings (# through ######) * - Creates chunks for each heading + its content * - Tracks parent-child relationships between heading levels * - Includes configurable parent context in child chunks * - Produces deterministic output for the same input * * @param text - The markdown content to chunk * @param _maxChars - Maximum characters per chunk (reserved for future chunking within sections) * @param minChars - Minimum characters for a chunk to be emitted * @param _overlap - Character overlap between chunks (reserved for future use) * @param createParentChunks - Whether to register headings as parents for their level * @param parentContextChars - Characters of parent context to include in children (0 to disable) * @returns Array of HierarchicalChunkResult with hierarchy metadata * * @example * ```typescript * const markdown = "# Title\nContent\n## Section\nMore content"; * const chunks = hierarchicalChunk(markdown, 1500, 50, 0, true, 100); * // Returns: * // [ * // { text: "# Title\nContent", level: 1, chunkId: "chunk_0" }, * // { text: "## Section\nMore content", level: 2, * // parentId: "chunk_0", parentContext: "# Title\nContent", chunkId: "chunk_1" } * // ] * ``` */ function hierarchicalChunk( text: string, _maxChars: number, minChars: number, _overlap: number, createParentChunks: boolean, parentContextChars: number ): HierarchicalChunkResult[] { const results: HierarchicalChunkResult[] = []; const lines = text.split('\n'); // Track parent at each level (index 0 unused, 1-6 for heading levels) const parentStack: Array<{ id: string; context: string; level: number } | undefined> = new Array(7).fill(undefined); let currentContent = ''; let currentLevel = 0; let currentHeading = ''; let chunkIndex = 0; const generateTempId = () => `chunk_${chunkIndex++}`; const flushContent = () => { if (!currentContent.trim()) return; const contentText = currentHeading ? `${currentHeading}\n${currentContent.trim()}` : currentContent.trim(); if (contentText.length >= minChars || results.length === 0) { const chunkId = generateTempId(); // Find the closest parent at a higher level let parentId: string | undefined; let parentContext: string | undefined; for (let level = currentLevel - 1; level >= 1; level--) { if (parentStack[level]) { parentId = parentStack[level]!.id; if (parentContextChars > 0) { parentContext = parentStack[level]!.context.slice(0, parentContextChars); } break; } } results.push({ text: contentText, level: currentLevel, parentId, parentContext: parentContext || undefined, chunkId }); // If this is a heading, register as parent for its level if (currentLevel > 0 && createParentChunks) { parentStack[currentLevel] = { id: chunkId, context: contentText, level: currentLevel }; // Clear parents at deeper levels when we encounter a new heading for (let i = currentLevel + 1; i <= 6; i++) { parentStack[i] = undefined; } } } }; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const headingLevel = parseHeadingLevel(line); if (headingLevel !== null) { // Flush previous content before starting new section flushContent(); currentHeading = line; currentLevel = headingLevel; currentContent = ''; // Look ahead to get content under this heading const contentLines: string[] = []; let j = i + 1; while (j < lines.length) { const nextLine = lines[j]; const nextHeadingLevel = parseHeadingLevel(nextLine); if (nextHeadingLevel !== null) { break; // Stop at next heading } contentLines.push(nextLine); j++; } currentContent = contentLines.join('\n'); i = j - 1; // Skip the lines we consumed } else { // Not a heading, accumulate content if (currentLevel === 0) { currentContent += (currentContent ? '\n' : '') + line; } } } // Flush any remaining content flushContent(); return results; } /** * Split text by paragraph (double newlines) */ function paragraphChunk(text: string, maxChars: number, minChars: number): string[] { const paragraphs = text.split(/\n\n+/); const chunks: string[] = []; let current = ""; for (const para of paragraphs) { const trimmed = para.trim(); if (!trimmed) continue; if (current.length + trimmed.length + 2 <= maxChars) { current = current ? current + "\n\n" + trimmed : trimmed; } else { if (current) chunks.push(current); current = trimmed; } } if (current) chunks.push(current); // Merge small chunks return mergeSmallChunks(chunks, minChars, maxChars); } /** * Split text by headings (markdown) */ function headingChunk(text: string, maxChars: number, minChars: number): string[] { const sections = text.split(/^(#{1,6}\s.+)$/m); const chunks: string[] = []; let current = ""; let currentHeading = ""; for (let i = 0; i < sections.length; i++) { const section = sections[i].trim(); if (!section) continue; if (section.startsWith("#")) { // It's a heading if (current) { chunks.push(currentHeading + "\n\n" + current); } currentHeading = section; current = ""; } else { // It's content if (current.length + section.length + 2 <= maxChars) { current = current ? current + "\n\n" + section : section; } else { if (current) chunks.push(currentHeading + "\n\n" + current); current = section; } } } if (current) { chunks.push(currentHeading + "\n\n" + current); } return mergeSmallChunks(chunks, minChars, maxChars); } /** * Merge chunks that are too small */ function mergeSmallChunks(chunks: string[], minChars: number, maxChars: number): string[] { const result: string[] = []; let buffer = ""; for (const chunk of chunks) { if (buffer.length + chunk.length + 2 <= maxChars) { buffer = buffer ? buffer + "\n\n" + chunk : chunk; } else { if (buffer) result.push(buffer); buffer = chunk; } } if (buffer) result.push(buffer); return result; } export async function normalizeChunk(input: NormalizeChunkInput): Promise<NormalizeChunkResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { // Check output path const outputPath = path.join(normalizedDir, "chunks.jsonl"); const configHash = hashConfig({ strategy: input.strategy, max_chars: input.max_chars, min_chars: input.min_chars, overlap_chars: input.overlap_chars, split_hierarchy: input.split_hierarchy, }); // Track stats let documentsProcessed = 0; let chunksCreated = 0; let chunksBelowMin = 0; let chunksAtMax = 0; let totalChars = 0; const allChunks: DocumentChunk[] = []; // Process each input file for (const inputPath of input.input_paths.sort()) { const fullPath = path.join(runDir, inputPath); if (!await pathExists(fullPath)) { continue; } // Read source content const content = await fs.readFile(fullPath, "utf-8"); const normalizedContent = normalizeText(content); const docHash = sha256(normalizedContent); documentsProcessed++; // Check if using hierarchical strategy if (input.strategy === "hierarchical") { // Use hierarchical chunking with parent tracking const hierarchicalResults = hierarchicalChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.create_parent_chunks, input.parent_context_chars ); // Map temp chunk IDs to real chunk IDs const tempIdToRealId = new Map<string, string>(); let byteOffset = 0; for (let i = 0; i < hierarchicalResults.length; i++) { const result = hierarchicalResults[i]; const chunkText = result.text; const textHash = sha256(chunkText); const byteEnd = byteOffset + Buffer.byteLength(chunkText, "utf-8"); const realChunkId = generateChunkId(docHash, byteOffset, byteEnd); // Map temp ID to real ID if (result.chunkId) { tempIdToRealId.set(result.chunkId, realChunkId); } // Resolve parent ID from temp ID to real ID let resolvedParentId: string | undefined; if (result.parentId) { resolvedParentId = tempIdToRealId.get(result.parentId); } const chunk: DocumentChunk = { doc_id: docHash, chunk_id: realChunkId, chunk_index: i, hierarchy_level: result.level, parent_id: resolvedParentId, parent_context: result.parentContext, source: { type: detectSourceType(inputPath), uri: inputPath, retrieved_at: now(), content_hash: docHash, }, content: { text: chunkText, text_hash: textHash, char_count: chunkText.length, token_count_approx: estimateTokens(chunkText), }, position: { byte_start: byteOffset, byte_end: byteEnd, }, metadata: { content_type: detectContentType(inputPath), }, }; allChunks.push(chunk); chunksCreated++; totalChars += chunkText.length; if (chunkText.length < input.min_chars) { chunksBelowMin++; } if (chunkText.length >= input.max_chars - 10) { chunksAtMax++; } byteOffset = byteEnd; } } else { // Chunk based on strategy let chunks: string[]; switch (input.strategy) { case "recursive": chunks = recursiveChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.split_hierarchy ); break; case "by_paragraph": chunks = paragraphChunk(normalizedContent, input.max_chars, input.min_chars); break; case "by_heading": chunks = headingChunk(normalizedContent, input.max_chars, input.min_chars); break; case "fixed_chars": chunks = []; for (let i = 0; i < normalizedContent.length; i += input.max_chars - input.overlap_chars) { chunks.push(normalizedContent.slice(i, i + input.max_chars)); } break; case "by_sentence": const sentences = normalizedContent.split(/(?<=[.!?])\s+/); chunks = []; let current = ""; for (const sentence of sentences) { if (current.length + sentence.length + 1 <= input.max_chars) { current = current ? current + " " + sentence : sentence; } else { if (current) chunks.push(current); current = sentence; } } if (current) chunks.push(current); break; case "by_page": // For page-based, assume page markers or use fixed splits chunks = normalizedContent.split(/\f|\n{4,}/).filter(Boolean); break; default: chunks = recursiveChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.split_hierarchy ); } // Create DocumentChunk records let byteOffset = 0; for (let i = 0; i < chunks.length; i++) { const chunkText = chunks[i]; const textHash = sha256(chunkText); const byteEnd = byteOffset + Buffer.byteLength(chunkText, "utf-8"); const chunk: DocumentChunk = { doc_id: docHash, chunk_id: generateChunkId(docHash, byteOffset, byteEnd), chunk_index: i, hierarchy_level: 0, // Non-hierarchical chunks get level 0 source: { type: detectSourceType(inputPath), uri: inputPath, retrieved_at: now(), content_hash: docHash, }, content: { text: chunkText, text_hash: textHash, char_count: chunkText.length, token_count_approx: estimateTokens(chunkText), }, position: { byte_start: byteOffset, byte_end: byteEnd, }, metadata: { content_type: detectContentType(inputPath), }, }; allChunks.push(chunk); chunksCreated++; totalChars += chunkText.length; if (chunkText.length < input.min_chars) { chunksBelowMin++; } if (chunkText.length >= input.max_chars - 10) { chunksAtMax++; } byteOffset = byteEnd; } } } // Write all chunks (sorted for determinism) allChunks.sort((a, b) => { const docCompare = a.doc_id.localeCompare(b.doc_id); if (docCompare !== 0) return docCompare; return a.chunk_index - b.chunk_index; }); // Clear existing file and write fresh await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, allChunks); return { success: true, output_path: "normalized/chunks.jsonl", stats: { documents_processed: documentsProcessed, chunks_created: chunksCreated, chunks_below_min: chunksBelowMin, chunks_at_max: chunksAtMax, avg_chunk_chars: chunksCreated > 0 ? Math.round(totalChars / chunksCreated) : 0, total_chars: totalChars, }, chunker_config: { strategy: input.strategy, max_chars: input.max_chars, overlap_chars: input.overlap_chars, config_hash: configHash, }, }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to chunk documents: ${err}`, { recoverable: false, }); } } // ============================================================================ // Normalize Enrich // ============================================================================ export interface NormalizeEnrichResult { success: boolean; output_path: string; stats: { chunks_enriched: number; languages_detected: Record<string, number>; tags_extracted: number; sections_identified: number; }; } export async function normalizeEnrich(input: NormalizeEnrichInput): Promise<NormalizeEnrichResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { const chunksPath = path.join(runDir, input.chunks_path); if (!await pathExists(chunksPath)) { return createToolError("CHUNK_ERROR", `Chunks file not found: ${input.chunks_path}`, { recoverable: false, }); } // Read chunks const chunks = await readJsonl<DocumentChunk>(chunksPath); const stats = { chunks_enriched: 0, languages_detected: {} as Record<string, number>, tags_extracted: 0, sections_identified: 0, }; // Import language detection let detectLanguage: ((text: string) => string) | null = null; if (input.rules.detect_language) { try { const franc = await import("franc-min"); detectLanguage = (text: string) => { const lang = franc.franc(text); return lang === "und" ? "unknown" : lang; }; } catch { // Language detection not available } } // Process each chunk for (const chunk of chunks) { let modified = false; // Language detection if (input.rules.detect_language && detectLanguage) { const lang = detectLanguage(chunk.content.text); chunk.metadata.language = lang; stats.languages_detected[lang] = (stats.languages_detected[lang] || 0) + 1; modified = true; } // Regex tag extraction if (input.rules.regex_tags?.length) { const tags: string[] = chunk.metadata.tags || []; for (const rule of input.rules.regex_tags) { const regex = new RegExp(rule.pattern, rule.flags); const matches = chunk.content.text.matchAll(regex); for (const match of matches) { const value = match[1] || match[0]; const tag = `${rule.tag_name}:${value}`; if (!tags.includes(tag)) { tags.push(tag); stats.tags_extracted++; } } } if (tags.length) { chunk.metadata.tags = tags.sort(); modified = true; } } // Section detection if (input.rules.section_patterns?.length) { for (const pattern of input.rules.section_patterns) { const regex = new RegExp(pattern.pattern); if (regex.test(chunk.content.text)) { chunk.position.section = pattern.section_name; stats.sections_identified++; modified = true; break; } } } // Taxonomy mapping if (input.rules.taxonomy) { const tags: string[] = chunk.metadata.tags || []; const textLower = chunk.content.text.toLowerCase(); for (const [category, terms] of Object.entries(input.rules.taxonomy)) { for (const term of terms) { if (textLower.includes(term.toLowerCase())) { const tag = `category:${category}`; if (!tags.includes(tag)) { tags.push(tag); } break; } } } if (tags.length) { chunk.metadata.tags = tags.sort(); modified = true; } } if (modified) { stats.chunks_enriched++; } } // Write enriched chunks back const outputPath = path.join(normalizedDir, "chunks.enriched.jsonl"); await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, chunks); // Save enrichment report await writeJson(path.join(normalizedDir, "metadata_enrichment.json"), stats); return { success: true, output_path: "normalized/chunks.enriched.jsonl", stats, }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to enrich chunks: ${err}`, { recoverable: false, }); } } // ============================================================================ // Normalize Dedupe // ============================================================================ export interface NormalizeDedupeResult { success: boolean; output_path: string; stats: { input_chunks: number; output_chunks: number; duplicates_removed: number; duplicate_groups: number; }; dedupe_report_path: string; } export async function normalizeDedupe(input: NormalizeDedupeInput): Promise<NormalizeDedupeResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { const chunksPath = path.join(runDir, input.chunks_path); if (!await pathExists(chunksPath)) { return createToolError("CHUNK_ERROR", `Chunks file not found: ${input.chunks_path}`, { recoverable: false, }); } // Read chunks const chunks = await readJsonl<DocumentChunk>(chunksPath); const stats = { input_chunks: chunks.length, output_chunks: 0, duplicates_removed: 0, duplicate_groups: 0, }; const duplicateReport: Array<{ kept: string; removed: string[]; method: string; }> = []; // Track seen content const seen = new Map<string, DocumentChunk>(); const uniqueChunks: DocumentChunk[] = []; if (input.method === "exact") { // Exact hash deduplication for (const chunk of chunks) { const key = input.scope === "global" ? chunk.content.text_hash : `${chunk.doc_id}:${chunk.content.text_hash}`; if (seen.has(key)) { stats.duplicates_removed++; // Find or create duplicate group const existing = duplicateReport.find(r => r.kept === seen.get(key)!.chunk_id); if (existing) { existing.removed.push(chunk.chunk_id); } else { stats.duplicate_groups++; duplicateReport.push({ kept: seen.get(key)!.chunk_id, removed: [chunk.chunk_id], method: "exact", }); } } else { seen.set(key, chunk); uniqueChunks.push(chunk); } } } else if (input.method === "simhash") { // Simhash-based near-duplicate detection // Simplified implementation using text comparison const threshold = input.similarity_threshold; for (const chunk of chunks) { let isDuplicate = false; for (const existing of uniqueChunks) { // Simple similarity check (Jaccard-like) const similarity = computeTextSimilarity( chunk.content.text, existing.content.text ); if (similarity >= threshold) { isDuplicate = true; stats.duplicates_removed++; const report = duplicateReport.find(r => r.kept === existing.chunk_id); if (report) { report.removed.push(chunk.chunk_id); } else { stats.duplicate_groups++; duplicateReport.push({ kept: existing.chunk_id, removed: [chunk.chunk_id], method: "simhash", }); } break; } } if (!isDuplicate) { uniqueChunks.push(chunk); } } } stats.output_chunks = uniqueChunks.length; // Write deduplicated chunks const outputPath = path.join(normalizedDir, "chunks.deduped.jsonl"); await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, uniqueChunks); // Write dedupe report const reportPath = path.join(normalizedDir, "dedupe_report.json"); await writeJson(reportPath, { method: input.method, scope: input.scope, threshold: input.similarity_threshold, stats, groups: duplicateReport, }); return { success: true, output_path: "normalized/chunks.deduped.jsonl", stats, dedupe_report_path: "normalized/dedupe_report.json", }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to deduplicate chunks: ${err}`, { recoverable: false, }); } } // ============================================================================ // Helper Functions // ============================================================================ function detectSourceType(filePath: string): DocumentChunk["source"]["type"] { const ext = path.extname(filePath).toLowerCase(); const mapping: Record<string, DocumentChunk["source"]["type"]> = { ".pdf": "pdf", ".html": "html", ".htm": "html", ".md": "markdown", ".txt": "txt", ".csv": "csv", ".json": "json", ".docx": "docx", }; return mapping[ext] || "txt"; } function detectContentType(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const mapping: Record<string, string> = { ".pdf": "application/pdf", ".html": "text/html", ".htm": "text/html", ".md": "text/markdown", ".txt": "text/plain", ".csv": "text/csv", ".json": "application/json", }; return mapping[ext] || "text/plain"; } function computeTextSimilarity(a: string, b: string): number { // Simple character-based Jaccard similarity const setA = new Set(a.toLowerCase().split(/\s+/)); const setB = new Set(b.toLowerCase().split(/\s+/)); let intersection = 0; for (const word of setA) { if (setB.has(word)) intersection++; } const union = setA.size + setB.size - intersection; return union > 0 ? intersection / union : 0; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server