IndexFoundry MCP

normalize.ts•29.5 KiB

/** * IndexFoundry-MCP: Normalize Tools (Phase 3) * * Chunking, enrichment, and deduplication tools. * All operations are deterministic and produce auditable outputs. * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import * as path from "path"; import * as fs from "fs/promises"; import type { DocumentChunk, ToolError } from "../types.js"; import type { NormalizeChunkInput, NormalizeEnrichInput, NormalizeDedupeInput } from "../schemas.js"; import { sha256, generateChunkId, pathExists, ensureDir, readJsonl, appendJsonl, writeJson, normalizeText, estimateTokens, hashConfig, createToolError, now, parseHeadingLevel, getParentContext, } from "../utils.js"; import { getRunManager } from "../run-manager.js"; // ============================================================================ // Normalize Chunk // ============================================================================ export interface NormalizeChunkResult { success: boolean; output_path: string; stats: { documents_processed: number; chunks_created: number; chunks_below_min: number; chunks_at_max: number; avg_chunk_chars: number; total_chars: number; }; chunker_config: { strategy: string; max_chars: number; overlap_chars: number; config_hash: string; }; } /** * Split text using recursive strategy */ function recursiveChunk( text: string, maxChars: number, minChars: number, overlap: number, separators: string[] ): string[] { // If text fits, return as single chunk if (text.length <= maxChars) { return [text]; } // Try each separator in order for (const sep of separators) { const parts = text.split(sep); // If we got multiple parts, process them if (parts.length > 1) { const chunks: string[] = []; let current = ""; for (const part of parts) { const candidate = current ? current + sep + part : part; if (candidate.length <= maxChars) { current = candidate; } else { // Push current chunk if it exists if (current.length >= minChars) { chunks.push(current); } else if (current && chunks.length > 0) { // Append small chunk to previous chunks[chunks.length - 1] += sep + current; } // Start new chunk if (part.length > maxChars) { // Part is too large, recurse with next separator const subChunks = recursiveChunk( part, maxChars, minChars, overlap, separators.slice(separators.indexOf(sep) + 1) ); chunks.push(...subChunks); current = ""; } else { current = part; } } } // Don't forget last chunk if (current.length >= minChars) { chunks.push(current); } else if (current && chunks.length > 0) { chunks[chunks.length - 1] += sep + current; } // Apply overlap if (overlap > 0 && chunks.length > 1) { const overlapped: string[] = [chunks[0]]; for (let i = 1; i < chunks.length; i++) { const prevChunk = chunks[i - 1]; const overlapText = prevChunk.slice(-overlap); overlapped.push(overlapText + chunks[i]); } return overlapped; } return chunks; } } // No separator worked, hard split const chunks: string[] = []; for (let i = 0; i < text.length; i += maxChars - overlap) { chunks.push(text.slice(i, i + maxChars)); } return chunks; } /** * Internal result type for hierarchical chunking with parent tracking. * Used to map temporary IDs to real SHA256-based chunk IDs. */ interface HierarchicalChunkResult { /** The text content of this chunk */ text: string; /** Hierarchy level: 1-6 for h1-h6, 0 for non-heading content */ level: number; /** Temporary ID of parent chunk (resolved to real SHA256 ID later) */ parentId?: string; /** Truncated context from parent chunk for semantic continuity */ parentContext?: string; /** Temporary chunk ID (replaced with deterministic SHA256 ID later) */ chunkId?: string; } /** * Performs hierarchical chunking on markdown content, creating parent-child * relationships based on heading structure (h1-h6). * * This strategy: * - Parses ATX-style headings (# through ######) * - Creates chunks for each heading + its content * - Tracks parent-child relationships between heading levels * - Includes configurable parent context in child chunks * - Produces deterministic output for the same input * * @param text - The markdown content to chunk * @param _maxChars - Maximum characters per chunk (reserved for future chunking within sections) * @param minChars - Minimum characters for a chunk to be emitted * @param _overlap - Character overlap between chunks (reserved for future use) * @param createParentChunks - Whether to register headings as parents for their level * @param parentContextChars - Characters of parent context to include in children (0 to disable) * @returns Array of HierarchicalChunkResult with hierarchy metadata * * @example * ```typescript * const markdown = "# Title\nContent\n## Section\nMore content"; * const chunks = hierarchicalChunk(markdown, 1500, 50, 0, true, 100); * // Returns: * // [ * // { text: "# Title\nContent", level: 1, chunkId: "chunk_0" }, * // { text: "## Section\nMore content", level: 2, * // parentId: "chunk_0", parentContext: "# Title\nContent", chunkId: "chunk_1" } * // ] * ``` */ function hierarchicalChunk( text: string, _maxChars: number, minChars: number, _overlap: number, createParentChunks: boolean, parentContextChars: number ): HierarchicalChunkResult[] { const results: HierarchicalChunkResult[] = []; const lines = text.split('\n'); // Track parent at each level (index 0 unused, 1-6 for heading levels) const parentStack: Array<{ id: string; context: string; level: number } | undefined> = new Array(7).fill(undefined); let currentContent = ''; let currentLevel = 0; let currentHeading = ''; let chunkIndex = 0; const generateTempId = () => `chunk_${chunkIndex++}`; const flushContent = () => { if (!currentContent.trim()) return; const contentText = currentHeading ? `${currentHeading}\n${currentContent.trim()}` : currentContent.trim(); if (contentText.length >= minChars || results.length === 0) { const chunkId = generateTempId(); // Find the closest parent at a higher level let parentId: string | undefined; let parentContext: string | undefined; for (let level = currentLevel - 1; level >= 1; level--) { if (parentStack[level]) { parentId = parentStack[level]!.id; if (parentContextChars > 0) { parentContext = parentStack[level]!.context.slice(0, parentContextChars); } break; } } results.push({ text: contentText, level: currentLevel, parentId, parentContext: parentContext || undefined, chunkId }); // If this is a heading, register as parent for its level if (currentLevel > 0 && createParentChunks) { parentStack[currentLevel] = { id: chunkId, context: contentText, level: currentLevel }; // Clear parents at deeper levels when we encounter a new heading for (let i = currentLevel + 1; i <= 6; i++) { parentStack[i] = undefined; } } } }; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const headingLevel = parseHeadingLevel(line); if (headingLevel !== null) { // Flush previous content before starting new section flushContent(); currentHeading = line; currentLevel = headingLevel; currentContent = ''; // Look ahead to get content under this heading const contentLines: string[] = []; let j = i + 1; while (j < lines.length) { const nextLine = lines[j]; const nextHeadingLevel = parseHeadingLevel(nextLine); if (nextHeadingLevel !== null) { break; // Stop at next heading } contentLines.push(nextLine); j++; } currentContent = contentLines.join('\n'); i = j - 1; // Skip the lines we consumed } else { // Not a heading, accumulate content if (currentLevel === 0) { currentContent += (currentContent ? '\n' : '') + line; } } } // Flush any remaining content flushContent(); return results; } /** * Split text by paragraph (double newlines) */ function paragraphChunk(text: string, maxChars: number, minChars: number): string[] { const paragraphs = text.split(/\n\n+/); const chunks: string[] = []; let current = ""; for (const para of paragraphs) { const trimmed = para.trim(); if (!trimmed) continue; if (current.length + trimmed.length + 2 <= maxChars) { current = current ? current + "\n\n" + trimmed : trimmed; } else { if (current) chunks.push(current); current = trimmed; } } if (current) chunks.push(current); // Merge small chunks return mergeSmallChunks(chunks, minChars, maxChars); } /** * Split text by headings (markdown) */ function headingChunk(text: string, maxChars: number, minChars: number): string[] { const sections = text.split(/^(#{1,6}\s.+)$/m); const chunks: string[] = []; let current = ""; let currentHeading = ""; for (let i = 0; i < sections.length; i++) { const section = sections[i].trim(); if (!section) continue; if (section.startsWith("#")) { // It's a heading if (current) { chunks.push(currentHeading + "\n\n" + current); } currentHeading = section; current = ""; } else { // It's content if (current.length + section.length + 2 <= maxChars) { current = current ? current + "\n\n" + section : section; } else { if (current) chunks.push(currentHeading + "\n\n" + current); current = section; } } } if (current) { chunks.push(currentHeading + "\n\n" + current); } return mergeSmallChunks(chunks, minChars, maxChars); } /** * Merge chunks that are too small */ function mergeSmallChunks(chunks: string[], minChars: number, maxChars: number): string[] { const result: string[] = []; let buffer = ""; for (const chunk of chunks) { if (buffer.length + chunk.length + 2 <= maxChars) { buffer = buffer ? buffer + "\n\n" + chunk : chunk; } else { if (buffer) result.push(buffer); buffer = chunk; } } if (buffer) result.push(buffer); return result; } export async function normalizeChunk(input: NormalizeChunkInput): Promise<NormalizeChunkResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { // Check output path const outputPath = path.join(normalizedDir, "chunks.jsonl"); const configHash = hashConfig({ strategy: input.strategy, max_chars: input.max_chars, min_chars: input.min_chars, overlap_chars: input.overlap_chars, split_hierarchy: input.split_hierarchy, }); // Track stats let documentsProcessed = 0; let chunksCreated = 0; let chunksBelowMin = 0; let chunksAtMax = 0; let totalChars = 0; const allChunks: DocumentChunk[] = []; // Process each input file for (const inputPath of input.input_paths.sort()) { const fullPath = path.join(runDir, inputPath); if (!await pathExists(fullPath)) { continue; } // Read source content const content = await fs.readFile(fullPath, "utf-8"); const normalizedContent = normalizeText(content); const docHash = sha256(normalizedContent); documentsProcessed++; // Check if using hierarchical strategy if (input.strategy === "hierarchical") { // Use hierarchical chunking with parent tracking const hierarchicalResults = hierarchicalChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.create_parent_chunks, input.parent_context_chars ); // Map temp chunk IDs to real chunk IDs const tempIdToRealId = new Map<string, string>(); let byteOffset = 0; for (let i = 0; i < hierarchicalResults.length; i++) { const result = hierarchicalResults[i]; const chunkText = result.text; const textHash = sha256(chunkText); const byteEnd = byteOffset + Buffer.byteLength(chunkText, "utf-8"); const realChunkId = generateChunkId(docHash, byteOffset, byteEnd); // Map temp ID to real ID if (result.chunkId) { tempIdToRealId.set(result.chunkId, realChunkId); } // Resolve parent ID from temp ID to real ID let resolvedParentId: string | undefined; if (result.parentId) { resolvedParentId = tempIdToRealId.get(result.parentId); } const chunk: DocumentChunk = { doc_id: docHash, chunk_id: realChunkId, chunk_index: i, hierarchy_level: result.level, parent_id: resolvedParentId, parent_context: result.parentContext, source: { type: detectSourceType(inputPath), uri: inputPath, retrieved_at: now(), content_hash: docHash, }, content: { text: chunkText, text_hash: textHash, char_count: chunkText.length, token_count_approx: estimateTokens(chunkText), }, position: { byte_start: byteOffset, byte_end: byteEnd, }, metadata: { content_type: detectContentType(inputPath), }, }; allChunks.push(chunk); chunksCreated++; totalChars += chunkText.length; if (chunkText.length < input.min_chars) { chunksBelowMin++; } if (chunkText.length >= input.max_chars - 10) { chunksAtMax++; } byteOffset = byteEnd; } } else { // Chunk based on strategy let chunks: string[]; switch (input.strategy) { case "recursive": chunks = recursiveChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.split_hierarchy ); break; case "by_paragraph": chunks = paragraphChunk(normalizedContent, input.max_chars, input.min_chars); break; case "by_heading": chunks = headingChunk(normalizedContent, input.max_chars, input.min_chars); break; case "fixed_chars": chunks = []; for (let i = 0; i < normalizedContent.length; i += input.max_chars - input.overlap_chars) { chunks.push(normalizedContent.slice(i, i + input.max_chars)); } break; case "by_sentence": const sentences = normalizedContent.split(/(?<=[.!?])\s+/); chunks = []; let current = ""; for (const sentence of sentences) { if (current.length + sentence.length + 1 <= input.max_chars) { current = current ? current + " " + sentence : sentence; } else { if (current) chunks.push(current); current = sentence; } } if (current) chunks.push(current); break; case "by_page": // For page-based, assume page markers or use fixed splits chunks = normalizedContent.split(/\f|\n{4,}/).filter(Boolean); break; default: chunks = recursiveChunk( normalizedContent, input.max_chars, input.min_chars, input.overlap_chars, input.split_hierarchy ); } // Create DocumentChunk records let byteOffset = 0; for (let i = 0; i < chunks.length; i++) { const chunkText = chunks[i]; const textHash = sha256(chunkText); const byteEnd = byteOffset + Buffer.byteLength(chunkText, "utf-8"); const chunk: DocumentChunk = { doc_id: docHash, chunk_id: generateChunkId(docHash, byteOffset, byteEnd), chunk_index: i, hierarchy_level: 0, // Non-hierarchical chunks get level 0 source: { type: detectSourceType(inputPath), uri: inputPath, retrieved_at: now(), content_hash: docHash, }, content: { text: chunkText, text_hash: textHash, char_count: chunkText.length, token_count_approx: estimateTokens(chunkText), }, position: { byte_start: byteOffset, byte_end: byteEnd, }, metadata: { content_type: detectContentType(inputPath), }, }; allChunks.push(chunk); chunksCreated++; totalChars += chunkText.length; if (chunkText.length < input.min_chars) { chunksBelowMin++; } if (chunkText.length >= input.max_chars - 10) { chunksAtMax++; } byteOffset = byteEnd; } } } // Write all chunks (sorted for determinism) allChunks.sort((a, b) => { const docCompare = a.doc_id.localeCompare(b.doc_id); if (docCompare !== 0) return docCompare; return a.chunk_index - b.chunk_index; }); // Clear existing file and write fresh await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, allChunks); return { success: true, output_path: "normalized/chunks.jsonl", stats: { documents_processed: documentsProcessed, chunks_created: chunksCreated, chunks_below_min: chunksBelowMin, chunks_at_max: chunksAtMax, avg_chunk_chars: chunksCreated > 0 ? Math.round(totalChars / chunksCreated) : 0, total_chars: totalChars, }, chunker_config: { strategy: input.strategy, max_chars: input.max_chars, overlap_chars: input.overlap_chars, config_hash: configHash, }, }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to chunk documents: ${err}`, { recoverable: false, }); } } // ============================================================================ // Normalize Enrich // ============================================================================ export interface NormalizeEnrichResult { success: boolean; output_path: string; stats: { chunks_enriched: number; languages_detected: Record<string, number>; tags_extracted: number; sections_identified: number; }; } export async function normalizeEnrich(input: NormalizeEnrichInput): Promise<NormalizeEnrichResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { const chunksPath = path.join(runDir, input.chunks_path); if (!await pathExists(chunksPath)) { return createToolError("CHUNK_ERROR", `Chunks file not found: ${input.chunks_path}`, { recoverable: false, }); } // Read chunks const chunks = await readJsonl<DocumentChunk>(chunksPath); const stats = { chunks_enriched: 0, languages_detected: {} as Record<string, number>, tags_extracted: 0, sections_identified: 0, }; // Import language detection let detectLanguage: ((text: string) => string) | null = null; if (input.rules.detect_language) { try { const franc = await import("franc-min"); detectLanguage = (text: string) => { const lang = franc.franc(text); return lang === "und" ? "unknown" : lang; }; } catch { // Language detection not available } } // Process each chunk for (const chunk of chunks) { let modified = false; // Language detection if (input.rules.detect_language && detectLanguage) { const lang = detectLanguage(chunk.content.text); chunk.metadata.language = lang; stats.languages_detected[lang] = (stats.languages_detected[lang] || 0) + 1; modified = true; } // Regex tag extraction if (input.rules.regex_tags?.length) { const tags: string[] = chunk.metadata.tags || []; for (const rule of input.rules.regex_tags) { const regex = new RegExp(rule.pattern, rule.flags); const matches = chunk.content.text.matchAll(regex); for (const match of matches) { const value = match[1] || match[0]; const tag = `${rule.tag_name}:${value}`; if (!tags.includes(tag)) { tags.push(tag); stats.tags_extracted++; } } } if (tags.length) { chunk.metadata.tags = tags.sort(); modified = true; } } // Section detection if (input.rules.section_patterns?.length) { for (const pattern of input.rules.section_patterns) { const regex = new RegExp(pattern.pattern); if (regex.test(chunk.content.text)) { chunk.position.section = pattern.section_name; stats.sections_identified++; modified = true; break; } } } // Taxonomy mapping if (input.rules.taxonomy) { const tags: string[] = chunk.metadata.tags || []; const textLower = chunk.content.text.toLowerCase(); for (const [category, terms] of Object.entries(input.rules.taxonomy)) { for (const term of terms) { if (textLower.includes(term.toLowerCase())) { const tag = `category:${category}`; if (!tags.includes(tag)) { tags.push(tag); } break; } } } if (tags.length) { chunk.metadata.tags = tags.sort(); modified = true; } } if (modified) { stats.chunks_enriched++; } } // Write enriched chunks back const outputPath = path.join(normalizedDir, "chunks.enriched.jsonl"); await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, chunks); // Save enrichment report await writeJson(path.join(normalizedDir, "metadata_enrichment.json"), stats); return { success: true, output_path: "normalized/chunks.enriched.jsonl", stats, }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to enrich chunks: ${err}`, { recoverable: false, }); } } // ============================================================================ // Normalize Dedupe // ============================================================================ export interface NormalizeDedupeResult { success: boolean; output_path: string; stats: { input_chunks: number; output_chunks: number; duplicates_removed: number; duplicate_groups: number; }; dedupe_report_path: string; } export async function normalizeDedupe(input: NormalizeDedupeInput): Promise<NormalizeDedupeResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const normalizedDir = manager.getNormalizedDir(input.run_id); try { const chunksPath = path.join(runDir, input.chunks_path); if (!await pathExists(chunksPath)) { return createToolError("CHUNK_ERROR", `Chunks file not found: ${input.chunks_path}`, { recoverable: false, }); } // Read chunks const chunks = await readJsonl<DocumentChunk>(chunksPath); const stats = { input_chunks: chunks.length, output_chunks: 0, duplicates_removed: 0, duplicate_groups: 0, }; const duplicateReport: Array<{ kept: string; removed: string[]; method: string; }> = []; // Track seen content const seen = new Map<string, DocumentChunk>(); const uniqueChunks: DocumentChunk[] = []; if (input.method === "exact") { // Exact hash deduplication for (const chunk of chunks) { const key = input.scope === "global" ? chunk.content.text_hash : `${chunk.doc_id}:${chunk.content.text_hash}`; if (seen.has(key)) { stats.duplicates_removed++; // Find or create duplicate group const existing = duplicateReport.find(r => r.kept === seen.get(key)!.chunk_id); if (existing) { existing.removed.push(chunk.chunk_id); } else { stats.duplicate_groups++; duplicateReport.push({ kept: seen.get(key)!.chunk_id, removed: [chunk.chunk_id], method: "exact", }); } } else { seen.set(key, chunk); uniqueChunks.push(chunk); } } } else if (input.method === "simhash") { // Simhash-based near-duplicate detection // Simplified implementation using text comparison const threshold = input.similarity_threshold; for (const chunk of chunks) { let isDuplicate = false; for (const existing of uniqueChunks) { // Simple similarity check (Jaccard-like) const similarity = computeTextSimilarity( chunk.content.text, existing.content.text ); if (similarity >= threshold) { isDuplicate = true; stats.duplicates_removed++; const report = duplicateReport.find(r => r.kept === existing.chunk_id); if (report) { report.removed.push(chunk.chunk_id); } else { stats.duplicate_groups++; duplicateReport.push({ kept: existing.chunk_id, removed: [chunk.chunk_id], method: "simhash", }); } break; } } if (!isDuplicate) { uniqueChunks.push(chunk); } } } stats.output_chunks = uniqueChunks.length; // Write deduplicated chunks const outputPath = path.join(normalizedDir, "chunks.deduped.jsonl"); await fs.writeFile(outputPath, ""); await appendJsonl(outputPath, uniqueChunks); // Write dedupe report const reportPath = path.join(normalizedDir, "dedupe_report.json"); await writeJson(reportPath, { method: input.method, scope: input.scope, threshold: input.similarity_threshold, stats, groups: duplicateReport, }); return { success: true, output_path: "normalized/chunks.deduped.jsonl", stats, dedupe_report_path: "normalized/dedupe_report.json", }; } catch (err) { return createToolError("CHUNK_ERROR", `Failed to deduplicate chunks: ${err}`, { recoverable: false, }); } } // ============================================================================ // Helper Functions // ============================================================================ function detectSourceType(filePath: string): DocumentChunk["source"]["type"] { const ext = path.extname(filePath).toLowerCase(); const mapping: Record<string, DocumentChunk["source"]["type"]> = { ".pdf": "pdf", ".html": "html", ".htm": "html", ".md": "markdown", ".txt": "txt", ".csv": "csv", ".json": "json", ".docx": "docx", }; return mapping[ext] || "txt"; } function detectContentType(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const mapping: Record<string, string> = { ".pdf": "application/pdf", ".html": "text/html", ".htm": "text/html", ".md": "text/markdown", ".txt": "text/plain", ".csv": "text/csv", ".json": "application/json", }; return mapping[ext] || "text/plain"; } function computeTextSimilarity(a: string, b: string): number { // Simple character-based Jaccard similarity const setA = new Set(a.toLowerCase().split(/\s+/)); const setB = new Set(b.toLowerCase().split(/\s+/)); let intersection = 0; for (const word of setA) { if (setB.has(word)) intersection++; } const union = setA.size + setB.size - intersection; return union > 0 ? intersection / union : 0; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

normalize.ts•29.5 KiB