IndexFoundry MCP

hydrate.ts•14.9 KiB

/** * Context Expansion / Chunk Hydration * * This module provides context expansion for search results by fetching * adjacent and parent chunks to provide fuller context for retrieved results. * * Features: * - 📖 Fetch adjacent chunks (before/after a retrieved chunk) * - 🔗 Fetch parent chunks using parent_id (from hierarchical chunking) * - 🌳 Build hierarchy path from chunk to root * - ⚙️ Configure expansion depth and strategy * * @module tools/hydrate * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import { z } from 'zod'; import * as path from 'path'; import type { DocumentChunk } from '../types.js'; import { readJsonl, createToolError } from '../utils.js'; // ============================================================================ // Types // ============================================================================ /** Strategy for context expansion: adjacent chunks, parent hierarchy, or both */ export type ExpandContextStrategy = 'adjacent' | 'parent' | 'both'; /** * Options for hydrating chunks with context. * Controls which context chunks are fetched and limits. */ export interface HydrateOptions { /** Whether context expansion is enabled */ enabled: boolean; /** Strategy: 'adjacent' (siblings), 'parent' (hierarchy), or 'both' */ strategy: ExpandContextStrategy; /** Number of chunks to fetch before the target chunk */ adjacent_before: number; /** Number of chunks to fetch after the target chunk */ adjacent_after: number; /** Whether to include the immediate parent chunk */ include_parent: boolean; /** Maximum total context chunks to return (for limiting response size) */ max_total_chunks: number; } /** * Context information attached to a hydrated search result. * Contains adjacent siblings and parent hierarchy information. */ export interface HydratedContext { /** Immediate parent chunk (if include_parent=true and parent exists) */ parent?: DocumentChunk; /** Chunks appearing before the target in document order */ siblings_before: DocumentChunk[]; /** Chunks appearing after the target in document order */ siblings_after: DocumentChunk[]; /** Path of chunk_ids from root to target: [root, ..., parent, target] */ hierarchy_path?: string[]; } /** * A search result enhanced with context information. * Combines the original chunk/score with hydrated context. */ export interface HydratedSearchResult { /** The original matched chunk */ chunk: DocumentChunk; /** Relevance score from search */ score: number; /** Hydrated context (adjacent chunks, parent, hierarchy) */ context: HydratedContext; } /** * Input format for search results before hydration. */ export interface SearchResultInput { /** The matched chunk */ chunk: DocumentChunk; /** Relevance score */ score: number; } // ============================================================================ // Schema // ============================================================================ /** * Zod schema for context expansion options. * Used in ServeQueryInputSchema and validated at tool boundaries. */ export const ExpandContextInputSchema = z.object({ enabled: z.boolean().default(false) .describe("🔗 Enable context expansion to fetch related chunks"), strategy: z.enum(['adjacent', 'parent', 'both']).default('both') .describe("📚 Strategy: 'adjacent' (siblings), 'parent' (hierarchy), or 'both'"), adjacent_before: z.number().int().min(0).max(5).default(1) .describe("⬆️ Number of chunks to fetch before the target (0-5)"), adjacent_after: z.number().int().min(0).max(5).default(1) .describe("⬇️ Number of chunks to fetch after the target (0-5)"), include_parent: z.boolean().default(true) .describe("🌳 Include the immediate parent chunk in context"), max_total_chunks: z.number().int().min(1).max(20).default(10) .describe("📊 Maximum total context chunks to return (1-20)") }); /** Inferred type from ExpandContextInputSchema */ export type ExpandContextInput = z.infer<typeof ExpandContextInputSchema>; // ============================================================================ // Helper Functions // ============================================================================ /** * Find adjacent chunks (before/after) for a given chunk within the same document. * Chunks are ordered by chunk_index to ensure deterministic results. * * @param chunk - The target chunk to find neighbors for * @param allChunks - All chunks available (typically from the same run) * @param before - Number of chunks to retrieve before the target * @param after - Number of chunks to retrieve after the target * @returns Object with before and after chunk arrays * * @example * const { before, after } = findAdjacentChunks(targetChunk, allChunks, 2, 2); * // before: [chunk at index-2, chunk at index-1] * // after: [chunk at index+1, chunk at index+2] */ function findAdjacentChunks( chunk: DocumentChunk, allChunks: DocumentChunk[], before: number, after: number ): { before: DocumentChunk[]; after: DocumentChunk[] } { // Filter chunks from same document (same doc_id) const sameDocChunks = allChunks.filter(c => c.doc_id === chunk.doc_id); // Sort by chunk_index for deterministic ordering sameDocChunks.sort((a, b) => a.chunk_index - b.chunk_index); // Find current chunk position const currentIdx = sameDocChunks.findIndex(c => c.chunk_id === chunk.chunk_id); if (currentIdx === -1) { return { before: [], after: [] }; } // Get before chunks (limit by available and requested) const beforeChunks = sameDocChunks.slice(Math.max(0, currentIdx - before), currentIdx); // Get after chunks const afterChunks = sameDocChunks.slice(currentIdx + 1, currentIdx + 1 + after); return { before: beforeChunks, after: afterChunks }; } /** * Find the parent chain from a chunk to the root. * Traverses parent_id references to build the ancestor chain. * Includes circular reference detection to prevent infinite loops. * * @param chunk - The starting chunk * @param allChunks - All chunks available for parent lookup * @returns Array of parent chunks ordered [immediate parent, grandparent, ...] * * @example * // For chunk c7 with hierarchy: c1 -> c5 -> c6 -> c7 * const chain = findParentChain(c7, allChunks); * // Returns: [c6, c5, c1] */ function findParentChain( chunk: DocumentChunk, allChunks: DocumentChunk[] ): DocumentChunk[] { const chain: DocumentChunk[] = []; const visited = new Set<string>(); // Prevent circular references let current = chunk; while (current.parent_id) { // Detect circular reference if (visited.has(current.parent_id)) { break; } visited.add(current.chunk_id); const parent = allChunks.find(c => c.chunk_id === current.parent_id); if (!parent) break; chain.push(parent); current = parent; } return chain; // Ordered: [immediate parent, grandparent, ...] } /** * Build hierarchy path from root to current chunk. * Returns an array of chunk_ids representing the path. * * @param chunk - The target chunk * @param allChunks - All chunks available for parent lookup * @returns Array of chunk_ids from root to target, or undefined if no hierarchy * * @example * // For chunk c7 with hierarchy: c1 -> c5 -> c6 -> c7 * const path = buildHierarchyPath(c7, allChunks); * // Returns: ['c1', 'c5', 'c6', 'c7'] */ function buildHierarchyPath( chunk: DocumentChunk, allChunks: DocumentChunk[] ): string[] | undefined { // If chunk has no parent_id and hierarchy_level is 0 or undefined, no hierarchy if (!chunk.parent_id && (!chunk.hierarchy_level || chunk.hierarchy_level === 0)) { return undefined; } const chain = findParentChain(chunk, allChunks); // If no parents found, just return current chunk if (chain.length === 0) { return [chunk.chunk_id]; } // Return chunk_ids from root to current return [...chain.map(c => c.chunk_id).reverse(), chunk.chunk_id]; } /** * Enforce max_total_chunks limit on context. * Prioritizes: parent > immediate siblings > distant siblings. * * @param context - The context object to limit * @param maxTotal - Maximum total chunks allowed * @returns Limited context with chunks trimmed to fit limit * * @example * const limited = enforceMaxChunks({ parent: p, siblings_before: [a,b,c], siblings_after: [d,e] }, 3); * // Returns: { parent: p, siblings_before: [c], siblings_after: [d] } (parent + 2 closest siblings) */ function enforceMaxChunks( context: { parent?: DocumentChunk; siblings_before: DocumentChunk[]; siblings_after: DocumentChunk[]; }, maxTotal: number ): typeof context { // Parent has highest priority let count = context.parent ? 1 : 0; if (count >= maxTotal) { return { parent: context.parent, siblings_before: [], siblings_after: [] }; } const remaining = maxTotal - count; // Distribute remaining evenly between before and after, prioritizing closer chunks const halfRemaining = Math.floor(remaining / 2); const otherHalf = remaining - halfRemaining; // Before chunks: take the last N (closest to current) const beforeCount = Math.min(context.siblings_before.length, halfRemaining); const trimmedBefore = context.siblings_before.slice(-beforeCount); // After chunks: take the first N (closest to current) const afterCount = Math.min(context.siblings_after.length, otherHalf); const trimmedAfter = context.siblings_after.slice(0, afterCount); return { parent: context.parent, siblings_before: trimmedBefore, siblings_after: trimmedAfter }; } /** * Safely read chunks JSONL file with error handling. * Returns empty array if file doesn't exist or can't be read. * * @param chunksPath - Path to the chunks.jsonl file * @returns Array of DocumentChunks, or empty array on error */ async function loadChunksFromFile(chunksPath: string): Promise<DocumentChunk[]> { try { return await readJsonl<DocumentChunk>(chunksPath); } catch { // File doesn't exist or is unreadable - return empty array // This allows hydration to work gracefully with empty context return []; } } // ============================================================================ // Main Functions // ============================================================================ /** * Hydrate a single chunk with context (adjacent and/or parent chunks). * * This function enriches a search result with surrounding context, * making it easier to understand the chunk in its original document context. * * @param chunk - The source chunk to hydrate * @param allChunks - All chunks from the document/run for context lookup * @param options - Hydration options (strategy, limits, etc.) * @param score - Optional relevance score (defaults to 0) * @returns Hydrated result with context chunks attached * * @example * const hydrated = await hydrateChunk(chunk, allChunks, { * enabled: true, * strategy: 'both', * adjacent_before: 2, * adjacent_after: 2, * include_parent: true, * max_total_chunks: 10 * }); * * console.log(hydrated.context.siblings_before); // 2 chunks before * console.log(hydrated.context.parent?.content.text); // Parent heading */ export async function hydrateChunk( chunk: DocumentChunk, allChunks: DocumentChunk[], options: HydrateOptions, score?: number ): Promise<HydratedSearchResult> { // If hydration is disabled, return empty context if (!options.enabled) { return { chunk, score: score ?? 0, context: { siblings_before: [], siblings_after: [], parent: undefined, hierarchy_path: undefined } }; } let siblings_before: DocumentChunk[] = []; let siblings_after: DocumentChunk[] = []; let parent: DocumentChunk | undefined; let hierarchy_path: string[] | undefined; // Handle adjacent strategy if (options.strategy === 'adjacent' || options.strategy === 'both') { const adjacent = findAdjacentChunks( chunk, allChunks, options.adjacent_before, options.adjacent_after ); siblings_before = adjacent.before; siblings_after = adjacent.after; } // Handle parent strategy if (options.strategy === 'parent' || options.strategy === 'both') { if (options.include_parent && chunk.parent_id) { parent = allChunks.find(c => c.chunk_id === chunk.parent_id); } // Always build hierarchy path when using parent strategy hierarchy_path = buildHierarchyPath(chunk, allChunks); } // Apply max_total_chunks limit const limitedContext = enforceMaxChunks( { parent, siblings_before, siblings_after }, options.max_total_chunks ); return { chunk, score: score ?? 0, context: { ...limitedContext, hierarchy_path } }; } /** * Hydrate multiple search results with context. * * This function processes an array of search results, adding context * to each one. It handles deduplication to ensure context chunks * don't duplicate chunks already in the main results. * * @param results - Array of search results to hydrate * @param runId - Run ID for locating the chunks.jsonl file * @param options - Hydration options (strategy, limits, etc.) * @returns Array of hydrated search results with context * * @example * const results = await searchVectors(query, { top_k: 5 }); * const hydrated = await hydrateSearchResults(results, runId, { * enabled: true, * strategy: 'both', * adjacent_before: 1, * adjacent_after: 1, * include_parent: true, * max_total_chunks: 5 * }); * * // Each result now has context.siblings_before, context.siblings_after, etc. * for (const r of hydrated) { * console.log(`Chunk: ${r.chunk.chunk_id}, Parent: ${r.context.parent?.chunk_id}`); * } */ export async function hydrateSearchResults( results: SearchResultInput[], runId: string, options: HydrateOptions ): Promise<HydratedSearchResult[]> { // Handle empty results if (results.length === 0) { return []; } // Load all chunks from the run's JSONL file const runsDir = path.join(process.cwd(), '.indexfoundry', 'runs', runId); const chunksPath = path.join(runsDir, 'normalized', 'chunks.jsonl'); const allChunks = await loadChunksFromFile(chunksPath); // Collect all result chunk IDs for deduplication const resultChunkIds = new Set(results.map(r => r.chunk.chunk_id)); // Hydrate each result const hydratedResults: HydratedSearchResult[] = []; for (const result of results) { const score = result.score ?? 0; const hydrated = await hydrateChunk( result.chunk, allChunks, options, score ); // Filter out chunks that are already in the main results (deduplication) hydrated.context.siblings_before = hydrated.context.siblings_before.filter( c => !resultChunkIds.has(c.chunk_id) ); hydrated.context.siblings_after = hydrated.context.siblings_after.filter( c => !resultChunkIds.has(c.chunk_id) ); hydratedResults.push(hydrated); } return hydratedResults; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

hydrate.ts•14.9 KiB