Skip to main content
Glama
MarkdownChunker.ts7.5 kB
/** * MarkdownChunker - Split markdown content into semantic chunks * Uses heading-based chunking with size limits for semantic search */ import { nanoid } from "nanoid"; import type { MarkdownChunk, MarkdownHeading, ChunkConfig, } from "../markdown/types.js"; /** * Default chunking configuration */ export const DEFAULT_CHUNK_CONFIG: ChunkConfig = { maxChunkSize: 500, // Max words per chunk overlapSize: 50, // Overlap words splitOnHeadings: true, // Split at heading boundaries preserveContext: true, // Include parent headings }; /** * MarkdownChunker class */ export class MarkdownChunker { private config: ChunkConfig; constructor(config: Partial<ChunkConfig> = {}) { this.config = { ...DEFAULT_CHUNK_CONFIG, ...config }; } /** * Chunk markdown content */ chunk( fileId: string, content: string, headings: MarkdownHeading[] ): MarkdownChunk[] { if (this.config.splitOnHeadings && headings.length > 0) { return this.chunkByHeadings(fileId, content, headings); } else { return this.chunkBySize(fileId, content); } } /** * Chunk content by heading boundaries */ private chunkByHeadings( fileId: string, content: string, headings: MarkdownHeading[] ): MarkdownChunk[] { const chunks: MarkdownChunk[] = []; const lines = content.split("\n"); // Build heading hierarchy map const headingMap = new Map<string, MarkdownHeading>(); for (const heading of headings) { headingMap.set(heading.id, heading); } // Process each heading section for (let i = 0; i < headings.length; i++) { const heading = headings[i]; const nextHeading = headings[i + 1]; // Find section boundaries const startLine = heading.line_number - 1; // 0-indexed const endLine = nextHeading ? nextHeading.line_number - 1 : lines.length; // Extract section content const sectionLines = lines.slice(startLine, endLine); const sectionContent = sectionLines.join("\n"); // Get context (parent headings) const context = this.config.preserveContext ? this.getHeadingContext(heading, headingMap) : heading.text; // Count words const wordCount = this.countWords(sectionContent); const tokenCount = this.estimateTokens(sectionContent); // If section is too large, split it further if (wordCount > this.config.maxChunkSize) { const subChunks = this.splitLargeSection( fileId, sectionContent, context, startLine, chunks.length ); chunks.push(...subChunks); } else { // Create single chunk for this section chunks.push({ id: nanoid(), file_id: fileId, chunk_index: chunks.length, heading: context, content: sectionContent, start_offset: this.calculateOffset(lines, startLine), end_offset: this.calculateOffset(lines, endLine), token_count: tokenCount, word_count: wordCount, }); } } // Handle content before first heading if (headings.length > 0 && headings[0].line_number > 1) { const preambleLines = lines.slice(0, headings[0].line_number - 1); const preambleContent = preambleLines.join("\n").trim(); if (preambleContent) { chunks.unshift({ id: nanoid(), file_id: fileId, chunk_index: 0, heading: null, content: preambleContent, start_offset: 0, end_offset: this.calculateOffset(lines, headings[0].line_number - 1), token_count: this.estimateTokens(preambleContent), word_count: this.countWords(preambleContent), }); // Reindex chunks for (let i = 1; i < chunks.length; i++) { chunks[i].chunk_index = i; } } } return chunks; } /** * Split large section into multiple chunks */ private splitLargeSection( fileId: string, content: string, heading: string, startLine: number, baseIndex: number ): MarkdownChunk[] { const chunks: MarkdownChunk[] = []; const words = content.split(/\s+/); const maxWords = this.config.maxChunkSize; const overlapWords = this.config.overlapSize; let currentIndex = 0; while (currentIndex < words.length) { const chunkWords = words.slice( currentIndex, currentIndex + maxWords ); const chunkContent = chunkWords.join(" "); chunks.push({ id: nanoid(), file_id: fileId, chunk_index: baseIndex + chunks.length, heading, content: chunkContent, start_offset: startLine, // Approximate end_offset: startLine, // Approximate token_count: this.estimateTokens(chunkContent), word_count: chunkWords.length, }); // Move to next chunk with overlap currentIndex += maxWords - overlapWords; // Prevent infinite loop if (maxWords <= overlapWords) { break; } } return chunks; } /** * Chunk content by size without respecting headings */ private chunkBySize(fileId: string, content: string): MarkdownChunk[] { const chunks: MarkdownChunk[] = []; // Handle empty content if (!content.trim()) { return chunks; } const words = content.split(/\s+/); const maxWords = this.config.maxChunkSize; const overlapWords = this.config.overlapSize; let currentIndex = 0; while (currentIndex < words.length) { const chunkWords = words.slice( currentIndex, currentIndex + maxWords ); const chunkContent = chunkWords.join(" "); chunks.push({ id: nanoid(), file_id: fileId, chunk_index: chunks.length, heading: null, content: chunkContent, start_offset: 0, // Not tracking offsets in this mode end_offset: 0, token_count: this.estimateTokens(chunkContent), word_count: chunkWords.length, }); currentIndex += maxWords - overlapWords; if (maxWords <= overlapWords) { break; } } return chunks; } /** * Get full heading context (parent headings) */ private getHeadingContext( heading: MarkdownHeading, headingMap: Map<string, MarkdownHeading> ): string { const parts: string[] = []; let current: MarkdownHeading | undefined = heading; // Walk up the hierarchy while (current) { parts.unshift(current.text); if (current.parent_id) { current = headingMap.get(current.parent_id); } else { break; } } return parts.join(" > "); } /** * Calculate byte offset in content */ private calculateOffset(lines: string[], lineIndex: number): number { let offset = 0; for (let i = 0; i < lineIndex && i < lines.length; i++) { offset += lines[i].length + 1; // +1 for newline } return offset; } /** * Count words in text */ private countWords(text: string): number { const words = text.trim().split(/\s+/); return words[0] === "" ? 0 : words.length; } /** * Estimate token count (rough approximation) * Tokens ≈ words * 1.3 (conservative estimate for English) */ private estimateTokens(text: string): number { const wordCount = this.countWords(text); return Math.ceil(wordCount * 1.3); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/xiaolai/claude-writers-aid-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server