Skip to main content
Glama
by microsoft
mdchunk.ts4.6 kB
/** * Chunks markdown into sections based on headings while maintaining subtrees. * Handles WorkspaceFile objects and plain markdown strings. * Does not reliably handle code sections containing markdown. * @param markdown - The markdown content as a string or a WorkspaceFile object. If a WorkspaceFile, its content is used. Throws if encoding is base64. * @param approximateTokens - Function to estimate token count of text. Used to calculate chunk sizes. * @param options - Optional configuration including maxTokens (default 4096) and pageSeparator (default "======"). * @returns Array of TextChunk objects representing the chunks, including metadata such as filename and line range. */ export async function chunkMarkdown( markdown: string | WorkspaceFile, approximateTokens: (text: string) => number, options?: { maxTokens?: number pageSeparator?: string } ): Promise<TextChunk[]> { const { maxTokens = 4096, pageSeparator = "======" } = options || {} if (!markdown) return [] type Section = { heading: string; lines: string[]; level: number } const filename = typeof markdown === "object" ? markdown.filename : "" if (typeof markdown === "object") { if (markdown.encoding === "base64") throw new Error("base64 encoding not supported") markdown = markdown.content } const lines = markdown.split(/\r?\n/g) const sections: Section[] = [] let current: Section | null = null lines.forEach((line) => { if (line.startsWith(pageSeparator)) { if (current) sections.push(current) current = null return } const match = /^(\#{1,6})\s+(.*)/.exec(line) if (match) { if (current) sections.push(current) current = { heading: match[2], lines: [line], level: match[1].length, } return } if (!current) current = { heading: "", lines: [], level: 0 } current.lines.push(line) }) if (current) sections.push(current) const chunks: string[] = [] let tempChunk: Section[] = [] let tokenCount = 0 for (let i = 0; i < sections.length; i++) { const sectionTokens = sectionTokenCount(sections[i], approximateTokens) if (sectionTokens > maxTokens) { if (tempChunk.length) { chunks.push(buildChunk(tempChunk)) tempChunk = [] tokenCount = 0 } chunks.push(buildChunk([sections[i]])) continue } if (tokenCount + sectionTokens <= maxTokens) { tempChunk.push(sections[i]) tokenCount += sectionTokens } else { // Instead of discarding, gather removed sections and prepend them to the new chunk const removedSections: Section[] = [] let j = i while ( j > 0 && sections[j].level > sections[j - 1].level && tokenCount + sectionTokens > maxTokens && tempChunk.length ) { const removed = tempChunk.pop() if (removed) { removedSections.unshift(removed) tokenCount -= sectionTokenCount(removed, approximateTokens) } j-- } // Close off current chunk if (tempChunk.length) { chunks.push(buildChunk(tempChunk)) } // Start the new chunk with removed and current tempChunk = [...removedSections, sections[i]] tokenCount = tempChunk.reduce( (acc, sec) => acc + sectionTokenCount(sec, approximateTokens), 0 ) } } if (tempChunk.length) chunks.push(buildChunk(tempChunk)) // convert into text chunk let currentLine = 0 return chunks.map( (chunk, i) => ({ filename: filename + `#chunk${i}`, lineStart: currentLine, lineEnd: (currentLine += chunk.split(/\r?\n/g).length), content: chunk, }) satisfies TextChunk ) function sectionTokenCount( section: { lines: string[] }, tokenCount: (txt: string) => number ) { return section.lines.reduce((acc, line) => acc + tokenCount(line), 0) } function buildChunk(sections: { lines: string[] }[]) { return sections.map((s) => s.lines.join("\n")).join("\n") } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/microsoft/genaiscript'

If you have feedback or need assistance with the MCP directory API, please join our Discord server