Doclea MCP

Official

Overview Schema Related Servers Score Discussions

doclea-mcp
src
chunking

markdown.ts•15 KiB

/** * Semantic markdown chunking * * Intelligently splits markdown documents while respecting: * - Document structure (headers, code blocks) * - Token limits per chunk * - Source location tracking */ import { countTokens, splitIntoTokenChunks } from "../utils/tokens"; /** * Metadata for a markdown chunk */ export interface MarkdownChunkMetadata { /** Starting line number (1-based) */ startLine: number; /** Ending line number (1-based, inclusive) */ endLine: number; /** Header hierarchy leading to this chunk */ headers: string[]; /** Header level of this section (1-6, or 0 for content without header) */ level: number; /** Whether this chunk contains frontmatter */ hasFrontmatter: boolean; /** Whether this chunk contains code blocks */ hasCodeBlock: boolean; } /** * A chunk of markdown content with metadata */ export interface MarkdownChunk { /** The markdown content */ content: string; /** Metadata about the chunk's location and context */ metadata: MarkdownChunkMetadata; /** Approximate token count */ tokenCount: number; } /** * Options for markdown chunking */ export interface MarkdownChunkOptions { /** Maximum tokens per chunk (default: 256) */ maxTokens?: number; /** Overlap tokens between chunks for context continuity (default: 0) */ overlap?: number; /** Whether to include header hierarchy in each chunk (default: true) */ includeHeaderContext?: boolean; } // Regex patterns const FRONTMATTER_REGEX = /^---\n([\s\S]*?)\n---\n?/; const EMPTY_FRONTMATTER_REGEX = /^---\n---\n?/; const HEADER_REGEX = /^(#{1,6})\s+(.+)$/; const CODE_BLOCK_START = /^```(\w*)?$/; const CODE_BLOCK_END = /^```$/; /** * Parsed markdown section */ interface MarkdownSection { content: string; startLine: number; endLine: number; level: number; header: string | null; headers: string[]; hasFrontmatter: boolean; hasCodeBlock: boolean; } /** * Parse markdown into sections by headers */ function parseMarkdownSections(markdown: string): MarkdownSection[] { const lines = markdown.split("\n"); const sections: MarkdownSection[] = []; let currentSection: MarkdownSection | null = null; const headerStack: { level: number; text: string }[] = []; let inCodeBlock = false; let inFrontmatter = false; let frontmatterDone = false; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineNum = i + 1; // 1-based // Track frontmatter (only at start) if (i === 0 && line === "---") { inFrontmatter = true; if (!currentSection) { currentSection = { content: line, startLine: lineNum, endLine: lineNum, level: 0, header: null, headers: [], hasFrontmatter: true, hasCodeBlock: false, }; } continue; } if (inFrontmatter) { if (currentSection) { currentSection.content += `\n${line}`; currentSection.endLine = lineNum; } if (line === "---") { inFrontmatter = false; frontmatterDone = true; } continue; } // Track code blocks (don't parse headers inside code blocks) if (CODE_BLOCK_START.test(line) && !inCodeBlock) { inCodeBlock = true; if (currentSection) { currentSection.hasCodeBlock = true; } } else if (CODE_BLOCK_END.test(line) && inCodeBlock) { inCodeBlock = false; } // Check for headers (only outside code blocks) const headerMatch = !inCodeBlock ? line.match(HEADER_REGEX) : null; if (headerMatch) { // Save previous section if (currentSection?.content.trim()) { sections.push(currentSection); } const level = headerMatch[1].length; const headerText = headerMatch[2].trim(); // Update header stack while ( headerStack.length > 0 && headerStack[headerStack.length - 1].level >= level ) { headerStack.pop(); } headerStack.push({ level, text: headerText }); // Start new section currentSection = { content: line, startLine: lineNum, endLine: lineNum, level, header: headerText, headers: headerStack.map((h) => h.text), hasFrontmatter: false, hasCodeBlock: false, }; } else { // Add content to current section if (!currentSection) { currentSection = { content: line, startLine: lineNum, endLine: lineNum, level: 0, header: null, headers: headerStack.map((h) => h.text), hasFrontmatter: frontmatterDone && lineNum <= 10, // Frontmatter was just before hasCodeBlock: inCodeBlock, }; } else { currentSection.content += `\n${line}`; currentSection.endLine = lineNum; if (inCodeBlock) { currentSection.hasCodeBlock = true; } } } } // Don't forget the last section if (currentSection?.content.trim()) { sections.push(currentSection); } return sections; } /** * Split a section that exceeds token limit while preserving code blocks */ async function splitLargeSection( section: MarkdownSection, maxTokens: number, ): Promise<MarkdownSection[]> { const lines = section.content.split("\n"); const result: MarkdownSection[] = []; let currentChunk: string[] = []; let currentStartLine = section.startLine; let currentTokens = 0; let inCodeBlock = false; let codeBlockBuffer: string[] = []; let codeBlockStartLine = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineNum = section.startLine + i; // Track code block boundaries if (CODE_BLOCK_START.test(line) && !inCodeBlock) { inCodeBlock = true; codeBlockBuffer = [line]; codeBlockStartLine = lineNum; continue; } if (inCodeBlock) { codeBlockBuffer.push(line); if (CODE_BLOCK_END.test(line)) { // Code block complete - add atomically inCodeBlock = false; const codeBlockContent = codeBlockBuffer.join("\n"); const codeBlockTokens = await countTokens(codeBlockContent); // If code block alone exceeds limit, it gets its own chunk if (codeBlockTokens > maxTokens) { // Flush current chunk first if (currentChunk.length > 0) { result.push({ content: currentChunk.join("\n"), startLine: currentStartLine, endLine: lineNum - codeBlockBuffer.length, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: false, }); currentChunk = []; } // Add code block as its own chunk result.push({ content: codeBlockContent, startLine: codeBlockStartLine, endLine: lineNum, level: section.level, header: null, headers: section.headers, hasFrontmatter: false, hasCodeBlock: true, }); currentStartLine = lineNum + 1; currentTokens = 0; } else if (currentTokens + codeBlockTokens > maxTokens) { // Flush current chunk, start new one with code block if (currentChunk.length > 0) { result.push({ content: currentChunk.join("\n"), startLine: currentStartLine, endLine: codeBlockStartLine - 1, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: false, }); } currentChunk = [codeBlockContent]; currentStartLine = codeBlockStartLine; currentTokens = codeBlockTokens; } else { // Add code block to current chunk currentChunk.push(codeBlockContent); currentTokens += codeBlockTokens; } codeBlockBuffer = []; } continue; } // Regular line handling const lineTokens = await countTokens(line); // Handle single line that exceeds token limit - split by tokens if (lineTokens > maxTokens) { // Flush current chunk first if (currentChunk.length > 0) { result.push({ content: currentChunk.join("\n"), startLine: currentStartLine, endLine: lineNum - 1, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: currentChunk.some((l) => l.startsWith("```")), }); currentChunk = []; currentTokens = 0; } // Split the long line into token-based chunks const lineChunks = await splitIntoTokenChunks(line, maxTokens, 0); for (const lineChunk of lineChunks) { result.push({ content: lineChunk, startLine: lineNum, endLine: lineNum, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: false, }); } currentStartLine = lineNum + 1; } else if ( currentTokens + lineTokens > maxTokens && currentChunk.length > 0 ) { // Flush current chunk result.push({ content: currentChunk.join("\n"), startLine: currentStartLine, endLine: lineNum - 1, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: currentChunk.some((l) => l.startsWith("```")), }); currentChunk = [line]; currentStartLine = lineNum; currentTokens = lineTokens; } else { currentChunk.push(line); currentTokens += lineTokens; } } // Flush remaining content if (currentChunk.length > 0) { result.push({ content: currentChunk.join("\n"), startLine: currentStartLine, endLine: section.endLine, level: section.level, header: result.length === 0 ? section.header : null, headers: section.headers, hasFrontmatter: result.length === 0 && section.hasFrontmatter, hasCodeBlock: currentChunk.some((l) => l.startsWith("```")), }); } return result.length > 0 ? result : [section]; } /** * Chunk a markdown document into semantic sections * * @param markdown - The markdown content to chunk * @param options - Chunking options * @returns Array of markdown chunks with metadata * * @example * const chunks = await chunkMarkdown(content, { maxTokens: 256 }); * for (const chunk of chunks) { * console.log(`Lines ${chunk.metadata.startLine}-${chunk.metadata.endLine}`); * console.log(`Headers: ${chunk.metadata.headers.join(' > ')}`); * console.log(chunk.content); * } */ export async function chunkMarkdown( markdown: string, options: MarkdownChunkOptions = {}, ): Promise<MarkdownChunk[]> { const { maxTokens = 256, overlap = 0, includeHeaderContext = true } = options; if (!markdown || !markdown.trim()) { return []; } // Parse into sections by headers const sections = parseMarkdownSections(markdown); if (sections.length === 0) { return []; } // Process sections, splitting large ones const processedSections: MarkdownSection[] = []; for (const section of sections) { const tokenCount = await countTokens(section.content); if (tokenCount <= maxTokens) { processedSections.push(section); } else { // Split large section const subSections = await splitLargeSection(section, maxTokens); processedSections.push(...subSections); } } // Convert to chunks with metadata const chunks: MarkdownChunk[] = []; for (const section of processedSections) { let content = section.content; // Optionally prepend header context if (includeHeaderContext && section.headers.length > 0 && !section.header) { const _contextHeader = section.headers .map((h, i) => `${"#".repeat(i + 1)} ${h}`) .join("\n"); // Only add if it doesn't duplicate existing headers in content if (!content.startsWith("#")) { content = `\n${content}`; } } const tokenCount = await countTokens(content); chunks.push({ content, tokenCount, metadata: { startLine: section.startLine, endLine: section.endLine, headers: section.headers, level: section.level, hasFrontmatter: section.hasFrontmatter, hasCodeBlock: section.hasCodeBlock, }, }); } return chunks; } /** * Extract frontmatter from markdown * * @param markdown - The markdown content * @returns Object with frontmatter (if present) and remaining content */ export function extractFrontmatter(markdown: string): { frontmatter: string | null; content: string; frontmatterLines: number; } { // Check for empty frontmatter first const emptyMatch = markdown.match(EMPTY_FRONTMATTER_REGEX); if (emptyMatch) { return { frontmatter: "", content: markdown.slice(emptyMatch[0].length), frontmatterLines: emptyMatch[0].split("\n").length, }; } const match = markdown.match(FRONTMATTER_REGEX); if (match) { const frontmatter = match[1]; const frontmatterLines = match[0].split("\n").length; return { frontmatter, content: markdown.slice(match[0].length), frontmatterLines, }; } return { frontmatter: null, content: markdown, frontmatterLines: 0, }; } /** * Get the header hierarchy at a specific line * * @param markdown - The markdown content * @param lineNumber - The line number (1-based) * @returns Array of headers leading to that line */ export function getHeadersAtLine( markdown: string, lineNumber: number, ): string[] { const lines = markdown.split("\n"); const headerStack: { level: number; text: string }[] = []; let inCodeBlock = false; for (let i = 0; i < Math.min(lineNumber, lines.length); i++) { const line = lines[i]; // Track code blocks if (CODE_BLOCK_START.test(line) && !inCodeBlock) { inCodeBlock = true; continue; } if (CODE_BLOCK_END.test(line) && inCodeBlock) { inCodeBlock = false; continue; } if (!inCodeBlock) { const headerMatch = line.match(HEADER_REGEX); if (headerMatch) { const level = headerMatch[1].length; const text = headerMatch[2].trim(); // Pop headers of same or higher level while ( headerStack.length > 0 && headerStack[headerStack.length - 1].level >= level ) { headerStack.pop(); } headerStack.push({ level, text }); } } } return headerStack.map((h) => h.text); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/docleaai/doclea-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

markdown.ts•15 KiB