Skip to main content
Glama
chunker.ts6.55 kB
/** * Smart document chunking for better search results * Splits documents into searchable chunks while preserving context */ import type { DocumentPage, DocumentSection, ChunkOptions } from './types.js'; export interface DocumentChunk { id: string; documentUrl: string; title: string; category: string; loader: string; chunkType: 'title' | 'section' | 'code' | 'full'; content: string; codeLanguage?: string; sectionHeading?: string; sectionLevel?: number; order: number; metadata: { hasCode: boolean; wordCount: number; difficulty?: string; tags: string[]; }; } export class DocumentChunker { private options: ChunkOptions; constructor(options: Partial<ChunkOptions> = {}) { this.options = { maxChunkSize: 1000, // characters overlapSize: 100, // characters of overlap between chunks preserveCodeBlocks: true, minChunkSize: 50, ...options, }; } /** * Chunk a document page into searchable pieces */ chunkDocument(doc: DocumentPage): DocumentChunk[] { const chunks: DocumentChunk[] = []; let order = 0; // Always create a title chunk for context chunks.push(this.createTitleChunk(doc, order++)); // Chunk each section for (const section of doc.sections) { const sectionChunks = this.chunkSection(doc, section, order); chunks.push(...sectionChunks); order += sectionChunks.length; } // If no sections, create a single full-doc chunk if (doc.sections.length === 0 && doc.content.length > 0) { chunks.push(this.createFullDocChunk(doc, order++)); } return chunks; } /** * Create title/intro chunk */ private createTitleChunk(doc: DocumentPage, order: number): DocumentChunk { // Get first paragraph or first N characters as intro const intro = doc.content.substring(0, this.options.maxChunkSize); return { id: `${this.hashString(doc.url)}-title`, documentUrl: doc.url, title: doc.title, category: doc.category, loader: doc.loader, chunkType: 'title', content: `${doc.title}\n\n${intro}`, order, metadata: { hasCode: false, wordCount: this.countWords(intro), difficulty: doc.metadata.difficulty, tags: doc.metadata.tags, }, }; } /** * Create full document chunk for small docs */ private createFullDocChunk(doc: DocumentPage, order: number): DocumentChunk { return { id: `${this.hashString(doc.url)}-full`, documentUrl: doc.url, title: doc.title, category: doc.category, loader: doc.loader, chunkType: 'full', content: doc.content, order, metadata: { hasCode: false, wordCount: this.countWords(doc.content), difficulty: doc.metadata.difficulty, tags: doc.metadata.tags, }, }; } /** * Chunk a section intelligently */ private chunkSection( doc: DocumentPage, section: DocumentSection, startOrder: number ): DocumentChunk[] { const chunks: DocumentChunk[] = []; let order = startOrder; // Create code block chunks first (if preserving) if (this.options.preserveCodeBlocks && section.codeBlocks.length > 0) { for (const codeBlock of section.codeBlocks) { chunks.push({ id: `${this.hashString(doc.url)}-${order}`, documentUrl: doc.url, title: doc.title, category: doc.category, loader: doc.loader, chunkType: 'code', content: `${section.heading}\n\n${codeBlock.caption || ''}\n\n${codeBlock.code}`, codeLanguage: codeBlock.language, sectionHeading: section.heading, sectionLevel: section.level, order: order++, metadata: { hasCode: true, wordCount: this.countWords(codeBlock.code), difficulty: doc.metadata.difficulty, tags: doc.metadata.tags, }, }); } } // Create section text chunks if (section.content.trim().length >= this.options.minChunkSize) { const textChunks = this.splitText(section.content); for (const textChunk of textChunks) { chunks.push({ id: `${this.hashString(doc.url)}-${order}`, documentUrl: doc.url, title: doc.title, category: doc.category, loader: doc.loader, chunkType: 'section', content: `${section.heading}\n\n${textChunk}`, sectionHeading: section.heading, sectionLevel: section.level, order: order++, metadata: { hasCode: section.codeBlocks.length > 0, wordCount: this.countWords(textChunk), difficulty: doc.metadata.difficulty, tags: doc.metadata.tags, }, }); } } return chunks; } /** * Split long text into overlapping chunks */ private splitText(text: string): string[] { if (text.length <= this.options.maxChunkSize) { return [text]; } const chunks: string[] = []; let start = 0; while (start < text.length) { let end = Math.min(start + this.options.maxChunkSize, text.length); // Try to break at sentence boundary if (end < text.length) { const sentenceEnd = text.lastIndexOf('.', end); const newlineEnd = text.lastIndexOf('\n', end); const breakPoint = Math.max(sentenceEnd, newlineEnd); if (breakPoint > start + this.options.minChunkSize) { end = breakPoint + 1; } } const chunk = text.substring(start, end).trim(); if (chunk.length > 0) { chunks.push(chunk); } // Move start forward with overlap, ensuring it doesn't go backwards const prevStart = start; start = end - this.options.overlapSize; // Ensure we always make forward progress if (start <= prevStart) { start = Math.min(prevStart + 1, text.length); } if (start >= text.length) break; } return chunks; } /** * Count words in text */ private countWords(text: string): number { return text.trim().split(/\s+/).length; } /** * Simple hash function for IDs */ private hashString(str: string): string { let hash = 0; for (let i = 0; i < str.length; i++) { const char = str.charCodeAt(i); hash = (hash << 5) - hash + char; hash = hash & hash; // Convert to 32-bit integer } return Math.abs(hash).toString(36); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OGMatrix/mcmodding-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server