Skip to main content
Glama
chunking.service.ts5.61 kB
import { Injectable } from '@nestjs/common'; import { Chunk, ChunkOptions, logger } from '@snakagent/core'; @Injectable() export class ChunkingService { async chunkText( documentId: string, text: string, options: ChunkOptions ): Promise<Chunk[]> { const { chunkSize, overlap, strategy = 'adaptive' } = options; try { if (!text || text.trim().length === 0) { return []; } if (chunkSize <= 0) { throw new Error('Chunk size must be positive'); } if (overlap < 0) { throw new Error('Overlap cannot be negative'); } if (overlap >= chunkSize) { throw new Error('Overlap must be less than chunk size'); } let chunks: Chunk[]; if (strategy === 'whitespace') { chunks = this.chunkByWhitespace(documentId, text, chunkSize, overlap); } else if (strategy === 'structured') { chunks = this.chunkStructured(documentId, text, chunkSize, overlap); } else { chunks = this.chunkAdaptive(documentId, text, chunkSize, overlap); } return chunks; } catch (err) { logger.error(`Chunking failed:`, err); throw err; } } private chunkByWhitespace( documentId: string, text: string, chunkSize: number, overlap: number ): Chunk[] { const tokens = text.split(/\s+/).filter((token) => token.length > 0); const chunks: Chunk[] = []; let index = 0; for (let start = 0; start < tokens.length; start += chunkSize - overlap) { const end = Math.min(start + chunkSize, tokens.length); const chunkTokens = tokens.slice(start, end); chunks.push({ id: `${documentId}-${index}`, text: chunkTokens.join(' '), metadata: { documentId, chunkIndex: index++, startToken: start, endToken: end, }, }); if (end === tokens.length) { break; } } return chunks; } private chunkAdaptive( documentId: string, text: string, chunkSize: number, overlap: number ): Chunk[] { const lines = text.split(/\n/); const segments: { text: string; heading: boolean }[] = []; let current = ''; for (const rawLine of lines) { const line = rawLine.trim(); if (line === '') { if (current) { segments.push({ text: current, heading: false }); current = ''; } continue; } if (/^#+\s+/.test(line)) { if (current) { segments.push({ text: current, heading: false }); current = ''; } segments.push({ text: line, heading: true }); continue; } current = current ? `${current} ${line}` : line; } if (current) { segments.push({ text: current, heading: false }); } const chunks: Chunk[] = []; let index = 0; let currentTokens: string[] = []; let startToken = 0; const flush = () => { if (!currentTokens.length) return; const end = Math.min(chunkSize, currentTokens.length); const chunkTokens = currentTokens.slice(0, end); const endToken = startToken + chunkTokens.length; chunks.push({ id: `${documentId}-${index}`, text: chunkTokens.join(' '), metadata: { documentId, chunkIndex: index++, startToken, endToken, }, }); const overlapTokens = chunkTokens.slice(Math.max(0, end - overlap)); currentTokens = overlapTokens.concat(currentTokens.slice(end)); startToken = endToken - overlapTokens.length; }; for (const seg of segments) { const segTokens = seg.text .split(/\s+/) .filter((token) => token.length > 0); if (seg.heading) { flush(); currentTokens.push(...segTokens); if (currentTokens.length >= chunkSize) { flush(); } continue; } if (currentTokens.length + segTokens.length > chunkSize) { flush(); } currentTokens.push(...segTokens); while (currentTokens.length >= chunkSize) { flush(); } } if (currentTokens.length) { flush(); } return chunks; } private chunkStructured( documentId: string, text: string, chunkSize: number, overlap: number ): Chunk[] { const lines = text.split(/\n/); const chunks: Chunk[] = []; let index = 0; let currentTokens: string[] = []; let startToken = 0; const flush = () => { if (!currentTokens.length) return; const end = Math.min(chunkSize, currentTokens.length); const chunkTokens = currentTokens.slice(0, end); const endToken = startToken + chunkTokens.length; chunks.push({ id: `${documentId}-${index}`, text: chunkTokens.join(' '), metadata: { documentId, chunkIndex: index++, startToken, endToken, }, }); const overlapTokens = chunkTokens.slice(Math.max(0, end - overlap)); currentTokens = overlapTokens.concat(currentTokens.slice(end)); startToken = endToken - overlap; }; for (const line of lines) { const lineTokens = line .trim() .split(/\s+/) .filter((token) => token.length > 0); if (currentTokens.length + lineTokens.length > chunkSize) { flush(); } currentTokens.push(...lineTokens); while (currentTokens.length >= chunkSize) { flush(); } } if (currentTokens.length) { flush(); } return chunks; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/KasarLabs/snak'

If you have feedback or need assistance with the MCP directory API, please join our Discord server