GitLab Docs MCP Server

chunker.ts•6.64 KiB

/** * Document chunking utilities for splitting large documents into manageable pieces * Specifically handles the massive GraphQL reference documentation */ import { logger } from '../utils/logger.js'; interface ChunkConfig { /** * Maximum size in characters before chunking is applied */ maxChunkSize: number; /** * Section markers that indicate logical split points */ sectionMarkers: string[]; /** * Overlap between chunks to preserve context */ overlapSize: number; } interface DocumentChunk { /** * Original document path with chunk suffix */ path: string; /** * Title of the chunk */ title: string; /** * Content of the chunk */ content: string; /** * Which chunk this is (1-indexed) */ chunkIndex: number; /** * Total number of chunks for this document */ totalChunks: number; } /** * Default configuration for GraphQL reference chunking * Note: These markers are based on GitLab's auto-generated GraphQL documentation structure. * If markers change, the system falls back to size-based chunking automatically. */ const GRAPHQL_CHUNK_CONFIG: ChunkConfig = { maxChunkSize: 500000, // 500KB per chunk sectionMarkers: [ 'Query type', 'Mutation type', 'Object types', 'Enumeration types', 'Scalar types', 'Abstract types', 'Input types', ], overlapSize: 1000, // 1KB overlap to preserve context }; /** * Checks if a document should be chunked based on size and path */ export function shouldChunkDocument(path: string, content: string): boolean { // Only chunk the massive GraphQL reference file if (path === 'api/graphql/reference/_index.md' && content.length > 500000) { return true; } return false; } /** * Splits a large document into logical chunks based on section markers */ export function chunkDocument( path: string, title: string, content: string, config: ChunkConfig = GRAPHQL_CHUNK_CONFIG ): DocumentChunk[] { const chunks: DocumentChunk[] = []; // Find all section markers const sections: { marker: string; position: number }[] = []; for (const marker of config.sectionMarkers) { const regex = new RegExp(`^${marker}\\s*$`, 'gm'); let match; while ((match = regex.exec(content)) !== null) { sections.push({ marker, position: match.index, }); } } // Sort sections by position sections.sort((a, b) => a.position - b.position); if (sections.length === 0) { // No sections found, fall back to size-based chunking logger.warn(`No section markers found in ${path}, using size-based chunking`); return chunkBySize(path, title, content, config); } logger.info(`Found ${sections.length} sections in ${path}, creating semantic chunks`); // Create chunks based on sections for (let i = 0; i < sections.length; i++) { const section = sections[i]; const nextSection = sections[i + 1]; const start = section.position; const end = nextSection ? nextSection.position : content.length; const chunkContent = content.slice(start, end); // If chunk is still too large, split it further if (chunkContent.length > config.maxChunkSize * 1.5) { const subChunks = chunkBySize( `${path}#${sanitizeMarker(section.marker)}`, `${title} - ${section.marker}`, chunkContent, config ); chunks.push(...subChunks); } else { chunks.push({ path: `${path}#${sanitizeMarker(section.marker)}`, title: `${title} - ${section.marker}`, content: chunkContent, chunkIndex: chunks.length + 1, totalChunks: 0, // Will be set later }); } } // Update total chunks count const totalChunks = chunks.length; chunks.forEach((chunk) => { chunk.totalChunks = totalChunks; }); return chunks; } /** * Splits content by size when no logical sections are found */ function chunkBySize( path: string, title: string, content: string, config: ChunkConfig ): DocumentChunk[] { const chunks: DocumentChunk[] = []; let position = 0; let chunkIndex = 1; while (position < content.length) { const end = Math.min(position + config.maxChunkSize, content.length); let chunkEnd = end; // Try to break at a paragraph boundary if (end < content.length) { const nextNewlines = content.indexOf('\n\n', end - 100); if (nextNewlines !== -1 && nextNewlines < end + 100) { chunkEnd = nextNewlines; } } const chunkContent = content.slice( Math.max(0, position - config.overlapSize), chunkEnd ); chunks.push({ path: `${path}#chunk-${chunkIndex}`, title: `${title} (Part ${chunkIndex})`, content: chunkContent, chunkIndex, totalChunks: 0, // Will be set later }); position = chunkEnd; chunkIndex++; } // Update total chunks count const totalChunks = chunks.length; chunks.forEach((chunk) => { chunk.totalChunks = totalChunks; }); return chunks; } /** * Sanitizes a section marker for use in a path fragment */ function sanitizeMarker(marker: string): string { return marker .toLowerCase() .replace(/\s+/g, '-') .replace(/[^a-z0-9-]/g, ''); } /** * Creates a summary chunk with links to all other chunks */ export function createSummaryChunk( originalPath: string, originalTitle: string, chunks: DocumentChunk[] ): string { const summaryLines = [ `# ${originalTitle}`, '', '> **Note:** This document has been split into multiple sections for better performance.', '', '## Available Sections', '', ]; for (const chunk of chunks) { summaryLines.push(`- [${chunk.title}](${chunk.path})`); } summaryLines.push(''); summaryLines.push('## About This Reference'); summaryLines.push(''); summaryLines.push('This is the auto-generated GraphQL API reference for GitLab. Each section contains detailed information about:'); summaryLines.push(''); summaryLines.push('- **Query types**: Top-level entry points for read operations'); summaryLines.push('- **Mutation types**: Entry points for write operations'); summaryLines.push('- **Object types**: Resource representations in the API'); summaryLines.push('- **Enumeration types**: Predefined value sets'); summaryLines.push('- **Scalar types**: Basic data types'); summaryLines.push('- **Abstract types**: Unions and interfaces'); summaryLines.push('- **Input types**: Arguments for mutations and queries'); summaryLines.push(''); summaryLines.push('Use the interactive GraphQL explorer to test queries, or generate a machine-readable schema in IDL or JSON formats.'); return summaryLines.join('\n'); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ozanmutlu/Gitlab-Docs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunker.ts•6.64 KiB