folder-mcp

Overview Schema Related Servers Score Discussions

excel-chunking.ts•12.8 KiB

/** * Sprint 11: Excel Document Format-Aware Chunking * * Implements chunking that respects Excel sheet structure using xlsx's * natural cell address coordinate system (A1, B2, etc.) for precise extraction. */ import XLSX from 'xlsx'; import { ParsedContent, TextChunk, ChunkedContent, ExcelMetadata, createDefaultSemanticMetadata } from '../../types/index.js'; /** * Represents a worksheet structure for chunking */ interface WorksheetStructure { name: string; index: number; range: XLSX.Range | string | null; // Can be either Range object or string like "A1:Z100" rowCount: number; colCount: number; csvContent: string; jsonData: any[][]; hasFormulas: boolean; formulas: string[]; } /** * Service for Excel document format-aware chunking */ export class ExcelChunkingService { private readonly DEFAULT_MAX_TOKENS = 1000; private readonly DEFAULT_MIN_TOKENS = 100; private readonly DEFAULT_MAX_ROWS_PER_CHUNK = 50; /** * Chunk an Excel document respecting sheet boundaries */ public chunkExcelDocument( content: ParsedContent, maxTokens: number = this.DEFAULT_MAX_TOKENS, minTokens: number = this.DEFAULT_MIN_TOKENS ): ChunkedContent { if (content.type !== 'excel') { throw new Error('Content must be an Excel document'); } const metadata = content.metadata as ExcelMetadata; // Get worksheet structures from metadata const worksheets = metadata.worksheets as WorksheetStructure[]; if (!worksheets || worksheets.length === 0) { throw new Error('Excel document has no worksheets'); } // Create chunks respecting sheet boundaries const chunks = this.createSheetAwareChunks( worksheets, content.content, maxTokens, minTokens ); return { originalContent: content, chunks, totalChunks: chunks.length }; } /** * Create chunks respecting sheet boundaries */ private createSheetAwareChunks( worksheets: WorksheetStructure[], fullText: string, maxTokens: number, minTokens: number ): TextChunk[] { const chunks: TextChunk[] = []; let currentOffset = 0; for (const worksheet of worksheets) { if (!worksheet.csvContent || !worksheet.csvContent.trim()) { continue; } // Find the sheet's position in fullText const sheetMarker = `=== Sheet: ${worksheet.name} ===`; const sheetStart = fullText.indexOf(sheetMarker, currentOffset); if (sheetStart >= 0) { currentOffset = sheetStart + sheetMarker.length + 1; // +1 for newline } const sheetChunks = this.chunkSingleSheet( worksheet, fullText, chunks.length, maxTokens, minTokens, currentOffset ); chunks.push(...sheetChunks); // Update offset to after this sheet's content if (sheetChunks.length > 0) { const lastChunk = sheetChunks[sheetChunks.length - 1]; if (lastChunk) { currentOffset = lastChunk.endPosition; } } } return chunks; } /** * Chunk a single sheet into text chunks */ private chunkSingleSheet( worksheet: WorksheetStructure, fullText: string, startChunkIndex: number, maxTokens: number, minTokens: number, searchFromOffset: number = 0 ): TextChunk[] { const chunks: TextChunk[] = []; // Parse CSV content into rows const rows = worksheet.csvContent.split('\n'); if (rows.length === 0) { return chunks; } // Keep header row with each chunk const headerRow = rows[0] || ''; const headerTokens = Math.ceil(headerRow.length / 4); // Find where this sheet's actual content starts in fullText const sheetMarker = `=== Sheet: ${worksheet.name} ===`; const sheetMarkerPos = fullText.indexOf(sheetMarker, searchFromOffset); const contentStartPos = sheetMarkerPos >= 0 ? fullText.indexOf('\n', sheetMarkerPos) + 1 // After the marker line : searchFromOffset; let currentChunk: { rows: string[]; text: string; tokenCount: number; startRow: number; endRow: number; startCol: string; endCol: string; dataStartPos?: number; // Track actual position of data (without header) } | null = null; // Start from row 1 (skip header, we'll add it to each chunk) for (let i = 1; i < rows.length; i++) { const row = rows[i] || ''; if (!row.trim()) continue; const rowTokens = Math.ceil(row.length / 4); if (!currentChunk) { // Start new chunk with header currentChunk = { rows: [headerRow, row], text: headerRow + '\n' + row, tokenCount: headerTokens + rowTokens, startRow: 1, // Excel uses 1-based rows endRow: i + 1, // Convert to 1-based startCol: 'A', endCol: this.getLastColumn(worksheet) }; } else { // Check if adding this row would exceed max tokens const potentialTokens = currentChunk.tokenCount + rowTokens; if (potentialTokens > maxTokens && currentChunk.tokenCount >= minTokens) { // Save current chunk chunks.push(this.createChunk( currentChunk.text, startChunkIndex + chunks.length, fullText, worksheet.name, currentChunk, searchFromOffset )); // Start new chunk with header currentChunk = { rows: [headerRow, row], text: headerRow + '\n' + row, tokenCount: headerTokens + rowTokens, startRow: i + 1, // Convert to 1-based endRow: i + 1, startCol: 'A', endCol: this.getLastColumn(worksheet) }; } else { // Add row to current chunk currentChunk.rows.push(row); currentChunk.text += '\n' + row; currentChunk.tokenCount += rowTokens; currentChunk.endRow = i + 1; // Convert to 1-based } } } // Save final chunk if (currentChunk && currentChunk.text.trim()) { chunks.push(this.createChunk( currentChunk.text, startChunkIndex + chunks.length, fullText, worksheet.name, currentChunk, searchFromOffset )); } return chunks; } /** * Get the last column letter from worksheet range */ private getLastColumn(worksheet: WorksheetStructure): string { if (worksheet.range && typeof worksheet.range === 'object' && 'e' in worksheet.range) { return XLSX.utils.encode_col((worksheet.range as any).e.c); } // If range is a string like "A1:Z100", extract the last column if (typeof worksheet.range === 'string' && worksheet.range.includes(':')) { const parts = worksheet.range.split(':'); if (parts.length === 2 && parts[1]) { const endCell = parts[1]; // Extract column letters from cell address (e.g., "Z100" -> "Z") const match = endCell.match(/^([A-Z]+)/); if (match && match[1]) { return match[1]; } } } // Default to column Z if no range return 'Z'; } /** * Create a single chunk with extraction parameters */ private createChunk( text: string, index: number, fullText: string, sheetName: string, chunkData: { startRow: number; endRow: number; startCol: string; endCol: string; }, searchFromOffset: number = 0 ): TextChunk { // For Excel chunks, the text includes headers that aren't at the same position // in the original content. Since Sprint 11's goal is bidirectional translation // using native coordinates (sheet/row/column), we rely on extraction params // for precise extraction rather than text offsets. // We'll still provide approximate offsets for reference, but the extraction // params are the authoritative source for Excel content extraction. const sheetMarker = `=== Sheet: ${sheetName} ===`; const sheetStart = fullText.indexOf(sheetMarker); // Use the sheet position as base offset (approximate) const startOffset = sheetStart >= 0 ? sheetStart : 0; const endOffset = startOffset + text.length; return { content: text, startPosition: startOffset, endPosition: endOffset, tokenCount: Math.ceil(text.length / 4), chunkIndex: index, metadata: { sourceFile: '', sourceType: 'excel', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }; } /** * Extract content by sheet and row range */ public async extractByRange( filePath: string, sheetName: string, startRow: number, endRow: number ): Promise<string> { // Read the Excel file const workbook = XLSX.readFile(filePath); // Validate sheet exists if (!workbook.SheetNames.includes(sheetName)) { throw new Error(`Sheet '${sheetName}' not found. Available sheets: ${workbook.SheetNames.join(', ')}`); } // Get the worksheet const worksheet = workbook.Sheets[sheetName]; if (!worksheet) { throw new Error(`Failed to load sheet '${sheetName}'`); } // Get sheet range const sheetRange = worksheet['!ref'] ? XLSX.utils.decode_range(worksheet['!ref']) : null; if (!sheetRange) { throw new Error(`Sheet '${sheetName}' has no data`); } // Validate row ranges // Convert 1-based rows to 0-based for internal use const startRow0 = startRow - 1; const endRow0 = endRow - 1; if (startRow0 < sheetRange.s.r || startRow0 > sheetRange.e.r) { throw new Error(`Start row ${startRow} is out of range. Sheet has rows ${sheetRange.s.r + 1} to ${sheetRange.e.r + 1}`); } if (endRow0 < startRow0 || endRow0 > sheetRange.e.r) { throw new Error(`End row ${endRow} is out of range. Sheet has rows ${sheetRange.s.r + 1} to ${sheetRange.e.r + 1}`); } // Extract the specified range const extractedRows: string[][] = []; for (let r = startRow0; r <= endRow0; r++) { const row: string[] = []; // Extract all columns in the sheet for simplicity for (let c = sheetRange.s.c; c <= sheetRange.e.c; c++) { const cellAddress = XLSX.utils.encode_cell({ r, c }); const cell = worksheet[cellAddress]; // Get cell value or formula if (cell) { if (cell.f) { // If cell has formula, include it row.push(`=${cell.f}`); } else if (cell.v !== undefined) { // Otherwise use the value row.push(String(cell.v)); } else { row.push(''); } } else { row.push(''); } } extractedRows.push(row); } // Convert to CSV format (matching how we chunk) const extractedText = extractedRows .map(row => row.join(',')) .join('\n'); return extractedText; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

excel-chunking.ts•12.8 KiB