/**
* Sprint 11: Excel Document Format-Aware Chunking
*
* Implements chunking that respects Excel sheet structure using xlsx's
* natural cell address coordinate system (A1, B2, etc.) for precise extraction.
*/
import XLSX from 'xlsx';
import { ParsedContent, TextChunk, ChunkedContent, ExcelMetadata, createDefaultSemanticMetadata } from '../../types/index.js';
/**
* Represents a worksheet structure for chunking
*/
interface WorksheetStructure {
name: string;
index: number;
range: XLSX.Range | string | null; // Can be either Range object or string like "A1:Z100"
rowCount: number;
colCount: number;
csvContent: string;
jsonData: any[][];
hasFormulas: boolean;
formulas: string[];
}
/**
* Service for Excel document format-aware chunking
*/
export class ExcelChunkingService {
private readonly DEFAULT_MAX_TOKENS = 1000;
private readonly DEFAULT_MIN_TOKENS = 100;
private readonly DEFAULT_MAX_ROWS_PER_CHUNK = 50;
/**
* Chunk an Excel document respecting sheet boundaries
*/
public chunkExcelDocument(
content: ParsedContent,
maxTokens: number = this.DEFAULT_MAX_TOKENS,
minTokens: number = this.DEFAULT_MIN_TOKENS
): ChunkedContent {
if (content.type !== 'excel') {
throw new Error('Content must be an Excel document');
}
const metadata = content.metadata as ExcelMetadata;
// Get worksheet structures from metadata
const worksheets = metadata.worksheets as WorksheetStructure[];
if (!worksheets || worksheets.length === 0) {
throw new Error('Excel document has no worksheets');
}
// Create chunks respecting sheet boundaries
const chunks = this.createSheetAwareChunks(
worksheets,
content.content,
maxTokens,
minTokens
);
return {
originalContent: content,
chunks,
totalChunks: chunks.length
};
}
/**
* Create chunks respecting sheet boundaries
*/
private createSheetAwareChunks(
worksheets: WorksheetStructure[],
fullText: string,
maxTokens: number,
minTokens: number
): TextChunk[] {
const chunks: TextChunk[] = [];
let currentOffset = 0;
for (const worksheet of worksheets) {
if (!worksheet.csvContent || !worksheet.csvContent.trim()) {
continue;
}
// Find the sheet's position in fullText
const sheetMarker = `=== Sheet: ${worksheet.name} ===`;
const sheetStart = fullText.indexOf(sheetMarker, currentOffset);
if (sheetStart >= 0) {
currentOffset = sheetStart + sheetMarker.length + 1; // +1 for newline
}
const sheetChunks = this.chunkSingleSheet(
worksheet,
fullText,
chunks.length,
maxTokens,
minTokens,
currentOffset
);
chunks.push(...sheetChunks);
// Update offset to after this sheet's content
if (sheetChunks.length > 0) {
const lastChunk = sheetChunks[sheetChunks.length - 1];
if (lastChunk) {
currentOffset = lastChunk.endPosition;
}
}
}
return chunks;
}
/**
* Chunk a single sheet into text chunks
*/
private chunkSingleSheet(
worksheet: WorksheetStructure,
fullText: string,
startChunkIndex: number,
maxTokens: number,
minTokens: number,
searchFromOffset: number = 0
): TextChunk[] {
const chunks: TextChunk[] = [];
// Parse CSV content into rows
const rows = worksheet.csvContent.split('\n');
if (rows.length === 0) {
return chunks;
}
// Keep header row with each chunk
const headerRow = rows[0] || '';
const headerTokens = Math.ceil(headerRow.length / 4);
// Find where this sheet's actual content starts in fullText
const sheetMarker = `=== Sheet: ${worksheet.name} ===`;
const sheetMarkerPos = fullText.indexOf(sheetMarker, searchFromOffset);
const contentStartPos = sheetMarkerPos >= 0
? fullText.indexOf('\n', sheetMarkerPos) + 1 // After the marker line
: searchFromOffset;
let currentChunk: {
rows: string[];
text: string;
tokenCount: number;
startRow: number;
endRow: number;
startCol: string;
endCol: string;
dataStartPos?: number; // Track actual position of data (without header)
} | null = null;
// Start from row 1 (skip header, we'll add it to each chunk)
for (let i = 1; i < rows.length; i++) {
const row = rows[i] || '';
if (!row.trim()) continue;
const rowTokens = Math.ceil(row.length / 4);
if (!currentChunk) {
// Start new chunk with header
currentChunk = {
rows: [headerRow, row],
text: headerRow + '\n' + row,
tokenCount: headerTokens + rowTokens,
startRow: 1, // Excel uses 1-based rows
endRow: i + 1, // Convert to 1-based
startCol: 'A',
endCol: this.getLastColumn(worksheet)
};
} else {
// Check if adding this row would exceed max tokens
const potentialTokens = currentChunk.tokenCount + rowTokens;
if (potentialTokens > maxTokens && currentChunk.tokenCount >= minTokens) {
// Save current chunk
chunks.push(this.createChunk(
currentChunk.text,
startChunkIndex + chunks.length,
fullText,
worksheet.name,
currentChunk,
searchFromOffset
));
// Start new chunk with header
currentChunk = {
rows: [headerRow, row],
text: headerRow + '\n' + row,
tokenCount: headerTokens + rowTokens,
startRow: i + 1, // Convert to 1-based
endRow: i + 1,
startCol: 'A',
endCol: this.getLastColumn(worksheet)
};
} else {
// Add row to current chunk
currentChunk.rows.push(row);
currentChunk.text += '\n' + row;
currentChunk.tokenCount += rowTokens;
currentChunk.endRow = i + 1; // Convert to 1-based
}
}
}
// Save final chunk
if (currentChunk && currentChunk.text.trim()) {
chunks.push(this.createChunk(
currentChunk.text,
startChunkIndex + chunks.length,
fullText,
worksheet.name,
currentChunk,
searchFromOffset
));
}
return chunks;
}
/**
* Get the last column letter from worksheet range
*/
private getLastColumn(worksheet: WorksheetStructure): string {
if (worksheet.range && typeof worksheet.range === 'object' && 'e' in worksheet.range) {
return XLSX.utils.encode_col((worksheet.range as any).e.c);
}
// If range is a string like "A1:Z100", extract the last column
if (typeof worksheet.range === 'string' && worksheet.range.includes(':')) {
const parts = worksheet.range.split(':');
if (parts.length === 2 && parts[1]) {
const endCell = parts[1];
// Extract column letters from cell address (e.g., "Z100" -> "Z")
const match = endCell.match(/^([A-Z]+)/);
if (match && match[1]) {
return match[1];
}
}
}
// Default to column Z if no range
return 'Z';
}
/**
* Create a single chunk with extraction parameters
*/
private createChunk(
text: string,
index: number,
fullText: string,
sheetName: string,
chunkData: {
startRow: number;
endRow: number;
startCol: string;
endCol: string;
},
searchFromOffset: number = 0
): TextChunk {
// For Excel chunks, the text includes headers that aren't at the same position
// in the original content. Since Sprint 11's goal is bidirectional translation
// using native coordinates (sheet/row/column), we rely on extraction params
// for precise extraction rather than text offsets.
// We'll still provide approximate offsets for reference, but the extraction
// params are the authoritative source for Excel content extraction.
const sheetMarker = `=== Sheet: ${sheetName} ===`;
const sheetStart = fullText.indexOf(sheetMarker);
// Use the sheet position as base offset (approximate)
const startOffset = sheetStart >= 0 ? sheetStart : 0;
const endOffset = startOffset + text.length;
return {
content: text,
startPosition: startOffset,
endPosition: endOffset,
tokenCount: Math.ceil(text.length / 4),
chunkIndex: index,
metadata: {
sourceFile: '',
sourceType: 'excel',
totalChunks: 0,
hasOverlap: false
},
semanticMetadata: createDefaultSemanticMetadata()
};
}
/**
* Extract content by sheet and row range
*/
public async extractByRange(
filePath: string,
sheetName: string,
startRow: number,
endRow: number
): Promise<string> {
// Read the Excel file
const workbook = XLSX.readFile(filePath);
// Validate sheet exists
if (!workbook.SheetNames.includes(sheetName)) {
throw new Error(`Sheet '${sheetName}' not found. Available sheets: ${workbook.SheetNames.join(', ')}`);
}
// Get the worksheet
const worksheet = workbook.Sheets[sheetName];
if (!worksheet) {
throw new Error(`Failed to load sheet '${sheetName}'`);
}
// Get sheet range
const sheetRange = worksheet['!ref'] ? XLSX.utils.decode_range(worksheet['!ref']) : null;
if (!sheetRange) {
throw new Error(`Sheet '${sheetName}' has no data`);
}
// Validate row ranges
// Convert 1-based rows to 0-based for internal use
const startRow0 = startRow - 1;
const endRow0 = endRow - 1;
if (startRow0 < sheetRange.s.r || startRow0 > sheetRange.e.r) {
throw new Error(`Start row ${startRow} is out of range. Sheet has rows ${sheetRange.s.r + 1} to ${sheetRange.e.r + 1}`);
}
if (endRow0 < startRow0 || endRow0 > sheetRange.e.r) {
throw new Error(`End row ${endRow} is out of range. Sheet has rows ${sheetRange.s.r + 1} to ${sheetRange.e.r + 1}`);
}
// Extract the specified range
const extractedRows: string[][] = [];
for (let r = startRow0; r <= endRow0; r++) {
const row: string[] = [];
// Extract all columns in the sheet for simplicity
for (let c = sheetRange.s.c; c <= sheetRange.e.c; c++) {
const cellAddress = XLSX.utils.encode_cell({ r, c });
const cell = worksheet[cellAddress];
// Get cell value or formula
if (cell) {
if (cell.f) {
// If cell has formula, include it
row.push(`=${cell.f}`);
} else if (cell.v !== undefined) {
// Otherwise use the value
row.push(String(cell.v));
} else {
row.push('');
}
} else {
row.push('');
}
}
extractedRows.push(row);
}
// Convert to CSV format (matching how we chunk)
const extractedText = extractedRows
.map(row => row.join(','))
.join('\n');
return extractedText;
}
}