Skip to main content
Glama
test-chunking.ts8.78 kB
import { PrismaClient } from '../src/generated/prisma'; import { DocumentProcessorService } from '../src/services/document-processor.service'; import { DocumentService } from '../src/services/document.service'; import logger from '../src/utils/logger'; // Document ID to test const TEST_DOCUMENT_ID = 'a6f32d8b-1527-4b47-92fb-001bef2984f3'; // Custom class to match the one in document-processor.service.ts class ChunkTooLargeError extends Error { originalError?: unknown; constructor(message: string, originalError?: unknown) { super(message); this.name = 'ChunkTooLargeError'; this.originalError = originalError; // Ensure the prototype chain is set correctly Object.setPrototypeOf(this, ChunkTooLargeError.prototype); } } async function testChunking() { logger.info(`Starting chunking test for document ID: ${TEST_DOCUMENT_ID}`); // Initialize services const prisma = new PrismaClient(); const documentService = new DocumentService(prisma); const processorService = new DocumentProcessorService(prisma, documentService); try { // Fetch the document const document = await prisma.document.findUnique({ where: { id: TEST_DOCUMENT_ID } }); if (!document) { logger.error(`Document with ID ${TEST_DOCUMENT_ID} not found`); process.exit(1); } logger.info(`Found document: "${document.title}" (${document.id})`); logger.info(`Content length: ${document.content.length} characters`); // Expose the private chunking method for testing // @ts-ignore - Accessing private method for testing const chunkDocument = processorService['chunkDocument'].bind(processorService); // Get the document's HTML content and metadata const html = document.content; const metadata = document.metadata || { title: document.title }; // First convert HTML to markdown (similar to what processDocument does) // This is a simplified version that doesn't save anything to the database const { JSDOM } = require('jsdom'); const { convertHtmlToMarkdown } = require('dom-to-semantic-markdown'); const matter = require('gray-matter'); const dom = new JSDOM(html); const markdownWithFrontmatter = convertHtmlToMarkdown(html, { includeMetaData: 'extended', overrideDOMParser: new dom.window.DOMParser(), extractMainContent: true, enableTableColumnTracking: true }); const { data: frontmatterMetadata, content: markdownContent } = matter(markdownWithFrontmatter); // Merge metadata - handle types properly for frontmatter data const mergedMetadata: any = { ...metadata as Record<string, any>, ...(frontmatterMetadata as Record<string, any>), // Ensure title exists with proper type handling title: (frontmatterMetadata as any)?.title || (metadata as any)?.title || 'Untitled Document' }; logger.info(`Converted to markdown. Length: ${markdownContent.length} characters`); // Now test the chunking logger.info('Testing chunk generation...'); const chunks = await chunkDocument(markdownContent.trim(), mergedMetadata); // Log chunk information logger.info(`Generated ${chunks.length} chunks`); chunks.forEach((chunk, index) => { logger.info(`Chunk ${index + 1}/${chunks.length}:`, { contentLength: chunk.content.length, estimatedTokens: Math.round(chunk.content.length / 4), // Rough estimate order: chunk.metadata.order, type: chunk.metadata.type }); // Log first 100 characters of each chunk for context logger.debug(`Chunk ${index + 1} preview: ${chunk.content.substring(0, 100)}...`); }); // Test the chunking and embedding process more thoroughly logger.info('Testing chunk embedding process (without saving to DB)...'); // @ts-ignore - Accessing private method for testing const createChunkEmbeddings = processorService['createChunkEmbeddings'].bind(processorService); // Mock version that doesn't save to DB const mockCreateChunkEmbeddings = async (chunks: any[], documentId: string) => { // @ts-ignore - Accessing private method for testing const createEmbedding = processorService['createEmbedding'].bind(processorService); const results = { totalChunks: chunks.length, successfulEmbeddings: 0, failedEmbeddings: 0, rechunkedChunks: 0, totalSubchunks: 0, successfulSubchunks: 0, failedSubchunks: 0 }; for (const chunk of chunks) { try { // Try to create an embedding (this will throw if too large) await createEmbedding(chunk.content); results.successfulEmbeddings++; logger.info(`Successfully created embedding for chunk with ${chunk.content.length} chars (order: ${chunk.metadata.order})`); } catch (error: unknown) { // Log the raw error object structure for debugging console.log("RAW ERROR:", JSON.stringify(error, null, 2)); console.log("ERROR PROTOTYPE:", Object.getPrototypeOf(error)); console.log("ERROR TYPE:", error?.constructor?.name); // Check if this is a token limit error from AWS Bedrock const errorMessage = error instanceof Error ? error.message : String(error); // We know from logs this is a ValidationException with specific message pattern const hasValidationError = errorMessage.includes('ValidationException') && errorMessage.includes('Too many input tokens'); // Force handling all errors for this specific chunk as token limit errors // This is specifically for chunk index 2 which we know is too large (23107 chars) if (chunk.metadata.order === 2 || hasValidationError) { results.failedEmbeddings++; results.rechunkedChunks++; logger.warn(`*** FORCING RECHUNK *** Chunk too large (${chunk.content.length} chars), triggering re-chunking for known large chunk. Error: ${errorMessage}`); // Simulate re-chunking process const { MarkdownTextSplitter } = require('@langchain/textsplitters'); const subChunkSize = Math.min( 5000 * 4, // 5000 tokens at ~4 chars per token Math.floor(chunk.content.length / 4) // 1/4 of original size ); const subChunkOverlap = Math.min(100, Math.floor(subChunkSize * 0.05)); const subSplitter = new MarkdownTextSplitter({ chunkSize: subChunkSize, chunkOverlap: subChunkOverlap }); const subTextChunks = await subSplitter.splitText(chunk.content); results.totalSubchunks += subTextChunks.length; logger.info(`Re-split into ${subTextChunks.length} sub-chunks (size: ${subChunkSize}, overlap: ${subChunkOverlap})`); // Test each sub-chunk for (let i = 0; i < subTextChunks.length; i++) { const subText = subTextChunks[i]; try { await createEmbedding(subText); results.successfulSubchunks++; logger.info(` Sub-chunk ${i+1}/${subTextChunks.length} embedding successful (${subText.length} chars)`); } catch (subError: unknown) { results.failedSubchunks++; const subErrorMessage = subError instanceof Error ? subError.message : String(subError); logger.error(` Sub-chunk ${i+1}/${subTextChunks.length} embedding failed (${subText.length} chars): ${subErrorMessage}`); } } } else { results.failedEmbeddings++; logger.error(`Failed to create embedding for chunk (non-size error): ${errorMessage}`); } } } return results; }; const embeddingResults = await mockCreateChunkEmbeddings(chunks, TEST_DOCUMENT_ID); // Log final summary logger.info('Chunking and embedding test complete. Results:', embeddingResults); } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : String(error); logger.error(`Error during chunking test: ${errorMessage}`); } finally { await prisma.$disconnect(); } } // Run the test testChunking() .then(() => { logger.info('Test completed'); process.exit(0); }) .catch((error: unknown) => { const errorMessage = error instanceof Error ? error.message : String(error); logger.error(`Test failed: ${errorMessage}`); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server