folder-mcp

Overview Schema Related Servers Score Discussions

folder-mcp
tests
real-integration

document-data-real.test.ts•16 KiB

import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import { setupRealTestEnvironment } from '../helpers/real-test-environment'; import { CacheTestHelper } from '../helpers/cache-test-helper'; import { existsSync } from 'fs'; import { promises as fs } from 'fs'; import path from 'path'; import type { RealTestEnvironment } from '../helpers/real-test-environment'; const POLICY_DOCS = [ 'Policies/Remote_Work_Policy.pdf', 'Policies/Remote_Work_Policy.docx', 'Policies/Remote_Work_Policy.txt', ]; describe('Document Data Endpoints - Real Integration Tests', () => { let env: RealTestEnvironment; beforeAll(async () => { env = await setupRealTestEnvironment('document-data-real'); }); afterAll(async () => { await env.cleanup(); }); it('should extract raw text content from PDF, Word, and plain text policy documents', async () => { for (const relPath of POLICY_DOCS) { const absPath = path.join(env.knowledgeBasePath, relPath); const fileType = absPath.split('.').pop() || ''; const result = await env.services.fileParsing.parseFile(absPath, fileType); expect(typeof result.content).toBe('string'); expect(result.content.length).toBeGreaterThan(50); expect(result.content).toMatch(/remote work/i); } }); it('should extract metadata (author, creation date, keywords) from real documents', async () => { for (const relPath of POLICY_DOCS) { const absPath = path.join(env.knowledgeBasePath, relPath); const fileType = absPath.split('.').pop() || ''; const result = await env.services.fileParsing.parseFile(absPath, fileType); if (fileType === 'pdf' || fileType === 'docx') { expect(result.metadata).toHaveProperty('author'); expect(result.metadata).toHaveProperty('created'); expect(result.metadata).toHaveProperty('keywords'); } else { // TXT and others: should not have these properties expect(result.metadata).not.toHaveProperty('author'); expect(result.metadata).not.toHaveProperty('created'); expect(result.metadata).not.toHaveProperty('keywords'); } } }); it('should support chunk-based access for large documents with real content boundaries', async () => { const relPath = 'Policies/Remote_Work_Policy.pdf'; const absPath = path.join(env.knowledgeBasePath, relPath); const fileType = 'pdf'; const result = await env.services.fileParsing.parseFile(absPath, fileType); // Simulate chunking by splitting into 500-char chunks const chunks = result.content.match(/.{1,500}/g) || []; expect(Array.isArray(chunks)).toBe(true); expect(chunks.length).toBeGreaterThan(1); for (const chunk of chunks) { expect(typeof chunk).toBe('string'); expect(chunk.length).toBeLessThanOrEqual(500); } }); it('should preserve rich text, tables, and lists when extracting to plain text', async () => { const relPath = 'Policies/Remote_Work_Policy.docx'; const absPath = path.join(env.knowledgeBasePath, relPath); const fileType = 'docx'; const result = await env.services.fileParsing.parseFile(absPath, fileType); expect(result.content).toMatch(/Table:/i); // Accept either '- Flexible hours' or 'Flexible hours' (with or without dash) expect(result.content).toMatch(/(- Flexible hours|Flexible hours)/i); }); it('should handle UTF-8, special characters, and international content', async () => { const relPath = 'Policies/Remote_Work_Policy.txt'; const absPath = path.join(env.knowledgeBasePath, relPath); const fileType = 'txt'; const result = await env.services.fileParsing.parseFile(absPath, fileType); expect(result.content).toMatch(/[üöäß]/i); expect(result.content).toMatch(/远程办公/i); }); it('should validate cache directory creation and population for document data processing', async () => { // Enhanced cache validation using CacheTestHelper const cacheHelper = new CacheTestHelper(env.knowledgeBasePath); // Create document data cache structure const cacheStructure = await cacheHelper.createDocumentDataCacheStructure(); // Verify cache structure was created properly expect(cacheHelper.cacheExists()).toBe(true); expect(existsSync(cacheStructure.baseDir)).toBe(true); expect(existsSync(cacheStructure.sqliteDatabase)).toBe(true); expect(existsSync(cacheStructure.documents!)).toBe(true); // Test cache population with actual document data const testDoc = 'Policies/Remote_Work_Policy.pdf'; const testDocPath = path.join(env.knowledgeBasePath, testDoc); const parsedContent = await env.services.fileParsing.parseFile(testDocPath, 'pdf'); // Create structured test data that matches expected format const documentData = { documentId: testDoc, content: parsedContent.content, metadata: parsedContent.metadata, processedAt: new Date().toISOString(), contentLength: parsedContent.content.length, fileType: 'pdf', contentPreview: parsedContent.content.substring(0, 200), hasRemoteWorkContent: parsedContent.content.toLowerCase().includes('remote work') }; // Save document data using CacheTestHelper const savedPath = await cacheHelper.saveToCache('documents', 'test-remote-work-policy', documentData); expect(existsSync(savedPath)).toBe(true); // Note: Metadata is now stored in SQLite database, not separate JSON files // Save additional validation data to documents directory instead const validationPath = await cacheHelper.saveToCache('documents', 'test-remote-work-validation', { originalPath: testDocPath, fileName: path.basename(testDocPath), fileType: 'pdf', extractedAt: new Date().toISOString(), metadata: parsedContent.metadata }); expect(existsSync(validationPath)).toBe(true); // Validate cache structure and contents (metadata is now in SQLite) const validation = await cacheHelper.validateCacheStructure(['documents']); expect(validation.isValid).toBe(true); expect(validation.exists).toBe(true); expect(validation.subdirectories).toContain('documents'); // Note: metadata subdirectory is no longer created - metadata is in SQLite expect(validation.files.length).toBeGreaterThanOrEqual(2); expect(validation.errors.length).toBe(0); // Test data content validation - ensure cached data matches expected structure const cachedDocument = await cacheHelper.loadFromCache('documents', 'test-remote-work-policy'); expect(cachedDocument).toBeTruthy(); expect(cachedDocument).toHaveProperty('documentId'); expect(cachedDocument).toHaveProperty('content'); expect(cachedDocument).toHaveProperty('metadata'); expect(cachedDocument).toHaveProperty('contentLength'); expect(cachedDocument.documentId).toBe(testDoc); expect(cachedDocument.content).toBe(parsedContent.content); expect(cachedDocument.contentLength).toBe(parsedContent.content.length); expect(cachedDocument.hasRemoteWorkContent).toBe(true); const cachedMetadata = await cacheHelper.loadFromCache('metadata', 'test-remote-work-metadata'); expect(cachedMetadata).toBeTruthy(); expect(cachedMetadata).toHaveProperty('originalPath'); expect(cachedMetadata).toHaveProperty('fileName'); expect(cachedMetadata).toHaveProperty('fileType'); expect(cachedMetadata.originalPath).toBe(testDocPath); expect(cachedMetadata.fileName).toBe('Remote_Work_Policy.pdf'); expect(cachedMetadata.fileType).toBe('pdf'); // Test cache persistence const persistenceTest = await cacheHelper.testCachePersistence({ testKey: 'persistence-validation', testData: documentData, timestamp: new Date().toISOString() }); expect(persistenceTest).toBe(true); // Test cache index integrity const integrityCheck = await cacheHelper.validateCacheIndexIntegrity(); expect(integrityCheck.isValid).toBe(true); expect(integrityCheck.corruptedFiles.length).toBe(0); expect(integrityCheck.errors.length).toBe(0); // Get cache statistics for reporting const stats = await cacheHelper.getCacheStatistics(); expect(stats.totalFiles).toBeGreaterThanOrEqual(2); expect(stats.subdirectories).toBeGreaterThanOrEqual(2); expect(stats.totalSize).toBeGreaterThan(0); expect(stats.byType).toHaveProperty('documents'); expect(stats.byType).toHaveProperty('metadata'); console.log(`✅ Cache directory created and validated at: ${cacheHelper.getCacheBaseDir()}`); console.log(`✅ Cache populated with document data for: ${testDoc}`); console.log(`✅ Cache structure validation: ${validation.isValid ? 'PASS' : 'FAIL'}`); console.log(`✅ Cache content validation: ${cachedDocument ? 'PASS' : 'FAIL'}`); console.log(`✅ Cache persistence test: ${persistenceTest ? 'PASS' : 'FAIL'}`); console.log(`✅ Cache index integrity: ${integrityCheck.isValid ? 'PASS' : 'FAIL'}`); console.log(`✅ Cache statistics: ${stats.totalFiles} files, ${Math.round(stats.totalSize / 1024)}KB`); console.log('✅ Document data processing cache infrastructure is fully validated'); }); describe('Edge Case Handling for Document Data', () => { it('should handle corrupted PDF files gracefully', async () => { // Test corrupted PDF file handling const corruptedPdf = path.join(env.knowledgeBasePath, 'test-edge-cases', 'corrupted_test.pdf'); expect(existsSync(corruptedPdf)).toBe(true); // Should throw an error, not crash await expect(env.services.fileParsing.parseFile(corruptedPdf, 'pdf')).rejects.toThrow(); console.log('✅ Corrupted PDF handled gracefully with proper error'); }); it('should handle empty files without crashing', async () => { // Test empty file handling const emptyFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'empty.txt'); expect(existsSync(emptyFile)).toBe(true); const result = await env.services.fileParsing.parseFile(emptyFile, 'txt'); expect(result.content || '').toBe(''); expect(result.metadata).toBeDefined(); console.log('✅ Empty file handled without crashing'); }); it('should handle huge files with memory management', async () => { // Test huge file handling const hugeFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'huge_test.txt'); expect(existsSync(hugeFile)).toBe(true); // Verify file is actually large const stats = await fs.stat(hugeFile); expect(stats.size).toBeGreaterThan(1000000); // > 1MB const result = await env.services.fileParsing.parseFile(hugeFile, 'txt'); expect(result.content).toBeDefined(); expect(typeof result.content).toBe('string'); expect(result.content.length).toBeGreaterThan(1000000); console.log(`✅ Huge file (${stats.size} bytes) handled without memory issues`); }); it('should handle unicode filenames and content properly', async () => { // Test unicode file handling const unicodeFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'test_файл_测试.txt'); expect(existsSync(unicodeFile)).toBe(true); const result = await env.services.fileParsing.parseFile(unicodeFile, 'txt'); expect(result.content).toBeDefined(); expect(typeof result.content).toBe('string'); expect(result.content.length).toBeGreaterThan(0); console.log('✅ Unicode filename and content handled correctly'); }); it('should handle binary files masquerading as text gracefully', async () => { // Test binary file rejection const binaryFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'binary_cache_test.bin'); expect(existsSync(binaryFile)).toBe(true); // Should throw an error for unsupported file type await expect(env.services.fileParsing.parseFile(binaryFile, 'txt')).rejects.toThrow(); console.log('✅ Binary file masquerading as text rejected gracefully'); }); it('should handle missing files with appropriate errors', async () => { // Test missing file handling const missingFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'does_not_exist.txt'); expect(existsSync(missingFile)).toBe(false); // Should throw appropriate error for missing file await expect(env.services.fileParsing.parseFile(missingFile, 'txt')).rejects.toThrow(); console.log('✅ Missing file handled with appropriate error'); }); it('should handle malformed document structures gracefully', async () => { // Test with corrupted Excel file const corruptedXlsx = path.join(env.knowledgeBasePath, 'test-edge-cases', 'corrupted.xlsx'); expect(existsSync(corruptedXlsx)).toBe(true); // Should handle corrupted Excel gracefully try { const result = await env.services.fileParsing.parseFile(corruptedXlsx, 'xlsx'); // If it doesn't throw, verify it returns proper structure expect(result).toBeDefined(); expect(result.content !== undefined).toBe(true); console.log('✅ Corrupted Excel handled gracefully'); } catch (error) { // If it throws, it should be a controlled error expect(error).toBeInstanceOf(Error); console.log('✅ Corrupted Excel threw controlled error'); } }); it('should handle edge case metadata extraction safely', async () => { // Test metadata extraction from edge case files const testFiles = [ { path: 'test-edge-cases/empty.txt', type: 'txt', expectMetadata: false }, { path: 'test-edge-cases/huge_test.txt', type: 'txt', expectMetadata: false }, { path: 'test-edge-cases/test_файл_测试.txt', type: 'txt', expectMetadata: false } ]; for (const testCase of testFiles) { const filePath = path.join(env.knowledgeBasePath, testCase.path); if (existsSync(filePath)) { try { const result = await env.services.fileParsing.parseFile(filePath, testCase.type); expect(result.metadata).toBeDefined(); if (testCase.expectMetadata) { expect(Object.keys(result.metadata || {}).length).toBeGreaterThan(0); } else { // TXT files typically don't have rich metadata expect(result.metadata).toBeDefined(); // Should be empty object } console.log(`✅ Metadata extraction for ${path.basename(testCase.path)} handled safely`); } catch (error) { console.log(`⚠️ Expected error for ${path.basename(testCase.path)}: ${(error as Error).message}`); } } } }); it('should handle chunking edge cases properly', async () => { // Test chunking with edge case files const testFiles = [ { path: 'test-edge-cases/empty.txt', type: 'txt' }, { path: 'test-edge-cases/huge_test.txt', type: 'txt' } ]; for (const testCase of testFiles) { const filePath = path.join(env.knowledgeBasePath, testCase.path); if (existsSync(filePath)) { try { const result = await env.services.fileParsing.parseFile(filePath, testCase.type); // Test chunking logic const chunkSize = 500; const chunks = result.content.match(new RegExp(`.{1,${chunkSize}}`, 'g')) || []; if (result.content.length === 0) { expect(chunks.length).toBe(0); } else { expect(chunks.length).toBeGreaterThan(0); chunks.forEach(chunk => { expect(chunk.length).toBeLessThanOrEqual(chunkSize); }); } console.log(`✅ Chunking for ${path.basename(testCase.path)} handled properly (${chunks.length} chunks)`); } catch (error) { console.log(`⚠️ Expected error for ${path.basename(testCase.path)}: ${(error as Error).message}`); } } } }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document-data-real.test.ts•16 KiB