import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { setupRealTestEnvironment } from '../helpers/real-test-environment';
import { CacheTestHelper } from '../helpers/cache-test-helper';
import { existsSync } from 'fs';
import { promises as fs } from 'fs';
import path from 'path';
import type { RealTestEnvironment } from '../helpers/real-test-environment';
const POLICY_DOCS = [
'Policies/Remote_Work_Policy.pdf',
'Policies/Remote_Work_Policy.docx',
'Policies/Remote_Work_Policy.txt',
];
describe('Document Data Endpoints - Real Integration Tests', () => {
let env: RealTestEnvironment;
beforeAll(async () => {
env = await setupRealTestEnvironment('document-data-real');
});
afterAll(async () => {
await env.cleanup();
});
it('should extract raw text content from PDF, Word, and plain text policy documents', async () => {
for (const relPath of POLICY_DOCS) {
const absPath = path.join(env.knowledgeBasePath, relPath);
const fileType = absPath.split('.').pop() || '';
const result = await env.services.fileParsing.parseFile(absPath, fileType);
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(50);
expect(result.content).toMatch(/remote work/i);
}
});
it('should extract metadata (author, creation date, keywords) from real documents', async () => {
for (const relPath of POLICY_DOCS) {
const absPath = path.join(env.knowledgeBasePath, relPath);
const fileType = absPath.split('.').pop() || '';
const result = await env.services.fileParsing.parseFile(absPath, fileType);
if (fileType === 'pdf' || fileType === 'docx') {
expect(result.metadata).toHaveProperty('author');
expect(result.metadata).toHaveProperty('created');
expect(result.metadata).toHaveProperty('keywords');
} else {
// TXT and others: should not have these properties
expect(result.metadata).not.toHaveProperty('author');
expect(result.metadata).not.toHaveProperty('created');
expect(result.metadata).not.toHaveProperty('keywords');
}
}
});
it('should support chunk-based access for large documents with real content boundaries', async () => {
const relPath = 'Policies/Remote_Work_Policy.pdf';
const absPath = path.join(env.knowledgeBasePath, relPath);
const fileType = 'pdf';
const result = await env.services.fileParsing.parseFile(absPath, fileType);
// Simulate chunking by splitting into 500-char chunks
const chunks = result.content.match(/.{1,500}/g) || [];
expect(Array.isArray(chunks)).toBe(true);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(typeof chunk).toBe('string');
expect(chunk.length).toBeLessThanOrEqual(500);
}
});
it('should preserve rich text, tables, and lists when extracting to plain text', async () => {
const relPath = 'Policies/Remote_Work_Policy.docx';
const absPath = path.join(env.knowledgeBasePath, relPath);
const fileType = 'docx';
const result = await env.services.fileParsing.parseFile(absPath, fileType);
expect(result.content).toMatch(/Table:/i);
// Accept either '- Flexible hours' or 'Flexible hours' (with or without dash)
expect(result.content).toMatch(/(- Flexible hours|Flexible hours)/i);
});
it('should handle UTF-8, special characters, and international content', async () => {
const relPath = 'Policies/Remote_Work_Policy.txt';
const absPath = path.join(env.knowledgeBasePath, relPath);
const fileType = 'txt';
const result = await env.services.fileParsing.parseFile(absPath, fileType);
expect(result.content).toMatch(/[üöäß]/i);
expect(result.content).toMatch(/远程办公/i);
});
it('should validate cache directory creation and population for document data processing', async () => {
// Enhanced cache validation using CacheTestHelper
const cacheHelper = new CacheTestHelper(env.knowledgeBasePath);
// Create document data cache structure
const cacheStructure = await cacheHelper.createDocumentDataCacheStructure();
// Verify cache structure was created properly
expect(cacheHelper.cacheExists()).toBe(true);
expect(existsSync(cacheStructure.baseDir)).toBe(true);
expect(existsSync(cacheStructure.sqliteDatabase)).toBe(true);
expect(existsSync(cacheStructure.documents!)).toBe(true);
// Test cache population with actual document data
const testDoc = 'Policies/Remote_Work_Policy.pdf';
const testDocPath = path.join(env.knowledgeBasePath, testDoc);
const parsedContent = await env.services.fileParsing.parseFile(testDocPath, 'pdf');
// Create structured test data that matches expected format
const documentData = {
documentId: testDoc,
content: parsedContent.content,
metadata: parsedContent.metadata,
processedAt: new Date().toISOString(),
contentLength: parsedContent.content.length,
fileType: 'pdf',
contentPreview: parsedContent.content.substring(0, 200),
hasRemoteWorkContent: parsedContent.content.toLowerCase().includes('remote work')
};
// Save document data using CacheTestHelper
const savedPath = await cacheHelper.saveToCache('documents', 'test-remote-work-policy', documentData);
expect(existsSync(savedPath)).toBe(true);
// Note: Metadata is now stored in SQLite database, not separate JSON files
// Save additional validation data to documents directory instead
const validationPath = await cacheHelper.saveToCache('documents', 'test-remote-work-validation', {
originalPath: testDocPath,
fileName: path.basename(testDocPath),
fileType: 'pdf',
extractedAt: new Date().toISOString(),
metadata: parsedContent.metadata
});
expect(existsSync(validationPath)).toBe(true);
// Validate cache structure and contents (metadata is now in SQLite)
const validation = await cacheHelper.validateCacheStructure(['documents']);
expect(validation.isValid).toBe(true);
expect(validation.exists).toBe(true);
expect(validation.subdirectories).toContain('documents');
// Note: metadata subdirectory is no longer created - metadata is in SQLite
expect(validation.files.length).toBeGreaterThanOrEqual(2);
expect(validation.errors.length).toBe(0);
// Test data content validation - ensure cached data matches expected structure
const cachedDocument = await cacheHelper.loadFromCache('documents', 'test-remote-work-policy');
expect(cachedDocument).toBeTruthy();
expect(cachedDocument).toHaveProperty('documentId');
expect(cachedDocument).toHaveProperty('content');
expect(cachedDocument).toHaveProperty('metadata');
expect(cachedDocument).toHaveProperty('contentLength');
expect(cachedDocument.documentId).toBe(testDoc);
expect(cachedDocument.content).toBe(parsedContent.content);
expect(cachedDocument.contentLength).toBe(parsedContent.content.length);
expect(cachedDocument.hasRemoteWorkContent).toBe(true);
const cachedMetadata = await cacheHelper.loadFromCache('metadata', 'test-remote-work-metadata');
expect(cachedMetadata).toBeTruthy();
expect(cachedMetadata).toHaveProperty('originalPath');
expect(cachedMetadata).toHaveProperty('fileName');
expect(cachedMetadata).toHaveProperty('fileType');
expect(cachedMetadata.originalPath).toBe(testDocPath);
expect(cachedMetadata.fileName).toBe('Remote_Work_Policy.pdf');
expect(cachedMetadata.fileType).toBe('pdf');
// Test cache persistence
const persistenceTest = await cacheHelper.testCachePersistence({
testKey: 'persistence-validation',
testData: documentData,
timestamp: new Date().toISOString()
});
expect(persistenceTest).toBe(true);
// Test cache index integrity
const integrityCheck = await cacheHelper.validateCacheIndexIntegrity();
expect(integrityCheck.isValid).toBe(true);
expect(integrityCheck.corruptedFiles.length).toBe(0);
expect(integrityCheck.errors.length).toBe(0);
// Get cache statistics for reporting
const stats = await cacheHelper.getCacheStatistics();
expect(stats.totalFiles).toBeGreaterThanOrEqual(2);
expect(stats.subdirectories).toBeGreaterThanOrEqual(2);
expect(stats.totalSize).toBeGreaterThan(0);
expect(stats.byType).toHaveProperty('documents');
expect(stats.byType).toHaveProperty('metadata');
console.log(`✅ Cache directory created and validated at: ${cacheHelper.getCacheBaseDir()}`);
console.log(`✅ Cache populated with document data for: ${testDoc}`);
console.log(`✅ Cache structure validation: ${validation.isValid ? 'PASS' : 'FAIL'}`);
console.log(`✅ Cache content validation: ${cachedDocument ? 'PASS' : 'FAIL'}`);
console.log(`✅ Cache persistence test: ${persistenceTest ? 'PASS' : 'FAIL'}`);
console.log(`✅ Cache index integrity: ${integrityCheck.isValid ? 'PASS' : 'FAIL'}`);
console.log(`✅ Cache statistics: ${stats.totalFiles} files, ${Math.round(stats.totalSize / 1024)}KB`);
console.log('✅ Document data processing cache infrastructure is fully validated');
});
describe('Edge Case Handling for Document Data', () => {
it('should handle corrupted PDF files gracefully', async () => {
// Test corrupted PDF file handling
const corruptedPdf = path.join(env.knowledgeBasePath, 'test-edge-cases', 'corrupted_test.pdf');
expect(existsSync(corruptedPdf)).toBe(true);
// Should throw an error, not crash
await expect(env.services.fileParsing.parseFile(corruptedPdf, 'pdf')).rejects.toThrow();
console.log('✅ Corrupted PDF handled gracefully with proper error');
});
it('should handle empty files without crashing', async () => {
// Test empty file handling
const emptyFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'empty.txt');
expect(existsSync(emptyFile)).toBe(true);
const result = await env.services.fileParsing.parseFile(emptyFile, 'txt');
expect(result.content || '').toBe('');
expect(result.metadata).toBeDefined();
console.log('✅ Empty file handled without crashing');
});
it('should handle huge files with memory management', async () => {
// Test huge file handling
const hugeFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'huge_test.txt');
expect(existsSync(hugeFile)).toBe(true);
// Verify file is actually large
const stats = await fs.stat(hugeFile);
expect(stats.size).toBeGreaterThan(1000000); // > 1MB
const result = await env.services.fileParsing.parseFile(hugeFile, 'txt');
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(1000000);
console.log(`✅ Huge file (${stats.size} bytes) handled without memory issues`);
});
it('should handle unicode filenames and content properly', async () => {
// Test unicode file handling
const unicodeFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'test_файл_测试.txt');
expect(existsSync(unicodeFile)).toBe(true);
const result = await env.services.fileParsing.parseFile(unicodeFile, 'txt');
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(0);
console.log('✅ Unicode filename and content handled correctly');
});
it('should handle binary files masquerading as text gracefully', async () => {
// Test binary file rejection
const binaryFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'binary_cache_test.bin');
expect(existsSync(binaryFile)).toBe(true);
// Should throw an error for unsupported file type
await expect(env.services.fileParsing.parseFile(binaryFile, 'txt')).rejects.toThrow();
console.log('✅ Binary file masquerading as text rejected gracefully');
});
it('should handle missing files with appropriate errors', async () => {
// Test missing file handling
const missingFile = path.join(env.knowledgeBasePath, 'test-edge-cases', 'does_not_exist.txt');
expect(existsSync(missingFile)).toBe(false);
// Should throw appropriate error for missing file
await expect(env.services.fileParsing.parseFile(missingFile, 'txt')).rejects.toThrow();
console.log('✅ Missing file handled with appropriate error');
});
it('should handle malformed document structures gracefully', async () => {
// Test with corrupted Excel file
const corruptedXlsx = path.join(env.knowledgeBasePath, 'test-edge-cases', 'corrupted.xlsx');
expect(existsSync(corruptedXlsx)).toBe(true);
// Should handle corrupted Excel gracefully
try {
const result = await env.services.fileParsing.parseFile(corruptedXlsx, 'xlsx');
// If it doesn't throw, verify it returns proper structure
expect(result).toBeDefined();
expect(result.content !== undefined).toBe(true);
console.log('✅ Corrupted Excel handled gracefully');
} catch (error) {
// If it throws, it should be a controlled error
expect(error).toBeInstanceOf(Error);
console.log('✅ Corrupted Excel threw controlled error');
}
});
it('should handle edge case metadata extraction safely', async () => {
// Test metadata extraction from edge case files
const testFiles = [
{ path: 'test-edge-cases/empty.txt', type: 'txt', expectMetadata: false },
{ path: 'test-edge-cases/huge_test.txt', type: 'txt', expectMetadata: false },
{ path: 'test-edge-cases/test_файл_测试.txt', type: 'txt', expectMetadata: false }
];
for (const testCase of testFiles) {
const filePath = path.join(env.knowledgeBasePath, testCase.path);
if (existsSync(filePath)) {
try {
const result = await env.services.fileParsing.parseFile(filePath, testCase.type);
expect(result.metadata).toBeDefined();
if (testCase.expectMetadata) {
expect(Object.keys(result.metadata || {}).length).toBeGreaterThan(0);
} else {
// TXT files typically don't have rich metadata
expect(result.metadata).toBeDefined(); // Should be empty object
}
console.log(`✅ Metadata extraction for ${path.basename(testCase.path)} handled safely`);
} catch (error) {
console.log(`⚠️ Expected error for ${path.basename(testCase.path)}: ${(error as Error).message}`);
}
}
}
});
it('should handle chunking edge cases properly', async () => {
// Test chunking with edge case files
const testFiles = [
{ path: 'test-edge-cases/empty.txt', type: 'txt' },
{ path: 'test-edge-cases/huge_test.txt', type: 'txt' }
];
for (const testCase of testFiles) {
const filePath = path.join(env.knowledgeBasePath, testCase.path);
if (existsSync(filePath)) {
try {
const result = await env.services.fileParsing.parseFile(filePath, testCase.type);
// Test chunking logic
const chunkSize = 500;
const chunks = result.content.match(new RegExp(`.{1,${chunkSize}}`, 'g')) || [];
if (result.content.length === 0) {
expect(chunks.length).toBe(0);
} else {
expect(chunks.length).toBeGreaterThan(0);
chunks.forEach(chunk => {
expect(chunk.length).toBeLessThanOrEqual(chunkSize);
});
}
console.log(`✅ Chunking for ${path.basename(testCase.path)} handled properly (${chunks.length} chunks)`);
} catch (error) {
console.log(`⚠️ Expected error for ${path.basename(testCase.path)}: ${(error as Error).message}`);
}
}
}
});
});
});