// RAG MCP Server Integration Test - Design Doc: rag-mcp-server-design.md (v1.1)
// Generated: 2025-10-31
// Test Type: Integration Test
// Implementation Timing: Alongside feature implementation
import { existsSync, mkdirSync, rmSync, writeFileSync } from 'node:fs'
import { readFile } from 'node:fs/promises'
import { resolve } from 'node:path'
import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it } from 'vitest'
import { RAGServer } from '../index.js'
import { generateMetaJsonPath, generateRawDataPath } from '../raw-data-utils.js'
// ============================================
// MVP Phase 1: Core Functionality Integration Test
// ============================================
describe('RAG MCP Server Integration Test - Phase 1', () => {
let ragServer: RAGServer
const testDbPath = resolve('./tmp/test-lancedb')
const testDataDir = resolve('./tmp/test-data')
beforeAll(async () => {
// Setup: LanceDB initialization, Transformers.js model load
mkdirSync(testDbPath, { recursive: true })
mkdirSync(testDataDir, { recursive: true })
ragServer = new RAGServer({
dbPath: testDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: testDataDir,
maxFileSize: 100 * 1024 * 1024, // 100MB
})
await ragServer.initialize()
})
afterAll(async () => {
// Cleanup: Delete test data, close DB connection
rmSync(testDbPath, { recursive: true, force: true })
rmSync(testDataDir, { recursive: true, force: true })
})
describe('AC-001: MCP Protocol Integration', () => {
// AC interpretation: [Functional requirement] Recognized as MCP server and 4 tools are properly registered
// Validation: 4 tools (query_documents, ingest_file, list_files, status) are callable from MCP client
it('MCP server starts via stdio transport and is recognized by MCP client', async () => {
// Verify RAGServer is initialized
expect(ragServer).toBeDefined()
// Verify 4 handler methods exist
expect(typeof ragServer.handleQueryDocuments).toBe('function')
expect(typeof ragServer.handleIngestFile).toBe('function')
expect(typeof ragServer.handleListFiles).toBe('function')
expect(typeof ragServer.handleStatus).toBe('function')
})
// AC interpretation: [Technical requirement] JSON Schema-compliant tool definitions are recognized by MCP client
// Validation: Each tool's JSON Schema is correctly defined and returned to MCP client
it('JSON Schema definitions for 4 tools (query_documents, ingest_file, list_files, status) are recognized by MCP client', async () => {
// Verify setupHandlers() is called during RAGServer initialization and tool definitions are configured
// Since actual MCP SDK tool list retrieval is the responsibility of the MCP client,
// here we verify that 4 tool handlers are properly defined
expect(ragServer).toBeDefined()
// Verify status, list_files handler operations (no arguments)
const statusResult = await ragServer.handleStatus()
expect(statusResult).toBeDefined()
expect(statusResult.content).toBeDefined()
expect(statusResult.content.length).toBe(1)
expect(statusResult.content[0].type).toBe('text')
const listFilesResult = await ragServer.handleListFiles()
expect(listFilesResult).toBeDefined()
expect(listFilesResult.content).toBeDefined()
expect(listFilesResult.content.length).toBe(1)
expect(listFilesResult.content[0].type).toBe('text')
})
// AC interpretation: [Error handling] Appropriate MCP error response returned when error occurs
// Validation: MCP error response (error code, message) returned for invalid input
it('Appropriate MCP error response (JSON-RPC 2.0 format) returned for invalid tool invocation', async () => {
// Call ingest_file with non-existent file and verify error occurs
await expect(
ragServer.handleIngestFile({ filePath: '/nonexistent/file.pdf' })
).rejects.toThrow()
})
// Edge Case: Parallel request processing
// Validation: Multiple MCP tool invocations are processed in parallel
it('3 parallel MCP tool invocations are processed normally (P-003)', async () => {
// Invoke 3 handlers in parallel
const results = await Promise.all([
ragServer.handleStatus(),
ragServer.handleListFiles(),
ragServer.handleStatus(),
])
// Verify all results are returned normally
expect(results).toHaveLength(3)
for (const result of results) {
expect(result).toBeDefined()
expect(result.content).toBeDefined()
expect(result.content.length).toBe(1)
expect(result.content[0].type).toBe('text')
}
})
})
// AC-002: Document Ingestion - SemanticChunker tests are in src/chunker/__tests__/semantic-chunker.test.ts
describe('AC-003: Vector Embedding Generation', () => {
// AC interpretation: [Technical requirement] Text chunks are converted to 384-dimensional vectors
// Validation: Generate embedding from text, 384-dimensional vector is returned
it('Text chunk properly converted to 384-dimensional vector', async () => {
const { Embedder } = await import('../../embedder/index')
const embedder = new Embedder({
modelPath: 'Xenova/all-MiniLM-L6-v2',
batchSize: 8,
cacheDir: './tmp/models',
})
await embedder.initialize()
const testText = 'This is a test text for embedding generation.'
const embedding = await embedder.embed(testText)
expect(embedding).toBeDefined()
expect(Array.isArray(embedding)).toBe(true)
expect(embedding.length).toBe(384)
expect(embedding.every((value) => typeof value === 'number')).toBe(true)
})
// AC interpretation: [Technical requirement] all-MiniLM-L6-v2 model is automatically downloaded on first startup
// Validation: all-MiniLM-L6-v2 model is downloaded from Hugging Face on first startup
it('all-MiniLM-L6-v2 model automatically downloaded on first startup and cached in models/ directory', async () => {
const { Embedder } = await import('../../embedder/index')
const embedder = new Embedder({
modelPath: 'Xenova/all-MiniLM-L6-v2',
batchSize: 8,
cacheDir: './tmp/models',
})
// Model initialization (automatic download on first run)
await embedder.initialize()
// Verify initialization succeeded
const testText = 'Test model initialization.'
const embedding = await embedder.embed(testText)
expect(embedding).toBeDefined()
expect(Array.isArray(embedding)).toBe(true)
expect(embedding.length).toBe(384)
})
// AC interpretation: [Technical requirement] Embedding generation executed with batch size 8
// Validation: Generate embeddings for multiple text chunks with batch size 8
it('Generate embeddings for multiple text chunks (e.g., 16) with batch size 8', async () => {
const { Embedder } = await import('../../embedder/index')
const embedder = new Embedder({
modelPath: 'Xenova/all-MiniLM-L6-v2',
batchSize: 8,
cacheDir: './tmp/models',
})
await embedder.initialize()
// Create 16 text chunks (2 batches with batch size 8)
const texts = Array.from({ length: 16 }, (_, i) => `This is test text chunk ${i + 1}.`)
const embeddings = await embedder.embedBatch(texts)
// Validation: 16 vectors are returned
expect(embeddings).toBeDefined()
expect(Array.isArray(embeddings)).toBe(true)
expect(embeddings.length).toBe(16)
// Verify each vector is 384-dimensional
for (const embedding of embeddings) {
expect(Array.isArray(embedding)).toBe(true)
expect(embedding.length).toBe(384)
expect(embedding.every((value) => typeof value === 'number')).toBe(true)
}
})
// Edge Case: Empty string
// Validation: Empty string embedding generation fails fast with error
it('Empty string embedding generation throws EmbeddingError (fail-fast)', async () => {
const { Embedder, EmbeddingError } = await import('../../embedder/index')
const embedder = new Embedder({
modelPath: 'Xenova/all-MiniLM-L6-v2',
batchSize: 8,
cacheDir: './tmp/models',
})
await embedder.initialize()
// Attempt to generate embedding for empty string
await expect(embedder.embed('')).rejects.toThrow(EmbeddingError)
await expect(embedder.embed('')).rejects.toThrow('Cannot generate embedding for empty text')
})
// Edge Case: Very long text
// Validation: Embedding generation for text over 1000 characters completes normally
it('Embedding generation for text over 1000 characters completes normally', async () => {
const { Embedder } = await import('../../embedder/index')
const embedder = new Embedder({
modelPath: 'Xenova/all-MiniLM-L6-v2',
batchSize: 8,
cacheDir: './tmp/models',
})
await embedder.initialize()
const longText = 'This is a very long text. '.repeat(50) // Approx 1350 characters
const embedding = await embedder.embed(longText)
expect(embedding).toBeDefined()
expect(Array.isArray(embedding)).toBe(true)
expect(embedding.length).toBe(384)
expect(embedding.every((value) => typeof value === 'number')).toBe(true)
})
})
describe('AC-004: Vector Search', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac004')
const localTestDataDir = resolve('./tmp/test-data-ac004')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-004
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
// Ingest test document
const testFile = resolve(localTestDataDir, 'test-typescript.txt')
writeFileSync(
testFile,
'TypeScript is a strongly typed programming language that builds on JavaScript. ' +
'TypeScript adds optional static typing to JavaScript. ' +
'TypeScript provides type safety and helps catch errors at compile time. ' +
'TypeScript is widely used in modern web development. ' +
'TypeScript supports interfaces, generics, and other advanced features.'
)
await localRagServer.handleIngestFile({ filePath: testFile })
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Functional requirement] Related documents returned for natural language query
// Validation: Call query_documents with natural language query, related documents are returned
it('Related documents returned for natural language query (e.g., "TypeScript type safety")', async () => {
const result = await localRagServer.handleQueryDocuments({
query: 'TypeScript type safety',
limit: 5,
})
expect(result).toBeDefined()
expect(result.content).toBeDefined()
expect(result.content.length).toBe(1)
expect(result.content[0].type).toBe('text')
const results = JSON.parse(result.content[0].text)
expect(Array.isArray(results)).toBe(true)
expect(results.length).toBeGreaterThan(0)
// Verify results contain required fields
for (const doc of results) {
expect(doc.filePath).toBeDefined()
expect(doc.chunkIndex).toBeDefined()
expect(doc.text).toBeDefined()
expect(doc.score).toBeDefined()
}
})
// AC interpretation: [Technical requirement] Search results sorted by score (descending)
// Validation: Search result scores are sorted in descending order
it('Search results sorted by score (descending)', async () => {
const result = await localRagServer.handleQueryDocuments({
query: 'TypeScript',
limit: 5,
})
const results = JSON.parse(result.content[0].text)
expect(Array.isArray(results)).toBe(true)
// Verify scores are sorted in descending order
// LanceDB returns distance scores (smaller means more similar), so verify ascending sort
for (let i = 0; i < results.length - 1; i++) {
expect(results[i].score).toBeLessThanOrEqual(results[i + 1].score)
}
})
// AC interpretation: [Technical requirement] Default top-5 results returned
// Validation: When limit not specified, 5 search results are returned
it('When limit not specified, default top-5 results returned', async () => {
const result = await localRagServer.handleQueryDocuments({
query: 'TypeScript',
})
const results = JSON.parse(result.content[0].text)
expect(Array.isArray(results)).toBe(true)
// If chunk count is less than 5, that number; if 5 or more, max 5 results
expect(results.length).toBeLessThanOrEqual(5)
})
// Edge Case: No matches
// Validation: When no matching documents, empty array is returned
it('Empty array returned for query with no matching documents (e.g., random string)', async () => {
// Search in empty DB
const emptyDbPath = resolve('./tmp/test-lancedb-empty')
mkdirSync(emptyDbPath, { recursive: true })
const emptyServer = new RAGServer({
dbPath: emptyDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: testDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await emptyServer.initialize()
const result = await emptyServer.handleQueryDocuments({
query: 'xyzabc123randomstring',
})
const results = JSON.parse(result.content[0].text)
expect(Array.isArray(results)).toBe(true)
expect(results.length).toBe(0)
rmSync(emptyDbPath, { recursive: true, force: true })
})
// Edge Case: limit boundary values
// Validation: Operates normally with boundary values limit=1, limit=20
it('Operates normally with boundary values limit=1, limit=20', async () => {
const result1 = await localRagServer.handleQueryDocuments({
query: 'TypeScript',
limit: 1,
})
const results1 = JSON.parse(result1.content[0].text)
expect(Array.isArray(results1)).toBe(true)
expect(results1.length).toBeLessThanOrEqual(1)
const result20 = await localRagServer.handleQueryDocuments({
query: 'TypeScript',
limit: 20,
})
const results20 = JSON.parse(result20.content[0].text)
expect(Array.isArray(results20)).toBe(true)
expect(results20.length).toBeLessThanOrEqual(20)
})
})
describe('AC-005: Error Handling (Basic)', () => {
// AC interpretation: [Error handling] Error message returned for non-existent file path
// Validation: Call ingest_file with non-existent file path, FileOperationError is returned
it('FileOperationError returned for non-existent file path (e.g., /nonexistent/file.pdf)', async () => {
const nonExistentFile = resolve(testDataDir, 'nonexistent-file.pdf')
await expect(ragServer.handleIngestFile({ filePath: nonExistentFile })).rejects.toThrow()
})
// AC interpretation: [Error handling] Error message returned for corrupted PDF file
// Validation: Call ingest_file with corrupted PDF file, FileOperationError is returned
it('FileOperationError returned for corrupted PDF file (e.g., invalid header)', async () => {
// Create corrupted PDF file
const corruptedPdf = resolve(testDataDir, 'corrupted.pdf')
writeFileSync(corruptedPdf, 'This is not a valid PDF file')
await expect(ragServer.handleIngestFile({ filePath: corruptedPdf })).rejects.toThrow()
})
// AC interpretation: [Error handling] Error message returned when LanceDB connection fails
// Validation: When LanceDB connection fails, DatabaseError is returned
it('DatabaseError returned when LanceDB connection fails (e.g., invalid dbPath)', async () => {
// Attempt to initialize RAGServer with invalid dbPath
const invalidDbPath = '/invalid/path/that/does/not/exist'
const invalidServer = new RAGServer({
dbPath: invalidDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: testDataDir,
maxFileSize: 100 * 1024 * 1024,
})
// Verify error occurs during initialization or query execution
// LanceDB initialization may succeed with invalid path, but actual operations may fail
// Here we verify either initialization succeeds or error occurs
try {
await invalidServer.initialize()
// If initialization succeeds, verify error on actual query
await expect(invalidServer.handleQueryDocuments({ query: 'test' })).rejects.toThrow()
} catch (error) {
// Error during initialization is also OK
expect(error).toBeDefined()
}
})
})
})
// ============================================
// MVP Phase 2: Complete Functionality Integration Test
// ============================================
describe('RAG MCP Server Integration Test - Phase 2', () => {
beforeAll(async () => {
// Setup: LanceDB initialization, Transformers.js model load
})
afterAll(async () => {
// Cleanup: Delete test data, close DB connection
})
describe('AC-006: Additional Format Support (Phase 2)', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac006')
const localTestDataDir = resolve('./tmp/test-data-ac006')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-006
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Functional requirement] DOCX files ingested via ingest_file tool and text extracted
// Validation: Call ingest_file with DOCX file path, text extraction and chunk storage succeed
it('DOCX file ingested via ingest_file tool, text properly extracted and saved to LanceDB', async () => {
// Create test DOCX file (mammoth requires actual DOCX file)
// Use mammoth mock or actual DOCX file
// Here, instead of creating text file with .docx extension,
// test parseDocx method directly since actual DOCX file is required
const { DocumentParser } = await import('../../parser/index')
const parser = new DocumentParser({
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
// Create simple DOCX file (binary format actually required)
// Here, instead of creating minimal DOCX that mammoth can process,
// verify error handling with invalid file as error handling test
const testDocxFile = resolve(localTestDataDir, 'test-sample.docx')
// Creating actual DOCX file is complex,
// so verify parseDocx method is properly defined
// Actual DOCX file testing done manually or in E2E tests
// Verify parseFile method recognizes .docx extension
const testTxtFile = resolve(localTestDataDir, 'test-for-docx.txt')
writeFileSync(testTxtFile, 'Test content for DOCX format check')
// Verify calling parseFile as .docx file calls parseDocx
// (Will error without actual DOCX file, but can verify branching is correct)
try {
// Expect error since not actual DOCX file
const fakeDocxFile = resolve(localTestDataDir, 'fake.docx')
writeFileSync(fakeDocxFile, 'Not a real DOCX file')
await parser.parseFile(fakeDocxFile)
// Fail if error does not occur
expect(false).toBe(true)
} catch (error) {
// Verify FileOperationError occurs (DOCX parse failure)
expect((error as Error).name).toBe('FileOperationError')
expect((error as Error).message).toContain('Failed to parse DOCX')
}
})
// AC interpretation: [Functional requirement] All formats (PDF/DOCX/TXT/MD) ingested successfully
// Validation: All 4 formats (PDF, DOCX, TXT, MD) ingested successfully
it('Sample files for all formats (PDF, DOCX, TXT, MD) ingested successfully', async () => {
// Test DocumentParser directly to verify all 4 formats are supported
const { DocumentParser } = await import('../../parser/index')
const parser = new DocumentParser({
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
// Test TXT file parsing
const testTxtFile = resolve(localTestDataDir, 'test-all-formats.txt')
writeFileSync(testTxtFile, 'Test content for TXT format')
const txtResult = await parser.parseFile(testTxtFile)
expect(txtResult.content).toBe('Test content for TXT format')
// Test MD file parsing
const testMdFile = resolve(localTestDataDir, 'test-all-formats.md')
writeFileSync(testMdFile, '# Test Markdown\n\nTest content for MD format')
const mdResult = await parser.parseFile(testMdFile)
expect(mdResult.content).toBe('# Test Markdown\n\nTest content for MD format')
// Verify DOCX file branching exists
// Verify FileOperationError occurs with invalid DOCX file
const fakeDocxFile = resolve(localTestDataDir, 'test-all-formats.docx')
writeFileSync(fakeDocxFile, 'Not a real DOCX file')
try {
await parser.parseFile(fakeDocxFile)
// Fail if error does not occur
expect(false).toBe(true)
} catch (error) {
// Verify FileOperationError occurs (DOCX parse failure)
expect((error as Error).name).toBe('FileOperationError')
expect((error as Error).message).toContain('Failed to parse DOCX')
}
// PDF uses parsePdf directly (not parseFile)
// Verify parseFile rejects PDF files
const fakePdfFile = resolve(localTestDataDir, 'test-all-formats.pdf')
writeFileSync(fakePdfFile, 'Not a real PDF file')
try {
await parser.parseFile(fakePdfFile)
// Fail if error does not occur
expect(false).toBe(true)
} catch (error) {
// Verify ValidationError occurs (PDF not supported via parseFile)
expect((error as Error).name).toBe('ValidationError')
expect((error as Error).message).toContain('Unsupported file format')
}
// Verify all 3 formats (DOCX, TXT, MD) are supported via parseFile
// PDF is handled by parsePdf directly
})
})
describe('AC-007: File Management', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac007')
const localTestDataDir = resolve('./tmp/test-data-ac007')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-007
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
// Ingest test documents (3 files)
const testFile1 = resolve(localTestDataDir, 'test-file-1.txt')
writeFileSync(testFile1, 'This is test file 1. '.repeat(50)) // Approx 1000 characters
await localRagServer.handleIngestFile({ filePath: testFile1 })
const testFile2 = resolve(localTestDataDir, 'test-file-2.txt')
writeFileSync(testFile2, 'This is test file 2. '.repeat(30)) // Approx 600 characters
await localRagServer.handleIngestFile({ filePath: testFile2 })
const testFile3 = resolve(localTestDataDir, 'test-file-3.txt')
writeFileSync(testFile3, 'This is test file 3. '.repeat(20)) // Approx 400 characters
await localRagServer.handleIngestFile({ filePath: testFile3 })
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Functional requirement] List of ingested files displayed via list_files tool
// Validation: Call list_files, list of ingested files is returned
it('List of ingested files (filename, path, chunk count, ingestion time) displayed via list_files tool', async () => {
const result = await localRagServer.handleListFiles()
expect(result).toBeDefined()
expect(result.content).toBeDefined()
expect(result.content.length).toBe(1)
expect(result.content[0].type).toBe('text')
const files = JSON.parse(result.content[0].text)
expect(files.files).toBeDefined()
expect(files.files.length).toBe(3)
// Verify each ingested file contains required fields
for (const file of files.files.filter((f: { ingested: boolean }) => f.ingested)) {
expect(file.filePath).toBeDefined()
expect(file.chunkCount).toBeDefined()
expect(file.timestamp).toBeDefined()
}
})
// AC interpretation: [Functional requirement] Filename, path, chunk count, ingestion time accurately displayed
// Validation: list_files result contains detailed information for each file
it('list_files result accurately contains detailed information (filePath, chunkCount, timestamp) for each file', async () => {
const result = await localRagServer.handleListFiles()
const files = JSON.parse(result.content[0].text)
const { files: filesInBaseDir } = files
// Verify test-file-1.txt information
const testFile1Path = resolve(localTestDataDir, 'test-file-1.txt')
const file1 = filesInBaseDir.find((f: { filePath: string }) => f.filePath === testFile1Path)
expect(file1).toBeDefined()
expect(file1.chunkCount).toBeGreaterThan(0)
expect(file1.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/)
// Verify test-file-2.txt information
const testFile2Path = resolve(localTestDataDir, 'test-file-2.txt')
const file2 = filesInBaseDir.find((f: { filePath: string }) => f.filePath === testFile2Path)
expect(file2).toBeDefined()
expect(file2.chunkCount).toBeGreaterThan(0)
expect(file2.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/)
// Verify test-file-3.txt information
const testFile3Path = resolve(localTestDataDir, 'test-file-3.txt')
const file3 = filesInBaseDir.find((f: { filePath: string }) => f.filePath === testFile3Path)
expect(file3).toBeDefined()
expect(file3.chunkCount).toBeGreaterThan(0)
expect(file3.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/)
})
// AC interpretation: [Functional requirement] Supported file in BASE_DIR not yet ingested appears as ingested: false
// Validation: Place a file in BASE_DIR without ingesting it, list_files shows { filePath, ingested: false }
it('File in BASE_DIR not yet ingested appears with ingested: false in list_files', async () => {
const uningestedFile = resolve(localTestDataDir, 'not-yet-ingested.txt')
writeFileSync(uningestedFile, 'This file has not been ingested.')
try {
const result = await localRagServer.handleListFiles()
const files = JSON.parse(result.content[0].text)
const entry = files.files.find((f: { filePath: string }) => f.filePath === uningestedFile)
expect(entry).toBeDefined()
expect(entry.ingested).toBe(false)
expect(entry.chunkCount).toBeUndefined()
expect(entry.timestamp).toBeUndefined()
} finally {
rmSync(uningestedFile, { force: true })
}
})
// AC interpretation: [Functional requirement] System status displayed via status tool
// Validation: Call status, document count, chunk count, memory usage, uptime are returned
it('System status (documentCount, chunkCount, memoryUsage, uptime) displayed via status tool', async () => {
const result = await localRagServer.handleStatus()
expect(result).toBeDefined()
expect(result.content).toBeDefined()
expect(result.content.length).toBe(1)
expect(result.content[0].type).toBe('text')
const status = JSON.parse(result.content[0].text)
expect(status.documentCount).toBe(3)
expect(status.chunkCount).toBeGreaterThan(0)
expect(status.memoryUsage).toBeGreaterThan(0)
expect(status.uptime).toBeGreaterThan(0)
})
describe('System-managed path exclusion from list_files', () => {
let excludeServer: RAGServer
const excludeTestBase = resolve('./tmp/test-exclude-base')
const excludeTestDb = resolve(excludeTestBase, 'lancedb')
const excludeTestCache = resolve(excludeTestBase, 'models')
beforeAll(async () => {
// Create base directory and system-managed subdirectories
mkdirSync(excludeTestBase, { recursive: true })
mkdirSync(excludeTestDb, { recursive: true })
mkdirSync(excludeTestCache, { recursive: true })
// Place system files in dbPath and cacheDir before server init
writeFileSync(resolve(excludeTestDb, 'db-internal.txt'), 'Database internal file')
writeFileSync(resolve(excludeTestCache, 'model-cache.txt'), 'Model cache file')
// Place a user document in baseDir root
writeFileSync(resolve(excludeTestBase, 'user-document.txt'), 'User document content')
// Place user documents in a docs/ subdirectory
mkdirSync(resolve(excludeTestBase, 'docs'), { recursive: true })
writeFileSync(resolve(excludeTestBase, 'docs', 'notes.txt'), 'Notes in docs subdirectory')
excludeServer = new RAGServer({
dbPath: excludeTestDb,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: excludeTestCache,
baseDir: excludeTestBase,
maxFileSize: 100 * 1024 * 1024,
})
await excludeServer.initialize()
})
afterAll(async () => {
rmSync(excludeTestBase, { recursive: true, force: true })
})
it('System-managed paths excluded from list_files scan', async () => {
const result = await excludeServer.handleListFiles()
const parsed = JSON.parse(result.content[0].text)
const filePaths: string[] = parsed.files.map((f: { filePath: string }) => f.filePath)
// User documents should be present (root and subdirectory)
expect(filePaths).toContain(resolve(excludeTestBase, 'user-document.txt'))
expect(filePaths).toContain(resolve(excludeTestBase, 'docs', 'notes.txt'))
// Files inside dbPath and cacheDir should NOT be present
expect(filePaths).not.toContain(resolve(excludeTestDb, 'db-internal.txt'))
expect(filePaths).not.toContain(resolve(excludeTestCache, 'model-cache.txt'))
})
it('raw-data .md files inside dbPath excluded from files array', async () => {
// Ingest data via handleIngestData to create raw-data .md in dbPath/raw-data/
await excludeServer.handleIngestData({
content:
'Integration test content for raw-data exclusion verification. ' +
'This content is long enough to produce at least one chunk in the system.',
metadata: {
source: 'https://example.com/exclude-test',
format: 'text',
},
})
const result = await excludeServer.handleListFiles()
const parsed = JSON.parse(result.content[0].text)
const filePaths: string[] = parsed.files.map((f: { filePath: string }) => f.filePath)
// raw-data .md should NOT appear in files
const rawDataFiles = filePaths.filter((fp) => fp.includes('raw-data'))
expect(rawDataFiles).toHaveLength(0)
// raw-data should appear in sources
expect(parsed.sources.length).toBeGreaterThan(0)
const sourceEntry = parsed.sources.find(
(s: { source?: string }) => s.source === 'https://example.com/exclude-test'
)
expect(sourceEntry).toBeDefined()
})
it('dbPath/cacheDir outside baseDir causes no errors', async () => {
// Create a separate setup where dbPath and cacheDir are siblings of baseDir
const siblingBase = resolve('./tmp/test-exclude-sibling')
const siblingData = resolve(siblingBase, 'data')
const siblingDb = resolve(siblingBase, 'db')
const siblingCache = resolve(siblingBase, 'cache')
mkdirSync(siblingData, { recursive: true })
mkdirSync(siblingDb, { recursive: true })
mkdirSync(siblingCache, { recursive: true })
// Place a user file in the data directory
writeFileSync(resolve(siblingData, 'sibling-file.txt'), 'File in sibling baseDir')
try {
const siblingServer = new RAGServer({
dbPath: siblingDb,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: siblingCache,
baseDir: siblingData,
maxFileSize: 100 * 1024 * 1024,
})
await siblingServer.initialize()
const result = await siblingServer.handleListFiles()
const parsed = JSON.parse(result.content[0].text)
const filePaths: string[] = parsed.files.map((f: { filePath: string }) => f.filePath)
// Only the user file should appear
expect(filePaths).toContain(resolve(siblingData, 'sibling-file.txt'))
expect(parsed.files.length).toBe(1)
} finally {
rmSync(siblingBase, { recursive: true, force: true })
}
})
})
})
describe('AC-008: File Re-ingestion', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac008')
const localTestDataDir = resolve('./tmp/test-data-ac008')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-008
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Functional requirement] When existing file is re-ingested, old data is completely deleted
// Validation: Re-ingest with same file path, old chunks are deleted
it('When existing file is re-ingested, old data is completely deleted', async () => {
// Initial ingestion
const testFile = resolve(localTestDataDir, 'test-reingest.txt')
writeFileSync(testFile, 'This is the original content. '.repeat(50))
await localRagServer.handleIngestFile({ filePath: testFile })
// Re-ingestion (content changed)
writeFileSync(testFile, 'This is the updated content. '.repeat(30))
const result2 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest2 = JSON.parse(result2.content[0].text)
const updatedChunkCount = ingest2.chunkCount
// Validation: Only one file exists in file list
const listResult = await localRagServer.handleListFiles()
const files = JSON.parse(listResult.content[0].text)
const targetFiles = files.files.filter((f: { filePath: string }) => f.filePath === testFile)
expect(targetFiles.length).toBe(1)
// Validation: Chunk count matches new data (not old + new combined)
expect(targetFiles[0].chunkCount).toBe(updatedChunkCount)
})
// AC interpretation: [Technical requirement] After re-ingestion, only new data exists (0 duplicate data)
// Validation: After re-ingestion, chunks with same filePath contain only new data
it('After re-ingestion, only new data exists (0 duplicate data, R-003)', async () => {
// Initial ingestion
const testFile = resolve(localTestDataDir, 'test-no-duplicate.txt')
writeFileSync(testFile, 'Original data. '.repeat(50))
const result1 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest1 = JSON.parse(result1.content[0].text)
const originalChunkCount = ingest1.chunkCount
// Re-ingestion
writeFileSync(testFile, 'Updated data. '.repeat(40))
const result2 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest2 = JSON.parse(result2.content[0].text)
const updatedChunkCount = ingest2.chunkCount
// Validation: Only one file exists in file list (no duplicates)
const listResult = await localRagServer.handleListFiles()
const files = JSON.parse(listResult.content[0].text)
const targetFiles = files.files.filter((f: { filePath: string }) => f.filePath === testFile)
expect(targetFiles.length).toBe(1)
// Validation: Chunk count matches new data only (not old + new)
expect(targetFiles[0].chunkCount).toBe(updatedChunkCount)
expect(targetFiles[0].chunkCount).not.toBe(originalChunkCount + updatedChunkCount)
// Validation: Timestamp is updated
expect(targetFiles[0].timestamp).toBeDefined()
})
// AC interpretation: [Technical requirement] Atomicity of delete→insert guaranteed (transaction processing)
// Validation: Delete and insert executed atomically, no intermediate state exists
it('Atomicity of delete→insert guaranteed (transaction processing)', async () => {
// Verify transaction processing by confirming implementation executes backup→delete→insert in order
// Here, verify that in normal case, old data is completely deleted and only new data exists
const testFile = resolve(localTestDataDir, 'test-atomicity.txt')
writeFileSync(testFile, 'Atomicity test data. '.repeat(50))
const result1 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest1 = JSON.parse(result1.content[0].text)
const originalChunkCount = ingest1.chunkCount
// Re-ingestion
writeFileSync(testFile, 'Atomicity test updated. '.repeat(40))
const result2 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest2 = JSON.parse(result2.content[0].text)
const updatedChunkCount = ingest2.chunkCount
// Validation: Only one file exists in file list (atomicity guaranteed)
const listResult = await localRagServer.handleListFiles()
const files = JSON.parse(listResult.content[0].text)
const targetFiles = files.files.filter((f: { filePath: string }) => f.filePath === testFile)
expect(targetFiles.length).toBe(1)
// Validation: Chunk count proves atomicity - only new data exists (not old + new)
expect(targetFiles[0].chunkCount).toBe(updatedChunkCount)
expect(targetFiles[0].chunkCount).not.toBe(originalChunkCount + updatedChunkCount)
})
// AC interpretation: [Error handling] On error, automatic rollback from backup
// Validation: When error occurs during insertion, old data is restored
it('On error (e.g., insertion failure), automatic rollback from backup', async () => {
// Verify rollback functionality by confirming implementation catches error with try-catch and restores from backup
// Here, verify that in normal case without error, old data is completely deleted and only new data exists
// Rollback on error requires implementation-level test (using mocks)
const testFile = resolve(localTestDataDir, 'test-rollback.txt')
writeFileSync(testFile, 'Rollback test data. '.repeat(50))
const result1 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest1 = JSON.parse(result1.content[0].text)
const originalChunkCount = ingest1.chunkCount
// Re-ingest normally (no error)
writeFileSync(testFile, 'Rollback test updated. '.repeat(40))
const result2 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest2 = JSON.parse(result2.content[0].text)
const updatedChunkCount = ingest2.chunkCount
// Validation: In normal case, no rollback occurs and new data exists
const listResult = await localRagServer.handleListFiles()
const files = JSON.parse(listResult.content[0].text)
const targetFiles = files.files.filter((f: { filePath: string }) => f.filePath === testFile)
expect(targetFiles.length).toBe(1)
// Validation: Chunk count confirms successful re-ingestion (not old + new)
expect(targetFiles[0].chunkCount).toBe(updatedChunkCount)
expect(targetFiles[0].chunkCount).not.toBe(originalChunkCount + updatedChunkCount)
// Note: Rollback behavior on error needs to be verified in unit test
// by mocking VectorStore.insertChunks to cause error
})
// AC interpretation: [Data protection] Prevent data loss when re-ingest results in 0 chunks
// Validation: When chunking produces 0 chunks, error is thrown before delete (preserves existing data)
it('Throws error when chunking produces 0 chunks (prevents data loss on re-ingest)', async () => {
// Initial ingestion with valid content
const testFile = resolve(localTestDataDir, 'test-empty-chunks.txt')
writeFileSync(testFile, 'This is valid content for initial ingestion. '.repeat(50))
const result1 = await localRagServer.handleIngestFile({ filePath: testFile })
const ingest1 = JSON.parse(result1.content[0].text)
expect(ingest1.chunkCount).toBeGreaterThan(0)
// Re-ingest with empty content (should fail, preserving original data)
writeFileSync(testFile, '')
await expect(localRagServer.handleIngestFile({ filePath: testFile })).rejects.toThrow(
/No.*chunks/i
)
// Validation: Original data is preserved (not deleted)
const listResult = await localRagServer.handleListFiles()
const files = JSON.parse(listResult.content[0].text)
const targetFiles = files.files.filter((f: { filePath: string }) => f.filePath === testFile)
expect(targetFiles.length).toBe(1)
expect(targetFiles[0].chunkCount).toBe(ingest1.chunkCount)
})
})
describe('AC-009: Error Handling (Complete)', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac009')
const localTestDataDir = resolve('./tmp/test-data-ac009')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-009
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024, // 100MB
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Error handling] Error message returned for file without access permission
// Validation: Call ingest_file with file without access permission, FileOperationError is returned
it('FileOperationError returned for file without access permission (e.g., chmod 000)', async () => {
// Test with non-existent file since chmod 000 does not work on Windows
// (File read error occurs instead of access permission error)
const nonExistentFile = resolve(localTestDataDir, 'nonexistent-file.txt')
await expect(localRagServer.handleIngestFile({ filePath: nonExistentFile })).rejects.toThrow()
})
// AC interpretation: [Error handling] Size overflow error returned for files over 100MB
// Validation: Call ingest_file with file over 100MB, ValidationError is returned
it('ValidationError (size overflow) returned for files over 100MB (e.g., 101MB)', async () => {
// Create file over 100MB (simulate 101MB since actually too large)
// Integration test verifies file size check logic
const testFile = resolve(localTestDataDir, 'large-file.txt')
// Creating actual 101MB file makes test slow,
// so verify DocumentParser.validateFileSize applies 100MB limit
// Here, verify normal operation with small file (with enough content for chunking)
writeFileSync(
testFile,
'Small file content for validation test of file size limits. ' +
'This content needs to be long enough to generate at least one chunk. ' +
'The semantic chunker requires sufficient text content to process properly.'
)
// Verify normal operation (under 100MB)
await expect(localRagServer.handleIngestFile({ filePath: testFile })).resolves.toBeDefined()
// Note: Actual test with file over 100MB is done in DocumentParser unit test
})
// AC interpretation: [Security] Path traversal attacks are rejected (S-002)
// Validation: Call ingest_file with invalid path like `../../etc/passwd`, ValidationError is returned
it('Path traversal attack (e.g., ../../etc/passwd) rejected with ValidationError (S-002)', async () => {
// Attempt path traversal attack
await expect(
localRagServer.handleIngestFile({ filePath: '../../etc/passwd' })
).rejects.toThrow('absolute path')
})
// AC interpretation: [Error handling] Appropriate error message returned when out of memory
// Validation: Execute processing in out of memory state, appropriate error message is returned
it('Appropriate error message returned when out of memory (simulated)', async () => {
// Simulating out of memory error is difficult,
// so verify error handling is implemented
// Actual out of memory errors are detected by monitoring in production environment
const testFile = resolve(localTestDataDir, 'memory-test.txt')
writeFileSync(
testFile,
'Memory test content for verifying error handling implementation. ' +
'This content needs to be long enough to generate chunks properly. ' +
'The semantic chunker processes text into meaningful segments.'
)
// Verify normal operation
await expect(localRagServer.handleIngestFile({ filePath: testFile })).resolves.toBeDefined()
// Note: Actual out of memory error testing is done in mocks or E2E tests
})
// AC interpretation: [Security] Error messages do not contain stack traces by default (S-004)
// MCP servers should be secure by default - only show stack traces when explicitly in development mode
it('Stack traces not included by default when NODE_ENV is not set (S-004)', async () => {
const originalEnv = process.env['NODE_ENV']
process.env['NODE_ENV'] = undefined
try {
const nonExistentFile = resolve(localTestDataDir, 'nonexistent-default.txt')
await localRagServer.handleIngestFile({ filePath: nonExistentFile })
} catch (error) {
const errorMessage = (error as Error).message
expect(errorMessage).not.toContain('at ')
expect(errorMessage).not.toContain('.ts:')
} finally {
process.env['NODE_ENV'] = originalEnv
}
})
// Development mode should include stack traces for debugging
it('Stack traces included when NODE_ENV=development (S-004)', async () => {
const originalEnv = process.env['NODE_ENV']
process.env['NODE_ENV'] = 'development'
try {
const nonExistentFile = resolve(localTestDataDir, 'nonexistent-dev.txt')
await localRagServer.handleIngestFile({ filePath: nonExistentFile })
} catch (error) {
const errorMessage = (error as Error).message
// In development mode, stack trace should be included
expect(errorMessage).toContain('at ')
} finally {
process.env['NODE_ENV'] = originalEnv
}
})
})
describe('AC-010: File Deletion', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-ac010')
const localTestDataDir = resolve('./tmp/test-data-ac010')
beforeAll(async () => {
// Setup dedicated RAGServer for AC-010
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// AC interpretation: [Functional requirement] Deleted file no longer appears in list_files
// Validation: Delete ingested file, verify it no longer appears in list_files
it('Deleted file no longer appears in list_files', async () => {
const testFile = resolve(localTestDataDir, 'test-delete.txt')
writeFileSync(testFile, 'This file will be deleted. '.repeat(50))
await localRagServer.handleIngestFile({ filePath: testFile })
// Verify file exists before deletion
const listBefore = await localRagServer.handleListFiles()
const filesBefore = JSON.parse(listBefore.content[0].text)
expect(filesBefore.files.some((f: { filePath: string }) => f.filePath === testFile)).toBe(
true
)
// Execute deletion
await localRagServer.handleDeleteFile({ filePath: testFile })
// Verify file is no longer ingested after deletion (still on disk, but ingested: false)
const listAfter = await localRagServer.handleListFiles()
const filesAfter = JSON.parse(listAfter.content[0].text)
expect(
filesAfter.files.some(
(f: { filePath: string; ingested: boolean }) => f.filePath === testFile && f.ingested
)
).toBe(false)
})
// AC interpretation: [Functional requirement] Deleted file content does not appear in search results
// Validation: Delete file, verify its content is not returned in search results
it('Deleted file content does not appear in search results', async () => {
const testFile = resolve(localTestDataDir, 'test-search-delete.txt')
writeFileSync(testFile, 'Unique keyword XYZABC123 for deletion test. '.repeat(30))
await localRagServer.handleIngestFile({ filePath: testFile })
// Search before deletion
const searchBefore = await localRagServer.handleQueryDocuments({
query: 'XYZABC123',
limit: 5,
})
const resultsBefore = JSON.parse(searchBefore.content[0].text)
expect(resultsBefore.length).toBeGreaterThan(0)
// Execute deletion
await localRagServer.handleDeleteFile({ filePath: testFile })
// Search after deletion
const searchAfter = await localRagServer.handleQueryDocuments({
query: 'XYZABC123',
limit: 5,
})
const resultsAfter = JSON.parse(searchAfter.content[0].text)
expect(resultsAfter.length).toBe(0)
})
// AC interpretation: [Functional requirement] Deleting non-existent file is idempotent
// Validation: Delete non-existent file, operation completes without error
it('Deleting non-existent file completes without error (idempotent)', async () => {
const nonExistentFile = resolve(localTestDataDir, 'non-existent.txt')
// Verify operation completes without error
await expect(
localRagServer.handleDeleteFile({ filePath: nonExistentFile })
).resolves.toBeDefined()
})
// AC interpretation: [Security] Relative path deletion is rejected (S-002)
// Validation: Attempt deletion with relative path, ValidationError is returned
it('Relative path deletion rejected with error (S-002 security)', async () => {
await expect(
localRagServer.handleDeleteFile({ filePath: '../../../etc/passwd' })
).rejects.toThrow('absolute path')
})
})
describe('File Title Extraction Pipeline', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-title')
const localTestDataDir = resolve('./tmp/test-data-title')
beforeAll(async () => {
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// T-3: Verify ingest_data HTML → fileTitle end-to-end pipeline
it('ingest_data with HTML content preserves fileTitle in query results', async () => {
const html = `
<!DOCTYPE html>
<html>
<head><title>RAG Architecture Guide</title></head>
<body>
<article>
<h1>RAG Architecture Guide</h1>
<p>Retrieval-Augmented Generation combines information retrieval with language model generation to produce accurate and grounded responses.</p>
<p>This approach helps reduce hallucinations by providing relevant context from a knowledge base.</p>
</article>
</body>
</html>
`
// Ingest HTML via ingest_data
const ingestResult = await localRagServer.handleIngestData({
content: html,
metadata: {
source: 'https://example.com/rag-guide',
format: 'html',
},
})
const ingestData = JSON.parse(ingestResult.content[0].text)
expect(ingestData.chunkCount).toBeGreaterThan(0)
expect(ingestData.fileTitle).toBe('RAG Architecture Guide')
// Query and verify fileTitle appears in results
const queryResult = await localRagServer.handleQueryDocuments({
query: 'RAG retrieval augmented generation',
limit: 5,
})
const results = JSON.parse(queryResult.content[0].text)
expect(results.length).toBeGreaterThan(0)
// Verify fileTitle has the exact expected value in search results
const relevantResult = results.find(
(r: { fileTitle?: string | null }) => r.fileTitle === 'RAG Architecture Guide'
)
expect(relevantResult).toBeDefined()
})
})
describe('Meta JSON Sidecar Pipeline', () => {
let localRagServer: RAGServer
const localTestDbPath = resolve('./tmp/test-lancedb-meta-json')
const localTestDataDir = resolve('./tmp/test-data-meta-json')
beforeAll(async () => {
mkdirSync(localTestDbPath, { recursive: true })
mkdirSync(localTestDataDir, { recursive: true })
localRagServer = new RAGServer({
dbPath: localTestDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/models',
baseDir: localTestDataDir,
maxFileSize: 100 * 1024 * 1024,
})
await localRagServer.initialize()
})
afterAll(async () => {
rmSync(localTestDbPath, { recursive: true, force: true })
rmSync(localTestDataDir, { recursive: true, force: true })
})
// Test 1: ingest_data HTML creates .meta.json with correct title and format
it('ingest_data HTML creates .meta.json with correct title and format', async () => {
const html = `
<!DOCTYPE html>
<html>
<head><title>Meta JSON Test Page</title></head>
<body>
<article>
<h1>Meta JSON Test Page</h1>
<p>This article describes how meta JSON sidecar files work in the RAG pipeline for preserving document metadata.</p>
<p>The sidecar approach decouples metadata storage from the raw content, enabling clean separation of concerns.</p>
</article>
</body>
</html>
`
const ingestResult = await localRagServer.handleIngestData({
content: html,
metadata: {
source: 'https://example.com/meta-json-test',
format: 'html',
},
})
const ingestData = JSON.parse(ingestResult.content[0].text)
expect(ingestData.chunkCount).toBeGreaterThan(0)
// Derive the raw-data .md path and .meta.json path
const rawDataPath = generateRawDataPath(
localTestDbPath,
'https://example.com/meta-json-test',
'markdown'
)
const metaJsonPath = generateMetaJsonPath(rawDataPath)
// Verify .meta.json exists and has correct content (read raw file, not via loadMetaJson)
expect(existsSync(metaJsonPath)).toBe(true)
const metaRaw = JSON.parse(await readFile(metaJsonPath, 'utf-8'))
expect(metaRaw.title).toBe('Meta JSON Test Page')
expect(metaRaw.format).toBe('html')
expect(metaRaw.source).toBe('https://example.com/meta-json-test')
})
// Test 2: ingest_data markdown creates .meta.json with title from H1
it('ingest_data markdown creates .meta.json with title from H1', async () => {
const markdownContent = [
'# My Markdown Title',
'',
'This is a detailed markdown document that explains the concept of semantic chunking.',
'Semantic chunking splits text at natural boundaries like paragraphs and sentences.',
'It produces higher quality chunks than fixed-size approaches.',
].join('\n')
await localRagServer.handleIngestData({
content: markdownContent,
metadata: {
source: 'https://example.com/markdown-meta-test',
format: 'markdown',
},
})
// Verify .meta.json (read raw file, not via loadMetaJson)
const rawDataPath = generateRawDataPath(
localTestDbPath,
'https://example.com/markdown-meta-test',
'markdown'
)
const metaJsonPath = generateMetaJsonPath(rawDataPath)
const metaRaw = JSON.parse(await readFile(metaJsonPath, 'utf-8'))
expect(metaRaw.title).toBe('My Markdown Title')
expect(metaRaw.format).toBe('markdown')
})
// Test 3: ingest_data text creates .meta.json with title from first line
it('ingest_data text creates .meta.json with title from first line', async () => {
const textContent = [
'My Text Document Title',
'',
'This is the body of the text document that contains useful information.',
'The first line followed by a blank line serves as the document title.',
'This pattern is commonly used in plain text documents.',
].join('\n')
await localRagServer.handleIngestData({
content: textContent,
metadata: {
source: 'https://example.com/text-meta-test',
format: 'text',
},
})
// Verify .meta.json (read raw file, not via loadMetaJson)
const rawDataPath = generateRawDataPath(
localTestDbPath,
'https://example.com/text-meta-test',
'markdown'
)
const metaJsonPath = generateMetaJsonPath(rawDataPath)
const metaRaw = JSON.parse(await readFile(metaJsonPath, 'utf-8'))
expect(metaRaw.title).toBe('My Text Document Title')
expect(metaRaw.format).toBe('text')
})
// Test 4: ingest_data HTML -> query returns correct fileTitle without H1 duplication
it('ingest_data HTML -> query returns correct fileTitle without H1 duplication', async () => {
const html = `
<!DOCTYPE html>
<html>
<head><title>Duplication Check Title</title></head>
<body>
<article>
<h1>Duplication Check Title</h1>
<p>Vector embeddings are numerical representations of text that capture semantic meaning in high-dimensional space.</p>
<p>These embeddings enable efficient similarity search across large document collections.</p>
</article>
</body>
</html>
`
await localRagServer.handleIngestData({
content: html,
metadata: {
source: 'https://example.com/duplication-check',
format: 'html',
},
})
// Query for the content
const queryResult = await localRagServer.handleQueryDocuments({
query: 'vector embeddings semantic meaning',
limit: 5,
})
const results = JSON.parse(queryResult.content[0].text)
expect(results.length).toBeGreaterThan(0)
// Find results from this specific source
const rawDataPath = generateRawDataPath(
localTestDbPath,
'https://example.com/duplication-check',
'markdown'
)
const relevantResults = results.filter(
(r: { filePath: string }) => r.filePath === rawDataPath
)
expect(relevantResults.length).toBeGreaterThan(0)
// Verify fileTitle is set correctly
for (const result of relevantResults) {
expect(result.fileTitle).toBe('Duplication Check Title')
// Verify chunk text does NOT start with "# Duplication Check Title"
expect(result.text.startsWith('# Duplication Check Title')).toBe(false)
}
// Also verify the .md file on disk does not start with "# title\n\n"
const mdContent = await readFile(rawDataPath, 'utf-8')
expect(mdContent.startsWith('# Duplication Check Title\n\n')).toBe(false)
})
// Test 5: delete_file with source removes both .md and .meta.json
it('delete_file with source removes both .md and .meta.json', async () => {
const html = `
<!DOCTYPE html>
<html>
<head><title>Delete Test Page</title></head>
<body>
<article>
<h1>Delete Test Page</h1>
<p>This document will be ingested and then deleted to verify cleanup of sidecar files.</p>
<p>Both the raw markdown file and the meta JSON sidecar should be removed.</p>
</article>
</body>
</html>
`
// Ingest the content
await localRagServer.handleIngestData({
content: html,
metadata: {
source: 'https://example.com/delete-meta-test',
format: 'html',
},
})
// Verify files exist before deletion
const rawDataPath = generateRawDataPath(
localTestDbPath,
'https://example.com/delete-meta-test',
'markdown'
)
const metaJsonPath = generateMetaJsonPath(rawDataPath)
expect(existsSync(rawDataPath)).toBe(true)
expect(existsSync(metaJsonPath)).toBe(true)
// Delete by source
await localRagServer.handleDeleteFile({
source: 'https://example.com/delete-meta-test',
})
// Verify both files are deleted
expect(existsSync(rawDataPath)).toBe(false)
expect(existsSync(metaJsonPath)).toBe(false)
})
})
})