Local RAG

Overview Schema Related Servers Score Discussions

ingest-data.test.ts•14.1 KiB

// ingest_data Tool Test // Test Type: Integration Test // Tests handleIngestData functionality including HTML parsing and raw-data storage import { mkdir, readFile, rm } from 'node:fs/promises' import { afterAll, beforeAll, describe, expect, it } from 'vitest' import { RAGServer } from '../../server/index.js' // ============================================ // Test Configuration // ============================================ const testDbPath = './tmp/test-ingest-data-db' const testConfig = { dbPath: testDbPath, modelName: 'Xenova/all-MiniLM-L6-v2', cacheDir: './tmp/test-model-cache', baseDir: '.', maxFileSize: 10 * 1024 * 1024, } // ============================================ // Tests // ============================================ describe('ingest_data Tool', () => { let server: RAGServer beforeAll(async () => { await mkdir(testDbPath, { recursive: true }) await mkdir(testConfig.cacheDir, { recursive: true }) server = new RAGServer(testConfig) await server.initialize() }, 120000) // 2 minutes for model download afterAll(async () => { await rm(testDbPath, { recursive: true, force: true }) }) // -------------------------------------------- // Text Format Ingestion // -------------------------------------------- describe('Text Format Ingestion', () => { it('ingests plain text content', async () => { const content = 'This is plain text content for testing the ingest_data tool.' const source = 'test://plain-text-test' const result = await server.handleIngestData({ content, metadata: { source, format: 'text', }, }) const parsed = JSON.parse(result.content[0].text) expect(parsed.chunkCount).toBeGreaterThan(0) expect(parsed.filePath).toContain('raw-data') expect(parsed.filePath).toMatch(/\.md$/) // All formats use .md extension }) it('saves raw text to raw-data directory', async () => { const content = 'Content to verify file saving functionality.' const source = 'test://file-save-test' const result = await server.handleIngestData({ content, metadata: { source, format: 'text', }, }) const parsed = JSON.parse(result.content[0].text) const savedContent = await readFile(parsed.filePath, 'utf-8') expect(savedContent).toBe(content) }) }) // -------------------------------------------- // Markdown Format Ingestion // -------------------------------------------- describe('Markdown Format Ingestion', () => { it('ingests markdown content', async () => { const content = `# Heading This is markdown content with **bold** and _italic_ text. ## Subheading - List item 1 - List item 2 ` const source = 'test://markdown-test' const result = await server.handleIngestData({ content, metadata: { source, format: 'markdown', }, }) const parsed = JSON.parse(result.content[0].text) expect(parsed.chunkCount).toBeGreaterThan(0) expect(parsed.filePath).toMatch(/\.md$/) }) }) // -------------------------------------------- // HTML Format Ingestion // -------------------------------------------- describe('HTML Format Ingestion', () => { it('ingests HTML content and converts to markdown', async () => { const html = ` <!DOCTYPE html> <html> <body> <article> <h1>Test Article</h1> <p>This is the main content of the test article. It contains enough text for Readability to extract properly.</p> </article> </body> </html> ` const source = 'https://example.com/test-article' const result = await server.handleIngestData({ content: html, metadata: { source, format: 'html', }, }) const parsed = JSON.parse(result.content[0].text) expect(parsed.chunkCount).toBeGreaterThan(0) // HTML is converted to markdown, so saved as .md expect(parsed.filePath).toMatch(/\.md$/) }) it('extracts main content from HTML and removes noise', async () => { const html = ` <!DOCTYPE html> <html> <body> <nav><a href="/">Home</a><a href="/about">About</a></nav> <article> <h1>Main Article Title</h1> <p>This is the main content that should be extracted. It needs to be long enough for Readability to identify as primary content.</p> </article> <footer><p>Copyright 2024</p></footer> </body> </html> ` const source = 'https://example.com/noise-test' const result = await server.handleIngestData({ content: html, metadata: { source, format: 'html', }, }) const parsed = JSON.parse(result.content[0].text) const savedContent = await readFile(parsed.filePath, 'utf-8') // Main content should be present expect(savedContent).toContain('Main Article Title') expect(savedContent).toContain('main content that should be extracted') }) it('throws error for HTML with no extractable content', async () => { const html = '<html><body></body></html>' const source = 'https://example.com/empty-html' await expect( server.handleIngestData({ content: html, metadata: { source, format: 'html', }, }) ).rejects.toThrow('Failed to extract content from HTML') }) }) // -------------------------------------------- // Source Normalization // -------------------------------------------- describe('Source Normalization', () => { it('normalizes URL sources (removes query string)', async () => { const content = 'Test content for URL normalization' const source1 = 'https://example.com/page?utm_source=google' const source2 = 'https://example.com/page?tracking=xyz' const result1 = await server.handleIngestData({ content, metadata: { source: source1, format: 'text' }, }) const result2 = await server.handleIngestData({ content: 'Updated content', metadata: { source: source2, format: 'text' }, }) const parsed1 = JSON.parse(result1.content[0].text) const parsed2 = JSON.parse(result2.content[0].text) // Same normalized source should result in same file path expect(parsed1.filePath).toBe(parsed2.filePath) }) }) // -------------------------------------------- // Re-ingestion (Update) // -------------------------------------------- describe('Re-ingestion', () => { it('updates existing content when re-ingesting same source', async () => { const source = 'test://update-test' // Initial ingestion await server.handleIngestData({ content: 'Original content', metadata: { source, format: 'text' }, }) // Re-ingestion with updated content const result = await server.handleIngestData({ content: 'Updated content after re-ingestion', metadata: { source, format: 'text' }, }) const parsed = JSON.parse(result.content[0].text) const savedContent = await readFile(parsed.filePath, 'utf-8') expect(savedContent).toBe('Updated content after re-ingestion') expect(savedContent).not.toContain('Original') }) }) // -------------------------------------------- // List Files with Source Info // -------------------------------------------- describe('List Files with Source Info', () => { it('list_files includes source for raw-data files', async () => { const source = 'https://example.com/list-files-test' // Content needs to be long enough to generate at least one chunk const content = 'This is a longer content for testing the list files functionality. ' + 'It needs to be substantial enough to create at least one chunk. ' + 'The semantic chunker requires sufficient content to process properly.' await server.handleIngestData({ content, metadata: { source, format: 'text' }, }) const listResult = await server.handleListFiles() const files = JSON.parse(listResult.content[0].text) // Find the specific file we just ingested by source const targetFile = files.find( (f: { filePath: string; source?: string }) => f.source === source ) expect(targetFile).toBeDefined() expect(targetFile.filePath).toContain('raw-data') }) it('list_files does not include source for regular files', async () => { // Regular files (not raw-data) should not have source field // This is tested implicitly - if filePath doesn't contain 'raw-data', // the source field should be undefined const listResult = await server.handleListFiles() const files = JSON.parse(listResult.content[0].text) for (const file of files) { if (!file.filePath.includes('raw-data')) { expect(file.source).toBeUndefined() } } }) }) // -------------------------------------------- // Delete File with Physical File Removal // -------------------------------------------- describe('Delete File with Physical File Removal', () => { it('delete_file removes physical raw-data file', async () => { const source = 'https://example.com/delete-physical-test' const content = 'Content for testing physical file deletion. ' + 'This needs to be long enough to create chunks for the test to work properly.' // Ingest the data const ingestResult = await server.handleIngestData({ content, metadata: { source, format: 'text' }, }) const parsed = JSON.parse(ingestResult.content[0].text) const filePath = parsed.filePath // Verify file exists const { stat } = await import('node:fs/promises') await expect(stat(filePath)).resolves.toBeDefined() // Delete via handleDeleteFile await server.handleDeleteFile({ filePath }) // Verify physical file is deleted await expect(stat(filePath)).rejects.toThrow('ENOENT') }) it('delete_file handles missing raw-data file gracefully', async () => { const source = 'https://example.com/delete-missing-test' const content = 'Content for testing missing file handling. ' + 'This needs to be long enough to create chunks for the test to work properly.' // Ingest the data const ingestResult = await server.handleIngestData({ content, metadata: { source, format: 'text' }, }) const parsed = JSON.parse(ingestResult.content[0].text) const filePath = parsed.filePath // Manually delete the file first const { unlink } = await import('node:fs/promises') await unlink(filePath) // Delete via handleDeleteFile should not throw await expect(server.handleDeleteFile({ filePath })).resolves.toBeDefined() }) it('delete_file accepts source parameter to delete raw-data', async () => { const source = 'https://example.com/delete-by-source-test' const content = 'Content for testing deletion by source parameter. ' + 'This needs to be long enough to create chunks for the test to work properly.' // Ingest the data await server.handleIngestData({ content, metadata: { source, format: 'text' }, }) // Verify it's in list_files const listBefore = await server.handleListFiles() const filesBefore = JSON.parse(listBefore.content[0].text) const targetBefore = filesBefore.find((f: { source?: string }) => f.source === source) expect(targetBefore).toBeDefined() // Delete by source const deleteResult = await server.handleDeleteFile({ source }) const deleteData = JSON.parse(deleteResult.content[0].text) expect(deleteData.deleted).toBe(true) // Verify it's removed from list_files const listAfter = await server.handleListFiles() const filesAfter = JSON.parse(listAfter.content[0].text) const targetAfter = filesAfter.find((f: { source?: string }) => f.source === source) expect(targetAfter).toBeUndefined() }) it('delete_file throws error when neither filePath nor source provided', async () => { await expect(server.handleDeleteFile({})).rejects.toThrow( 'Either filePath or source must be provided' ) }) }) // -------------------------------------------- // Query Integration // -------------------------------------------- describe('Query Integration', () => { it('ingested data is searchable via query_documents', async () => { const uniqueContent = 'UniqueSearchableContent12345 for integration testing' const source = 'test://query-integration-test' await server.handleIngestData({ content: uniqueContent, metadata: { source, format: 'text' }, }) const queryResult = await server.handleQueryDocuments({ query: 'UniqueSearchableContent12345', limit: 5, }) const results = JSON.parse(queryResult.content[0].text) expect(results.length).toBeGreaterThan(0) expect(results[0].text).toContain('UniqueSearchableContent12345') }) it('query results include source for raw-data files', async () => { const source = 'https://example.com/source-restoration-test' const content = 'SourceRestorationTestContent98765 unique marker for this test' await server.handleIngestData({ content, metadata: { source, format: 'text' }, }) const queryResult = await server.handleQueryDocuments({ query: 'SourceRestorationTestContent98765', limit: 10, }) const results = JSON.parse(queryResult.content[0].text) expect(results.length).toBeGreaterThan(0) // Find the result that contains our specific content const targetResult = results.find((r: { text: string; source?: string }) => r.text.includes('SourceRestorationTestContent98765') ) expect(targetResult).toBeDefined() // Source should be restored from file path expect(targetResult.source).toBe(source) }) }) })

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ingest-data.test.ts•14.1 KiB