Local RAG

Overview Schema Related Servers Score Discussions

html-workflow.e2e.test.ts•13.3 KiB

// HTML Workflow E2E Test // Test Type: End-to-End Test // Tests complete HTML ingestion workflow: ingest_data -> query_documents import { mkdir, rm } from 'node:fs/promises' import { afterAll, beforeAll, describe, expect, it } from 'vitest' import { RAGServer } from '../../server/index.js' // ============================================ // Test Configuration // ============================================ const testDbPath = './tmp/test-html-workflow-db' const testConfig = { dbPath: testDbPath, modelName: 'Xenova/all-MiniLM-L6-v2', cacheDir: './tmp/test-model-cache', baseDir: '.', maxFileSize: 10 * 1024 * 1024, } // ============================================ // Tests // ============================================ describe('HTML Workflow E2E', () => { let server: RAGServer beforeAll(async () => { await mkdir(testDbPath, { recursive: true }) await mkdir(testConfig.cacheDir, { recursive: true }) server = new RAGServer(testConfig) await server.initialize() }, 120000) // 2 minutes for model download afterAll(async () => { await rm(testDbPath, { recursive: true, force: true }) }) // -------------------------------------------- // Complete HTML Workflow // -------------------------------------------- describe('Complete HTML Ingestion and Search', () => { it('ingests HTML page and finds content via semantic search', async () => { // Arrange: Prepare realistic HTML content const html = ` <!DOCTYPE html> <html lang="en"> <head> <title>Introduction to Vector Databases</title> <meta name="description" content="Learn about vector databases and their applications"> </head> <body> <header> <nav> <a href="/">Home</a> <a href="/docs">Documentation</a> <a href="/blog">Blog</a> </nav> </header> <main> <article> <h1>Introduction to Vector Databases</h1> <p class="meta">Published: December 30, 2024 | Author: Tech Writer</p> <h2>What is a Vector Database?</h2> <p>A vector database is a specialized database designed to store and query high-dimensional vectors efficiently. These vectors are typically generated by machine learning models and represent complex data like text, images, or audio as numerical arrays.</p> <h2>Key Features</h2> <ul> <li>Similarity search using distance metrics like cosine similarity or Euclidean distance</li> <li>Approximate nearest neighbor (ANN) algorithms for fast retrieval</li> <li>Support for hybrid search combining vector and keyword matching</li> </ul> <h2>Use Cases</h2> <p>Vector databases are essential for building RAG (Retrieval Augmented Generation) systems, recommendation engines, and semantic search applications. They enable finding similar items based on meaning rather than exact keyword matches.</p> </article> </main> <aside> <h3>Related Articles</h3> <ul> <li><a href="/embeddings">Understanding Embeddings</a></li> <li><a href="/rag">Building RAG Systems</a></li> </ul> </aside> <footer> <p>© 2024 Tech Blog. All rights reserved.</p> <nav> <a href="/privacy">Privacy</a> <a href="/terms">Terms</a> </nav> </footer> </body> </html> ` // Act: Ingest HTML const ingestResult = await server.handleIngestData({ content: html, metadata: { source: 'https://techblog.example.com/intro-vector-databases', format: 'html', }, }) const ingestParsed = JSON.parse(ingestResult.content[0].text) expect(ingestParsed.chunkCount).toBeGreaterThan(0) // Act: Search for content const queryResult = await server.handleQueryDocuments({ query: 'What is a vector database and how does it work?', limit: 5, }) const searchResults = JSON.parse(queryResult.content[0].text) // Assert: Should find relevant content expect(searchResults.length).toBeGreaterThan(0) expect(searchResults[0].text).toContain('vector') // Assert: Source should be restored expect(searchResults[0].source).toBe('https://techblog.example.com/intro-vector-databases') }) it('does not return navigation or footer content in search results', async () => { const html = ` <!DOCTYPE html> <html> <body> <nav> <a href="/">NavigationLinkHome</a> <a href="/contact">NavigationLinkContact</a> </nav> <article> <h1>Machine Learning Fundamentals</h1> <p>Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience. This comprehensive guide covers the basic concepts and practical applications of machine learning algorithms.</p> <p>Deep learning, a specialized form of machine learning, uses neural networks with multiple layers to process complex patterns in data.</p> </article> <footer> <p>FooterCopyrightNotice2024</p> </footer> </body> </html> ` await server.handleIngestData({ content: html, metadata: { source: 'https://example.com/ml-fundamentals', format: 'html', }, }) // Search for navigation-specific terms const navQuery = await server.handleQueryDocuments({ query: 'NavigationLinkHome NavigationLinkContact', limit: 10, }) const navResults = JSON.parse(navQuery.content[0].text) // Navigation content should not be in results (or have very low relevance) const hasNavContent = navResults.some( (r: { text: string }) => r.text.includes('NavigationLinkHome') || r.text.includes('NavigationLinkContact') ) expect(hasNavContent).toBe(false) // Search for footer-specific terms const footerQuery = await server.handleQueryDocuments({ query: 'FooterCopyrightNotice2024', limit: 10, }) const footerResults = JSON.parse(footerQuery.content[0].text) // Footer content should not be in results const hasFooterContent = footerResults.some((r: { text: string }) => r.text.includes('FooterCopyrightNotice2024') ) expect(hasFooterContent).toBe(false) // But main content should be searchable const mainQuery = await server.handleQueryDocuments({ query: 'machine learning neural networks', limit: 5, }) const mainResults = JSON.parse(mainQuery.content[0].text) expect(mainResults.length).toBeGreaterThan(0) expect(mainResults[0].text.toLowerCase()).toContain('machine learning') }) }) // -------------------------------------------- // Multiple HTML Pages // -------------------------------------------- describe('Multiple HTML Pages', () => { it('ingests multiple HTML pages and searches across all', async () => { // Ingest first page about TypeScript const typescriptHtml = ` <!DOCTYPE html> <html> <body> <article> <h1>TypeScript Best Practices</h1> <p>TypeScript is a strongly typed programming language that builds on JavaScript. It provides optional static typing and class-based object-oriented programming.</p> <p>Key benefits include better tooling support, catch errors at compile time, and improved code documentation through types.</p> </article> </body> </html> ` await server.handleIngestData({ content: typescriptHtml, metadata: { source: 'https://docs.example.com/typescript-best-practices', format: 'html', }, }) // Ingest second page about Python const pythonHtml = ` <!DOCTYPE html> <html> <body> <article> <h1>Python Data Science Guide</h1> <p>Python is widely used in data science and machine learning. Popular libraries include NumPy for numerical computing, Pandas for data manipulation, and Scikit-learn for machine learning.</p> <p>Python's simple syntax and extensive ecosystem make it ideal for data analysis and scientific computing.</p> </article> </body> </html> ` await server.handleIngestData({ content: pythonHtml, metadata: { source: 'https://docs.example.com/python-data-science', format: 'html', }, }) // Search for TypeScript content const tsQuery = await server.handleQueryDocuments({ query: 'TypeScript static typing', limit: 5, }) const tsResults = JSON.parse(tsQuery.content[0].text) expect(tsResults.length).toBeGreaterThan(0) expect(tsResults[0].source).toBe('https://docs.example.com/typescript-best-practices') // Search for Python content const pyQuery = await server.handleQueryDocuments({ query: 'Python NumPy Pandas data science', limit: 5, }) const pyResults = JSON.parse(pyQuery.content[0].text) expect(pyResults.length).toBeGreaterThan(0) expect(pyResults[0].source).toBe('https://docs.example.com/python-data-science') }) }) // -------------------------------------------- // HTML Update Workflow // -------------------------------------------- describe('HTML Update Workflow', () => { it('updates HTML content when re-ingesting same URL', async () => { const source = 'https://example.com/update-test-page' // Initial ingestion const initialHtml = ` <!DOCTYPE html> <html> <body> <article> <h1>Initial Version</h1> <p>This is the original content with InitialUniqueMarker12345.</p> </article> </body> </html> ` await server.handleIngestData({ content: initialHtml, metadata: { source, format: 'html' }, }) // Verify initial content is searchable const initialQuery = await server.handleQueryDocuments({ query: 'InitialUniqueMarker12345', limit: 5, }) const initialResults = JSON.parse(initialQuery.content[0].text) expect(initialResults.length).toBeGreaterThan(0) // Update with new content const updatedHtml = ` <!DOCTYPE html> <html> <body> <article> <h1>Updated Version</h1> <p>This is the updated content with UpdatedUniqueMarker67890.</p> </article> </body> </html> ` await server.handleIngestData({ content: updatedHtml, metadata: { source, format: 'html' }, }) // Verify updated content is searchable const updatedQuery = await server.handleQueryDocuments({ query: 'UpdatedUniqueMarker67890', limit: 5, }) const updatedResults = JSON.parse(updatedQuery.content[0].text) expect(updatedResults.length).toBeGreaterThan(0) expect(updatedResults[0].text).toContain('UpdatedUniqueMarker67890') // Verify old content is no longer searchable (replaced) const oldQuery = await server.handleQueryDocuments({ query: 'InitialUniqueMarker12345', limit: 5, }) const oldResults = JSON.parse(oldQuery.content[0].text) const hasOldContent = oldResults.some((r: { text: string }) => r.text.includes('InitialUniqueMarker12345') ) expect(hasOldContent).toBe(false) }) }) // -------------------------------------------- // Japanese Content // -------------------------------------------- describe('Japanese HTML Content', () => { it('handles Japanese HTML content correctly', async () => { const japaneseHtml = ` <!DOCTYPE html> <html lang="ja"> <body> <article> <h1>ベクトルデータベース入門</h1> <p>ベクトルデータベースは、高次元ベクトルを効率的に保存・検索するために設計された特殊なデータベースです。機械学習モデルによって生成されたベクトルを使用して、テキストや画像などの複雑なデータを数値配列として表現します。</p> <p>類似度検索やセマンティック検索において重要な役割を果たしています。</p> </article> </body> </html> ` await server.handleIngestData({ content: japaneseHtml, metadata: { source: 'https://example.jp/vector-db-intro', format: 'html', }, }) const query = await server.handleQueryDocuments({ query: 'ベクトルデータベース類似度検索', limit: 5, }) const results = JSON.parse(query.content[0].text) expect(results.length).toBeGreaterThan(0) expect(results[0].text).toContain('ベクトル') }) }) })

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html-workflow.e2e.test.ts•13.3 KiB