// HTML Workflow E2E Test
// Test Type: End-to-End Test
// Tests complete HTML ingestion workflow: ingest_data -> query_documents
import { mkdir, rm } from 'node:fs/promises'
import { afterAll, beforeAll, describe, expect, it } from 'vitest'
import { RAGServer } from '../../server/index.js'
// ============================================
// Test Configuration
// ============================================
const testDbPath = './tmp/test-html-workflow-db'
const testConfig = {
dbPath: testDbPath,
modelName: 'Xenova/all-MiniLM-L6-v2',
cacheDir: './tmp/test-model-cache',
baseDir: '.',
maxFileSize: 10 * 1024 * 1024,
}
// ============================================
// Tests
// ============================================
describe('HTML Workflow E2E', () => {
let server: RAGServer
beforeAll(async () => {
await mkdir(testDbPath, { recursive: true })
await mkdir(testConfig.cacheDir, { recursive: true })
server = new RAGServer(testConfig)
await server.initialize()
}, 120000) // 2 minutes for model download
afterAll(async () => {
await rm(testDbPath, { recursive: true, force: true })
})
// --------------------------------------------
// Complete HTML Workflow
// --------------------------------------------
describe('Complete HTML Ingestion and Search', () => {
it('ingests HTML page and finds content via semantic search', async () => {
// Arrange: Prepare realistic HTML content
const html = `
<!DOCTYPE html>
<html lang="en">
<head>
<title>Introduction to Vector Databases</title>
<meta name="description" content="Learn about vector databases and their applications">
</head>
<body>
<header>
<nav>
<a href="/">Home</a>
<a href="/docs">Documentation</a>
<a href="/blog">Blog</a>
</nav>
</header>
<main>
<article>
<h1>Introduction to Vector Databases</h1>
<p class="meta">Published: December 30, 2024 | Author: Tech Writer</p>
<h2>What is a Vector Database?</h2>
<p>A vector database is a specialized database designed to store and query high-dimensional vectors efficiently. These vectors are typically generated by machine learning models and represent complex data like text, images, or audio as numerical arrays.</p>
<h2>Key Features</h2>
<ul>
<li>Similarity search using distance metrics like cosine similarity or Euclidean distance</li>
<li>Approximate nearest neighbor (ANN) algorithms for fast retrieval</li>
<li>Support for hybrid search combining vector and keyword matching</li>
</ul>
<h2>Use Cases</h2>
<p>Vector databases are essential for building RAG (Retrieval Augmented Generation) systems, recommendation engines, and semantic search applications. They enable finding similar items based on meaning rather than exact keyword matches.</p>
</article>
</main>
<aside>
<h3>Related Articles</h3>
<ul>
<li><a href="/embeddings">Understanding Embeddings</a></li>
<li><a href="/rag">Building RAG Systems</a></li>
</ul>
</aside>
<footer>
<p>© 2024 Tech Blog. All rights reserved.</p>
<nav>
<a href="/privacy">Privacy</a>
<a href="/terms">Terms</a>
</nav>
</footer>
</body>
</html>
`
// Act: Ingest HTML
const ingestResult = await server.handleIngestData({
content: html,
metadata: {
source: 'https://techblog.example.com/intro-vector-databases',
format: 'html',
},
})
const ingestParsed = JSON.parse(ingestResult.content[0].text)
expect(ingestParsed.chunkCount).toBeGreaterThan(0)
// Act: Search for content
const queryResult = await server.handleQueryDocuments({
query: 'What is a vector database and how does it work?',
limit: 5,
})
const searchResults = JSON.parse(queryResult.content[0].text)
// Assert: Should find relevant content
expect(searchResults.length).toBeGreaterThan(0)
expect(searchResults[0].text).toContain('vector')
// Assert: Source should be restored
expect(searchResults[0].source).toBe('https://techblog.example.com/intro-vector-databases')
})
it('does not return navigation or footer content in search results', async () => {
const html = `
<!DOCTYPE html>
<html>
<body>
<nav>
<a href="/">NavigationLinkHome</a>
<a href="/contact">NavigationLinkContact</a>
</nav>
<article>
<h1>Machine Learning Fundamentals</h1>
<p>Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience. This comprehensive guide covers the basic concepts and practical applications of machine learning algorithms.</p>
<p>Deep learning, a specialized form of machine learning, uses neural networks with multiple layers to process complex patterns in data.</p>
</article>
<footer>
<p>FooterCopyrightNotice2024</p>
</footer>
</body>
</html>
`
await server.handleIngestData({
content: html,
metadata: {
source: 'https://example.com/ml-fundamentals',
format: 'html',
},
})
// Search for navigation-specific terms
const navQuery = await server.handleQueryDocuments({
query: 'NavigationLinkHome NavigationLinkContact',
limit: 10,
})
const navResults = JSON.parse(navQuery.content[0].text)
// Navigation content should not be in results (or have very low relevance)
const hasNavContent = navResults.some(
(r: { text: string }) =>
r.text.includes('NavigationLinkHome') || r.text.includes('NavigationLinkContact')
)
expect(hasNavContent).toBe(false)
// Search for footer-specific terms
const footerQuery = await server.handleQueryDocuments({
query: 'FooterCopyrightNotice2024',
limit: 10,
})
const footerResults = JSON.parse(footerQuery.content[0].text)
// Footer content should not be in results
const hasFooterContent = footerResults.some((r: { text: string }) =>
r.text.includes('FooterCopyrightNotice2024')
)
expect(hasFooterContent).toBe(false)
// But main content should be searchable
const mainQuery = await server.handleQueryDocuments({
query: 'machine learning neural networks',
limit: 5,
})
const mainResults = JSON.parse(mainQuery.content[0].text)
expect(mainResults.length).toBeGreaterThan(0)
expect(mainResults[0].text.toLowerCase()).toContain('machine learning')
})
})
// --------------------------------------------
// Multiple HTML Pages
// --------------------------------------------
describe('Multiple HTML Pages', () => {
it('ingests multiple HTML pages and searches across all', async () => {
// Ingest first page about TypeScript
const typescriptHtml = `
<!DOCTYPE html>
<html>
<body>
<article>
<h1>TypeScript Best Practices</h1>
<p>TypeScript is a strongly typed programming language that builds on JavaScript. It provides optional static typing and class-based object-oriented programming.</p>
<p>Key benefits include better tooling support, catch errors at compile time, and improved code documentation through types.</p>
</article>
</body>
</html>
`
await server.handleIngestData({
content: typescriptHtml,
metadata: {
source: 'https://docs.example.com/typescript-best-practices',
format: 'html',
},
})
// Ingest second page about Python
const pythonHtml = `
<!DOCTYPE html>
<html>
<body>
<article>
<h1>Python Data Science Guide</h1>
<p>Python is widely used in data science and machine learning. Popular libraries include NumPy for numerical computing, Pandas for data manipulation, and Scikit-learn for machine learning.</p>
<p>Python's simple syntax and extensive ecosystem make it ideal for data analysis and scientific computing.</p>
</article>
</body>
</html>
`
await server.handleIngestData({
content: pythonHtml,
metadata: {
source: 'https://docs.example.com/python-data-science',
format: 'html',
},
})
// Search for TypeScript content
const tsQuery = await server.handleQueryDocuments({
query: 'TypeScript static typing',
limit: 5,
})
const tsResults = JSON.parse(tsQuery.content[0].text)
expect(tsResults.length).toBeGreaterThan(0)
expect(tsResults[0].source).toBe('https://docs.example.com/typescript-best-practices')
// Search for Python content
const pyQuery = await server.handleQueryDocuments({
query: 'Python NumPy Pandas data science',
limit: 5,
})
const pyResults = JSON.parse(pyQuery.content[0].text)
expect(pyResults.length).toBeGreaterThan(0)
expect(pyResults[0].source).toBe('https://docs.example.com/python-data-science')
})
})
// --------------------------------------------
// HTML Update Workflow
// --------------------------------------------
describe('HTML Update Workflow', () => {
it('updates HTML content when re-ingesting same URL', async () => {
const source = 'https://example.com/update-test-page'
// Initial ingestion
const initialHtml = `
<!DOCTYPE html>
<html>
<body>
<article>
<h1>Initial Version</h1>
<p>This is the original content with InitialUniqueMarker12345.</p>
</article>
</body>
</html>
`
await server.handleIngestData({
content: initialHtml,
metadata: { source, format: 'html' },
})
// Verify initial content is searchable
const initialQuery = await server.handleQueryDocuments({
query: 'InitialUniqueMarker12345',
limit: 5,
})
const initialResults = JSON.parse(initialQuery.content[0].text)
expect(initialResults.length).toBeGreaterThan(0)
// Update with new content
const updatedHtml = `
<!DOCTYPE html>
<html>
<body>
<article>
<h1>Updated Version</h1>
<p>This is the updated content with UpdatedUniqueMarker67890.</p>
</article>
</body>
</html>
`
await server.handleIngestData({
content: updatedHtml,
metadata: { source, format: 'html' },
})
// Verify updated content is searchable
const updatedQuery = await server.handleQueryDocuments({
query: 'UpdatedUniqueMarker67890',
limit: 5,
})
const updatedResults = JSON.parse(updatedQuery.content[0].text)
expect(updatedResults.length).toBeGreaterThan(0)
expect(updatedResults[0].text).toContain('UpdatedUniqueMarker67890')
// Verify old content is no longer searchable (replaced)
const oldQuery = await server.handleQueryDocuments({
query: 'InitialUniqueMarker12345',
limit: 5,
})
const oldResults = JSON.parse(oldQuery.content[0].text)
const hasOldContent = oldResults.some((r: { text: string }) =>
r.text.includes('InitialUniqueMarker12345')
)
expect(hasOldContent).toBe(false)
})
})
// --------------------------------------------
// Japanese Content
// --------------------------------------------
describe('Japanese HTML Content', () => {
it('handles Japanese HTML content correctly', async () => {
const japaneseHtml = `
<!DOCTYPE html>
<html lang="ja">
<body>
<article>
<h1>ベクトルデータベース入門</h1>
<p>ベクトルデータベースは、高次元ベクトルを効率的に保存・検索するために設計された特殊なデータベースです。機械学習モデルによって生成されたベクトルを使用して、テキストや画像などの複雑なデータを数値配列として表現します。</p>
<p>類似度検索やセマンティック検索において重要な役割を果たしています。</p>
</article>
</body>
</html>
`
await server.handleIngestData({
content: japaneseHtml,
metadata: {
source: 'https://example.jp/vector-db-intro',
format: 'html',
},
})
const query = await server.handleQueryDocuments({
query: 'ベクトルデータベース 類似度検索',
limit: 5,
})
const results = JSON.parse(query.content[0].text)
expect(results.length).toBeGreaterThan(0)
expect(results[0].text).toContain('ベクトル')
})
})
})