Skip to main content
Glama
DocumentStore.test.ts49.1 kB
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { ScrapeResult } from "../scraper/types"; import type { Chunk } from "../splitter/types"; import { DocumentStore } from "./DocumentStore"; import { EmbeddingConfig } from "./embeddings/EmbeddingConfig"; import { VersionStatus } from "./types"; // Mock only the embedding service to generate deterministic embeddings for testing // This allows us to test ranking logic while using real SQLite database vi.mock("./embeddings/EmbeddingFactory", async (importOriginal) => { const actual = await importOriginal<typeof import("./embeddings/EmbeddingFactory")>(); return { ...actual, createEmbeddingModel: () => ({ embedQuery: vi.fn(async (text: string) => { // Generate deterministic embeddings based on text content for consistent testing const words = text.toLowerCase().split(/\s+/); const embedding = new Array(1536).fill(0); // Create meaningful semantic relationships for testing words.forEach((word, wordIndex) => { const wordHash = Array.from(word).reduce( (acc, char) => acc + char.charCodeAt(0), 0, ); const baseIndex = (wordHash % 100) * 15; // Distribute across embedding dimensions for (let i = 0; i < 15; i++) { const index = (baseIndex + i) % 1536; embedding[index] += 1.0 / (wordIndex + 1); // Earlier words get higher weight } }); // Normalize the embedding const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); return magnitude > 0 ? embedding.map((val) => val / magnitude) : embedding; }), embedDocuments: vi.fn(async (texts: string[]) => { // Generate embeddings for each text using the same logic as embedQuery return texts.map((text) => { const words = text.toLowerCase().split(/\s+/); const embedding = new Array(1536).fill(0); words.forEach((word, wordIndex) => { const wordHash = Array.from(word).reduce( (acc, char) => acc + char.charCodeAt(0), 0, ); const baseIndex = (wordHash % 100) * 15; for (let i = 0; i < 15; i++) { const index = (baseIndex + i) % 1536; embedding[index] += 1.0 / (wordIndex + 1); } }); const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); return magnitude > 0 ? embedding.map((val) => val / magnitude) : embedding; }); }), }), }; }); /** * Helper function to create minimal ScrapeResult for testing. * Converts simplified test data to the ScrapeResult format expected by addDocuments. */ function createScrapeResult( title: string, url: string, content: string, path: string[] = [], options?: { etag?: string | null; lastModified?: string | null; }, ): ScrapeResult { const chunks: Chunk[] = [ { types: ["text"], content, section: { level: 0, path }, }, ]; return { url, title, contentType: "text/html", textContent: content, links: [], errors: [], chunks, etag: options?.etag, lastModified: options?.lastModified, } satisfies ScrapeResult; } /** * Tests for DocumentStore with embeddings enabled * Uses explicit embedding configuration and tests hybrid search functionality */ describe("DocumentStore - With Embeddings", () => { let store: DocumentStore; beforeEach(async () => { // Create explicit embedding configuration for tests const embeddingConfig = EmbeddingConfig.parseEmbeddingConfig( "openai:text-embedding-3-small", ); // Create a fresh in-memory database for each test with explicit config store = new DocumentStore(":memory:", embeddingConfig); await store.initialize(); }); afterEach(async () => { if (store) { await store.shutdown(); } }); describe("Document Storage and Retrieval", () => { it("should store and retrieve documents with proper metadata", async () => { // Add two pages separately await store.addDocuments( "testlib", "1.0.0", 1, createScrapeResult( "JS Tutorial", "https://example.com/js-tutorial", "JavaScript programming tutorial with examples", ["programming", "javascript"], ), ); await store.addDocuments( "testlib", "1.0.0", 1, createScrapeResult( "Python DS", "https://example.com/python-ds", "Python data science guide with pandas", ["programming", "python"], ), ); // Verify documents were stored expect(await store.checkDocumentExists("testlib", "1.0.0")).toBe(true); // Verify library versions are tracked correctly const versions = await store.queryUniqueVersions("testlib"); expect(versions).toContain("1.0.0"); // Verify library version details const libraryVersions = await store.queryLibraryVersions(); expect(libraryVersions.has("testlib")).toBe(true); const testlibVersions = libraryVersions.get("testlib")!; expect(testlibVersions).toHaveLength(1); expect(testlibVersions[0].version).toBe("1.0.0"); expect(testlibVersions[0].documentCount).toBe(2); expect(testlibVersions[0].uniqueUrlCount).toBe(2); }); it("treats library names case-insensitively and reuses same library id", async () => { const a = await store.resolveVersionId("React", ""); const b = await store.resolveVersionId("react", ""); const c = await store.resolveVersionId("REACT", ""); expect(a).toBe(b); expect(b).toBe(c); }); it("should handle document deletion correctly", async () => { await store.addDocuments( "templib", "1.0.0", 1, createScrapeResult( "Temp Doc", "https://example.com/temp", "Temporary document for deletion test", ["temp"], ), ); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(true); const deletedCount = await store.deletePages("templib", "1.0.0"); expect(deletedCount).toBe(1); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(false); }); it("should completely remove a version including pages and documents", async () => { // Add two pages await store.addDocuments( "removelib", "1.0.0", 1, createScrapeResult( "Doc 1", "https://example.com/doc1", "First document for removal test", ["docs"], ), ); await store.addDocuments( "removelib", "1.0.0", 1, createScrapeResult( "Doc 2", "https://example.com/doc2", "Second document for removal test", ["docs"], ), ); expect(await store.checkDocumentExists("removelib", "1.0.0")).toBe(true); // Remove the version const result = await store.removeVersion("removelib", "1.0.0", true); // Verify the results expect(result.documentsDeleted).toBe(2); expect(result.versionDeleted).toBe(true); expect(result.libraryDeleted).toBe(true); // Verify documents no longer exist expect(await store.checkDocumentExists("removelib", "1.0.0")).toBe(false); }); it("should remove version but keep library when other versions exist", async () => { // Add two versions await store.addDocuments( "multilib", "1.0.0", 1, createScrapeResult("V1 Doc", "https://example.com/v1", "Version 1 document", [ "v1", ]), ); await store.addDocuments( "multilib", "2.0.0", 1, createScrapeResult("V2 Doc", "https://example.com/v2", "Version 2 document", [ "v2", ]), ); // Remove only version 1.0.0 const result = await store.removeVersion("multilib", "1.0.0", true); // Verify version 1 was deleted but library remains expect(result.documentsDeleted).toBe(1); expect(result.versionDeleted).toBe(true); expect(result.libraryDeleted).toBe(false); // Verify version 1 no longer exists but version 2 does expect(await store.checkDocumentExists("multilib", "1.0.0")).toBe(false); expect(await store.checkDocumentExists("multilib", "2.0.0")).toBe(true); }); it("should handle multiple versions of the same library", async () => { await store.addDocuments( "versionlib", "1.0.0", 1, createScrapeResult( "V1 Features", "https://example.com/v1", "Version 1.0 feature documentation", ["features"], ), ); await store.addDocuments( "versionlib", "2.0.0", 1, createScrapeResult( "V2 Features", "https://example.com/v2", "Version 2.0 feature documentation with new capabilities", ["features"], ), ); expect(await store.checkDocumentExists("versionlib", "1.0.0")).toBe(true); expect(await store.checkDocumentExists("versionlib", "2.0.0")).toBe(true); const versions = await store.queryUniqueVersions("versionlib"); expect(versions).toContain("1.0.0"); expect(versions).toContain("2.0.0"); }); it("should store and retrieve etag and lastModified metadata", async () => { const testEtag = '"abc123-def456"'; const testLastModified = "2023-12-01T10:30:00Z"; await store.addDocuments( "etagtest", "1.0.0", 1, createScrapeResult( "ETag Test Doc", "https://example.com/etag-test", "Test document with etag and lastModified", ["test"], { etag: testEtag, lastModified: testLastModified }, ), ); // Query the database directly to verify the etag and last_modified are stored // @ts-expect-error Accessing private property for testing const db = store.db; const pageResult = db .prepare(` SELECT p.etag, p.last_modified FROM pages p JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id WHERE l.name = ? AND COALESCE(v.name, '') = ? AND p.url = ? `) .get("etagtest", "1.0.0", "https://example.com/etag-test") as | { etag: string | null; last_modified: string | null; } | undefined; expect(pageResult).toBeDefined(); expect(pageResult?.etag).toBe(testEtag); expect(pageResult?.last_modified).toBe(testLastModified); // Also verify we can retrieve the document and it contains the metadata const results = await store.findByContent("etagtest", "1.0.0", "etag", 10); expect(results.length).toBeGreaterThan(0); const doc = results[0]; expect(doc.url).toBe("https://example.com/etag-test"); }); }); describe("Hybrid Search with Embeddings", () => { beforeEach(async () => { // Set up test documents with known semantic relationships for ranking tests await store.addDocuments( "searchtest", "1.0.0", 1, createScrapeResult( "JavaScript Programming Guide", "https://example.com/js-guide", "JavaScript programming tutorial with code examples and functions", ["programming", "javascript"], ), ); await store.addDocuments( "searchtest", "1.0.0", 1, createScrapeResult( "JavaScript Frameworks", "https://example.com/js-frameworks", "Advanced JavaScript frameworks like React and Vue for building applications", ["programming", "javascript", "frameworks"], ), ); await store.addDocuments( "searchtest", "1.0.0", 1, createScrapeResult( "Python Programming", "https://example.com/python-guide", "Python programming language tutorial for data science and machine learning", ["programming", "python"], ), ); }); it("should perform hybrid search combining vector and FTS", async () => { const results = await store.findByContent( "searchtest", "1.0.0", "JavaScript programming", 10, ); expect(results.length).toBeGreaterThan(0); // JavaScript documents should rank higher than non-JavaScript documents const topResult = results[0]; expect(topResult.content.toLowerCase()).toContain("javascript"); // Results should have both vector and FTS ranking metadata const hybridResults = results.filter( (r) => r.vec_rank !== undefined && r.fts_rank !== undefined, ); // At least some results should be hybrid matches if (hybridResults.length > 0) { for (const result of hybridResults) { expect(result.vec_rank).toBeGreaterThan(0); expect(result.fts_rank).toBeGreaterThan(0); expect(result.score).toBeGreaterThan(0); } } // All results should have valid scores for (const result of results) { expect(result.score).toBeGreaterThan(0); expect(typeof result.score).toBe("number"); // Results should have either vec_rank, fts_rank, or both expect(result.vec_rank !== undefined || result.fts_rank !== undefined).toBe(true); } }); it("should demonstrate semantic similarity through vector search", async () => { const results = await store.findByContent( "searchtest", "1.0.0", "programming tutorial", // Should match both exact terms and semantically similar content 10, ); expect(results.length).toBeGreaterThan(0); // Should find programming documents const programmingResults = results.filter((r) => r.content.toLowerCase().includes("programming"), ); expect(programmingResults.length).toBeGreaterThan(0); // At least some results should have vector ranks (semantic/embedding matching) // If no vector results, it might be because embeddings were disabled in this test run const vectorResults = results.filter((r) => r.vec_rank !== undefined); const ftsResults = results.filter((r) => r.fts_rank !== undefined); // Either we have vector results (hybrid search) or FTS results (fallback) expect(vectorResults.length > 0 || ftsResults.length > 0).toBe(true); // All results should have valid scores for (const result of results) { expect(result.score).toBeGreaterThan(0); } }); }); describe("Embedding Batch Processing", () => { let mockEmbedDocuments: ReturnType<typeof vi.fn>; beforeEach(() => { // Get reference to the mocked embedDocuments function if embeddings are enabled // @ts-expect-error Accessing private property for testing if (store.embeddings?.embedDocuments) { // @ts-expect-error Accessing private property for testing mockEmbedDocuments = vi.mocked(store.embeddings.embedDocuments); mockEmbedDocuments.mockClear(); } }); it("should successfully embed and store large batches of documents", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Add multiple large documents to verify batching works correctly const docCount = 5; const contentSize = 15000; // 15KB each - ensures batching behavior for (let i = 0; i < docCount; i++) { await store.addDocuments( "batchtest", "1.0.0", 1, createScrapeResult( `Batch Doc ${i + 1}`, `https://example.com/batch-doc${i + 1}`, "x".repeat(contentSize), ["section"], ), ); } // Verify all documents were successfully embedded and stored expect(await store.checkDocumentExists("batchtest", "1.0.0")).toBe(true); // Verify embedDocuments was called (batching occurred) expect(mockEmbedDocuments).toHaveBeenCalled(); // Verify all documents are searchable (embeddings were applied) const searchResults = await store.findByContent("batchtest", "1.0.0", "Batch", 10); expect(searchResults.length).toBe(docCount); }); it("should include proper document headers in embedding text", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } await store.addDocuments( "testlib", "1.0.0", 1, createScrapeResult("Test Title", "https://example.com/test", "Test content", [ "path", "to", "doc", ]), ); // Embedding text should include structured metadata expect(mockEmbedDocuments).toHaveBeenCalledTimes(1); const embeddedText = mockEmbedDocuments.mock.calls[0][0][0]; expect(embeddedText).toContain("<title>Test Title</title>"); expect(embeddedText).toContain("<url>https://example.com/test</url>"); expect(embeddedText).toContain("<path>path / to / doc</path>"); expect(embeddedText).toContain("Test content"); }); }); describe("Status Tracking and Metadata", () => { it("should update version status correctly", async () => { await store.addDocuments( "statuslib", "1.0.0", 1, createScrapeResult( "Status Test", "https://example.com/status-test", "Status tracking test content", ["test"], ), ); const versionId = await store.resolveVersionId("statuslib", "1.0.0"); await store.updateVersionStatus(versionId, VersionStatus.QUEUED); const queuedVersions = await store.getVersionsByStatus([VersionStatus.QUEUED]); expect(queuedVersions).toHaveLength(1); expect(queuedVersions[0].library_name).toBe("statuslib"); expect(queuedVersions[0].name).toBe("1.0.0"); expect(queuedVersions[0].status).toBe(VersionStatus.QUEUED); }); it("should store and retrieve scraper options", async () => { const versionId = await store.resolveVersionId("optionslib", "1.0.0"); const scraperOptions = { url: "https://example.com/docs", library: "optionslib", version: "1.0.0", maxDepth: 3, maxPages: 100, scope: "subpages" as const, followRedirects: true, }; await store.storeScraperOptions(versionId, scraperOptions); const retrieved = await store.getScraperOptions(versionId); expect(retrieved).not.toBeNull(); expect(retrieved?.options.maxDepth).toBe(3); expect(retrieved?.options.maxPages).toBe(100); expect(retrieved?.options.scope).toBe("subpages"); }); }); describe("Embedding Retry Logic", () => { let mockEmbedDocuments: ReturnType<typeof vi.fn>; let callCount: number; beforeEach(async () => { callCount = 0; // Get reference to the mocked embedDocuments function // @ts-expect-error Accessing private property for testing if (store.embeddings?.embedDocuments) { // @ts-expect-error Accessing private property for testing mockEmbedDocuments = vi.mocked(store.embeddings.embedDocuments); mockEmbedDocuments.mockClear(); } }); it("should successfully handle normal embedding without errors", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } await store.addDocuments( "normaltest", "1.0.0", 1, createScrapeResult( "Normal Doc", "https://example.com/normal", "This is a normal sized document that should embed without issues", ["test"], ), ); expect(mockEmbedDocuments).toHaveBeenCalled(); expect(await store.checkDocumentExists("normaltest", "1.0.0")).toBe(true); }); it("should retry and split batch when size error occurs", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Mock embedDocuments to fail first time with size error, then succeed on splits mockEmbedDocuments.mockImplementation(async (texts: string[]) => { callCount++; // First call with multiple texts: simulate size error if (callCount === 1 && texts.length > 1) { throw new Error("maximum context length exceeded"); } // Subsequent calls (after split): succeed with dummy embeddings return texts.map(() => new Array(1536).fill(0.1)); }); // Create a scrape result with multiple chunks to trigger batching const result = createScrapeResult( "Batch Doc", "https://example.com/batch", "Content chunk 1", ["section1"], ); result.chunks = [ { types: ["text"], content: "Content chunk 1", section: { level: 0, path: ["section1"] }, }, { types: ["text"], content: "Content chunk 2", section: { level: 0, path: ["section2"] }, }, ]; await store.addDocuments("retrytest", "1.0.0", 1, result); // Should have been called multiple times (initial failure + successful retries) expect(callCount).toBeGreaterThan(1); expect(await store.checkDocumentExists("retrytest", "1.0.0")).toBe(true); }); it("should truncate single oversized text when size error occurs", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Mock embedDocuments to fail first time with size error for single large text mockEmbedDocuments.mockImplementation(async (texts: string[]) => { callCount++; // First call with full text: simulate size error if (callCount === 1) { throw new Error("This model's maximum context length is 8191 tokens"); } // Second call (after truncation): succeed return texts.map(() => new Array(1536).fill(0.1)); }); // Create a document with very large content const largeContent = "x".repeat(50000); // 50KB await store.addDocuments( "truncatetest", "1.0.0", 1, createScrapeResult("Large Doc", "https://example.com/large", largeContent, [ "section", ]), ); // Should have been called twice (initial failure + successful retry with truncated text) expect(callCount).toBe(2); expect(await store.checkDocumentExists("truncatetest", "1.0.0")).toBe(true); }); it("should detect various size error messages", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } const sizeErrorMessages = [ "maximum context length exceeded", "input is too long", "token limit reached", "input is too large", "text exceeds the limit", "max token count exceeded", ]; for (const errorMsg of sizeErrorMessages) { callCount = 0; mockEmbedDocuments.mockClear(); // Mock to fail with specific error message, then succeed mockEmbedDocuments.mockImplementation(async (texts: string[]) => { callCount++; if (callCount === 1) { throw new Error(errorMsg); } return texts.map(() => new Array(1536).fill(0.1)); }); const testLib = `errortest-${sizeErrorMessages.indexOf(errorMsg)}`; await store.addDocuments( testLib, "1.0.0", 1, createScrapeResult( "Error Test", `https://example.com/${testLib}`, "Test content", ["test"], ), ); // Should have retried and succeeded expect(callCount).toBeGreaterThan(1); expect(await store.checkDocumentExists(testLib, "1.0.0")).toBe(true); } }); it("should re-throw non-size errors without retry", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Mock embedDocuments to fail with non-size error mockEmbedDocuments.mockRejectedValue( new Error("Network error: connection refused"), ); await expect( store.addDocuments( "networkerror", "1.0.0", 1, createScrapeResult( "Network Error Test", "https://example.com/network-error", "Test content", ["test"], ), ), ).rejects.toThrow("Network error"); // Should have been called only once (no retry for non-size errors) expect(mockEmbedDocuments).toHaveBeenCalledTimes(1); }); it("should handle nested retry for multiple batch splits", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Mock to fail multiple times, requiring nested splits mockEmbedDocuments.mockImplementation(async (texts: string[]) => { callCount++; // Fail on first two calls (requiring splits), succeed on smaller batches if (callCount <= 2 && texts.length > 1) { throw new Error("maximum context length exceeded"); } return texts.map(() => new Array(1536).fill(0.1)); }); // Create multiple chunks to trigger multiple splits const result = createScrapeResult( "Multi Split", "https://example.com/multi", "Chunk 1", ["s1"], ); result.chunks = [ { types: ["text"], content: "Chunk 1", section: { level: 0, path: ["s1"] } }, { types: ["text"], content: "Chunk 2", section: { level: 0, path: ["s2"] } }, { types: ["text"], content: "Chunk 3", section: { level: 0, path: ["s3"] } }, { types: ["text"], content: "Chunk 4", section: { level: 0, path: ["s4"] } }, ]; await store.addDocuments("multisplit", "1.0.0", 1, result); // Should have been called multiple times due to splits expect(callCount).toBeGreaterThan(2); expect(await store.checkDocumentExists("multisplit", "1.0.0")).toBe(true); }); it("should fail after retry if truncated text still too large", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } // Mock embedDocuments to always fail with size error (even after truncation) mockEmbedDocuments.mockRejectedValue( new Error("maximum context length exceeded - even after truncation"), ); await expect( store.addDocuments( "alwaysfail", "1.0.0", 1, createScrapeResult( "Always Fail", "https://example.com/always-fail", "x".repeat(100000), // Very large content ["test"], ), ), ).rejects.toThrow("maximum context length exceeded"); // Should have attempted multiple times (original + retry after truncation) expect(mockEmbedDocuments).toHaveBeenCalled(); }); }); }); /** * Tests for DocumentStore without embeddings (FTS-only mode) * Tests the fallback behavior when no embedding configuration is provided */ describe("DocumentStore - Without Embeddings (FTS-only)", () => { let store: DocumentStore; let originalEnv: NodeJS.ProcessEnv; beforeEach(() => { // Save and clear environment variables to disable embeddings originalEnv = { ...process.env }; delete process.env.OPENAI_API_KEY; delete process.env.GOOGLE_API_KEY; delete process.env.GOOGLE_APPLICATION_CREDENTIALS; delete process.env.AWS_ACCESS_KEY_ID; delete process.env.AWS_SECRET_ACCESS_KEY; delete process.env.AZURE_OPENAI_API_KEY; }); afterEach(async () => { // Restore original environment process.env = originalEnv; if (store) { await store.shutdown(); } }); describe("Initialization without embeddings", () => { it("should initialize successfully without embedding credentials", async () => { store = new DocumentStore(":memory:"); await expect(store.initialize()).resolves.not.toThrow(); }); it("should store documents without vectorization", async () => { store = new DocumentStore(":memory:"); await store.initialize(); await expect( store.addDocuments( "react", "18.0.0", 1, createScrapeResult( "React Hooks Guide", "https://example.com/react-hooks", "This is a test document about React hooks.", ["React", "Hooks"], ), ), ).resolves.not.toThrow(); const exists = await store.checkDocumentExists("react", "18.0.0"); expect(exists).toBe(true); }); }); describe("FTS-only Search", () => { beforeEach(async () => { store = new DocumentStore(":memory:"); await store.initialize(); await store.addDocuments( "testlib", "1.0.0", 1, createScrapeResult( "React Hooks Guide", "https://example.com/react-hooks", "React hooks are a powerful feature for state management.", ["React", "Hooks"], ), ); await store.addDocuments( "testlib", "1.0.0", 1, createScrapeResult( "TypeScript Introduction", "https://example.com/typescript-intro", "TypeScript provides excellent type safety for JavaScript.", ["TypeScript", "Intro"], ), ); }); it("should perform FTS-only search", async () => { const results = await store.findByContent("testlib", "1.0.0", "React hooks", 5); expect(results.length).toBeGreaterThan(0); expect(results[0].content).toContain("React hooks"); expect(results[0]).toHaveProperty("score"); expect(results[0]).toHaveProperty("fts_rank"); // Should NOT have vector rank since vectorization is disabled expect((results[0] as any).vec_rank).toBeUndefined(); }); it("should handle various search queries correctly", async () => { const jsResults = await store.findByContent("testlib", "1.0.0", "TypeScript", 5); expect(jsResults.length).toBeGreaterThan(0); expect(jsResults[0].content).toContain("TypeScript"); // Empty query should return empty results const emptyResults = await store.findByContent("testlib", "1.0.0", "", 5); expect(emptyResults).toHaveLength(0); }); it("should escape FTS queries safely", async () => { const maliciousQueries = [ "'; DROP TABLE documents; --", "programming & development", "function()", "test* wildcard", ]; for (const query of maliciousQueries) { await expect( store.findByContent("testlib", "1.0.0", query, 10), ).resolves.not.toThrow(); } }); }); }); /** * Common tests that work in both embedding and non-embedding modes * These tests focus on core database functionality */ describe("DocumentStore - Common Functionality", () => { let store: DocumentStore; // Use embeddings for these tests beforeEach(async () => { const embeddingConfig = EmbeddingConfig.parseEmbeddingConfig( "openai:text-embedding-3-small", ); store = new DocumentStore(":memory:", embeddingConfig); await store.initialize(); }); afterEach(async () => { if (store) { await store.shutdown(); } }); describe("getActiveEmbeddingConfig", () => { it("should return null when no embedding config is provided", async () => { // Create a store without embedding config (FTS-only mode) const ftsOnlyStore = new DocumentStore(":memory:"); await ftsOnlyStore.initialize(); const config = ftsOnlyStore.getActiveEmbeddingConfig(); expect(config).toBeNull(); await ftsOnlyStore.shutdown(); }); }); describe("Case Sensitivity", () => { it("treats version names case-insensitively within a library", async () => { const v1 = await store.resolveVersionId("cslib", "1.0.0"); const v2 = await store.resolveVersionId("cslib", "1.0.0"); const v3 = await store.resolveVersionId("cslib", "1.0.0"); expect(v1).toBe(v2); expect(v2).toBe(v3); }); it("collapses mixed-case version names to a single version id", async () => { const v1 = await store.resolveVersionId("mixcase", "Alpha"); const v2 = await store.resolveVersionId("mixcase", "alpha"); const v3 = await store.resolveVersionId("mixcase", "ALPHA"); expect(v1).toBe(v2); expect(v2).toBe(v3); }); }); describe("Version Isolation", () => { it("should search within specific versions only", async () => { await store.addDocuments( "featuretest", "1.0.0", 1, createScrapeResult( "Old Feature", "https://example.com/old", "Old feature documentation", ["features"], ), ); await store.addDocuments( "featuretest", "2.0.0", 1, createScrapeResult( "New Feature", "https://example.com/new", "New feature documentation", ["features"], ), ); const v1Results = await store.findByContent("featuretest", "1.0.0", "feature", 10); expect(v1Results.length).toBeGreaterThan(0); expect(v1Results[0].title).toBe("Old Feature"); const v2Results = await store.findByContent("featuretest", "2.0.0", "feature", 10); expect(v2Results.length).toBeGreaterThan(0); expect(v2Results[0].title).toBe("New Feature"); }); }); describe("Document Management", () => { it("should delete both documents and pages when removing all documents", async () => { const library = "delete-test"; const version = "1.0.0"; // Add multiple pages with documents await store.addDocuments( library, version, 1, createScrapeResult("Page 1", "https://example.com/page1", "Content for page 1", [ "section1", ]), ); await store.addDocuments( library, version, 1, createScrapeResult("Page 2", "https://example.com/page2", "Content for page 2", [ "section2", ]), ); // Verify both pages and documents exist const versionId = await store.resolveVersionId(library, version); const pagesBefore = await store.getPagesByVersionId(versionId); expect(pagesBefore.length).toBe(2); expect(await store.checkDocumentExists(library, version)).toBe(true); // Delete all documents for this version const deletedCount = await store.deletePages(library, version); expect(deletedCount).toBe(2); // Should delete 2 documents // Verify both documents AND pages are gone const pagesAfter = await store.getPagesByVersionId(versionId); expect(pagesAfter.length).toBe(0); // Pages should be deleted too expect(await store.checkDocumentExists(library, version)).toBe(false); }); it("should retrieve documents by ID", async () => { await store.addDocuments( "idtest", "1.0.0", 1, createScrapeResult( "ID Test Doc", "https://example.com/id-test", "Test document for ID retrieval", ["test"], ), ); const results = await store.findByContent("idtest", "1.0.0", "test document", 10); expect(results.length).toBeGreaterThan(0); const doc = results[0]; expect(doc.id).toBeDefined(); const retrievedDoc = await store.getById(doc.id); expect(retrievedDoc).not.toBeNull(); expect(retrievedDoc?.title).toBe("ID Test Doc"); }); it("should handle URL pre-deletion correctly", async () => { const library = "url-update-test"; const version = "1.0.0"; const url = "https://example.com/test-page"; // Helper function to count documents async function countDocuments(targetUrl?: string): Promise<number> { let query = ` SELECT COUNT(*) as count FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id WHERE l.name = ? AND COALESCE(v.name, '') = ? `; const params: any[] = [library.toLowerCase(), version.toLowerCase()]; if (targetUrl) { query += " AND p.url = ?"; params.push(targetUrl); } const result = (store as any).db.prepare(query).get(...params) as { count: number; }; return result.count; } // Add initial page with 2 chunks await store.addDocuments(library, version, 1, { ...createScrapeResult("Initial Test Page", url, "Initial content chunk 1", [ "section1", ]), chunks: [ { types: ["text"], content: "Initial content chunk 1", section: { level: 0, path: ["section1"] }, }, { types: ["text"], content: "Initial content chunk 2", section: { level: 0, path: ["section2"] }, }, ], }); expect(await countDocuments()).toBe(2); expect(await countDocuments(url)).toBe(2); // Update with new page (should trigger pre-deletion) await store.addDocuments(library, version, 1, { ...createScrapeResult("Updated Test Page", url, "Updated content chunk 1", [ "updated-section1", ]), chunks: [ { types: ["text"], content: "Updated content chunk 1", section: { level: 0, path: ["updated-section1"] }, }, { types: ["text"], content: "Updated content chunk 2", section: { level: 0, path: ["updated-section2"] }, }, { types: ["text"], content: "Updated content chunk 3", section: { level: 0, path: ["updated-section3"] }, }, ], }); expect(await countDocuments()).toBe(3); expect(await countDocuments(url)).toBe(3); }); }); describe("Search Security", () => { beforeEach(async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "Programming Guide", "https://example.com/programming", "Programming computers is fun and educational for developers", ["programming", "guide"], ), ); await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "CLI Options", "https://example.com/cli-options", "Use the --error-on-warnings flag to fail the build on warnings.", ["cli", "options"], ), ); }); it("should safely handle malicious queries", async () => { const maliciousQuery = "'; DROP TABLE documents; --"; await expect( store.findByContent("security-test", "1.0.0", maliciousQuery, 10), ).resolves.not.toThrow(); // Verify database is still functional const normalResults = await store.findByContent( "security-test", "1.0.0", "programming", 10, ); expect(normalResults.length).toBeGreaterThan(0); }); it("should handle special characters safely", async () => { const specialCharQueries = [ "programming & development", "software (lifecycle)", "price: $99.99", "100% coverage", ]; for (const query of specialCharQueries) { await expect( store.findByContent("security-test", "1.0.0", query, 10), ).resolves.not.toThrow(); } }); it("should handle quoted strings with hyphens (issue #262)", async () => { // Reproduction for: https://github.com/arabold/docs-mcp-server/issues/262 // Query: "--error-on-warnings" (including quotes) should not throw syntax error const results = await store.findByContent( "security-test", "1.0.0", '"--error-on-warnings"', 10, ); // Should find the document containing --error-on-warnings expect(results.length).toBeGreaterThan(0); expect(results[0].content).toContain("--error-on-warnings"); }); it("should handle other quoted strings with special characters", async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "Special Chars", "https://example.com/special", "Use @decorator or $variable in your code.", ["syntax"], ), ); // Test various quoted strings with special characters const testQueries = [ '"@decorator"', '"$variable"', '"foo-bar-baz"', '"test.method()"', ]; for (const query of testQueries) { await expect( store.findByContent("security-test", "1.0.0", query, 10), ).resolves.not.toThrow(); } }); it("should handle unbalanced quotes by auto-closing them", async () => { // User forgot closing quote const query = '"--error-on-warnings'; // Should not throw syntax error await expect( store.findByContent("security-test", "1.0.0", query, 10), ).resolves.not.toThrow(); }); it("should preserve phrase search when user provides quotes", async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "Phrase Test", "https://example.com/phrase", "The quick brown fox jumps over the lazy dog.", ["test"], ), ); // Quoted phrase should find exact phrase const phraseResults = await store.findByContent( "security-test", "1.0.0", '"quick brown fox"', 10, ); expect(phraseResults.length).toBeGreaterThan(0); expect(phraseResults[0].content).toContain("quick brown fox"); }); it("should support mixed quoted phrases and unquoted terms", async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "Mixed Search Test", "https://example.com/mixed", "Modern programming requires knowledge of design patterns and best practices.", ["programming"], ), ); // Test mixed search: unquoted term + quoted phrase const results = await store.findByContent( "security-test", "1.0.0", 'programming "design patterns"', 10, ); // Should find documents containing both "programming" AND the phrase "design patterns" expect(results.length).toBeGreaterThan(0); expect(results[0].content).toContain("programming"); expect(results[0].content).toContain("design patterns"); }); it("should treat FTS operators as literal keywords when in unquoted position", async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "OR Keyword Test", "https://example.com/or-test", "You can use OR conditions in your queries to match multiple terms.", ["queries"], ), ); // Query: "queries" OR malicious // With OR semantics, each term is treated as optional // So this becomes: exact match "queries OR malicious" OR ("queries" OR "OR" OR "malicious") // This document contains "queries" and "OR", so it will match const results = await store.findByContent( "security-test", "1.0.0", '"queries" OR malicious', 10, ); // This document contains "queries" and "OR" which satisfies the OR condition expect(results.length).toBeGreaterThan(0); expect(results[0].content).toContain("queries"); expect(results[0].content).toContain("OR"); }); it("should handle NOT operator as a literal keyword", async () => { await store.addDocuments( "security-test", "1.0.0", 1, createScrapeResult( "NOT Keyword Test", "https://example.com/not-test", "You should NOT use this approach in production code.", ["warnings"], ), ); // Query with NOT - treated as literal keyword with OR semantics // Becomes: exact match "production NOT unsafe" OR ("production" OR "NOT" OR "unsafe") // Document has "production" and "NOT" which satisfies the OR condition const results = await store.findByContent( "security-test", "1.0.0", '"production" NOT unsafe', 10, ); // Document has "production" and "NOT" which matches via OR expect(results.length).toBeGreaterThan(0); expect(results[0].content).toContain("production"); expect(results[0].content).toContain("NOT"); }); }); describe("Refresh Operations - getPagesByVersionId", () => { beforeEach(async () => { // Add pages with etags for building refresh queue await store.addDocuments( "refresh-queue-test", "1.0.0", 1, createScrapeResult( "Page 1", "https://example.com/page1", "Content 1", ["section1"], { etag: '"etag1"', lastModified: "2023-01-01T00:00:00Z" }, ), ); await store.addDocuments( "refresh-queue-test", "1.0.0", 1, createScrapeResult( "Page 2", "https://example.com/page2", "Content 2", ["section2"], { etag: '"etag2"', lastModified: "2023-01-02T00:00:00Z" }, ), ); await store.addDocuments( "refresh-queue-test", "1.0.0", 1, createScrapeResult( "Page 3 No ETag", "https://example.com/page3", "Content 3", ["section3"], { etag: null, lastModified: null }, ), ); }); it("should retrieve all pages with metadata for refresh queue building", async () => { const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); const pages = await store.getPagesByVersionId(versionId); expect(pages.length).toBe(3); // Verify page1 metadata const page1 = pages.find((p) => p.url === "https://example.com/page1"); expect(page1).toBeDefined(); expect(page1!.id).toBeDefined(); expect(page1!.etag).toBe('"etag1"'); expect(page1!.depth).toBe(1); // Verify page2 metadata const page2 = pages.find((p) => p.url === "https://example.com/page2"); expect(page2).toBeDefined(); expect(page2!.etag).toBe('"etag2"'); // Verify page3 (no etag) const page3 = pages.find((p) => p.url === "https://example.com/page3"); expect(page3).toBeDefined(); expect(page3!.etag).toBeNull(); }); it("should return empty array for version with no pages", async () => { const emptyVersionId = await store.resolveVersionId("empty-lib", "1.0.0"); const pages = await store.getPagesByVersionId(emptyVersionId); expect(pages).toEqual([]); }); it("should include all metadata fields needed for refresh", async () => { const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); const pages = await store.getPagesByVersionId(versionId); // All pages should have the necessary fields for refresh operations for (const page of pages) { expect(page.id).toBeDefined(); expect(page.url).toBeDefined(); expect(page.depth).toBeDefined(); // etag can be null, but the field should exist expect(page).toHaveProperty("etag"); } }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server