Skip to main content
Glama
LocalFileStrategy.test.ts28.9 kB
import { vol } from "memfs"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ProgressCallback } from "../../types"; import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../types"; import { LocalFileStrategy } from "./LocalFileStrategy"; vi.mock("node:fs/promises", () => ({ default: vol.promises })); vi.mock("node:fs"); describe("LocalFileStrategy", () => { beforeEach(() => { vol.reset(); }); it("should handle file:// URLs", () => { const strategy = new LocalFileStrategy(); expect(strategy.canHandle("file:///path/to/file.txt")).toBe(true); expect(strategy.canHandle("https://example.com")).toBe(false); }); it("should process a single file", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/test.md": "# Test\n\nThis is a test file.", }, "/", ); await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); const firstCall = progressCallback.mock.calls[0][0]; expect(firstCall).toMatchObject({ pagesScraped: 1, currentUrl: "file:///test.md", depth: 0, maxDepth: 0, totalPages: 1, totalDiscovered: 1, pageId: undefined, result: { textContent: "# Test\n\nThis is a test file.", contentType: "text/markdown", url: "file:///test.md", title: "Test", links: [], errors: [], chunks: [ { content: "# Test\nThis is a test file.", // content is simplified section: { level: 1, path: ["Test"], }, types: ["heading", "text"], }, ], }, } satisfies Partial<ScraperProgressEvent>); expect(firstCall.result?.etag).toBeDefined(); expect(firstCall.result?.lastModified).toBeDefined(); }); it("should process a directory with files and a subdirectory", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 2, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", "/testdir/file2.html": "<html><head><title>File 2 Title</title></head><body><h1>File 2</h1></body></html>", "/testdir/subdir/file3.txt": "File 3", }, "/", ); await strategy.scrape(options, progressCallback); // Should process file1.md, file2.html, and file3.txt (in subdir, depth=2) expect(progressCallback).toHaveBeenCalledTimes(3); }); it("should process different file types correctly", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", "/testdir/file2.html": "<html><head><title>File 2 Title</title></head><body><h1>File 2</h1></body></html>", "/testdir/file3.txt": "File 3", }, "/", ); await strategy.scrape(options, progressCallback); // All 3 files are page: file1.md, file2.html, and file3.txt (as markdown) expect(progressCallback).toHaveBeenCalledTimes(3); // Validate .md expect(progressCallback).toHaveBeenNthCalledWith( 1, expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///testdir/file1.md", depth: 1, maxDepth: 1, totalPages: 4, totalDiscovered: 4, result: expect.objectContaining({ textContent: "# File 1", contentType: "text/markdown", url: "file:///testdir/file1.md", title: "File 1", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Validate .html expect(progressCallback).toHaveBeenNthCalledWith( 2, expect.objectContaining({ pagesScraped: 2, currentUrl: "file:///testdir/file2.html", depth: 1, maxDepth: 1, totalPages: 4, totalDiscovered: 4, result: expect.objectContaining({ textContent: expect.stringContaining("# File 2"), contentType: "text/html", url: "file:///testdir/file2.html", title: "File 2 Title", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Validate .txt expect(progressCallback).toHaveBeenNthCalledWith( 3, expect.objectContaining({ pagesScraped: 3, currentUrl: "file:///testdir/file3.txt", depth: 1, maxDepth: 1, totalPages: 4, totalDiscovered: 4, result: expect.objectContaining({ textContent: "File 3", contentType: "text/plain", url: "file:///testdir/file3.txt", title: "file3.txt", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); it("should detect source code file types with correct MIME types", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///codebase", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/codebase/app.ts": "interface User {\n name: string;\n}", "/codebase/component.tsx": "export const App = () => <div>Hello</div>;", "/codebase/script.py": "def hello():\n print('world')", "/codebase/main.go": 'package main\n\nfunc main() {\n fmt.Println("Hello")\n}', "/codebase/lib.rs": 'fn main() {\n println!("Hello, world!");\n}', "/codebase/App.kt": 'fun main() {\n println("Hello, world!")\n}', "/codebase/script.rb": "puts 'Hello, world!'", "/codebase/run.sh": "#!/bin/bash\necho 'Hello, world!'", }, "/", ); await strategy.scrape(options, progressCallback); // Expect 8 files to be processed expect(progressCallback).toHaveBeenCalledTimes(8); // Check TypeScript file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "app.ts", textContent: expect.stringContaining("interface User"), contentType: "text/x-typescript", url: "file:///codebase/app.ts", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check TSX file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "component.tsx", textContent: expect.stringContaining("export const App"), contentType: "text/x-tsx", url: "file:///codebase/component.tsx", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Python file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "script.py", textContent: expect.stringContaining("def hello"), contentType: "text/x-python", url: "file:///codebase/script.py", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Go file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "main.go", textContent: expect.stringContaining("package main"), contentType: "text/x-go", url: "file:///codebase/main.go", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Rust file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "lib.rs", textContent: expect.stringContaining("fn main"), contentType: "text/x-rust", url: "file:///codebase/lib.rs", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Kotlin file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "App.kt", textContent: expect.stringContaining("fun main"), contentType: "text/x-kotlin", url: "file:///codebase/App.kt", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Ruby file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ title: "script.rb", textContent: expect.stringContaining("puts"), contentType: "text/x-ruby", url: "file:///codebase/script.rb", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); // Check Shell script expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ result: expect.objectContaining({ textContent: expect.stringContaining("#!/bin/bash"), contentType: "text/x-shellscript", url: "file:///codebase/run.sh", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); it("should handle empty files", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/testdir/empty.md": "", }, "/", ); await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///testdir/empty.md", result: expect.objectContaining({ textContent: "", contentType: "text/markdown", title: "Untitled", url: "file:///testdir/empty.md", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); it("should skip binary/unsupported files and only process supported text files", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); // Simulate a binary file (with null bytes) and an image file vol.fromJSON( { "/testdir/file1.md": "# File 1", // supported "/testdir/file2.png": Buffer.from([ 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x00, ]).toString("binary"), // PNG signature + null bytes "/testdir/file3.txt": "File 3", // supported "/testdir/file4.bin": Buffer.from([0x00, 0x01, 0x02, 0x03, 0x00]).toString( "binary", ), // binary with null bytes "/testdir/file5.html": "<html><body>File 5</body></html>", // supported }, "/", ); await strategy.scrape(options, progressCallback); // Only .md, .txt, and .html should be processed expect(progressCallback).toHaveBeenCalledTimes(3); const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); expect(calledUrls).toContain("file:///testdir/file1.md"); expect(calledUrls).toContain("file:///testdir/file3.txt"); expect(calledUrls).toContain("file:///testdir/file5.html"); // Should NOT process binary/image files expect(calledUrls).not.toContain("file:///testdir/file2.png"); expect(calledUrls).not.toContain("file:///testdir/file4.bin"); }); it("should respect include and exclude patterns for local crawling", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, includePatterns: ["/file1.md", "/file3.txt"], excludePatterns: ["/file3.txt"], // exclude takes precedence }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", // should be included "/testdir/file2.html": "<html><body>File 2</body></html>", // should be excluded (not in include) "/testdir/file3.txt": "File 3", // should be excluded (in exclude) }, "/", ); await strategy.scrape(options, progressCallback); // Only file1.md should be processed expect(progressCallback).toHaveBeenCalledTimes(1); const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); expect(calledUrls).toContain("file:///testdir/file1.md"); expect(calledUrls).not.toContain("file:///testdir/file2.html"); expect(calledUrls).not.toContain("file:///testdir/file3.txt"); }); it("should process files and folders with spaces in their names (percent-encoded in file:// URL)", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///test%20dir/space%20file.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/test dir/space file.md": "# Space File\n\nThis file has spaces in its name.", }, "/", ); await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///test%20dir/space%20file.md", result: expect.objectContaining({ textContent: "# Space File\n\nThis file has spaces in its name.", contentType: "text/markdown", url: "file:///test%20dir/space%20file.md", title: "Space File", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); it("should decode percent-encoded file paths (spaces as %20) for local crawling", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///test%20dir", // percent-encoded space library: "test", version: "1.0", maxPages: 10, maxDepth: 1, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/test dir/file with space.md": "# File With Space", "/test dir/normal.md": "# Normal File", }, "/", ); await strategy.scrape(options, progressCallback); // Both files should be processed expect(progressCallback).toHaveBeenCalledTimes(2); const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); expect(calledUrls).toContain("file:///test%20dir/file%20with%20space.md"); expect(calledUrls).toContain("file:///test%20dir/normal.md"); }); it("should process JSON files through JsonPipeline", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file:///api-docs.json", library: "test-api", version: "1.0.0", maxPages: 1, maxDepth: 0, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); // Create a JSON file with API documentation structure const jsonContent = JSON.stringify( { title: "Test API Documentation", version: "1.0.0", endpoints: { users: { get: { description: "Get all users", method: "GET", path: "/users", }, post: { description: "Create a new user", method: "POST", path: "/users", body: { name: "string", email: "string", }, }, }, }, schemas: { User: { id: "integer", name: "string", email: "string", }, }, }, null, 2, ); vol.fromJSON( { "/api-docs.json": jsonContent, }, "/", ); await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///api-docs.json", depth: 0, maxDepth: 0, totalPages: 1, totalDiscovered: 1, result: expect.objectContaining({ textContent: jsonContent, contentType: "application/json", title: "Test API Documentation", url: "file:///api-docs.json", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); it("should handle malformed file URLs with only two slashes", async () => { const strategy = new LocalFileStrategy(); const options: ScraperOptions = { url: "file://testdir/test.md", // Note: only two slashes (malformed) library: "test", version: "1.0", maxPages: 10, maxDepth: 0, maxConcurrency: 1, }; const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); const testContent = "# Test Content\nThis is a test file."; vol.fromJSON( { "/testdir/test.md": testContent, }, "/", ); await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ pagesScraped: 1, currentUrl: "file://testdir/test.md", // Original malformed URL preserved result: expect.objectContaining({ textContent: testContent, contentType: "text/markdown", title: "Test Content", url: "file://testdir/test.md", } satisfies Partial<ScrapeResult>), } satisfies Partial<ScraperProgressEvent>), ); }); describe("refresh workflow", () => { it("should skip processing when file returns NOT_MODIFIED (unchanged)", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); const testContent = "# Test File\nOriginal content"; // Create initial file with a specific mtime vol.fromJSON({ "/test.md": testContent }, "/"); // Get the file stats to capture the exact mtime const stats = await vol.promises.stat("/test.md"); const initialMtime = stats.mtime; // First scrape to get the initial etag const initialOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, }; await strategy.scrape(initialOptions, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); // Get the etag from the first scrape const firstCall = progressCallback.mock.calls[0][0]; const etag = firstCall.result?.etag; // Verify the mtime hasn't changed const statsAfterScrape = await vol.promises.stat("/test.md"); expect(statsAfterScrape.mtime.getTime()).toBe(initialMtime.getTime()); // Reset the callback but DON'T reset the filesystem // This preserves the file's mtime, so the etag stays the same progressCallback.mockClear(); // Now do a refresh with the same etag (file unchanged) const refreshOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, initialQueue: [ { url: "file:///test.md", depth: 0, pageId: 123, etag: etag, }, ], }; await strategy.scrape(refreshOptions, progressCallback); // Verify file was checked but returned NOT_MODIFIED (no result with content) // The root URL at depth 0 is always processed to check for changes expect(progressCallback).toHaveBeenCalledTimes(1); expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///test.md", depth: 0, result: null, // NOT_MODIFIED returns null result pageId: 123, }), ); }); it("should re-process file when it has been modified", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); const originalContent = "# Original\nOriginal content"; const updatedContent = "# Updated\nNew updated content"; // Create initial file vol.fromJSON({ "/test.md": originalContent }, "/"); // First scrape const initialOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, }; await strategy.scrape(initialOptions, progressCallback); const firstCall = progressCallback.mock.calls[0][0]; const oldEtag = firstCall.result?.etag; // Modify the file (update content and mtime) // Using a new date for fromJSON will create a new mtime vol.reset(); vol.fromJSON({ "/test.md": updatedContent }, "/"); // Wait a bit to ensure different mtime await new Promise((resolve) => setTimeout(resolve, 10)); progressCallback.mockClear(); // Refresh with old etag const refreshOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, initialQueue: [ { url: "file:///test.md", depth: 0, pageId: 456, etag: oldEtag, }, ], }; await strategy.scrape(refreshOptions, progressCallback); // Verify file was re-processed const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(1); expect(docCalls[0][0].result?.textContent).toContain("# Updated"); expect(docCalls[0][0].result?.textContent).toContain("New updated content"); expect(docCalls[0][0].result?.title).toBe("Updated"); // Verify new etag is different expect(docCalls[0][0].result?.etag).not.toBe(oldEtag); }); it("should handle deleted files during refresh", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); const testContent = "# Test File\nContent"; // Create initial file vol.fromJSON({ "/test.md": testContent }, "/"); // First scrape const initialOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, }; await strategy.scrape(initialOptions, progressCallback); const firstCall = progressCallback.mock.calls[0][0]; const etag = firstCall.result?.etag; // Delete the file vol.reset(); progressCallback.mockClear(); // Refresh with deleted file const refreshOptions: ScraperOptions = { url: "file:///test.md", library: "test", version: "1.0", maxPages: 1, maxDepth: 0, initialQueue: [ { url: "file:///test.md", depth: 0, pageId: 789, etag: etag, }, ], }; await strategy.scrape(refreshOptions, progressCallback); // Verify no processed documents were returned const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(0); }); it("should discover and process new files in a directory during refresh", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); // Create initial directory with one file vol.fromJSON( { "/testdir/file1.md": "# File 1", }, "/", ); // First scrape const initialOptions: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, }; await strategy.scrape(initialOptions, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); // Add a new file to the directory vol.fromJSON( { "/testdir/file1.md": "# File 1", "/testdir/file2.md": "# File 2\nNew file added", }, "/", ); progressCallback.mockClear(); // Refresh the directory (directories don't use etag, they just re-scan) const refreshOptions: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 1, }; await strategy.scrape(refreshOptions, progressCallback); // Should process both files expect(progressCallback).toHaveBeenCalledTimes(2); const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); expect(calledUrls).toContain("file:///testdir/file1.md"); expect(calledUrls).toContain("file:///testdir/file2.md"); }); it("should preserve depth from original scrape during refresh for nested files", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); vol.fromJSON( { "/testdir/subdir/deep/file.md": "# Deep File\nOriginal content", }, "/", ); // First scrape starting from directory - file will be discovered at depth 3 const initialOptions: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 3, }; await strategy.scrape(initialOptions, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); const firstCall = progressCallback.mock.calls[0][0]; expect(firstCall.depth).toBe(3); // File discovered at depth 3 const etag = firstCall.result?.etag; // Update the file with new content vol.reset(); vol.fromJSON( { "/testdir/subdir/deep/file.md": "# Deep File\nUpdated content", }, "/", ); await new Promise((resolve) => setTimeout(resolve, 10)); progressCallback.mockClear(); // Refresh starting from same directory with file in initialQueue at depth 3 const refreshOptions: ScraperOptions = { url: "file:///testdir", library: "test", version: "1.0", maxPages: 10, maxDepth: 3, initialQueue: [ { url: "file:///testdir/subdir/deep/file.md", depth: 3, // Original depth from discovery pageId: 555, etag: etag, }, ], }; await strategy.scrape(refreshOptions, progressCallback); // Verify file was re-processed and depth from initialQueue is preserved const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(1); expect(docCalls[0][0].depth).toBe(3); expect(docCalls[0][0].pageId).toBe(555); expect(docCalls[0][0].result?.textContent).toContain("Updated content"); }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server