LLM Researcher

extractor.test.ts•3.94 kB

import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import { ContentExtractor } from '@/extractor.js'; import { config } from '@/config.js'; describe('ContentExtractor', () => { let extractor: ContentExtractor; beforeEach(() => { extractor = new ContentExtractor(); config.verbose = true; }); afterEach(async () => { await extractor.close(); }); describe('Content extraction', () => { it('should extract content from a simple HTML page', async () => { // Use a reliable, simple page for testing const testUrl = 'https://example.com'; const content = await extractor.extract(testUrl); expect(typeof content).toBe('object'); expect(typeof content.title).toBe('string'); expect(typeof content.url).toBe('string'); expect(typeof content.content).toBe('string'); expect(typeof content.extractedAt).toBe('string'); expect(content.url).toBe(testUrl); expect(content.content.length).toBeGreaterThan(0); // Verify it's valid markdown expect(content.content.length).toBeGreaterThan(10); }, 30000); it('should handle extraction errors gracefully', async () => { const invalidUrl = 'https://nonexistent-domain-12345.com'; await expect(extractor.extract(invalidUrl)).rejects.toThrow(); }, 15000); }); describe('Markdown conversion', () => { it('should convert HTML to clean markdown with only allowed tags', () => { const testHtml = ` <h1>Main Title</h1> <h2>Subtitle</h2> <p>This is a <strong>bold</strong> and <em>italic</em> text with a <a href="https://example.com">link</a>.</p> <script>alert('evil');</script> <iframe src="bad.html"></iframe> <div class="sidebar">Sidebar content</div> <h3>Section</h3> <p>More content here.</p> `; // Test the processContent method indirectly by creating a mock const content = (extractor as any).processContent(testHtml, 'Test Title', 'https://example.com'); expect(content.title).toBe('Test Title'); expect(content.url).toBe('https://example.com'); expect(content.content).toContain('# Main Title'); expect(content.content).toContain('## Subtitle'); expect(content.content).toContain('### Section'); expect(content.content).toContain('**bold**'); expect(content.content).toContain('_italic_'); expect(content.content).toContain('https://example.com'); expect(content.content).not.toContain('alert'); expect(content.content).not.toContain('iframe'); expect(content.content).not.toContain('script'); }); it('should handle empty or invalid HTML', () => { const emptyHtml = ''; // Test that processContent handles empty HTML gracefully try { const result = (extractor as any).processContent(emptyHtml, 'Empty', 'https://example.com'); // If no error is thrown, check that result has minimal content expect(result).toBeDefined(); expect(result.title).toBe('Empty'); expect(result.url).toBe('https://example.com'); } catch (error) { // If an error is thrown, it should be the expected one expect((error as Error).message).toContain('No content could be extracted'); } }); }); describe('Resource management', () => { it('should properly initialize and close browser', async () => { expect(extractor).toBeDefined(); // Browser should be null initially expect((extractor as any).browser).toBeNull(); // After extraction, browser should exist await extractor.extract('https://example.com'); expect((extractor as any).browser).toBeDefined(); // After close, browser should be null again await extractor.close(); expect((extractor as any).browser).toBeNull(); }, 30000); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Code-Hex/light-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server