extractor.test.ts•3.94 kB
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { ContentExtractor } from '@/extractor.js';
import { config } from '@/config.js';
describe('ContentExtractor', () => {
let extractor: ContentExtractor;
beforeEach(() => {
extractor = new ContentExtractor();
config.verbose = true;
});
afterEach(async () => {
await extractor.close();
});
describe('Content extraction', () => {
it('should extract content from a simple HTML page', async () => {
// Use a reliable, simple page for testing
const testUrl = 'https://example.com';
const content = await extractor.extract(testUrl);
expect(typeof content).toBe('object');
expect(typeof content.title).toBe('string');
expect(typeof content.url).toBe('string');
expect(typeof content.content).toBe('string');
expect(typeof content.extractedAt).toBe('string');
expect(content.url).toBe(testUrl);
expect(content.content.length).toBeGreaterThan(0);
// Verify it's valid markdown
expect(content.content.length).toBeGreaterThan(10);
}, 30000);
it('should handle extraction errors gracefully', async () => {
const invalidUrl = 'https://nonexistent-domain-12345.com';
await expect(extractor.extract(invalidUrl)).rejects.toThrow();
}, 15000);
});
describe('Markdown conversion', () => {
it('should convert HTML to clean markdown with only allowed tags', () => {
const testHtml = `
<h1>Main Title</h1>
<h2>Subtitle</h2>
<p>This is a <strong>bold</strong> and <em>italic</em> text with a <a href="https://example.com">link</a>.</p>
<script>alert('evil');</script>
<iframe src="bad.html"></iframe>
<div class="sidebar">Sidebar content</div>
<h3>Section</h3>
<p>More content here.</p>
`;
// Test the processContent method indirectly by creating a mock
const content = (extractor as any).processContent(testHtml, 'Test Title', 'https://example.com');
expect(content.title).toBe('Test Title');
expect(content.url).toBe('https://example.com');
expect(content.content).toContain('# Main Title');
expect(content.content).toContain('## Subtitle');
expect(content.content).toContain('### Section');
expect(content.content).toContain('**bold**');
expect(content.content).toContain('_italic_');
expect(content.content).toContain('https://example.com');
expect(content.content).not.toContain('alert');
expect(content.content).not.toContain('iframe');
expect(content.content).not.toContain('script');
});
it('should handle empty or invalid HTML', () => {
const emptyHtml = '';
// Test that processContent handles empty HTML gracefully
try {
const result = (extractor as any).processContent(emptyHtml, 'Empty', 'https://example.com');
// If no error is thrown, check that result has minimal content
expect(result).toBeDefined();
expect(result.title).toBe('Empty');
expect(result.url).toBe('https://example.com');
} catch (error) {
// If an error is thrown, it should be the expected one
expect((error as Error).message).toContain('No content could be extracted');
}
});
});
describe('Resource management', () => {
it('should properly initialize and close browser', async () => {
expect(extractor).toBeDefined();
// Browser should be null initially
expect((extractor as any).browser).toBeNull();
// After extraction, browser should exist
await extractor.extract('https://example.com');
expect((extractor as any).browser).toBeDefined();
// After close, browser should be null again
await extractor.close();
expect((extractor as any).browser).toBeNull();
}, 30000);
});
});