import { describe, it, expect } from 'bun:test';
import { HtmlParser } from '@/spider/parser.js';
describe('html parser', () => {
const baseUrl = 'https://example.com';
describe('content extraction', () => {
it('should extract title from h1', () => {
const html = '<html><body><h1>Test Title</h1><p>Content</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.title).toBe('Test Title');
});
it('should extract title from title tag', () => {
const html = '<html><head><title>Page Title</title></head><body><p>Content</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.title).toBe('Page Title');
});
it('should extract content from body', () => {
const html = '<html><body><p>This is content</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.content).toContain('This is content');
});
it('should default to "Untitled" when no title found', () => {
const html = '<html><body><p>Content only</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.title).toBe('Untitled');
});
});
describe('metadata extraction', () => {
it('should extract meta description', () => {
const html = `
<html>
<head>
<meta name="description" content="Test description">
</head>
<body><p>Content</p></body>
</html>
`;
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.metadata.description).toBe('Test description');
});
it('should extract meta keywords', () => {
const html = `
<html>
<head>
<meta name="keywords" content="test, keywords, html">
</head>
<body><p>Content</p></body>
</html>
`;
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.metadata.keywords).toEqual(['test', 'keywords', 'html']);
});
it('should extract language', () => {
const html = '<html lang="en"><body><p>Content</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.metadata.language).toBe('en');
});
it('should count words', () => {
const html = '<html><body><p>one two three four five</p></body></html>';
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.metadata.wordCount).toBe(5);
});
});
describe('link extraction', () => {
it('should extract absolute links', () => {
const html = `
<html>
<body>
<a href="https://other.com/page">External</a>
<a href="/internal">Internal</a>
<p>Content</p>
</body>
</html>
`;
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.links).toContain('https://other.com/page');
expect(result.links).toContain('https://example.com/internal');
});
it('should deduplicate links', () => {
const html = `
<html>
<body>
<a href="/page">Link 1</a>
<a href="/page">Link 2</a>
<p>Content</p>
</body>
</html>
`;
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.links.length).toBe(1);
expect(result.links[0]).toBe('https://example.com/page');
});
it('should ignore invalid links', () => {
const html = `
<html>
<body>
<a href="">Empty</a>
<a href="javascript:void(0)">JS</a>
<a href="/valid">Valid</a>
<p>Content</p>
</body>
</html>
`;
const parser = new HtmlParser(baseUrl);
const result = parser.parse(html);
expect(result.links).toEqual(['https://example.com/valid']);
});
});
});