import { describe, it, expect } from 'bun:test';
import { MarkdownExtractor } from '@/extractors/markdown.js';
describe('markdown extractor', () => {
const url = 'https://example.com/page';
describe('html to markdown conversion', () => {
it('should convert headings', async () => {
const html = `
<html>
<body>
<h1>Main Title</h1>
<h2>Subtitle</h2>
<p>Content</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).toContain('# Main Title');
expect(result.content).toContain('## Subtitle');
});
it('should convert paragraphs', async () => {
const html = `
<html>
<body>
<p>First paragraph</p>
<p>Second paragraph</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).toContain('First paragraph');
expect(result.content).toContain('Second paragraph');
});
it('should convert code blocks', async () => {
const html = `
<html>
<body>
<pre><code class="language-javascript">console.log('hello');</code></pre>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).toContain('```');
expect(result.content).toContain("console.log('hello');");
expect(result.content).toContain('```');
});
it('should convert inline code', async () => {
const html = `
<html>
<body>
<p>Use <code>console.log()</code> to debug</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).toContain('`console.log()`');
});
it('should convert lists', async () => {
const html = `
<html>
<body>
<ul>
<li>First item</li>
<li>Second item</li>
</ul>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).toContain('First item');
expect(result.content).toContain('Second item');
});
});
describe('content cleaning', () => {
it('should remove script tags', async () => {
const html = `
<html>
<body>
<script>alert('bad');</script>
<p>Good content</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).not.toContain('alert');
expect(result.content).toContain('Good content');
});
it('should remove navigation elements', async () => {
const html = `
<html>
<body>
<nav>
<a href="/home">Home</a>
<a href="/about">About</a>
</nav>
<p>Main content</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.content).not.toContain('Home');
expect(result.content).not.toContain('About');
expect(result.content).toContain('Main content');
});
});
describe('title extraction', () => {
it('should extract title from first heading', async () => {
const html = `
<html>
<body>
<h1>Page Title</h1>
<p>Content</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.title).toBe('Page Title');
});
it('should fallback to title tag', async () => {
const html = `
<html>
<head><title>HTML Title</title></head>
<body><p>Content without heading</p></body>
</html>
`;
const extractor = new MarkdownExtractor();
const result = await extractor.extract(html, url);
expect(result.title).toBe('HTML Title');
});
});
describe('options', () => {
it('should respect max length option', async () => {
const html = `
<html>
<body>
<p>This is a very long piece of content that should be truncated when the maxLength option is used</p>
</body>
</html>
`;
const extractor = new MarkdownExtractor({ maxLength: 50 });
const result = await extractor.extract(html, url);
expect(result.content.length).toBeLessThanOrEqual(50);
});
});
});