import TurndownService from 'turndown';
import { BaseExtractor, ExtractorResult, ExtractorOptions } from './base.js';
import { logger } from '@/utils/logger.js';
export interface MarkdownOptions extends ExtractorOptions {
headingStyle?: 'setext' | 'atx';
bulletListMarker?: '-' | '*' | '+';
codeBlockStyle?: 'indented' | 'fenced';
emDelimiter?: '_' | '*';
strongDelimiter?: '**' | '__';
}
export class MarkdownExtractor extends BaseExtractor {
private turndown: TurndownService;
constructor(options: MarkdownOptions = {}) {
super(options);
this.turndown = new TurndownService({
headingStyle: options.headingStyle || 'atx',
bulletListMarker: options.bulletListMarker || '-',
codeBlockStyle: options.codeBlockStyle || 'fenced',
emDelimiter: options.emDelimiter || '_',
strongDelimiter: options.strongDelimiter || '**',
});
this.configureTurndown();
}
async extract(html: string, url: string): Promise<ExtractorResult> {
try {
// Clean HTML before conversion
const cleanedHtml = this.cleanHtml(html);
// Convert to markdown
let markdown = this.turndown.turndown(cleanedHtml);
// Post-process markdown
markdown = this.postProcessMarkdown(markdown);
if (this.options.maxLength) {
markdown = this.truncateContent(markdown, this.options.maxLength);
}
// Extract title from markdown (first heading or from HTML)
const title = this.extractTitleFromMarkdown(markdown) ||
this.extractTitleFromHtml(html);
const result: ExtractorResult = {
title,
content: markdown,
excerpt: this.extractExcerpt(this.markdownToPlainText(markdown)),
language: this.extractLanguageFromHtml(html),
};
logger.debug(`Markdown extracted ${markdown.length} characters from ${url}`);
return result;
} catch (error) {
logger.error(`Markdown extraction failed for ${url}:`, error);
throw error;
}
}
private configureTurndown(): void {
// Custom rules for better markdown conversion
// Handle code blocks better
this.turndown.addRule('codeBlock', {
filter: ['pre'],
replacement: (content, node) => {
const language = this.getCodeLanguage(node as Element);
return `\n\n\`\`\`${language}\n${content}\n\`\`\`\n\n`;
},
});
// Handle inline code
this.turndown.addRule('inlineCode', {
filter: ['code'],
replacement: (content) => `\`${content}\``,
});
// Remove unwanted elements
this.turndown.remove([
'script', 'style', 'noscript', 'iframe', 'object', 'embed',
'nav', 'header', 'footer', 'aside',
]);
// Keep useful attributes
this.turndown.keep(['sub', 'sup']);
}
private cleanHtml(html: string): string {
// Remove unwanted elements and clean up HTML
return html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
.replace(/class="[^"]*nav[^"]*"/gi, '')
.replace(/class="[^"]*menu[^"]*"/gi, '')
.replace(/class="[^"]*sidebar[^"]*"/gi, '');
}
private postProcessMarkdown(markdown: string): string {
return markdown
// Clean up excessive whitespace
.replace(/\n\s*\n\s*\n/g, '\n\n')
.replace(/^[ \t]+$/gm, '')
// Fix list formatting
.replace(/^(\s*)-\s*$/gm, '')
// Clean up code blocks
.replace(/```\s*\n\s*```/g, '')
.trim();
}
private extractTitleFromMarkdown(markdown: string): string | undefined {
// Look for first heading
const headingMatch = markdown.match(/^#+ (.+)$/m);
if (headingMatch) {
return headingMatch[1].trim();
}
// Look for setext-style heading
const setextMatch = markdown.match(/^(.+)\n[=-]+$/m);
if (setextMatch) {
return setextMatch[1].trim();
}
return undefined;
}
private extractTitleFromHtml(html: string): string {
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
if (titleMatch) {
return titleMatch[1].trim();
}
const h1Match = html.match(/<h1[^>]*>([^<]+)<\/h1>/i);
if (h1Match) {
return h1Match[1].trim();
}
return 'Untitled';
}
private extractLanguageFromHtml(html: string): string | undefined {
const langMatch = html.match(/<html[^>]*lang=["']([^"']+)["'][^>]*>/i);
return langMatch ? langMatch[1] : undefined;
}
private getCodeLanguage(element: Element): string {
// Try to detect language from class names
const className = element.className || '';
const langMatch = className.match(/(?:lang|language)-(\w+)/);
if (langMatch) {
return langMatch[1];
}
// Check for data attributes
const dataLang = element.getAttribute('data-lang') ||
element.getAttribute('data-language');
if (dataLang) {
return dataLang;
}
return '';
}
private markdownToPlainText(markdown: string): string {
return markdown
.replace(/^#{1,6}\s+/gm, '') // Remove headings
.replace(/\*\*([^*]+)\*\*/g, '$1') // Remove bold
.replace(/\*([^*]+)\*/g, '$1') // Remove italic
.replace(/`([^`]+)`/g, '$1') // Remove inline code
.replace(/```[\s\S]*?```/g, '') // Remove code blocks
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Remove links
.replace(/^\s*[-*+]\s+/gm, '') // Remove list markers
.replace(/^\s*\d+\.\s+/gm, '') // Remove numbered list markers
.replace(/\n+/g, ' ') // Normalize whitespace
.trim();
}
}