export interface ExtractorResult {
title: string;
content: string;
excerpt?: string;
author?: string;
publishedTime?: string;
language?: string;
}
export interface ExtractorOptions {
preserveFormatting?: boolean;
includeImages?: boolean;
includeLinks?: boolean;
maxLength?: number;
}
export abstract class BaseExtractor {
constructor(protected options: ExtractorOptions = {}) {}
abstract extract(html: string, url: string): Promise<ExtractorResult>;
protected sanitizeText(text: string): string {
return text
.replace(/\s+/g, ' ')
.replace(/\n+/g, '\n')
.trim();
}
protected truncateContent(content: string, maxLength?: number): string {
if (!maxLength || content.length <= maxLength) {
return content;
}
const truncated = content.slice(0, maxLength - 3);
const lastSpace = truncated.lastIndexOf(' ');
if (lastSpace > maxLength * 0.8) {
return truncated.slice(0, lastSpace) + '...';
}
return truncated + '...';
}
protected extractExcerpt(content: string, maxLength = 200): string {
const sentences = content.split(/[.!?]+\s+/);
let excerpt = '';
for (const sentence of sentences) {
if (excerpt.length + sentence.length + 1 > maxLength) {
break;
}
excerpt += (excerpt ? ' ' : '') + sentence.trim();
}
return excerpt || content.slice(0, maxLength).trim();
}
}