content-extractor.ts•2.5 kB
import { load } from 'cheerio';
import { MAIN_CONTENT_SELECTORS, NOISE_SELECTORS, JS_REQUIRED_PATTERNS } from './selectors.js';
export class ContentExtractor {
static extractMainContent($: ReturnType<typeof load>, selector?: string): string {
let content = '';
if (selector) {
content = $(selector).text().trim();
}
if (!content) {
// 尝试找到主要内容区域
for (const mainSelector of MAIN_CONTENT_SELECTORS) {
const element = $(mainSelector);
if (element.length > 0) {
// 移除干扰元素
element.find(NOISE_SELECTORS.join(',')).remove();
content = element.text().trim();
if (content) break;
}
}
}
// 如果还是没找到内容,尝试从 body 中提取
if (!content) {
const $body = $('body').clone();
// 移除干扰元素
$body.find(NOISE_SELECTORS.join(',')).remove();
content = $body.text().trim();
}
// 清理文本
return content
.replace(/[\n\r]+/g, '\n') // 合并多个换行
.replace(/[ \t]+/g, ' ') // 合并多个空格
.split('\n') // 按行分割
.map(line => line.trim()) // 清理每行的空白
.filter(Boolean) // 移除空行
.join('\n'); // 重新组合
}
static extractLinks($: ReturnType<typeof load>): string[] {
const links: string[] = [];
$('a[href]').each((_, element) => {
const $link = $(element);
const href = $link.attr('href');
const text = $link.text().trim();
if (href && text && !href.startsWith('#') && !href.startsWith('javascript:')) {
try {
const fullUrl = new URL(href, $('base').attr('href')).toString();
links.push(`[${text}] ${fullUrl}`);
} catch (e) {
// 忽略无效的 URL
}
}
});
return [...new Set(links)]; // 去重
}
static mayNeedJavaScript(url: string, html: string, $: ReturnType<typeof load>): boolean {
// 检查 URL 是否包含可能需要 JS 的特征
if (JS_REQUIRED_PATTERNS.some(pattern => pattern.test(url))) {
return true;
}
// 检查页面内容是否暗示需要 JS
const hasNoContent = $('body').text().trim().length < 100;
const hasLoadingIndicator = $('body').text().toLowerCase().includes('loading');
const hasReactRoot = $('#root').length > 0 || $('#app').length > 0;
return hasNoContent || hasLoadingIndicator || hasReactRoot;
}
}