/**
* Creeper 爬虫服务
* 通过子进程调用 Python Creeper 进行网页爬取
*/
import { spawn, ChildProcess } from 'child_process';
import { logger } from '../utils/logger.js';
import { FileSaver } from '../utils/file-saver.js';
export interface CreeperConfig {
pythonPath?: string; // Python 解释器路径(默认 python)
scriptPath: string; // creeper.py 所在目录
concurrency?: number; // 并发数(默认 5)
timeout?: number; // 超时时间(默认 60s)
}
export interface CreeperResult {
url: string;
title: string;
summary: string; // 页面描述
content: string; // 页面内容(Markdown)
success: boolean;
error?: string;
}
export class CreeperService {
private config: CreeperConfig;
private scriptPath: string;
private fileSaver?: FileSaver;
constructor(config: CreeperConfig, fileSaveConfig?: any) {
this.config = {
pythonPath: 'python',
concurrency: 5,
timeout: 60000, // 60 秒
...config
};
// 构建脚本完整路径
this.scriptPath = `${this.config.scriptPath}/creeper.py`;
// 初始化文件保存器
if (fileSaveConfig) {
this.fileSaver = new FileSaver(fileSaveConfig);
}
logger.info('Creeper service initialized', {
scriptPath: this.scriptPath,
pythonPath: this.config.pythonPath,
concurrency: this.config.concurrency,
timeout: this.config.timeout,
fileSaveEnabled: !!this.fileSaver
});
}
/**
* 批量爬取 URL
* @param urls URL 列表
* @param saveToFile 是否保存到文件
* @param queryKeyword 查询关键词(用于文件命名)
*/
async crawl(
urls: string[],
saveToFile: boolean = false,
queryKeyword?: string
): Promise<CreeperResult[]> {
const startTime = Date.now();
if (urls.length === 0) {
return [];
}
try {
logger.info('Starting Creeper batch crawl', {
urlCount: urls.length,
concurrency: this.config.concurrency,
saveToFile,
queryKeyword
});
// 准备命令行参数
const args: string[] = [
this.scriptPath,
'--urls',
urls.join(','),
'-c',
this.config.concurrency!.toString()
];
// 执行爬取
const results = await this.executeCreeper(args);
// 如果启用文件保存功能且有查询关键词
if (saveToFile && this.fileSaver && queryKeyword) {
const successResults = results.filter(r => r.success);
if (successResults.length > 0) {
logger.info('Saving crawled content to single file', {
successCount: successResults.length,
queryKeyword
});
// 准备批量保存的数据
const batchItems = successResults.map(result => ({
url: result.url,
title: result.title,
content: result.content
}));
// 异步保存到单个文件
this.fileSaver.saveBatch(queryKeyword, batchItems)
.then(saveResult => {
if (saveResult.success) {
logger.info('Content saved successfully to single file', {
filePath: saveResult.filePath,
itemCount: batchItems.length
});
} else {
logger.error('Failed to save content', {
error: saveResult.error
});
}
})
.catch(error => {
logger.error('Failed to save content', {
error: error instanceof Error ? error.message : 'Unknown error'
});
});
}
}
const successCount = results.filter(r => r.success).length;
const duration = Date.now() - startTime;
logger.info('Creeper batch crawl completed', {
totalUrls: urls.length,
successCount,
failureCount: urls.length - successCount,
duration: `${duration}ms`,
contentSaved: saveToFile && !!this.fileSaver && !!queryKeyword
});
return results;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
logger.error('Creeper batch crawl failed', {
urlCount: urls.length,
error: errorMessage
});
// 返回失败结果
return urls.map(url => ({
url,
title: '爬取失败',
summary: '',
content: `爬取失败: ${errorMessage}`,
success: false,
error: errorMessage
}));
}
}
/**
* 执行 creeper.py 脚本
*/
private async executeCreeper(args: string[]): Promise<CreeperResult[]> {
return new Promise((resolve, reject) => {
const results: CreeperResult[] = [];
let stdout = '';
let stderr = '';
// 启动子进程
const child: ChildProcess = spawn(this.config!.pythonPath!, args, {
cwd: this.config!.scriptPath, // 设置工作目录
stdio: ['ignore', 'pipe', 'pipe'],
env: {
...process.env,
// 可选的环境变量
CONCURRENCY: this.config!.concurrency?.toString(),
REQUEST_TIMEOUT: '30', // 单个请求超时 30 秒
}
});
// 设置超时
const timeoutId = setTimeout(() => {
if (child.pid) {
process.kill(child.pid, 'SIGTERM');
}
reject(new Error(`Creeper 执行超时 (${this.config!.timeout}ms)`));
}, this.config!.timeout);
// 收集输出
if (child.stdout) {
child.stdout.on('data', (data) => {
stdout += data.toString();
});
}
if (child.stderr) {
child.stderr.on('data', (data) => {
stderr += data.toString();
});
}
// 处理进程结束
child.on('close', (code) => {
clearTimeout(timeoutId);
if (code !== 0 && code !== null) {
// 进程异常退出
const errorMsg = stderr || stdout || `Process exited with code ${code}`;
reject(new Error(`Creeper 执行失败: ${errorMsg}`));
return;
}
// 解析 JSON 输出
try {
// 提取 JSON(stdout 可能包含其他文本)
const jsonMatch = stdout.match(/\[[\s\S]*\]/);
const jsonStr = jsonMatch ? jsonMatch[0] : '[]';
const rawData = JSON.parse(jsonStr);
// 转换为 CreeperResult 格式
if (Array.isArray(rawData)) {
for (const item of rawData) {
results.push({
url: item.url || '',
title: item.title || '',
summary: item.summary || '',
content: item.content || '',
success: !!item.title && !item.content?.includes('获取失败'),
error: item.content?.includes('获取失败') ? item.content : undefined
});
}
}
resolve(results);
} catch (parseError) {
logger.error('Failed to parse Creeper output', {
stdout: stdout.substring(0, 500),
stderr: stderr.substring(0, 500),
error: parseError instanceof Error ? parseError.message : 'Unknown'
});
reject(new Error('解析 Creeper 输出失败'));
}
});
// 处理进程错误
child.on('error', (error) => {
clearTimeout(timeoutId);
reject(new Error(`启动 Creeper 失败: ${error.message}`));
});
});
}
/**
* 测试 Creeper 环境
*/
async testEnvironment(): Promise<boolean> {
try {
// 测试单个 URL
const results = await this.executeCreeper([
this.scriptPath,
'--urls',
'https://httpbin.org/json',
'-c',
'1'
]);
const success = results.length > 0 && results[0].success;
if (success) {
logger.info('Creeper environment test passed');
} else {
logger.error('Creeper environment test failed', { results });
}
return success;
} catch (error) {
logger.error('Creeper environment test error', {
error: error instanceof Error ? error.message : 'Unknown'
});
return false;
}
}
}