/**
* web_search 工具实现
* 使用 SearXNG + Creeper + LLM 进行智能搜索和总结
*/
import { z } from 'zod';
import type { SearXNGService } from '../services/searxng.js';
import type { CreeperService } from '../services/creeper.js';
import type { SummarizerService } from '../services/summarizer.js';
import type { TopicCategory } from '../types/index.js';
import { ResultFilter } from '../services/filter.js';
import { FilterLlmService } from '../services/filter-llm.js';
import { mapReduceSummarize } from '../utils/map-reduce.js';
import { logger } from '../utils/logger.js';
export const webSearchInputSchema = z.object({
query: z.string()
.min(1, '搜索查询不能为空')
.transform(val => val.trim())
.refine(val => val.length > 0, '搜索查询不能只包含空格字符')
.describe('搜索查询'),
max_results: z.number().min(1).max(20).default(10).describe('最大结果数量'),
language: z.string().optional().default('zh').describe('搜索语言'),
time_range: z.enum(['day', 'week', 'month', 'year']).optional().describe('时间范围'),
include_domains: z.array(z.string()).optional().describe('只搜索这些域名'),
exclude_domains: z.array(z.string()).optional().describe('排除这些域名'),
save_to_file: z.boolean().default(false).describe('是否将爬取的内容保存到本地文件'),
});
export type WebSearchInput = z.infer<typeof webSearchInputSchema>;
import { getSearchSummaryPromptByTopic, getSingleSummaryPromptByTopic } from '../prompts/summary/index.js';
export async function executeWebSearch(
input: WebSearchInput,
searxngService: SearXNGService,
creeperService: CreeperService,
summarizerService: SummarizerService,
config: {
filter: any;
filterLlm: any;
mapReduce: any;
maxContentLength: number;
}
): Promise<string> {
const startTime = Date.now();
logger.info('Executing web_search', {
query: input.query,
max_results: input.max_results,
language: input.language,
time_range: input.time_range
});
try {
// 1. SearXNG 搜索(多获取一些结果供过滤)
const searchResults = await searxngService.search(input.query, {
language: input.language,
timeRange: input.time_range,
maxResults: input.max_results * 2, // 多取一倍供过滤
});
if (searchResults.length === 0) {
return `未找到与 "${input.query}" 相关的搜索结果。`;
}
logger.info('SearXNG search completed', {
initialResultCount: searchResults.length,
requestedResults: input.max_results
});
// 2. 规则过滤
const filter = new ResultFilter({
maxResults: input.max_results,
domainWhitelist: input.include_domains,
domainBlacklist: input.exclude_domains,
...config.filter
});
const filteredResults = filter.filter(searchResults, input.query);
logger.info('Rule filtering completed', {
filteredResultCount: filteredResults.length,
whitelistCount: input.include_domains?.length || 0,
blacklistCount: input.exclude_domains?.length || 0
});
// 2.5. LLM 智能过滤(如果启用)
let topic: TopicCategory = 'other';
let finalResults = filteredResults;
if (config.filterLlm.enabled && filteredResults.length > 0) {
const filterLlmService = new FilterLlmService(config.filterLlm);
const { filtered: llmFiltered, topic: detectedTopic } = await filter.filterWithLlm(
filteredResults,
input.query,
(query, results) => filterLlmService.filter(query, results)
);
topic = detectedTopic;
finalResults = llmFiltered;
logger.info('LLM filtering completed', {
topic,
keptCount: finalResults.length,
filteredCount: filteredResults.length - finalResults.length
});
}
if (finalResults.length === 0) {
return `搜索结果已被过滤器全部排除。请尝试调整域名过滤设置。`;
}
// 3. Creeper 爬取网页内容
const urls = finalResults.map(r => r.url);
const crawledPages = await creeperService.crawl(urls, input.save_to_file, input.query);
// 统计成功/失败
const successPages = crawledPages.filter(p => p.success);
const failedPages = crawledPages.filter(p => !p.success);
logger.info('Crawling completed', {
totalUrls: urls.length,
successCount: successPages.length,
failureCount: failedPages.length
});
if (successPages.length === 0) {
return `所有网页爬取失败。请检查 Creeper 配置或网页可访问性。`;
}
// 4. 计算内容总大小
const totalLength = successPages.reduce((sum, p) => sum + p.content.length, 0);
logger.info('Content size analysis', {
totalLength,
avgLength: Math.round(totalLength / successPages.length),
threshold: config.mapReduce.threshold
});
// 5. 选择总结策略(主题驱动)
let summary: string;
logger.info('Using topic-driven summarization', { topic });
if (totalLength < config.mapReduce.threshold) {
// 单次总结
logger.info('Using single summarization');
const combinedContent = formatForSingleSummary(successPages);
const summaryPrompt = getSearchSummaryPromptByTopic(topic);
const summaryResponse = await summarizerService.summarize({
content: combinedContent,
prompt: summaryPrompt
});
summary = summaryResponse.summary;
} else {
// Map-Reduce 总结
logger.info('Using Map-Reduce summarization');
summary = await mapReduceSummarize(
successPages,
summarizerService,
{
chunkSize: config.mapReduce.chunkSize,
maxConcurrency: config.mapReduce.maxConcurrency
}
);
}
// 6. 格式化输出
const output = formatOutput(summary, successPages, failedPages);
const duration = Date.now() - startTime;
logger.info('Web search completed', {
query: input.query,
duration: `${duration}ms`,
topic,
initialResults: searchResults.length,
filteredResults: filteredResults.length,
finalResults: finalResults.length,
pagesCrawled: successPages.length
});
return output;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
logger.error('Web search failed', {
query: input.query,
error: errorMessage
});
return `搜索执行失败: ${errorMessage}`;
}
}
/**
* 格式化内容用于单次总结
*/
function formatForSingleSummary(pages: Array<{
url: string;
title: string;
content: string;
}>): string {
return pages.map((page, i) =>
`【来源${i + 1}】${page.title}\nURL: ${page.url}\n\n${page.content}`
).join('\n\n---\n\n');
}
/**
* 格式化最终输出
*/
function formatOutput(
summary: string,
successPages: Array<{url: string; title: string}>,
failedPages: Array<{url: string; error?: string}>
): string {
let output = '';
// 总结部分
output += '## 搜索结果总结\n\n';
output += summary;
// 成功来源
if (successPages.length > 0) {
output += '\n\n## 信息来源\n\n';
successPages.forEach((page, i) => {
output += `[${i + 1}] **${page.title}**\n ${page.url}\n\n`;
});
}
// 失败信息(如果有)
if (failedPages.length > 0) {
output += '\n## 获取失败的网页\n\n';
failedPages.forEach((page, i) => {
output += `[失败${i + 1}] ${page.url}`;
if (page.error) {
output += ` - ${page.error}`;
}
output += '\n\n';
});
}
return output;
}