MCP Toolkit

index.ts•10.8 kB

import { ToolDefinition, ToolResponse } from '../types.js'; import { Config } from '../../config/config.js'; import { HttpsProxyAgent } from 'https-proxy-agent'; import fetch from 'node-fetch'; import { load } from 'cheerio'; import type { Browser } from 'puppeteer'; import * as fs from 'fs'; import * as path from 'path'; // 可能需要 JS 渲染的网站特征 const JS_REQUIRED_PATTERNS = [ /react/i, /vue/i, /angular/i, /\.js$/i, /javascript/i, /dynamic/i, /spa/i, ]; // 主要内容标签选择器 const MAIN_CONTENT_SELECTORS = [ 'article', 'main', '.main', '.content', '.main-content', '.article', '.post', '.entry', '#content', '[role="main"]', ]; // 需要移除的干扰元素选择器 const NOISE_SELECTORS = [ 'header', 'footer', 'nav', '.nav', '.navigation', '.menu', '.sidebar', '.ad', '.advertisement', '.social', '.share', '.related', '.recommended', '.comments', 'script', 'style', 'iframe', 'form', ]; // 网页管理器类 class WebpageManager { private downloadDir: string; constructor( private readonly config: Config ) { // 使用当前目录下的 downloads 目录存储下载的文件 this.downloadDir = path.join(process.cwd(), 'downloads'); // 确保下载目录存在 if (!fs.existsSync(this.downloadDir)) { fs.mkdirSync(this.downloadDir, { recursive: true }); } } private async fetchWithProxy(url: string): Promise<string> { const options: any = { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } }; if (this.config.network.proxy) { options.agent = new HttpsProxyAgent(this.config.network.proxy); } const response = await fetch(url, options); if (!response.ok) { throw new Error(`获取网页失败: ${response.status} ${response.statusText}`); } return response.text(); } private async fetchWithPuppeteer(url: string): Promise<string> { let browser: Browser | undefined; try { const { default: puppeteer } = await import('puppeteer'); const args = ['--no-sandbox', '--disable-setuid-sandbox']; if (this.config.network.proxy) { args.push(`--proxy-server=${this.config.network.proxy}`); } browser = await puppeteer.launch({ headless: true, args }); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 }); const content = await page.content(); await browser.close(); return content; } catch (error) { if (browser) { await browser.close(); } const errorMessage = error instanceof Error ? error.message : String(error); throw new Error(`使用 Puppeteer 获取页面失败。如果需要 JavaScript 渲染，请先安装 Puppeteer: npm install puppeteer。错误: ${errorMessage}`); } } private extractMainContent($: ReturnType<typeof load>, selector?: string): string { let content = ''; if (selector) { content = $(selector).text().trim(); } if (!content) { // 尝试找到主要内容区域 for (const mainSelector of MAIN_CONTENT_SELECTORS) { const element = $(mainSelector); if (element.length > 0) { // 移除干扰元素 element.find(NOISE_SELECTORS.join(',')).remove(); content = element.text().trim(); if (content) break; } } } // 如果还是没找到内容，尝试从 body 中提取 if (!content) { const $body = $('body').clone(); // 移除干扰元素 $body.find(NOISE_SELECTORS.join(',')).remove(); content = $body.text().trim(); } // 清理文本 return content .replace(/[\n\r]+/g, '\n') // 合并多个换行 .replace(/[ \t]+/g, ' ') // 合并多个空格 .split('\n') // 按行分割 .map(line => line.trim()) // 清理每行的空白 .filter(Boolean) // 移除空行 .join('\n'); // 重新组合 } private extractLinks($: ReturnType<typeof load>): string[] { const links: string[] = []; $('a[href]').each((_, element) => { const $link = $(element); const href = $link.attr('href'); const text = $link.text().trim(); if (href && text && !href.startsWith('#') && !href.startsWith('javascript:')) { try { const fullUrl = new URL(href, $('base').attr('href')).toString(); links.push(`• ${text}\n → ${fullUrl}`); } catch (e) { // 忽略无效的 URL } } }); return [...new Set(links)]; // 去重 } private mayNeedJavaScript(url: string, html: string, $: ReturnType<typeof load>): boolean { // 检查 URL 是否包含可能需要 JS 的特征 if (JS_REQUIRED_PATTERNS.some(pattern => pattern.test(url))) { return true; } // 检查页面内容是否暗示需要 JS const hasNoContent = $('body').text().trim().length < 100; const hasLoadingIndicator = $('body').text().toLowerCase().includes('loading'); const hasReactRoot = $('#root').length > 0 || $('#app').length > 0; return hasNoContent || hasLoadingIndicator || hasReactRoot; } // 保存内容到文件 private async saveContent(content: string, url: string, format: 'text' | 'html' = 'text'): Promise<string> { // 确保下载目录存在 if (!fs.existsSync(this.downloadDir)) { await fs.promises.mkdir(this.downloadDir, { recursive: true }); } // 从 URL 生成文件名 const urlObj = new URL(url); const baseFileName = path.basename(urlObj.pathname) || 'webpage'; const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const extension = format === 'html' ? '.html' : '.txt'; const fileName = `${baseFileName}-${timestamp}${extension}`; const filePath = path.join(this.downloadDir, fileName); // 写入文件 await fs.promises.writeFile(filePath, content, 'utf8'); return filePath; } // 网页读取处理器 async readWebpage(args: { url: string; requiresJavaScript?: boolean; extractLinks?: boolean; selector?: string; save?: boolean; saveFormat?: 'text' | 'html'; }): Promise<ToolResponse> { try { if (!args.url?.trim()) { return { content: [{ type: 'text', text: 'URL不能为空' }], isError: true }; } // 首次获取内容 let content = await (args.requiresJavaScript ? this.fetchWithPuppeteer(args.url) : this.fetchWithProxy(args.url)); if (!content) { return { content: [{ type: 'text', text: '无法获取页面内容' }], isError: true }; } // 解析内容 let $ = load(content); // 如果没有明确指定是否需要 JS，尝试自动检测 if (args.requiresJavaScript === undefined && this.mayNeedJavaScript(args.url, content, $)) { try { content = await this.fetchWithPuppeteer(args.url); $ = load(content); } catch (error) { // 如果 Puppeteer 失败，继续使用已获取的内容 console.error('Puppeteer 渲染失败，使用原始内容:', error); } } const mainContent = this.extractMainContent($, args.selector); const links = args.extractLinks ? this.extractLinks($) : []; // 组合返回文本 const separator = "=".repeat(80); let resultText = `当前页面: ${args.url}\n${separator}\n\n`; if (links.length > 0) { resultText += "可用链接:\n"; resultText += links.join('\n'); resultText += "\n\n" + separator + "\n\n"; } resultText += "页面内容:\n>>>\n\n"; resultText += mainContent; resultText += "\n\n<<<\n" + separator; // 如果需要保存内容 if (args.save) { const saveFormat = args.saveFormat || 'text'; const contentToSave = saveFormat === 'html' ? content : resultText; const savedPath = await this.saveContent(contentToSave, args.url, saveFormat); resultText = `内容已保存到: ${savedPath}\n\n${resultText}`; } return { content: [{ type: 'text', text: resultText }] }; } catch (error) { return { content: [{ type: 'text', text: error instanceof Error ? error.message : String(error) }], isError: true }; } } // 清理资源 dispose(): void { // 不需要清理资源 } } // 创建网页工具 export function createWebpageTools( config: Config ): ToolDefinition[] { const manager = new WebpageManager(config); return [ { name: 'read_webpage', description: '读取网页的主要内容。当需要阅读特定网页的文本内容时使用此工具（不包括 GitHub，GitHub 相关内容请使用专门的 GitHub 工具）。此工具会自动提取主要内容，忽略广告、导航栏等干扰元素。支持普通 HTML 页面和需要 JS 渲染的页面。', inputSchema: { type: 'object', properties: { url: { type: 'string', description: '网页 URL。注意：对于 GitHub 相关内容，请使用专门的 GitHub 工具而不是此工具。' }, requiresJavaScript: { type: 'boolean', description: '是否需要 JavaScript 渲染。对于大多数静态网页不需要启用，但对于一些现代 Web 应用（如 React、Vue 等构建的网站）可能需要启用。默认为 false。' }, extractLinks: { type: 'boolean', description: '是否提取页面中的链接。如果为 true，将返回页面中找到的所有链接及其文本。默认为 false。' }, selector: { type: 'string', description: '可选的 CSS 选择器，用于只提取页面特定部分的内容。例如 "article" 或 ".main-content"。' }, save: { type: 'boolean', description: '是否保存内容到文件。如果为 true，将在当前工作目录的 downloads 目录下保存内容。' }, saveFormat: { type: 'string', enum: ['text', 'html'], description: '保存格式：text (只保存提取的文本) 或 html (保存完整的HTML)。默认为 text。' } }, required: ['url'] }, handler: args => manager.readWebpage(args) } ]; }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/zxfgds/mcp-toolkit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server