fetchUrls.ts•5.27 kB
import { chromium } from "playwright";
import { WebContentProcessor } from "../services/webContentProcessor.js";
import { FetchOptions, FetchResult } from "../types/index.js";
// Parse command line arguments, check for debug flag
const isDebugMode = process.argv.includes("--debug");
/**
* Tool definition for fetch_urls
*/
export const fetchUrlsTool = {
name: "fetch_urls",
description: "Retrieve web page content from multiple specified URLs",
inputSchema: {
type: "object",
properties: {
urls: {
type: "array",
items: {
type: "string",
},
description: "Array of URLs to fetch",
},
timeout: {
type: "number",
description:
"Page loading timeout in milliseconds, default is 30000 (30 seconds)",
},
waitUntil: {
type: "string",
description:
"Specifies when navigation is considered complete, options: 'load', 'domcontentloaded', 'networkidle', 'commit', default is 'load'",
},
extractContent: {
type: "boolean",
description:
"Whether to intelligently extract the main content, default is true",
},
maxLength: {
type: "number",
description:
"Maximum length of returned content (in characters), default is no limit",
},
returnHtml: {
type: "boolean",
description:
"Whether to return HTML content instead of Markdown, default is false",
},
waitForNavigation: {
type: "boolean",
description:
"Whether to wait for additional navigation after initial page load (useful for sites with anti-bot verification), default is false",
},
navigationTimeout: {
type: "number",
description:
"Maximum time to wait for additional navigation in milliseconds, default is 10000 (10 seconds)",
},
disableMedia: {
type: "boolean",
description:
"Whether to disable media resources (images, stylesheets, fonts, media), default is true",
},
debug: {
type: "boolean",
description:
"Whether to enable debug mode (showing browser window), overrides the --debug command line flag if specified",
},
},
required: ["urls"],
}
};
/**
* Implementation of the fetch_urls tool
*/
export async function fetchUrls(args: any) {
const urls = (args?.urls as string[]) || [];
if (!urls || !Array.isArray(urls) || urls.length === 0) {
throw new Error("URLs parameter is required and must be a non-empty array");
}
const options: FetchOptions = {
timeout: Number(args?.timeout) || 30000,
waitUntil: String(args?.waitUntil || "load") as 'load' | 'domcontentloaded' | 'networkidle' | 'commit',
extractContent: args?.extractContent !== false,
maxLength: Number(args?.maxLength) || 0,
returnHtml: args?.returnHtml === true,
waitForNavigation: args?.waitForNavigation === true,
navigationTimeout: Number(args?.navigationTimeout) || 10000,
disableMedia: args?.disableMedia !== false,
debug: args?.debug
};
// 确定是否启用调试模式(优先使用参数指定的值,否则使用命令行标志)
const useDebugMode = options.debug !== undefined ? options.debug : isDebugMode;
if (useDebugMode) {
console.log(`[Debug] Debug mode enabled for URLs: ${urls.join(', ')}`);
}
let browser = null;
try {
browser = await chromium.launch({ headless: !useDebugMode });
const context = await browser.newContext({
javaScriptEnabled: true,
ignoreHTTPSErrors: true,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
});
await context.route('**/*', async (route) => {
const resourceType = route.request().resourceType();
if (options.disableMedia && ['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
await route.abort();
} else {
await route.continue();
}
});
const processor = new WebContentProcessor(options, '[FetchURLs]');
const results = await Promise.all(
urls.map(async (url, index) => {
const page = await context.newPage();
try {
const result = await processor.processPageContent(page, url);
return { index, ...result } as FetchResult;
} finally {
if (!useDebugMode) {
await page.close().catch(e => console.error(`[Error] Failed to close page: ${e.message}`));
} else {
console.log(`[Debug] Page kept open for debugging. URL: ${url}`);
}
}
})
);
results.sort((a, b) => (a.index || 0) - (b.index || 0));
const combinedResults = results
.map((result, i) => `[webpage ${i + 1} begin]\n${result.content}\n[webpage ${i + 1} end]`)
.join('\n\n');
return {
content: [{ type: "text", text: combinedResults }]
};
} finally {
if (!useDebugMode) {
if (browser) await browser.close().catch(e => console.error(`[Error] Failed to close browser: ${e.message}`));
} else {
console.log(`[Debug] Browser kept open for debugging. URLs: ${urls.join(', ')}`);
}
}
}