Fetcher MCP
by jae-jae
Verified
- fetcher-mcp
- src
- services
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import TurndownService from "turndown";
import { FetchOptions, FetchResult } from "../types/index.js";
export class WebContentProcessor {
private options: FetchOptions;
private logPrefix: string;
constructor(options: FetchOptions, logPrefix: string = '') {
this.options = options;
this.logPrefix = logPrefix;
}
async processPageContent(page: any, url: string): Promise<FetchResult> {
try {
// Set timeout
page.setDefaultTimeout(this.options.timeout);
// Navigate to URL
console.error(`${this.logPrefix} Navigating to URL: ${url}`);
await page.goto(url, {
timeout: this.options.timeout,
waitUntil: this.options.waitUntil,
});
// Handle possible anti-bot verification and redirection
if (this.options.waitForNavigation) {
console.error(`${this.logPrefix} Waiting for possible navigation/redirection...`);
try {
// Create a promise to wait for page navigation events
const navigationPromise = page.waitForNavigation({
timeout: this.options.navigationTimeout,
waitUntil: this.options.waitUntil
});
// Set a timeout
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(new Error('Navigation timeout'));
}, this.options.navigationTimeout);
});
// Wait for navigation event or timeout, whichever occurs first
await Promise.race([navigationPromise, timeoutPromise])
.then(() => {
console.error(`${this.logPrefix} Page navigated/redirected successfully`);
})
.catch(e => {
// If timeout occurs but page may have already loaded, we can continue
console.error(`${this.logPrefix} No navigation occurred or navigation timeout: ${e.message}`);
});
} catch (navError: any) {
console.error(`${this.logPrefix} Error waiting for navigation: ${navError.message}`);
// Continue processing the page even if there are navigation issues
}
}
// Get page title
const pageTitle = await page.title();
console.error(`${this.logPrefix} Page title: ${pageTitle}`);
// Get HTML content
const html = await page.content();
if (!html) {
console.error(`${this.logPrefix} Browser returned empty content`);
return {
success: false,
content: `Title: Error\nURL: ${url}\nContent:\n\n<error>Failed to retrieve web page content: Browser returned empty content</error>`,
error: "Browser returned empty content"
};
}
console.error(`${this.logPrefix} Successfully retrieved web page content, length: ${html.length}`);
const processedContent = await this.processContent(html, url);
// Format the response
const formattedContent = `Title: ${pageTitle}\nURL: ${url}\nContent:\n\n${processedContent}`;
return {
success: true,
content: formattedContent
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : "Unknown error";
console.error(`${this.logPrefix} Error: ${errorMessage}`);
return {
success: false,
content: `Title: Error\nURL: ${url}\nContent:\n\n<error>Failed to retrieve web page content: ${errorMessage}</error>`,
error: errorMessage
};
}
}
private async processContent(html: string, url: string): Promise<string> {
let contentToProcess = html;
// Extract main content if needed
if (this.options.extractContent) {
console.error(`${this.logPrefix} Extracting main content`);
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
console.error(`${this.logPrefix} Could not extract main content, will use full HTML`);
} else {
contentToProcess = article.content;
console.error(`${this.logPrefix} Successfully extracted main content, length: ${contentToProcess.length}`);
}
}
// Convert to markdown if needed
let processedContent = contentToProcess;
if (!this.options.returnHtml) {
console.error(`${this.logPrefix} Converting to Markdown`);
const turndownService = new TurndownService();
processedContent = turndownService.turndown(contentToProcess);
console.error(`${this.logPrefix} Successfully converted to Markdown, length: ${processedContent.length}`);
}
// Truncate if needed
if (this.options.maxLength > 0 && processedContent.length > this.options.maxLength) {
console.error(`${this.logPrefix} Content exceeds maximum length, will truncate to ${this.options.maxLength} characters`);
processedContent = processedContent.substring(0, this.options.maxLength);
}
return processedContent;
}
}