de en es ja ko ru zh

docs-mcp-server

by arabold

TypeScript

MIT License

542

676

Overview InspectNew Endpoints Schema Related Servers Reviews Score

Need Help?View Source Code Report Issue

FetchUrlTool.ts•5.81 kB

import type { AutoDetectFetcher, RawContent } from "../scraper/fetcher"; import { HtmlPipeline } from "../scraper/pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../scraper/pipelines/MarkdownPipeline"; import { TextPipeline } from "../scraper/pipelines/TextPipeline"; import type { ContentPipeline, ProcessedContent } from "../scraper/pipelines/types"; import { ScrapeMode } from "../scraper/types"; import { convertToString } from "../scraper/utils/buffer"; import { resolveCharset } from "../scraper/utils/charset"; import { logger } from "../utils/logger"; import { ToolError, ValidationError } from "./errors"; export interface FetchUrlToolOptions { /** * The URL to fetch and convert to markdown. * Must be a valid HTTP/HTTPS URL or file:// URL. */ url: string; /** * Whether to follow HTTP redirects. * @default true */ followRedirects?: boolean; /** * Determines the HTML processing strategy. * - 'fetch': Use a simple DOM parser (faster, less JS support). * - 'playwright': Use a headless browser (slower, full JS support). * - 'auto': Automatically select the best strategy (currently defaults to 'playwright'). * @default ScrapeMode.Auto */ scrapeMode?: ScrapeMode; /** * Custom HTTP headers to send with the request (e.g., for authentication). * Keys are header names, values are header values. */ headers?: Record<string, string>; } /** * Tool for fetching a single URL and converting its content to Markdown. * Unlike scrape_docs, this tool only processes one page without crawling * or storing the content. * * Supports both HTTP/HTTPS URLs and local file URLs (file://). */ export class FetchUrlTool { /** * AutoDetectFetcher handles all URL types and fallback logic automatically. */ private readonly fetcher: AutoDetectFetcher; /** * Collection of pipelines that will be tried in order for processing content. * The first pipeline that can process the content type will be used. * Currently includes HtmlPipeline, MarkdownPipeline, and TextPipeline (as fallback). */ private readonly pipelines: ContentPipeline[]; constructor(fetcher: AutoDetectFetcher) { this.fetcher = fetcher; const htmlPipeline = new HtmlPipeline(); const markdownPipeline = new MarkdownPipeline(); const textPipeline = new TextPipeline(); // Order matters: more specific pipelines first, fallback (text) pipeline last this.pipelines = [htmlPipeline, markdownPipeline, textPipeline]; } /** * Fetches content from a URL and converts it to Markdown. * Supports both HTTP/HTTPS URLs and local file URLs (file://). * @returns The processed Markdown content * @throws {ToolError} If fetching or processing fails */ async execute(options: FetchUrlToolOptions): Promise<string> { const { url, scrapeMode = ScrapeMode.Auto, headers } = options; if (!this.fetcher.canFetch(url)) { throw new ValidationError( `Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`, this.constructor.name, ); } try { logger.info(`📡 Fetching ${url}...`); const fetchOptions = { followRedirects: options.followRedirects ?? true, maxRetries: 3, headers, // propagate custom headers }; // AutoDetectFetcher handles all fallback logic automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); logger.info("🔄 Processing content..."); let processed: Awaited<ProcessedContent> | undefined; for (const pipeline of this.pipelines) { if (pipeline.canProcess(rawContent)) { processed = await pipeline.process( rawContent, { url, library: "", version: "", maxDepth: 0, maxPages: 1, maxConcurrency: 1, scope: "subpages", followRedirects: options.followRedirects ?? true, excludeSelectors: undefined, ignoreErrors: false, scrapeMode, headers, // propagate custom headers }, this.fetcher, ); break; } } if (!processed) { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`, ); // Use proper charset detection for unsupported content types const resolvedCharset = resolveCharset( rawContent.charset, rawContent.content, rawContent.mimeType, ); const contentString = convertToString(rawContent.content, resolvedCharset); return contentString; } for (const err of processed.errors) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } if (typeof processed.textContent !== "string" || !processed.textContent.trim()) { throw new ToolError( `Processing resulted in empty content for ${url}`, this.constructor.name, ); } logger.info(`✅ Successfully processed ${url}`); return processed.textContent; } catch (error) { // Preserve ToolError as it already has user-friendly messages, wrap others if (error instanceof ToolError) { throw error; } throw new ToolError( `Unable to fetch or process the URL "${url}". Please verify the URL is correct and accessible.`, this.constructor.name, ); } finally { // Cleanup all pipelines and fetcher to prevent resource leaks (e.g., browser instances) await Promise.allSettled([ ...this.pipelines.map((pipeline) => pipeline.close()), this.fetcher.close(), ]); } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server