de en es ja ko ru zh

docs-mcp-server

by arabold

TypeScript

MIT License

542

676

Overview InspectNew Endpoints Schema Related Servers Reviews Score

Need Help?View Source Code Report Issue

ScrapeTool.ts•6.61 kB

import * as semver from "semver"; import type { IPipeline } from "../pipeline/trpc/interfaces"; import { ScrapeMode } from "../scraper/types"; import { DEFAULT_MAX_CONCURRENCY, DEFAULT_MAX_DEPTH, DEFAULT_MAX_PAGES, } from "../utils/config"; import { logger } from "../utils/logger"; import { ValidationError } from "./errors"; export interface ScrapeToolOptions { library: string; version?: string | null; // Make version optional url: string; options?: { maxPages?: number; maxDepth?: number; /** * Defines the allowed crawling boundary relative to the starting URL * - 'subpages': Only crawl URLs on the same hostname and within the same starting path (default) * - 'hostname': Crawl any URL on the same hostname, regardless of path * - 'domain': Crawl any URL on the same top-level domain, including subdomains */ scope?: "subpages" | "hostname" | "domain"; /** * Controls whether HTTP redirects (3xx responses) should be followed * - When true: Redirects are followed automatically (default) * - When false: A RedirectError is thrown when a 3xx response is received */ followRedirects?: boolean; maxConcurrency?: number; // Note: Concurrency is now set when PipelineManager is created ignoreErrors?: boolean; /** * Determines the HTML processing strategy. * - 'fetch': Use a simple DOM parser (faster, less JS support). * - 'playwright': Use a headless browser (slower, full JS support). * - 'auto': Automatically select the best strategy (currently defaults to 'playwright'). * @default ScrapeMode.Auto */ scrapeMode?: ScrapeMode; /** * Patterns for including URLs during scraping. If not set, all are included by default. * Regex patterns must be wrapped in slashes, e.g. /pattern/. */ includePatterns?: string[]; /** * Patterns for excluding URLs during scraping. Exclude takes precedence over include. * If not specified, default patterns exclude common files (CHANGELOG.md, LICENSE, etc.) * and folders (archive, deprecated, i18n locales, etc.). * Regex patterns must be wrapped in slashes, e.g. /pattern/. */ excludePatterns?: string[]; /** * Custom HTTP headers to send with each request (e.g., for authentication). * Keys are header names, values are header values. */ headers?: Record<string, string>; }; /** If false, returns jobId immediately without waiting. Defaults to true. */ waitForCompletion?: boolean; } export interface ScrapeResult { /** Indicates the number of pages scraped if waitForCompletion was true and the job succeeded. May be 0 or inaccurate if job failed or waitForCompletion was false. */ pagesScraped: number; } /** Return type for ScrapeTool.execute */ export type ScrapeExecuteResult = ScrapeResult | { jobId: string }; /** * Tool for enqueuing documentation scraping jobs via the pipeline. */ export class ScrapeTool { private pipeline: IPipeline; constructor(pipeline: IPipeline) { this.pipeline = pipeline; } async execute(options: ScrapeToolOptions): Promise<ScrapeExecuteResult> { const { library, version, url, options: scraperOptions, waitForCompletion = true, } = options; // Store initialization and manager start should happen externally let internalVersion: string; const partialVersionRegex = /^\d+(\.\d+)?$/; // Matches '1' or '1.2' if (version === null || version === undefined) { internalVersion = ""; } else { const validFullVersion = semver.valid(version); if (validFullVersion) { internalVersion = validFullVersion; } else if (partialVersionRegex.test(version)) { const coercedVersion = semver.coerce(version); if (coercedVersion) { internalVersion = coercedVersion.version; } else { throw new ValidationError( `Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, "ScrapeTool", ); } } else { throw new ValidationError( `Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, "ScrapeTool", ); } } internalVersion = internalVersion.toLowerCase(); // Use the injected pipeline instance const pipeline = this.pipeline; // Remove internal progress tracking and callbacks // let pagesScraped = 0; // let lastReportedPages = 0; // const reportProgress = ... // pipeline.setCallbacks(...) // Normalize pipeline version argument: use null for unversioned to be explicit cross-platform const enqueueVersion: string | null = internalVersion === "" ? null : internalVersion; // Enqueue the job using the injected pipeline const jobId = await pipeline.enqueueJob(library, enqueueVersion, { url: url, library: library, version: internalVersion, scope: scraperOptions?.scope ?? "subpages", followRedirects: scraperOptions?.followRedirects ?? true, maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES, maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH, maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY, ignoreErrors: scraperOptions?.ignoreErrors ?? true, scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto, // Pass scrapeMode enum includePatterns: scraperOptions?.includePatterns, excludePatterns: scraperOptions?.excludePatterns, headers: scraperOptions?.headers, // <-- propagate headers }); // Conditionally wait for completion if (waitForCompletion) { try { await pipeline.waitForJobCompletion(jobId); // Fetch final job state to get status and potentially final page count const finalJob = await pipeline.getJob(jobId); const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0; // Get count from final job state logger.debug( `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`, ); return { pagesScraped: finalPagesScraped, }; } catch (error) { logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`); throw error; // Re-throw so the caller knows it failed } // No finally block needed to stop pipeline, as it's managed externally } // If not waiting, return the job ID immediately return { jobId }; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server