docs-mcp-server

Overview Schema Related Servers Score Discussions

scrape.ts•9.54 KiB

/** * Scrape command - Scrapes and indexes documentation from a URL or local folder. */ import type { Argv } from "yargs"; import { EventType } from "../../events"; import { PipelineFactory, PipelineJobStatus, type PipelineOptions } from "../../pipeline"; import type { IPipeline } from "../../pipeline/trpc/interfaces"; import { ScrapeMode } from "../../scraper/types"; import { createDocumentManagement } from "../../store"; import type { IDocumentManagement } from "../../store/trpc/interfaces"; import { TelemetryEvent, telemetry } from "../../telemetry"; import { ScrapeTool } from "../../tools"; import { loadConfig } from "../../utils/config"; import { type CliContext, getEventBus, parseHeaders } from "../utils"; export function createScrapeCommand(cli: Argv) { cli.command( "scrape <library> <url>", "Download and index documentation from a URL or local directory", (yargs) => { return yargs .version(false) .positional("library", { type: "string", description: "Library name", demandOption: true, }) .positional("url", { type: "string", description: "URL or file:// path to scrape", demandOption: true, }) .option("version", { type: "string", description: "Version of the library (optional)", alias: "v", }) .option("max-pages", { type: "number", description: "Maximum pages to scrape", alias: ["p", "maxPages"], }) .option("max-depth", { type: "number", description: "Maximum navigation depth", alias: ["d", "maxDepth"], }) .option("max-concurrency", { type: "number", description: "Maximum concurrent page requests", alias: ["c", "maxConcurrency"], }) .option("ignore-errors", { type: "boolean", description: "Ignore errors during scraping", default: true, alias: "ignoreErrors", }) .option("scope", { choices: ["subpages", "hostname", "domain"], description: "Crawling boundary", default: "subpages", }) .option("follow-redirects", { type: "boolean", description: "Follow HTTP redirects", default: true, alias: "followRedirects", }) .option("no-follow-redirects", { type: "boolean", description: "Disable following HTTP redirects", hidden: true, }) .option("scrape-mode", { choices: Object.values(ScrapeMode), description: "HTML processing strategy", default: ScrapeMode.Auto, alias: "scrapeMode", }) .option("include-pattern", { type: "string", array: true, description: "Glob or regex pattern for URLs to include (can be specified multiple times). Regex patterns must be wrapped in slashes, e.g. /pattern/.", alias: "includePattern", default: [] as string[], }) .option("exclude-pattern", { type: "string", array: true, description: "Glob or regex pattern for URLs to exclude (can be specified multiple times, takes precedence over include). Regex patterns must be wrapped in slashes, e.g. /pattern/.", alias: "excludePattern", default: [] as string[], }) .option("header", { type: "string", array: true, description: "Custom HTTP header to send with each request (can be specified multiple times)", default: [] as string[], }) .option("embedding-model", { type: "string", description: "Embedding model configuration (e.g., 'openai:text-embedding-3-small')", alias: "embeddingModel", }) .option("server-url", { type: "string", description: "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)", alias: "serverUrl", }) .option("clean", { type: "boolean", description: "Clear existing documents before scraping", default: true, }) .usage( "$0 scrape <library> <url> [options]\n\n" + "Scrape and index documentation from a URL or local folder.\n\n" + "To scrape local files or folders, use a file:// URL.\n" + "Examples:\n" + " scrape mylib https://react.dev/reference/react\n" + " scrape mylib file:///Users/me/docs/index.html\n" + " scrape mylib file:///Users/me/docs/my-library\n" + "\nNote: For local files/folders, you must use the file:// prefix. If running in Docker, mount the folder and use the container path. See README for details.", ); }, async (argv) => { const library = argv.library as string; const url = argv.url as string; const serverUrl = argv.serverUrl as string | undefined; const appConfig = loadConfig(argv, { configPath: argv.config as string, searchDir: argv.storePath as string, // resolved globally }); const maxPages = (argv.maxPages as number) ?? appConfig.scraper.maxPages; const maxDepth = (argv.maxDepth as number) ?? appConfig.scraper.maxDepth; const maxConcurrency = (argv.maxConcurrency as number) ?? appConfig.scraper.maxConcurrency; // Update appConfig with effective values appConfig.scraper.maxPages = maxPages; appConfig.scraper.maxDepth = maxDepth; appConfig.scraper.maxConcurrency = maxConcurrency; await telemetry.track(TelemetryEvent.CLI_COMMAND, { command: "scrape", library, version: argv.version, url, maxPages, maxDepth, maxConcurrency, scope: argv.scope, scrapeMode: argv.scrapeMode, followRedirects: argv.followRedirects, hasHeaders: (argv.header as string[]).length > 0, hasIncludePatterns: (argv.includePattern as string[]).length > 0, hasExcludePatterns: (argv.excludePattern as string[]).length > 0, useServerUrl: !!serverUrl, }); const eventBus = getEventBus(argv as CliContext); const docService: IDocumentManagement = await createDocumentManagement({ serverUrl, eventBus, appConfig: appConfig, }); let pipeline: IPipeline | null = null; // Display initial status console.log("⏳ Initializing scraping job..."); // Subscribe to event bus for progress updates (only for local pipelines) let unsubscribeProgress: (() => void) | null = null; let unsubscribeStatus: (() => void) | null = null; if (!serverUrl) { unsubscribeProgress = eventBus.on(EventType.JOB_PROGRESS, (event) => { const { job, progress } = event; console.log( `📄 Scraping ${job.library}${job.version ? ` v${job.version}` : ""}: ${progress.pagesScraped}/${progress.totalPages} pages`, ); }); unsubscribeStatus = eventBus.on(EventType.JOB_STATUS_CHANGE, (event) => { if (event.status === PipelineJobStatus.RUNNING) { console.log( `🚀 Scraping ${event.library}${event.version ? ` v${event.version}` : ""}...`, ); } }); } try { const pipelineOptions: PipelineOptions = { recoverJobs: false, serverUrl, appConfig: appConfig, }; pipeline = serverUrl ? await PipelineFactory.createPipeline(undefined, eventBus, { serverUrl, ...pipelineOptions, }) : await PipelineFactory.createPipeline( docService as unknown as never, eventBus, pipelineOptions, ); await pipeline.start(); const scrapeTool = new ScrapeTool(pipeline, appConfig.scraper); const headers = parseHeaders((argv.header as string[]) || []); const result = await scrapeTool.execute({ url, library, version: argv.version as string | undefined, options: { maxPages, maxDepth, maxConcurrency, ignoreErrors: argv.ignoreErrors as boolean, scope: argv.scope as "subpages" | "hostname" | "domain", followRedirects: argv.followRedirects as boolean, scrapeMode: argv.scrapeMode as ScrapeMode, includePatterns: (argv.includePattern as string[])?.length > 0 ? (argv.includePattern as string[]) : undefined, excludePatterns: (argv.excludePattern as string[])?.length > 0 ? (argv.excludePattern as string[]) : undefined, headers: Object.keys(headers).length > 0 ? headers : undefined, clean: argv.clean as boolean, }, }); if ("pagesScraped" in result) { console.log(`✅ Successfully scraped ${result.pagesScraped} pages`); } else { console.log(`✅ Scraping job started with ID: ${result.jobId}`); } } catch (error) { console.error( `❌ Scraping failed: ${error instanceof Error ? error.message : String(error)}`, ); throw error; } finally { if (unsubscribeProgress) unsubscribeProgress(); if (unsubscribeStatus) unsubscribeStatus(); if (pipeline) await pipeline.stop(); await docService.shutdown(); } }, ); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scrape.ts•9.54 KiB