Documentation MCP Server

scraper.ts•4.17 KiB

import { CheerioCrawler } from 'crawlee'; import { log } from 'apify'; export interface DocPage { id: string; source: string; title: string; content: string; snippet: string; type: 'api' | 'guide' | 'example'; url: string; searchableText: string; apiReference?: { signature?: string; parameters?: string[]; returns?: string; example?: string; }; } export interface ScraperOptions { startUrls?: string[]; maxPages?: number; } export async function scrapeApifyDocs(options: ScraperOptions = {}): Promise<DocPage[]> { const { startUrls = [ 'https://docs.apify.com/sdk/js/docs', 'https://docs.apify.com/sdk/js/reference', ], maxPages = 100, } = options; // Auto-generate URL patterns from start URLs const urlPatterns = startUrls.map((url) => { try { const urlObj = new URL(url); // Get the origin (protocol + host) and path const base = urlObj.origin + urlObj.pathname.replace(/\/$/, ''); // Add wildcard pattern to crawl all pages under this path return `${base}/**`; } catch (error) { log.warning(`Invalid URL: ${url}`); return url; } }); log.info('Auto-generated URL patterns:', urlPatterns); const docs: DocPage[] = []; const crawler = new CheerioCrawler({ maxRequestsPerCrawl: maxPages, async requestHandler({ request, $, enqueueLinks }) { log.info(`Scraping: ${request.url}`); // Extract page data with multiple selector fallbacks const title = $('h1').first().text().trim() || $('title').first().text().trim() || $('h2').first().text().trim() || 'Untitled'; const content = $('article').text().trim() || $('main').text().trim() || $('.content').text().trim() || $('body').text().trim(); // Debug logging log.info(`Extracted - Title: "${title.substring(0, 50)}", Content length: ${content.length}`); // Skip empty pages (reduced threshold) if (!title || content.length < 20) { log.warning(`Skipping page - insufficient content`); return; } // Determine page type let type: 'api' | 'guide' | 'example' = 'guide'; if (request.url.includes('/api/') || request.url.includes('/reference/')) type = 'api'; if (request.url.includes('/examples/') || request.url.includes('/example')) type = 'example'; // Create doc page object const docPage: DocPage = { id: `docs-${docs.length + 1}`, source: 'custom-docs', title, content, snippet: content.substring(0, 200) + '...', type, url: request.url, searchableText: `${title} ${content}`.toLowerCase(), }; // Extract API reference data if it's an API page if (type === 'api') { const codeExample = $('pre code').first().text(); docPage.apiReference = { signature: $('code.signature, .method-signature').first().text() || undefined, example: codeExample || undefined, }; } // Extract code examples from any page const codeBlocks = $('pre code').map((_, el) => $(el).text()).get(); if (codeBlocks.length > 0 && !docPage.apiReference?.example) { docPage.apiReference = { ...docPage.apiReference, example: codeBlocks[0], }; } docs.push(docPage); // Enqueue more links based on provided patterns await enqueueLinks({ globs: urlPatterns, exclude: ['**/*.pdf', '**/*.zip', '**/*.png', '**/*.jpg'], }); }, }); // Start crawling from provided start URLs await crawler.run(startUrls); log.info(`Scraped ${docs.length} pages`); return docs; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wuyuwenj/api-doc-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scraper.ts•4.17 KiB