SEO Crawler MCP

Overview Schema Related Servers Score Discussions

CrawlOrchestrator.ts•11.7 KiB

// Copyright 2026 Richard Baxter // Licensed under the Apache License, Version 2.0 /** * CrawlOrchestrator - Main coordinator for crawling * * Responsibilities: * - Initialize and manage Crawlee crawler * - Coordinate UrlManager, ContentExtractor, LinkExtractor, CrawlStorage * - Apply filters (extensions, patterns, robots.txt) * - Track progress and errors * - Handle crawl lifecycle (start, pause, completion) */ import { HttpCrawler, RequestQueue, Configuration, type Request, type CrawlingContext, log } from 'crawlee'; import { MemoryStorage } from '@crawlee/memory-storage'; import { load } from 'cheerio'; import type { CrawlConfig, CrawlMetadata, CrawlError } from '../types/index.js'; import { UrlManager } from './UrlManager.js'; import { ContentExtractor } from './ContentExtractor.js'; import { LinkExtractor } from './LinkExtractor.js'; import { CrawlStorage } from './CrawlStorage.js'; import { debug } from '../utils/debug.js'; export class CrawlOrchestrator { private crawler: any; private metadata: CrawlMetadata; private linkBuffer: any[] = []; private readonly LINK_BUFFER_SIZE = 100; private memoryStorage?: MemoryStorage; // Track storage for cleanup constructor( private config: CrawlConfig, private urlManager: UrlManager, private contentExtractor: ContentExtractor, private linkExtractor: LinkExtractor, private storage: CrawlStorage ) { // Suppress ALL Crawlee logging for MCP stdout cleanliness log.setLevel(log.LEVELS.OFF); this.metadata = this.createInitialMetadata(); // initializeCrawler is now async, will be called in run() } private createInitialMetadata(): CrawlMetadata { return { crawlId: this.config.crawlId, status: 'queued', startedAt: null, completedAt: null, duration: null, stats: { discovered: 0, crawled: 0, failed: 0, skipped: 0, depth: 0, speed: 0 }, errors: [] }; } private async initializeCrawler(): Promise<void> { // CRITICAL FIX: Create isolated storage with unique directory per crawl // This prevents RequestQueue state persistence between crawls // Use unique storage dir based on crawlId to ensure complete isolation const storageDir = `./crawlee-storage-${this.config.crawlId}`; this.memoryStorage = new MemoryStorage({ localDataDirectory: storageDir }); // Create isolated configuration using the memory storage const configuration = new Configuration({ storageClient: this.memoryStorage, persistStorage: false, // Don't persist between runs }); // Create isolated RequestQueue using the configuration const requestQueue = await RequestQueue.open(undefined, { config: configuration }); // User agent strings const userAgents = { chrome: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', googlebot: 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' }; const userAgent = userAgents[this.config.userAgent as keyof typeof userAgents] || userAgents.chrome; const crawlerConfig = { maxRequestsPerCrawl: this.config.maxPages, maxConcurrency: 20, minConcurrency: 5, maxRequestRetries: 5, requestHandlerTimeoutSecs: this.config.timeout / 1000, navigationTimeoutSecs: 30, additionalMimeTypes: ['text/html', 'application/xhtml+xml'], requestQueue, // Use isolated request queue (which already has the isolated config) preNavigationHooks: [ async ({ request }: any) => { request.headers = { 'User-Agent': userAgent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }; } ], requestHandler: async (context: CrawlingContext) => { const { request, response, body, crawler } = context as any; if (!response || !body) return; const $ = load(body.toString()); await this.processPage(request.url, body.toString(), $, response, crawler); }, failedRequestHandler: async (context: CrawlingContext, error: Error) => { const { request } = context as any; await this.handleFailedRequest(request, error); } }; this.crawler = new HttpCrawler(crawlerConfig as any); } async run(): Promise<CrawlMetadata> { try { debug('[ORCH] Starting orchestrator.run()'); debug('[ORCH] Config:', JSON.stringify(this.config, null, 2)); // Initialize crawler with isolated storage await this.initializeCrawler(); debug('[ORCH] Crawler initialized with isolated MemoryStorage'); await this.storage.initialize(); debug('[ORCH] Storage initialized'); this.metadata.status = 'running'; this.metadata.startedAt = new Date().toISOString(); await this.storage.saveMetadata(this.metadata, this.config); debug('[ORCH] Metadata saved, status=running'); this.urlManager.addDiscovered(this.config.startUrl, 0); debug('[ORCH] Start URL added to UrlManager'); debug('[ORCH] About to call crawler.run()...'); await this.crawler.run([this.config.startUrl]); debug('[ORCH] Crawler.run() completed'); if (this.linkBuffer.length > 0) { await this.storage.saveLinkData(this.linkBuffer); this.linkBuffer = []; } const endTime = Date.now(); const startTime = new Date(this.metadata.startedAt!).getTime(); this.metadata.status = 'completed'; this.metadata.completedAt = new Date().toISOString(); this.metadata.duration = endTime - startTime; this.metadata.stats.depth = this.urlManager.getMaxDepth(); this.metadata.stats.speed = this.metadata.duration > 0 ? (this.metadata.stats.crawled / (this.metadata.duration / 1000)) : 0; } catch (error: any) { console.error('[ORCH ERROR] Fatal error in run():', error); console.error('[ORCH ERROR] Stack:', error.stack); this.metadata.status = 'failed'; this.metadata.errors.push({ url: '', errorType: 'unknown', message: error.message || 'Unknown error', timestamp: new Date().toISOString() }); } finally { // CRITICAL: Clean up isolated storage to prevent state persistence between crawls // This ensures each crawl starts with a fresh RequestQueue if (this.memoryStorage) { try { await this.memoryStorage.purge(); debug('[ORCH] MemoryStorage cleaned up successfully'); } catch (cleanupError: any) { console.error('[ORCH WARN] Storage cleanup failed:', cleanupError.message); } } } await this.storage.saveMetadata(this.metadata, this.config); return this.metadata; } private async processPage( url: string, html: string, $: any, response: any, crawler: any ): Promise<void> { this.urlManager.markVisited(url); const pageData = this.contentExtractor.extract(url, html, $, { crawlId: this.config.crawlId, depth: this.urlManager.getDepth(url), statusCode: response.status || 200, contentType: response.headers?.['content-type'] || 'text/html', responseTime: 0, size: html.length, isInternal: this.urlManager.isInternal(url), linkedFrom: this.urlManager.getSourcePages(url), redirects: [] }, response); const links = this.linkExtractor.extract($, url, this.config.crawlId); const currentDepth = this.urlManager.getDepth(url); const linksToAdd: string[] = []; for (const link of links) { if (this.shouldCrawlUrl(link.targetUrl)) { // Only check depth - let Crawlee's RequestQueue handle de-duplication if (currentDepth < this.config.maxDepth) { // Still track in urlManager for metadata purposes if (!this.urlManager.isDiscovered(link.targetUrl)) { this.urlManager.addDiscovered(link.targetUrl, currentDepth + 1, url); } linksToAdd.push(link.targetUrl); } else { this.metadata.stats.skipped++; } } else { this.metadata.stats.skipped++; } } if (linksToAdd.length > 0) { await crawler.addRequests(linksToAdd); } await this.storage.savePageData(pageData); this.linkBuffer.push(...links); if (this.linkBuffer.length >= this.LINK_BUFFER_SIZE) { await this.storage.saveLinkData(this.linkBuffer); this.linkBuffer = []; } this.metadata.stats.crawled++; this.metadata.stats.discovered = this.urlManager.getTotalDiscovered(); if (this.metadata.stats.crawled % 10 === 0) { await this.storage.updateMetadata(this.metadata); } } private async handleFailedRequest(request: Request, error: Error): Promise<void> { this.metadata.stats.failed++; const errorType = this.categorizeError(error); const errorMessage = error.message || 'Unknown error'; const errorMessages = request.errorMessages || []; const fullErrorDetails = errorMessages.length > 0 ? `${errorMessage} (Retry ${request.retryCount}: ${errorMessages.join(', ')})` : errorMessage; this.metadata.errors.push({ url: request.url, errorType: errorType, message: fullErrorDetails, timestamp: new Date().toISOString() }); } private categorizeError(error: Error): CrawlError['errorType'] { const message = error.message.toLowerCase(); if (message.includes('timeout')) return 'timeout'; if (message.includes('dns') || message.includes('getaddrinfo')) return 'dns'; if (message.includes('connect') || message.includes('econnrefused')) return 'connection'; if (message.includes('ssl') || message.includes('certificate')) return 'ssl'; if (message.includes('401') || message.includes('403')) return 'auth'; if (message.includes('404')) return 'not_found'; if (message.includes('429') || message.includes('rate limit')) return 'rate_limit'; if (message.includes('500') || message.includes('502') || message.includes('503')) return 'server_error'; return 'network'; } private shouldCrawlUrl(url: string): boolean { const ext = this.getFileExtension(url); // Always exclude blocked extensions if (ext && this.config.excludeExtensions.includes(ext)) { return false; } // If URL has no extension, treat as HTML (allow it) // If URL has extension, check against include list if (ext && this.config.includeExtensions.length > 0 && !this.config.includeExtensions.includes(ext)) { return false; } if (this.config.excludePatterns.length > 0) { if (this.config.excludePatterns.some(pattern => { try { return new RegExp(pattern).test(url); } catch { return false; } })) { return false; } } if (this.config.includePatterns.length > 0) { if (!this.config.includePatterns.some(pattern => { try { return new RegExp(pattern).test(url); } catch { return false; } })) { return false; } } if (!this.config.crawlExternal && !this.urlManager.isInternal(url)) { return false; } return true; } private getFileExtension(url: string): string { try { const pathname = new URL(url).pathname; const parts = pathname.split('.'); if (parts.length > 1) { return parts[parts.length - 1].toLowerCase(); } } catch {} return ''; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/houtini-ai/seo-crawler-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

CrawlOrchestrator.ts•11.7 KiB