docmcp

docmcp
src
services
crawler
implementations

BaseCrawler.ts•7.33 KiB

import { ICrawler } from '../interfaces/ICrawler'; import { IJobManager } from '../interfaces/IJobManager'; import { ILinkExtractor } from '../interfaces/ILinkExtractor'; import { IDocumentProcessor } from '../interfaces/IDocumentProcessor'; import { IRobotsTxtService } from '../interfaces/IRobotsTxtService'; import { IRateLimiter } from '../interfaces/IRateLimiter'; import { IUrlQueue } from '../interfaces/IUrlQueue'; import { CrawlOptions, CrawlProgress, CrawlerState } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; /** * Abstract base class for crawler implementations * Provides common functionality and state management for all crawlers */ export abstract class BaseCrawler implements ICrawler { protected state: CrawlerState = CrawlerState.IDLE; protected options: CrawlOptions; protected logger = LoggingUtils.createTaggedLogger('crawler'); protected currentJobId: string | null = null; /** * Constructor for the base crawler * * @param urlQueue Queue for managing URLs to be crawled * @param jobManager Service for managing crawl jobs * @param linkExtractor Service for extracting links from pages * @param documentProcessor Service for processing and storing documents * @param robotsTxtService Service for handling robots.txt rules * @param rateLimiter Service for rate limiting requests * @param options Default crawl options */ constructor( protected readonly urlQueue: IUrlQueue, protected readonly jobManager: IJobManager, protected readonly linkExtractor: ILinkExtractor, protected readonly documentProcessor: IDocumentProcessor, protected readonly robotsTxtService: IRobotsTxtService, protected readonly rateLimiter: IRateLimiter, options: Partial<CrawlOptions> = {} ) { // Set default options this.options = { maxDepth: 3, baseUrl: '', rateLimit: 1000, respectRobotsTxt: true, userAgent: 'DocMCP Crawler/1.0', timeout: 30000, maxRedirects: 5, reuseCachedContent: true, cacheExpiry: 7, concurrency: 1, ...options }; } /** * Main method to start the crawling process * Implementation to be provided by subclasses * * @param jobId The ID of the job in the database * @param startUrl The URL to start crawling from * @param options Configuration options for the crawl */ abstract crawl(jobId: string, startUrl: string, options: CrawlOptions): Promise<void>; /** * Initialize the crawler * Sets up required services and state */ async initialize(): Promise<void> { this.logger.info('Initializing crawler'); this.state = CrawlerState.INITIALIZING; try { // Perform any necessary initialization this.state = CrawlerState.IDLE; this.logger.info('Crawler initialized'); } catch (error) { this.state = CrawlerState.ERROR; this.logger.error('Failed to initialize crawler', error as Error); throw error; } } /** * Stop the current crawling process * Cannot be resumed after stopping */ async stop(): Promise<void> { if (this.state === CrawlerState.CRAWLING || this.state === CrawlerState.PAUSED) { this.logger.info('Stopping crawler'); this.state = CrawlerState.STOPPING; // If we have an active job, mark it as cancelled if (this.currentJobId) { await this.jobManager.cancelJob(this.currentJobId, 'Crawl process stopped manually'); } this.state = CrawlerState.IDLE; this.currentJobId = null; this.logger.info('Crawler stopped'); } else { this.logger.warn(`Cannot stop crawler in state: ${this.state}`); } } /** * Pause the current crawling process * Can be resumed later */ async pause(): Promise<void> { if (this.state === CrawlerState.CRAWLING) { this.logger.info('Pausing crawler'); this.state = CrawlerState.PAUSED; // If we have an active job, mark it as paused if (this.currentJobId) { await this.jobManager.pauseJob(this.currentJobId); } this.logger.info('Crawler paused'); } else { this.logger.warn(`Cannot pause crawler in state: ${this.state}`); } } /** * Resume a previously paused crawling process */ async resume(): Promise<void> { if (this.state === CrawlerState.PAUSED) { this.logger.info('Resuming crawler'); this.state = CrawlerState.CRAWLING; // If we have an active job, mark it as resumed if (this.currentJobId) { await this.jobManager.resumeJob(this.currentJobId); } this.logger.info('Crawler resumed'); // Actual resumption logic should be handled by the concrete implementation } else { this.logger.warn(`Cannot resume crawler in state: ${this.state}`); } } /** * Get the current progress of the crawling process * * @returns Progress information including completion percentage and stats */ async getProgress(): Promise<CrawlProgress> { const totalUrls = this.urlQueue.size() + this.urlQueue.visitedCount(); const crawledUrls = this.urlQueue.visitedCount(); const pendingUrls = this.urlQueue.size(); // Calculate progress as a percentage const progress = totalUrls > 0 ? (crawledUrls / totalUrls) * 100 : 0; return { totalUrls, crawledUrls, pendingUrls, skippedUrls: 0, // This would need to be tracked separately progress: Math.min(progress, 100), // Ensure we don't exceed 100% percentage: progress }; } /** * Update job progress in the database * * @param jobId The ID of the job to update */ protected async updateJobProgress(jobId: string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); // Update the job progress await this.jobManager.updateProgress(jobId, progress.progress, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0 // Unknown at this point }); } /** * Handle completion of a job * * @param jobId The ID of the completed job */ protected async markJobCompleted(jobId: string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); // Mark the job as completed await this.jobManager.markJobCompleted(jobId, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0 // Will be updated by document processor }); this.currentJobId = null; this.state = CrawlerState.IDLE; this.logger.info(`Job ${jobId} completed`); } /** * Handle failure of a job * * @param jobId The ID of the failed job * @param error The error that caused the failure */ protected async markJobFailed(jobId: string, error: Error | string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); const errorMessage = error instanceof Error ? error.message : error; // Mark the job as failed await this.jobManager.markJobFailed(jobId, errorMessage, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0, errors: [errorMessage] }); this.currentJobId = null; this.state = CrawlerState.ERROR; this.logger.error(`Job ${jobId} failed: ${errorMessage}`); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

BaseCrawler.ts•7.33 KiB