Skip to main content
Glama
BaseCrawler.ts7.5 kB
import { ICrawler } from '../interfaces/ICrawler'; import { IJobManager } from '../interfaces/IJobManager'; import { ILinkExtractor } from '../interfaces/ILinkExtractor'; import { IDocumentProcessor } from '../interfaces/IDocumentProcessor'; import { IRobotsTxtService } from '../interfaces/IRobotsTxtService'; import { IRateLimiter } from '../interfaces/IRateLimiter'; import { IUrlQueue } from '../interfaces/IUrlQueue'; import { CrawlOptions, CrawlProgress, CrawlerState } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; /** * Abstract base class for crawler implementations * Provides common functionality and state management for all crawlers */ export abstract class BaseCrawler implements ICrawler { protected state: CrawlerState = CrawlerState.IDLE; protected options: CrawlOptions; protected logger = LoggingUtils.createTaggedLogger('crawler'); protected currentJobId: string | null = null; /** * Constructor for the base crawler * * @param urlQueue Queue for managing URLs to be crawled * @param jobManager Service for managing crawl jobs * @param linkExtractor Service for extracting links from pages * @param documentProcessor Service for processing and storing documents * @param robotsTxtService Service for handling robots.txt rules * @param rateLimiter Service for rate limiting requests * @param options Default crawl options */ constructor( protected readonly urlQueue: IUrlQueue, protected readonly jobManager: IJobManager, protected readonly linkExtractor: ILinkExtractor, protected readonly documentProcessor: IDocumentProcessor, protected readonly robotsTxtService: IRobotsTxtService, protected readonly rateLimiter: IRateLimiter, options: Partial<CrawlOptions> = {} ) { // Set default options this.options = { maxDepth: 3, baseUrl: '', rateLimit: 1000, respectRobotsTxt: true, userAgent: 'DocMCP Crawler/1.0', timeout: 30000, maxRedirects: 5, reuseCachedContent: true, cacheExpiry: 7, concurrency: 1, ...options }; } /** * Main method to start the crawling process * Implementation to be provided by subclasses * * @param jobId The ID of the job in the database * @param startUrl The URL to start crawling from * @param options Configuration options for the crawl */ abstract crawl(jobId: string, startUrl: string, options: CrawlOptions): Promise<void>; /** * Initialize the crawler * Sets up required services and state */ async initialize(): Promise<void> { this.logger.info('Initializing crawler'); this.state = CrawlerState.INITIALIZING; try { // Perform any necessary initialization this.state = CrawlerState.IDLE; this.logger.info('Crawler initialized'); } catch (error) { this.state = CrawlerState.ERROR; this.logger.error('Failed to initialize crawler', error as Error); throw error; } } /** * Stop the current crawling process * Cannot be resumed after stopping */ async stop(): Promise<void> { if (this.state === CrawlerState.CRAWLING || this.state === CrawlerState.PAUSED) { this.logger.info('Stopping crawler'); this.state = CrawlerState.STOPPING; // If we have an active job, mark it as cancelled if (this.currentJobId) { await this.jobManager.cancelJob(this.currentJobId, 'Crawl process stopped manually'); } this.state = CrawlerState.IDLE; this.currentJobId = null; this.logger.info('Crawler stopped'); } else { this.logger.warn(`Cannot stop crawler in state: ${this.state}`); } } /** * Pause the current crawling process * Can be resumed later */ async pause(): Promise<void> { if (this.state === CrawlerState.CRAWLING) { this.logger.info('Pausing crawler'); this.state = CrawlerState.PAUSED; // If we have an active job, mark it as paused if (this.currentJobId) { await this.jobManager.pauseJob(this.currentJobId); } this.logger.info('Crawler paused'); } else { this.logger.warn(`Cannot pause crawler in state: ${this.state}`); } } /** * Resume a previously paused crawling process */ async resume(): Promise<void> { if (this.state === CrawlerState.PAUSED) { this.logger.info('Resuming crawler'); this.state = CrawlerState.CRAWLING; // If we have an active job, mark it as resumed if (this.currentJobId) { await this.jobManager.resumeJob(this.currentJobId); } this.logger.info('Crawler resumed'); // Actual resumption logic should be handled by the concrete implementation } else { this.logger.warn(`Cannot resume crawler in state: ${this.state}`); } } /** * Get the current progress of the crawling process * * @returns Progress information including completion percentage and stats */ async getProgress(): Promise<CrawlProgress> { const totalUrls = this.urlQueue.size() + this.urlQueue.visitedCount(); const crawledUrls = this.urlQueue.visitedCount(); const pendingUrls = this.urlQueue.size(); // Calculate progress as a percentage const progress = totalUrls > 0 ? (crawledUrls / totalUrls) * 100 : 0; return { totalUrls, crawledUrls, pendingUrls, skippedUrls: 0, // This would need to be tracked separately progress: Math.min(progress, 100), // Ensure we don't exceed 100% percentage: progress }; } /** * Update job progress in the database * * @param jobId The ID of the job to update */ protected async updateJobProgress(jobId: string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); // Update the job progress await this.jobManager.updateProgress(jobId, progress.progress, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0 // Unknown at this point }); } /** * Handle completion of a job * * @param jobId The ID of the completed job */ protected async markJobCompleted(jobId: string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); // Mark the job as completed await this.jobManager.markJobCompleted(jobId, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0 // Will be updated by document processor }); this.currentJobId = null; this.state = CrawlerState.IDLE; this.logger.info(`Job ${jobId} completed`); } /** * Handle failure of a job * * @param jobId The ID of the failed job * @param error The error that caused the failure */ protected async markJobFailed(jobId: string, error: Error | string): Promise<void> { if (!jobId) return; const progress = await this.getProgress(); const errorMessage = error instanceof Error ? error.message : error; // Mark the job as failed await this.jobManager.markJobFailed(jobId, errorMessage, { pagesProcessed: progress.crawledUrls, pagesSkipped: progress.skippedUrls, totalChunks: 0, errors: [errorMessage] }); this.currentJobId = null; this.state = CrawlerState.ERROR; this.logger.error(`Job ${jobId} failed: ${errorMessage}`); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server