docmcp

docmcp
src
services

crawler.service.ts•26.6 KiB

import * as cheerio from 'cheerio'; import axios from 'axios'; import { URL } from 'url'; import { PrismaClient } from '../generated/prisma'; import { Prisma } from '../generated/prisma'; import logger from '../utils/logger'; import { DocumentService } from './document.service'; import { getPrismaClient as getMainPrismaClient } from '../config/database'; import robotsParser from 'robots-parser'; import { LinkExtractor } from '../utils/link-extractor'; import { PackageMetadata } from './document.service'; interface CrawlOptions { maxDepth: number; baseUrl: string; rateLimit?: number; // milliseconds between requests respectRobotsTxt?: boolean; // whether to respect robots.txt rules randomDelay?: boolean; // whether to add random delay between requests minDelay?: number; // minimum delay in milliseconds maxDelay?: number; // maximum delay in milliseconds userAgent?: string; // user agent to use for requests } // Common user agents for spoofing const USER_AGENTS = { GOOGLEBOT: 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', CHROME: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', SAFARI: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15', FIREFOX: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0' }; interface CrawlResult { url: string; title: string; content: string; metadata: { package?: string; version?: string; type?: string; tags?: string[]; }; level: number; links: string[]; } export class CrawlerService { private visitedUrls: Set<string> = new Set(); private urlQueue: { url: string; depth: number }[] = []; private documentService: DocumentService; private prisma: PrismaClient; private robotsTxt: any = null; // Track errors for reporting private errorCount: number = 0; private pagesSkipped: number = 0; private pagesProcessed: number = 0; constructor(prismaClient?: PrismaClient) { this.prisma = prismaClient || getMainPrismaClient(); this.documentService = new DocumentService(this.prisma); } /** * Initialize a crawl job for a documentation site */ async crawl(jobId: string, startUrl: string, options: Partial<CrawlOptions> = {}) { const defaultOptions: CrawlOptions = { maxDepth: 3, baseUrl: new URL(startUrl).origin, rateLimit: 1000, // 1 second between requests by default respectRobotsTxt: true, // respect robots.txt by default randomDelay: true, // use random delays between requests minDelay: 1500, // minimum 1.5 seconds between requests maxDelay: 5000, // maximum 5 seconds between requests userAgent: USER_AGENTS.GOOGLEBOT, // default to Googlebot user agent }; const crawlOptions = { ...defaultOptions, ...options }; this.urlQueue = [{ url: startUrl, depth: 0 }]; this.visitedUrls.clear(); // Reset counters this.errorCount = 0; this.pagesSkipped = 0; this.pagesProcessed = 0; // Log the crawl configuration logger.info(`Starting crawl with options:`, { maxDepth: crawlOptions.maxDepth, baseUrl: crawlOptions.baseUrl, rateLimit: crawlOptions.rateLimit, respectRobotsTxt: crawlOptions.respectRobotsTxt, randomDelay: crawlOptions.randomDelay, userAgent: crawlOptions.userAgent }); // Load robots.txt if enabled if (crawlOptions.respectRobotsTxt) { await this.loadRobotsTxt(crawlOptions.baseUrl); } try { while (this.urlQueue.length > 0) { // Check if job should be cancelled const jobStatus = await this.prisma.job.findUnique({ where: { id: jobId }, select: { shouldCancel: true, shouldPause: true } }); if (jobStatus?.shouldCancel) { logger.info(`Job ${jobId} was cancelled. Stopping crawl.`); await this.prisma.job.update({ where: { id: jobId }, data: { status: 'cancelled', endDate: new Date(), progress: this.visitedUrls.size / (this.visitedUrls.size + this.urlQueue.length), }, }); return; // Exit early } if (jobStatus?.shouldPause) { logger.info(`Job ${jobId} was paused. Stopping crawl.`); await this.prisma.job.update({ where: { id: jobId }, data: { status: 'paused', progress: this.visitedUrls.size / (this.visitedUrls.size + this.urlQueue.length), }, }); return; // Exit early } const { url, depth } = this.urlQueue.shift()!; if (depth > crawlOptions.maxDepth) { continue; } if (this.visitedUrls.has(url)) { continue; } // Check for recent existing document const copiedDocument = await this.findAndCopyRecentDocument(url, depth, jobId); if (copiedDocument) { logger.info(`Reused recent document data for ${url} (New ID: ${copiedDocument.id})`); this.visitedUrls.add(url); this.pagesProcessed++; // Count copied documents as processed // Extract links from the copied content using the LinkExtractor utility const links = LinkExtractor.extractAllLinks(copiedDocument.content, crawlOptions.baseUrl, url); logger.debug(`Extracted ${links.length} links from copied content for ${url}`); // Queue new URLs from copied content for (const link of links) { if (!this.visitedUrls.has(link)) { this.urlQueue.push({ url: link, depth: depth + 1 }); } } await this.updateJobProgress(jobId); // Update progress await this.applyDelay(crawlOptions); // Apply delay even after copying continue; // Skip fetching and processing this URL } // Check robots.txt rules if enabled if (crawlOptions.respectRobotsTxt && this.robotsTxt && !this.isAllowedByRobotsTxt(url)) { logger.info(`Skipping ${url} (disallowed by robots.txt)`); this.pagesSkipped++; continue; } try { logger.info(`Crawling: ${url} (depth: ${depth})`); const result = await this.crawlPage(url, depth, crawlOptions); this.visitedUrls.add(url); this.pagesProcessed++; // Create document record, including the jobId const jobPackageInfo = await this.extractPackageInfoFromJob(jobId); const documentPackageInfo = jobPackageInfo; await this.documentService.createDocument({ url: result.url, title: result.title, content: result.content, metadata: result.metadata as Prisma.InputJsonValue, crawlDate: new Date(), level: result.level, jobId: jobId, packageInfo: documentPackageInfo, }); // Queue new URLs for (const link of result.links) { if (!this.visitedUrls.has(link)) { this.urlQueue.push({ url: link, depth: depth + 1 }); } } // Update job progress and stats await this.updateJobProgress(jobId); // Apply delay between requests await this.applyDelay(crawlOptions); } catch (error) { // Handle specific error types if (axios.isAxiosError(error)) { // Mark URL as visited to prevent retries this.visitedUrls.add(url); this.errorCount++; this.pagesSkipped++; if (error.response) { // Server responded with an error status code const statusCode = error.response.status; if (statusCode === 404) { logger.warn(`Page not found (404): ${url} - Skipping and continuing.`); } else if (statusCode === 403 || statusCode === 401) { logger.warn(`Access denied (${statusCode}): ${url} - Skipping and continuing.`); } else if (statusCode >= 500) { logger.warn(`Server error (${statusCode}): ${url} - Skipping and continuing.`); } else { logger.warn(`HTTP error (${statusCode}): ${url} - Skipping and continuing.`); } // Update job with warning but continue crawling await this.prisma.job.update({ where: { id: jobId }, data: { error: `${this.errorCount} errors during crawling. Latest: HTTP ${statusCode} at ${url}`, errorCount: this.errorCount, lastError: new Date(), itemsFailed: this.errorCount, itemsSkipped: this.pagesSkipped, } }); } else if (error.request) { // Request was made but no response received (network error) logger.warn(`Network error for ${url} - Skipping and continuing.`); // Update job with warning but continue crawling await this.prisma.job.update({ where: { id: jobId }, data: { error: `${this.errorCount} errors during crawling. Latest: Network error at ${url}`, errorCount: this.errorCount, lastError: new Date(), itemsFailed: this.errorCount, itemsSkipped: this.pagesSkipped, } }); } else { // Something else went wrong logger.warn(`Error crawling ${url}: ${error.message} - Skipping and continuing.`); // Update job with warning but continue crawling await this.prisma.job.update({ where: { id: jobId }, data: { error: `${this.errorCount} errors during crawling. Latest: ${error.message} at ${url}`, errorCount: this.errorCount, lastError: new Date(), itemsFailed: this.errorCount, itemsSkipped: this.pagesSkipped, } }); } } else { // Non-Axios error - likely a parsing error or other issue this.visitedUrls.add(url); this.errorCount++; this.pagesSkipped++; logger.warn(`Error processing ${url}: ${error} - Skipping and continuing.`); // Update job with warning but continue crawling await this.prisma.job.update({ where: { id: jobId }, data: { error: `${this.errorCount} errors during crawling. Latest: Processing error at ${url}: ${error}`, errorCount: this.errorCount, lastError: new Date(), itemsFailed: this.errorCount, itemsSkipped: this.pagesSkipped, } }); } // Update job progress despite error await this.updateJobProgress(jobId); // Add delay after error before continuing await this.applyDelay(crawlOptions); // Continue with next URL - do NOT re-throw the error } } // The crawl completed normally logger.info(`Crawl finished successfully for job ${jobId}.`); } catch (error) { // This should only catch unexpected errors outside the URL processing loop logger.error('Unexpected error during crawl:', error); // We'll mark the job as failed in the finally block } finally { // Ensure the job is always marked with final status logger.info(`Crawl finished for job ${jobId}. Updating status.`); try { // Check if there were too many errors const failureThreshold = 0.75; // 75% of pages failed const totalPages = this.pagesProcessed + this.pagesSkipped; const errorRate = totalPages > 0 ? this.errorCount / totalPages : 0; // Determine final status let finalStatus: 'completed' | 'failed'; if (this.errorCount > 0 && errorRate >= failureThreshold) { finalStatus = 'failed'; logger.warn(`Job ${jobId} marked as failed due to high error rate (${errorRate.toFixed(2)})`); } else { finalStatus = 'completed'; logger.info(`Job ${jobId} completed with ${this.errorCount} errors out of ${totalPages} pages`); } await this.prisma.job.update({ where: { id: jobId }, data: { status: finalStatus, endDate: new Date(), // Set progress to 1 if completed progress: 1, // Update final statistics stats: { pagesProcessed: this.pagesProcessed, pagesSkipped: this.pagesSkipped, totalChunks: this.pagesProcessed, errorCount: this.errorCount, errorRate: errorRate }, itemsProcessed: this.pagesProcessed, itemsSkipped: this.pagesSkipped, itemsFailed: this.errorCount, itemsTotal: this.pagesProcessed + this.pagesSkipped }, }); logger.info(`Job ${jobId} status updated to ${finalStatus}.`); } catch (updateError) { logger.error(`Failed to update final job status for job ${jobId}:`, updateError); } } } /** * Update job progress and statistics */ private async updateJobProgress(jobId: string): Promise<void> { try { const totalUrls = this.visitedUrls.size + this.urlQueue.length; const progress = totalUrls > 0 ? this.visitedUrls.size / totalUrls : 0; // Calculate time estimates const job = await this.prisma.job.findUnique({ where: { id: jobId }, select: { startDate: true } }); let timeElapsed = 0; let timeRemaining = 0; if (job) { timeElapsed = Math.floor((Date.now() - job.startDate.getTime()) / 1000); // in seconds // Estimate remaining time based on progress if (progress > 0) { timeRemaining = Math.floor((timeElapsed / progress) - timeElapsed); } } // Calculate estimated completion time const estimatedCompletion = timeRemaining > 0 ? new Date(Date.now() + (timeRemaining * 1000)) : undefined; await this.prisma.job.update({ where: { id: jobId }, data: { progress, stats: { pagesProcessed: this.pagesProcessed, pagesSkipped: this.pagesSkipped, totalChunks: this.pagesProcessed, errorCount: this.errorCount }, itemsProcessed: this.pagesProcessed, itemsSkipped: this.pagesSkipped, itemsFailed: this.errorCount, itemsTotal: this.pagesProcessed + this.pagesSkipped + this.urlQueue.length, timeElapsed, timeRemaining, estimatedCompletion, lastActivity: new Date() }, }); } catch (error) { logger.warn(`Failed to update job progress for job ${jobId}:`, error); // Non-critical error, we can continue } } /** * Apply delay between requests based on the configured options */ private async applyDelay(options: CrawlOptions): Promise<void> { let delay = options.rateLimit || 1000; if (options.randomDelay && options.minDelay !== undefined && options.maxDelay !== undefined) { // Calculate a random delay between minDelay and maxDelay delay = Math.floor(Math.random() * (options.maxDelay - options.minDelay + 1)) + options.minDelay; logger.debug(`Applying random delay of ${delay}ms before next request`); } else { logger.debug(`Applying fixed delay of ${delay}ms before next request`); } await new Promise(resolve => setTimeout(resolve, delay)); } /** * Load and parse robots.txt file from a domain */ private async loadRobotsTxt(baseUrl: string): Promise<void> { try { const robotsUrl = new URL('/robots.txt', baseUrl).toString(); logger.info(`Loading robots.txt from ${robotsUrl}`); const response = await axios.get(robotsUrl, { timeout: 5000 }); if (response.status === 200) { this.robotsTxt = robotsParser(robotsUrl, response.data); logger.info(`Robots.txt successfully loaded from ${robotsUrl}`); } else { logger.warn(`Failed to load robots.txt from ${robotsUrl} - status: ${response.status}`); this.robotsTxt = null; } } catch (error) { logger.warn(`Error loading robots.txt: ${error}`); this.robotsTxt = null; } } /** * Check if URL is allowed by robots.txt */ private isAllowedByRobotsTxt(url: string): boolean { if (!this.robotsTxt) { return true; // If we can't load robots.txt, we assume everything is allowed } return this.robotsTxt.isAllowed(url, 'Googlebot'); } /** * Crawl a single page and extract its content */ private async crawlPage(url: string, depth: number, options: CrawlOptions): Promise<CrawlResult> { // Configure request headers with user agent const headers = { 'User-Agent': options.userAgent || USER_AGENTS.GOOGLEBOT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', }; const response = await axios.get(url, { headers, timeout: 15000, // 15 second timeout maxRedirects: 5 // Maximum 5 redirects }); const $ = cheerio.load(response.data); // Remove script tags, style tags, and comments $('script, style').remove(); $('*').contents().filter((_: number, el: any) => el.type === 'comment').remove(); // Extract title const title = $('title').text().trim() || $('h1').first().text().trim() || url; // Extract main content // This is a basic implementation - you might need to adjust selectors based on the site structure const mainContent = $('main, article, .content, .documentation, #content').first(); const content = mainContent.length ? mainContent.html() || '' : $('body').html() || ''; // Extract links using the LinkExtractor utility const extractedLinks = LinkExtractor.extractAllLinks(response.data, options.baseUrl, url); // Detect package information from URL and content const detectedPackageInfo = this.detectPackageInfoFromContent(url, $); // Extract metadata const metadata = { package: $('meta[name="package"]').attr('content') || detectedPackageInfo?.packageName, version: $('meta[name="version"]').attr('content') || detectedPackageInfo?.packageVersion, type: 'documentation', tags: ['auto-generated'], // Store the full package info for document creation detectedPackageInfo: detectedPackageInfo, }; return { url, title, content, metadata, level: depth, links: extractedLinks, }; } /** * NEW METHOD: Check for a recent document in the DB and copy its data * if found, creating a new document record linked to the current job. */ private async findAndCopyRecentDocument(url: string, depth: number, jobId: string): Promise<Prisma.DocumentGetPayload<{}> | null> { const fourWeeksAgo = new Date(); fourWeeksAgo.setDate(fourWeeksAgo.getDate() - 28); try { const existingDocument = await this.prisma.document.findFirst({ where: { url: url, crawlDate: { gte: fourWeeksAgo // Check if crawlDate is within the last 4 weeks } }, orderBy: { crawlDate: 'desc' // Get the most recent one if multiple exist } }); if (existingDocument) { logger.debug(`Found recent document for ${url} (ID: ${existingDocument.id}, Crawled: ${existingDocument.crawlDate.toISOString()})`); // Create a new document record, copying data from the existing one const newDocumentData: Prisma.DocumentCreateInput = { url: existingDocument.url, title: existingDocument.title, content: existingDocument.content, metadata: existingDocument.metadata ?? ({} as Prisma.InputJsonValue), crawlDate: new Date(), // Set crawlDate to now for the new record level: depth, // Use current depth job: { connect: { id: jobId } }, // Link to the current job // We don't copy parentDocumentId or childDocuments relations here // Processing step will handle content/chunks/embeddings for this new doc }; const newDocument = await this.documentService.createDocument(newDocumentData); logger.info(`Created new document ${newDocument.id} by copying data from ${existingDocument.id}`); return newDocument; } else { logger.debug(`No recent existing document found for ${url}`); return null; } } catch (error) { logger.error(`Error checking for existing document for ${url}:`, error); return null; // Proceed with normal crawl if DB check fails } } /** * Extract package information from job metadata * @param jobId The ID of the job * @returns Package metadata if available */ private async extractPackageInfoFromJob(jobId: string): Promise<PackageMetadata | undefined> { try { const job = await this.prisma.job.findUnique({ where: { id: jobId }, select: { metadata: true } }); if (!job?.metadata || typeof job.metadata !== 'object') { return undefined; } const metadata = job.metadata as Record<string, any>; // Only return package info if packageName is provided if (metadata.packageName) { return { packageName: metadata.packageName, packageVersion: metadata.packageVersion || 'latest', language: metadata.language || 'javascript', sourceName: metadata.sourceName || 'Job Metadata', sourceIsOfficial: metadata.sourceIsOfficial === true, relevanceScore: 0.9 // High relevance for explicitly provided package info }; } return undefined; } catch (error) { logger.warn(`Error extracting package info from job ${jobId}:`, error); return undefined; } } /** * Detect package information from URL patterns and HTML content * @param url URL of the page * @param $cheerio The cheerio instance with loaded HTML * @returns Detected package metadata if available */ private detectPackageInfoFromContent(url: string, $: any): PackageMetadata | undefined { // First check for explicit metadata in HTML const metaPackage = $('meta[name="package"]').attr('content'); const metaVersion = $('meta[name="version"]').attr('content'); // Initialize metadata object const packageInfo: Partial<PackageMetadata> = {}; // If meta tags provide package info, use it if (metaPackage) { packageInfo.packageName = metaPackage; if (metaVersion) { packageInfo.packageVersion = metaVersion; } packageInfo.sourceName = 'HTML Metadata'; packageInfo.relevanceScore = 0.85; // High confidence for explicit metadata return packageInfo as PackageMetadata; } // Otherwise, try to infer from URL try { const urlObj = new URL(url); const hostname = urlObj.hostname; const pathname = urlObj.pathname; // Check common documentation sites // NPM if (hostname === 'www.npmjs.com' || hostname === 'npmjs.com') { const pathParts = pathname.split('/').filter(Boolean); if (pathParts[0] === 'package' && pathParts.length > 1) { packageInfo.packageName = pathParts[1]; packageInfo.language = 'javascript'; packageInfo.sourceName = 'NPM'; packageInfo.sourceIsOfficial = true; packageInfo.relevanceScore = 0.8; return packageInfo as PackageMetadata; } } // GitHub if (hostname === 'github.com') { const pathParts = pathname.split('/').filter(Boolean); if (pathParts.length >= 2) { // GitHub URLs are typically github.com/owner/repo const repoName = pathParts[1]; packageInfo.packageName = repoName; packageInfo.sourceName = 'GitHub'; packageInfo.relevanceScore = 0.6; // Lower confidence since GitHub repositories might not be packages // Look for package.json references in the page content to confirm it's a package if ($('a[href*="package.json"]').length || $('a[href*="setup.py"]').length) { packageInfo.relevanceScore = 0.7; // Higher confidence if package config files are mentioned } return packageInfo as PackageMetadata; } } // PyPI if (hostname === 'pypi.org') { const pathParts = pathname.split('/').filter(Boolean); if (pathParts[0] === 'project' && pathParts.length > 1) { packageInfo.packageName = pathParts[1]; packageInfo.language = 'python'; packageInfo.sourceName = 'PyPI'; packageInfo.sourceIsOfficial = true; packageInfo.relevanceScore = 0.8; return packageInfo as PackageMetadata; } } // Package-specific documentation sites const commonPackageSites: Record<string, { name: string, language: string }> = { 'reactjs.org': { name: 'react', language: 'javascript' }, 'react.dev': { name: 'react', language: 'javascript' }, 'angular.io': { name: 'angular', language: 'typescript' }, 'vuejs.org': { name: 'vue', language: 'javascript' }, 'nextjs.org': { name: 'next', language: 'javascript' }, 'expressjs.com': { name: 'express', language: 'javascript' }, 'djangoproject.com': { name: 'django', language: 'python' }, 'flask.palletsprojects.com': { name: 'flask', language: 'python' }, }; if (commonPackageSites[hostname]) { const pkgInfo = commonPackageSites[hostname]; packageInfo.packageName = pkgInfo.name; packageInfo.language = pkgInfo.language; packageInfo.sourceName = 'Official Website'; packageInfo.sourceIsOfficial = true; packageInfo.relevanceScore = 0.9; // High confidence for known official sites return packageInfo as PackageMetadata; } // Unable to detect package return undefined; } catch (error) { logger.warn(`Error detecting package info from ${url}:`, error); return undefined; } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

crawler.service.ts•26.6 KiB