docmcp

docmcp
src
services
crawler
implementations

CheerioExtractor.ts•7.72 KiB

import axios from 'axios'; import * as cheerio from 'cheerio'; import { IContentExtractor } from '../interfaces/IContentExtractor'; import { ExtractedContent, ExtractionOptions, PageType } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; import { UrlUtils } from '../utils/UrlUtils'; /** * Content extractor implementation using Cheerio for static pages. * This is a lightweight implementation ideal for regular static HTML pages. */ export class CheerioExtractor implements IContentExtractor { private readonly logger = LoggingUtils.createTaggedLogger('cheerio-extractor'); /** * Check if a URL should be excluded from crawling * @param url The URL to check * @returns True if the URL should be excluded */ private isExcludedUrl(url: string): boolean { // Common file extensions to exclude const excludedExtensions = [ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', // Images '.css', '.js', '.json', '.xml', '.csv', '.rss', // Data files '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', // Documents '.zip', '.tar', '.gz', '.rar', // Archives '.mp3', '.mp4', '.avi', '.mkv', '.mov', '.wav', '.ogg', // Media '.exe', '.bin', '.iso', '.dmg', // Executables ]; // Check if URL ends with any excluded extension const lowercaseUrl = url.toLowerCase(); return excludedExtensions.some(ext => lowercaseUrl.endsWith(ext)); } /** * Extract content from a URL using Cheerio (HTML parser) * @param url The URL to extract content from * @param options Configuration options for the extraction * @returns The extracted content */ async extract(url: string, options: ExtractionOptions = {}): Promise<ExtractedContent> { const startTime = Date.now(); this.logger.debug(`Starting extraction for ${url}`); try { // Configure request const requestOptions = { headers: { 'User-Agent': options.userAgent || 'DocMCP Crawler/1.0 (Cheerio)', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', }, timeout: options.timeout || 10000, maxRedirects: 5, }; // Fetch the page content const response = await axios.get(url, requestOptions); if (response.status !== 200) { throw new Error(`Failed to fetch page content: HTTP ${response.status}`); } // Parse HTML with Cheerio const $ = cheerio.load(response.data); // Extract page title const title = $('title').text().trim() || null; // Extract page content const content = response.data; // Extract text content const textContent = this.extractText($); // Extract metadata const metadata = this.extractMetadata($, url); // Extract links if requested let links: string[] = []; if (options.extractLinks) { links = this.extractLinks($, url); } const extractedContent: ExtractedContent = { url: url, title: title, content: content, text: textContent, metadata: metadata, links: links }; const elapsedTime = Date.now() - startTime; this.logger.debug(`Extraction completed for ${url} in ${elapsedTime}ms`); return extractedContent; } catch (error) { this.logger.error(`Error extracting content from ${url}: ${error instanceof Error ? error.message : String(error)}`); throw error; } } /** * Check if this extractor supports the given page type * @param pageType The type of page (static or SPA) * @returns True if this extractor supports the page type */ supportsPageType(pageType: PageType): boolean { return pageType === PageType.STATIC; } /** * Clean up resources * Cheerio doesn't maintain persistent connections or resources, so this is a no-op */ async cleanup(): Promise<void> { // No resources to clean up for Cheerio return Promise.resolve(); } /** * Extract text content from the page * @param $ Cheerio instance * @returns Extracted text content */ private extractText($: any): string { // Remove script and style elements as they contain no meaningful text $('script, style, noscript, iframe, img').remove(); // Extract text from body or whole document if no body let textContent = ''; if ($('body').length) { textContent = $('body').text(); } else { textContent = $.text(); } // Normalize and clean up text return textContent .replace(/\s+/g, ' ') .replace(/\n+/g, '\n') .trim(); } /** * Extract metadata from the page * @param $ Cheerio instance * @param url Source URL * @returns Metadata object */ private extractMetadata($: any, url: string): Record<string, any> { const metadata: Record<string, any> = { domain: UrlUtils.extractDomain(url), lastModified: null, author: null, description: null, keywords: [], language: null, canonicalUrl: null }; // Extract meta description const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content'); if (description) { metadata.description = description.trim(); } // Extract meta keywords const keywords = $('meta[name="keywords"]').attr('content'); if (keywords) { metadata.keywords = keywords.split(',').map((k: string) => k.trim()).filter(Boolean); } // Extract language const language = $('html').attr('lang') || $('meta[http-equiv="content-language"]').attr('content'); if (language) { metadata.language = language.trim(); } // Extract canonical URL const canonical = $('link[rel="canonical"]').attr('href'); if (canonical) { metadata.canonicalUrl = UrlUtils.resolveUrl(canonical, url); } // Extract author const author = $('meta[name="author"]').attr('content') || $('meta[property="article:author"]').attr('content'); if (author) { metadata.author = author.trim(); } // Extract last modified const lastModified = $('meta[http-equiv="last-modified"]').attr('content'); if (lastModified) { metadata.lastModified = lastModified; } // Extract Open Graph metadata $('meta[property^="og:"]').each((_: number, element: any) => { const property = $(element).attr('property'); const content = $(element).attr('content'); if (property && content) { const key = property.replace('og:', ''); metadata[`og_${key}`] = content.trim(); } }); return metadata; } /** * Extract links from the page * @param $ Cheerio instance * @param baseUrl Base URL for resolving relative links * @returns Array of normalized absolute URLs */ private extractLinks($: any, baseUrl: string): string[] { const links = new Set<string>(); // Extract regular links but not javascript:void(0) // or URL with # or mailto: or has the same baseUrl with a # followed by anything $('a[href]').each((_: number, element: any) => { const href = $(element).attr('href'); if (href && href !== 'javascript:void(0)' && !href.startsWith('#') && !href.startsWith('mailto:') && !href.startsWith(baseUrl + '#')) { try { const absoluteUrl = UrlUtils.resolveUrl(href, baseUrl); if (UrlUtils.isValid(absoluteUrl) && !this.isExcludedUrl(absoluteUrl)) { links.add(absoluteUrl); } } catch (error) { // Invalid or malformed URL, skip it } } }); return Array.from(links); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

CheerioExtractor.ts•7.72 KiB