Skip to main content
Glama
CheerioExtractor.ts7.9 kB
import axios from 'axios'; import * as cheerio from 'cheerio'; import { IContentExtractor } from '../interfaces/IContentExtractor'; import { ExtractedContent, ExtractionOptions, PageType } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; import { UrlUtils } from '../utils/UrlUtils'; /** * Content extractor implementation using Cheerio for static pages. * This is a lightweight implementation ideal for regular static HTML pages. */ export class CheerioExtractor implements IContentExtractor { private readonly logger = LoggingUtils.createTaggedLogger('cheerio-extractor'); /** * Check if a URL should be excluded from crawling * @param url The URL to check * @returns True if the URL should be excluded */ private isExcludedUrl(url: string): boolean { // Common file extensions to exclude const excludedExtensions = [ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', // Images '.css', '.js', '.json', '.xml', '.csv', '.rss', // Data files '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', // Documents '.zip', '.tar', '.gz', '.rar', // Archives '.mp3', '.mp4', '.avi', '.mkv', '.mov', '.wav', '.ogg', // Media '.exe', '.bin', '.iso', '.dmg', // Executables ]; // Check if URL ends with any excluded extension const lowercaseUrl = url.toLowerCase(); return excludedExtensions.some(ext => lowercaseUrl.endsWith(ext)); } /** * Extract content from a URL using Cheerio (HTML parser) * @param url The URL to extract content from * @param options Configuration options for the extraction * @returns The extracted content */ async extract(url: string, options: ExtractionOptions = {}): Promise<ExtractedContent> { const startTime = Date.now(); this.logger.debug(`Starting extraction for ${url}`); try { // Configure request const requestOptions = { headers: { 'User-Agent': options.userAgent || 'DocMCP Crawler/1.0 (Cheerio)', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9', }, timeout: options.timeout || 10000, maxRedirects: 5, }; // Fetch the page content const response = await axios.get(url, requestOptions); if (response.status !== 200) { throw new Error(`Failed to fetch page content: HTTP ${response.status}`); } // Parse HTML with Cheerio const $ = cheerio.load(response.data); // Extract page title const title = $('title').text().trim() || null; // Extract page content const content = response.data; // Extract text content const textContent = this.extractText($); // Extract metadata const metadata = this.extractMetadata($, url); // Extract links if requested let links: string[] = []; if (options.extractLinks) { links = this.extractLinks($, url); } const extractedContent: ExtractedContent = { url: url, title: title, content: content, text: textContent, metadata: metadata, links: links }; const elapsedTime = Date.now() - startTime; this.logger.debug(`Extraction completed for ${url} in ${elapsedTime}ms`); return extractedContent; } catch (error) { this.logger.error(`Error extracting content from ${url}: ${error instanceof Error ? error.message : String(error)}`); throw error; } } /** * Check if this extractor supports the given page type * @param pageType The type of page (static or SPA) * @returns True if this extractor supports the page type */ supportsPageType(pageType: PageType): boolean { return pageType === PageType.STATIC; } /** * Clean up resources * Cheerio doesn't maintain persistent connections or resources, so this is a no-op */ async cleanup(): Promise<void> { // No resources to clean up for Cheerio return Promise.resolve(); } /** * Extract text content from the page * @param $ Cheerio instance * @returns Extracted text content */ private extractText($: any): string { // Remove script and style elements as they contain no meaningful text $('script, style, noscript, iframe, img').remove(); // Extract text from body or whole document if no body let textContent = ''; if ($('body').length) { textContent = $('body').text(); } else { textContent = $.text(); } // Normalize and clean up text return textContent .replace(/\s+/g, ' ') .replace(/\n+/g, '\n') .trim(); } /** * Extract metadata from the page * @param $ Cheerio instance * @param url Source URL * @returns Metadata object */ private extractMetadata($: any, url: string): Record<string, any> { const metadata: Record<string, any> = { domain: UrlUtils.extractDomain(url), lastModified: null, author: null, description: null, keywords: [], language: null, canonicalUrl: null }; // Extract meta description const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content'); if (description) { metadata.description = description.trim(); } // Extract meta keywords const keywords = $('meta[name="keywords"]').attr('content'); if (keywords) { metadata.keywords = keywords.split(',').map((k: string) => k.trim()).filter(Boolean); } // Extract language const language = $('html').attr('lang') || $('meta[http-equiv="content-language"]').attr('content'); if (language) { metadata.language = language.trim(); } // Extract canonical URL const canonical = $('link[rel="canonical"]').attr('href'); if (canonical) { metadata.canonicalUrl = UrlUtils.resolveUrl(canonical, url); } // Extract author const author = $('meta[name="author"]').attr('content') || $('meta[property="article:author"]').attr('content'); if (author) { metadata.author = author.trim(); } // Extract last modified const lastModified = $('meta[http-equiv="last-modified"]').attr('content'); if (lastModified) { metadata.lastModified = lastModified; } // Extract Open Graph metadata $('meta[property^="og:"]').each((_: number, element: any) => { const property = $(element).attr('property'); const content = $(element).attr('content'); if (property && content) { const key = property.replace('og:', ''); metadata[`og_${key}`] = content.trim(); } }); return metadata; } /** * Extract links from the page * @param $ Cheerio instance * @param baseUrl Base URL for resolving relative links * @returns Array of normalized absolute URLs */ private extractLinks($: any, baseUrl: string): string[] { const links = new Set<string>(); // Extract regular links but not javascript:void(0) // or URL with # or mailto: or has the same baseUrl with a # followed by anything $('a[href]').each((_: number, element: any) => { const href = $(element).attr('href'); if (href && href !== 'javascript:void(0)' && !href.startsWith('#') && !href.startsWith('mailto:') && !href.startsWith(baseUrl + '#')) { try { const absoluteUrl = UrlUtils.resolveUrl(href, baseUrl); if (UrlUtils.isValid(absoluteUrl) && !this.isExcludedUrl(absoluteUrl)) { links.add(absoluteUrl); } } catch (error) { // Invalid or malformed URL, skip it } } }); return Array.from(links); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server