docmcp

CrawlingStrategyFactory.ts•3.6 KiB

import { IContentExtractor } from '../interfaces/IContentExtractor'; import { IPageDetector } from '../interfaces/IPageDetector'; import { StrategyFactoryOptions, PageType } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; /** * Factory for selecting the appropriate content extraction strategy based on page type * This factory determines whether to use CheerioExtractor for static pages * or PuppeteerExtractor for SPAs (Single Page Applications) */ export class CrawlingStrategyFactory { private readonly logger = LoggingUtils.createTaggedLogger('strategy-factory'); /** * Create a new CrawlingStrategyFactory * @param pageDetector The page detector for identifying page types * @param cheerioExtractor The extractor for static pages * @param puppeteerExtractor The extractor for SPAs * @param options Configuration options for the factory */ constructor( private readonly pageDetector: IPageDetector, private readonly cheerioExtractor: IContentExtractor, private readonly puppeteerExtractor: IContentExtractor, private readonly options: StrategyFactoryOptions = {} ) { // Verify extractors support their expected page types if (!cheerioExtractor.supportsPageType(PageType.STATIC)) { this.logger.warn('Cheerio extractor doesn\'t report supporting static pages'); } if (!puppeteerExtractor.supportsPageType(PageType.SPA)) { this.logger.warn('Puppeteer extractor doesn\'t report supporting SPAs'); } } /** * Get the appropriate extractor for a URL * @param url The URL to get an extractor for * @param htmlContent Optional HTML content if already fetched * @returns The appropriate content extractor for the URL */ async getExtractorForUrl(url: string, htmlContent?: string): Promise<IContentExtractor> { this.logger.debug(`Getting extractor for URL: ${url}`); // Check if strategy is forced in options if (this.options.forceStrategy === 'cheerio') { this.logger.debug(`Forced strategy 'cheerio' for ${url}`); return this.cheerioExtractor; } else if (this.options.forceStrategy === 'puppeteer') { this.logger.debug(`Forced strategy 'puppeteer' for ${url}`); return this.puppeteerExtractor; } try { // Detect page type and select appropriate extractor const pageTypeResult = await this.pageDetector.detectPageType(url, htmlContent); this.logger.debug(`Detected page type for ${url}: ${pageTypeResult.pageType} (confidence: ${pageTypeResult.confidence.toFixed(2)})`); // Select extractor based on page type if (pageTypeResult.isSPA) { this.logger.debug(`Using Puppeteer extractor for SPA: ${url}`); return this.puppeteerExtractor; } else { this.logger.debug(`Using Cheerio extractor for static page: ${url}`); return this.cheerioExtractor; } } catch (error) { // If detection fails, default to Cheerio as it's more lightweight this.logger.error(`Error detecting page type for ${url}: ${error instanceof Error ? error.message : String(error)}`); this.logger.debug(`Defaulting to Cheerio extractor for ${url} due to detection failure`); return this.cheerioExtractor; } } /** * Get extractor directly by page type * This bypasses detection and can be used when page type is already known * @param pageType The known page type * @returns The appropriate content extractor for the page type */ getExtractorByPageType(pageType: PageType): IContentExtractor { return pageType === PageType.SPA ? this.puppeteerExtractor : this.cheerioExtractor; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

CrawlingStrategyFactory.ts•3.6 KiB