Skip to main content
Glama
WebScraperStrategy.ts6.38 kB
import { logger } from "../../utils/logger"; import type { UrlNormalizerOptions } from "../../utils/url"; import { AutoDetectFetcher } from "../fetcher"; import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline, PipelineResult } from "../pipelines/types"; import type { QueueItem, ScraperOptions } from "../types"; import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; export interface WebScraperStrategyOptions { urlNormalizerOptions?: UrlNormalizerOptions; shouldFollowLink?: (baseUrl: URL, targetUrl: URL) => boolean; } export class WebScraperStrategy extends BaseScraperStrategy { private readonly fetcher = new AutoDetectFetcher(); private readonly shouldFollowLinkFn?: (baseUrl: URL, targetUrl: URL) => boolean; private readonly pipelines: ContentPipeline[]; constructor(options: WebScraperStrategyOptions = {}) { super({ urlNormalizerOptions: options.urlNormalizerOptions }); this.shouldFollowLinkFn = options.shouldFollowLink; this.pipelines = PipelineFactory.createStandardPipelines(); } canHandle(url: string): boolean { try { const parsedUrl = new URL(url); return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:"; } catch { return false; } } // Removed custom isInScope logic; using shared scope utility for consistent behavior /** * Processes a single queue item by fetching its content and processing it through pipelines. * @param item - The queue item to process. * @param options - Scraper options including headers for HTTP requests. * @param _progressCallback - Optional progress callback (not used here). * @param signal - Optional abort signal for request cancellation. * @returns An object containing the processed document and extracted links. */ protected override async processItem( item: QueueItem, options: ScraperOptions, signal?: AbortSignal, ): Promise<ProcessItemResult> { const { url } = item; try { // Log when processing with ETag for conditional requests if (item.etag) { logger.debug(`Processing ${url} with stored ETag: ${item.etag}`); } // Define fetch options, passing signal, followRedirects, headers, and etag const fetchOptions = { signal, followRedirects: options.followRedirects, headers: options.headers, // Forward custom headers etag: item.etag, // Pass ETag for conditional requests }; // Use AutoDetectFetcher which handles fallbacks automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); logger.debug( `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`, ); // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND // Use the final URL from rawContent.source (which may differ due to redirects) if (rawContent.status !== FetchStatus.SUCCESS) { logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`); return { url: rawContent.source, links: [], status: rawContent.status }; } // --- Start Pipeline Processing --- let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content); if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`, ); processed = await pipeline.process(rawContent, options, this.fetcher); break; } } if (!processed) { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`, ); return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS }; } // Log errors from pipeline for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } // Check if content processing resulted in usable content if (!processed.textContent || !processed.textContent.trim()) { logger.warn( `⚠️ No processable content found for ${url} after pipeline execution.`, ); return { url: rawContent.source, links: processed.links, status: FetchStatus.SUCCESS, }; } // Update canonical base URL from the first page's final URL (after redirects) if (item.depth === 0) { this.canonicalBaseUrl = new URL(rawContent.source); } const filteredLinks = processed.links?.filter((link) => { try { const targetUrl = new URL(link); // Use the base class's shouldProcessUrl which handles scope + include/exclude patterns if (!this.shouldProcessUrl(targetUrl.href, options)) { return false; } // Apply optional custom filter function if provided if (this.shouldFollowLinkFn) { const baseUrl = this.canonicalBaseUrl ?? new URL(options.url); return this.shouldFollowLinkFn(baseUrl, targetUrl); } return true; } catch { return false; } }) ?? []; return { url: rawContent.source, etag: rawContent.etag, lastModified: rawContent.lastModified, contentType: processed.contentType || rawContent.mimeType, content: processed, links: filteredLinks, status: FetchStatus.SUCCESS, }; } catch (error) { // Log fetch errors or pipeline execution errors (if run throws) logger.error(`❌ Failed processing page ${url}: ${error}`); throw error; } } /** * Cleanup resources used by this strategy, specifically the pipeline browser instances and fetcher. */ async cleanup(): Promise<void> { await Promise.allSettled([ ...this.pipelines.map((pipeline) => pipeline.close()), this.fetcher.close(), ]); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server