Research Powerpack MCP

scrape.ts•7.46 KiB

/** * Scrape Links Tool Handler * Implements robust error handling that NEVER crashes the MCP server */ import type { ScrapeLinksParams, ScrapeLinksOutput } from '../schemas/scrape-links.js'; import { ScraperClient } from '../clients/scraper.js'; import { MarkdownCleaner } from '../services/markdown-cleaner.js'; import { createLLMProcessor, processContentWithLLM } from '../services/llm-processor.js'; import { removeMetaTags } from '../utils/markdown-formatter.js'; import { SCRAPER } from '../config/index.js'; import { classifyError } from '../utils/errors.js'; interface ToolOptions { sessionId?: string; logger?: (level: 'info' | 'error' | 'debug', message: string, sessionId: string) => Promise<void>; } function calculateTokenAllocation(urlCount: number): number { if (urlCount <= 0) return SCRAPER.MAX_TOKENS_BUDGET; return Math.floor(SCRAPER.MAX_TOKENS_BUDGET / urlCount); } function enhanceExtractionInstruction(instruction: string | undefined): string { const base = instruction || 'Extract the main content and key information from this page.'; return `${base}\n\n${SCRAPER.EXTRACTION_SUFFIX}`; } /** * Safe logger wrapper - NEVER throws */ async function safeLog( logger: ToolOptions['logger'], sessionId: string | undefined, level: 'info' | 'error' | 'debug', message: string ): Promise<void> { if (!logger || !sessionId) return; try { await logger(level, message, sessionId); } catch { // Silently ignore logger errors - they should never crash the tool console.error(`[Scrape Tool] Logger failed: ${message}`); } } /** * Handle scrape links request * NEVER throws - always returns a valid response with content and metadata */ export async function handleScrapeLinks( params: ScrapeLinksParams, options: ToolOptions = {} ): Promise<{ content: string; structuredContent: ScrapeLinksOutput }> { const { sessionId, logger } = options; const startTime = Date.now(); // Helper to create error response const createErrorResponse = (message: string, executionTime: number): { content: string; structuredContent: ScrapeLinksOutput } => ({ content: `# ❌ Scraping Failed\n\n${message}`, structuredContent: { content: `# ❌ Scraping Failed\n\n${message}`, metadata: { total_urls: params.urls?.length || 0, successful: 0, failed: params.urls?.length || 0, total_credits: 0, execution_time_ms: executionTime, }, }, }); // Validate params if (!params.urls || params.urls.length === 0) { return createErrorResponse('No URLs provided', Date.now() - startTime); } // Filter out invalid URLs early const validUrls: string[] = []; const invalidUrls: string[] = []; for (const url of params.urls) { try { new URL(url); validUrls.push(url); } catch { invalidUrls.push(url); } } if (validUrls.length === 0) { return createErrorResponse(`All ${params.urls.length} URLs are invalid`, Date.now() - startTime); } const tokensPerUrl = calculateTokenAllocation(validUrls.length); const totalBatches = Math.ceil(validUrls.length / SCRAPER.BATCH_SIZE); await safeLog(logger, sessionId, 'info', `Starting scrape: ${validUrls.length} URL(s), ${tokensPerUrl} tokens/URL, ${totalBatches} batch(es)`); // Initialize clients safely let client: ScraperClient; try { client = new ScraperClient(); } catch (error) { const err = classifyError(error); return createErrorResponse(`Failed to initialize scraper: ${err.message}`, Date.now() - startTime); } const markdownCleaner = new MarkdownCleaner(); const llmProcessor = createLLMProcessor(); // Returns null if not configured const enhancedInstruction = params.use_llm ? enhanceExtractionInstruction(params.what_to_extract) : undefined; // Scrape URLs - scrapeMultiple NEVER throws const results = await client.scrapeMultiple(validUrls, { timeout: params.timeout }); await safeLog(logger, sessionId, 'info', `Scraping complete. Processing ${results.length} results...`); let successful = 0; let failed = 0; let totalCredits = 0; let llmErrors = 0; const contents: string[] = []; // Add invalid URLs to failed count for (const invalidUrl of invalidUrls) { failed++; contents.push(`## ${invalidUrl}\n\n❌ Invalid URL format`); } // Process each result for (let i = 0; i < results.length; i++) { const result = results[i]; if (!result) { failed++; contents.push(`## Unknown URL\n\n❌ No result returned`); continue; } await safeLog(logger, sessionId, 'info', `[${i + 1}/${results.length}] Processing ${result.url}`); // Check for errors in result if (result.error || result.statusCode < 200 || result.statusCode >= 300) { failed++; const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`; contents.push(`## ${result.url}\n\n❌ Failed to scrape: ${errorMsg}`); await safeLog(logger, sessionId, 'error', `[${i + 1}/${results.length}] Failed: ${errorMsg}`); continue; } // Success case successful++; totalCredits += result.credits; // Process content safely let content: string; try { content = markdownCleaner.processContent(result.content); } catch { // If markdown cleaning fails, use raw content content = result.content; } // Apply LLM extraction if enabled - processContentWithLLM NEVER throws if (params.use_llm && llmProcessor) { await safeLog(logger, sessionId, 'info', `[${i + 1}/${results.length}] Applying LLM extraction (${tokensPerUrl} tokens)...`); const llmResult = await processContentWithLLM( content, { use_llm: params.use_llm, what_to_extract: enhancedInstruction, max_tokens: tokensPerUrl }, llmProcessor ); if (llmResult.processed) { content = llmResult.content; await safeLog(logger, sessionId, 'info', `[${i + 1}/${results.length}] LLM extraction complete`); } else { llmErrors++; await safeLog(logger, sessionId, 'info', `[${i + 1}/${results.length}] LLM extraction skipped: ${llmResult.error || 'unknown reason'}`); // Continue with original content - graceful degradation } } // Remove meta tags safely try { content = removeMetaTags(content); } catch { // If this fails, just use the content as-is } contents.push(`## ${result.url}\n\n${content}`); } const executionTime = Date.now() - startTime; await safeLog(logger, sessionId, 'info', `Completed: ${successful} successful, ${failed} failed, ${totalCredits} credits used`); // Build response const allocationHeader = `**Token Allocation:** ${tokensPerUrl.toLocaleString()} tokens/URL (${params.urls.length} URLs, ${SCRAPER.MAX_TOKENS_BUDGET.toLocaleString()} total budget)`; const statusHeader = `**Status:** ✅ ${successful} successful | ❌ ${failed} failed | 📦 ${totalBatches} batch(es)${llmErrors > 0 ? ` | ⚠️ ${llmErrors} LLM extraction failures` : ''}`; const formattedContent = `# Scraped Content (${params.urls.length} URLs)\n\n${allocationHeader}\n${statusHeader}\n\n---\n\n${contents.join('\n\n---\n\n')}`; const metadata = { total_urls: params.urls.length, successful, failed, total_credits: totalCredits, execution_time_ms: executionTime, tokens_per_url: tokensPerUrl, total_token_budget: SCRAPER.MAX_TOKENS_BUDGET, batches_processed: totalBatches, }; return { content: formattedContent, structuredContent: { content: formattedContent, metadata } }; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yigitkonur/research-powerpack-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scrape.ts•7.46 KiB