Skip to main content
Glama
robustWebScraper.js12.5 kB
// src/utils/robustWebScraper.js // Robust web scraping with headless browser capabilities and proxy support const axios = require('axios'); const { JSDOM } = require('jsdom'); class RobustWebScraper { constructor() { this.userAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ]; this.defaultHeaders = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0' }; } // Get random user agent for rotation getRandomUserAgent() { return this.userAgents[Math.floor(Math.random() * this.userAgents.length)]; } // Enhanced web search with multiple fallback strategies async searchWeb(query, maxResults = 5, options = {}) { const strategies = [ () => this.searchSearx(query, maxResults), () => this.searchDuckDuckGo(query, maxResults), () => this.searchDuckDuckGoHtml(query, maxResults), () => this.searchBing(query, maxResults), () => this.searchGoogle(query, maxResults) ]; for (const strategy of strategies) { try { const results = await strategy(); if (results && results.length > 0) { console.error(`[${new Date().toISOString()}] RobustWebScraper: Successfully retrieved ${results.length} results for "${query.substring(0, 50)}..."`); return results; } } catch (error) { console.warn(`[${new Date().toISOString()}] RobustWebScraper: Search strategy failed, trying next:`, error.message); continue; } } console.error(`[${new Date().toISOString()}] RobustWebScraper: All search strategies failed for query "${query.substring(0, 50)}..."`); return []; } async searchSearx(query, maxResults = 5) { const base = process.env.SEARXNG_URL; // e.g., https://searxng.example.org if (!base) return []; try { const url = `${base.replace(/\/$/, '')}/search`; const response = await axios.get(url, { params: { q: query, format: 'json', language: 'en', safesearch: 1, categories: 'general' }, timeout: 10000, headers: { 'User-Agent': this.getRandomUserAgent() } }); const items = Array.isArray(response.data.results) ? response.data.results : []; return items.slice(0, maxResults).map((r) => ({ title: r.title || 'SearxNG Result', text: r.content || r.snippet || '', url: r.url, source: 'searxng' })); } catch (e) { console.error(`[${new Date().toISOString()}] RobustWebScraper: SearxNG search failed:`, e.message); return []; } } // DuckDuckGo HTML fallback parser async searchDuckDuckGoHtml(query, maxResults = 5) { try { const response = await axios.get('https://duckduckgo.com/html/', { params: { q: query }, headers: { 'User-Agent': this.getRandomUserAgent(), ...this.defaultHeaders }, timeout: 10000 }); const dom = new JSDOM(response.data); const doc = dom.window.document; const links = Array.from(doc.querySelectorAll('.result__a')); const snippets = Array.from(doc.querySelectorAll('.result__snippet')); const out = []; for (let i = 0; i < Math.min(maxResults, links.length); i++) { const a = links[i]; out.push({ title: a.textContent?.trim() || 'DuckDuckGo Result', text: snippets[i]?.textContent?.trim() || '', url: a.href, source: 'ddg_html' }); } return out; } catch (e) { console.error(`[${new Date().toISOString()}] RobustWebScraper: DuckDuckGo HTML parse failed:`, e.message); return []; } } // DuckDuckGo search implementation async searchDuckDuckGo(query, maxResults = 5) { try { const response = await axios.get('https://api.duckduckgo.com/', { params: { q: query, format: 'json', no_html: '1', skip_disambig: '1' }, headers: { 'User-Agent': this.getRandomUserAgent(), ...this.defaultHeaders }, timeout: 10000 }); const results = []; if (response.data.RelatedTopics) { for (const topic of response.data.RelatedTopics.slice(0, maxResults)) { if (topic.FirstURL && topic.Text) { results.push({ title: topic.Text.split(' - ')[0] || 'DuckDuckGo Result', text: topic.Text, url: topic.FirstURL, source: 'ddg' }); } } } // Also check Abstract if available if (response.data.Abstract && response.data.AbstractURL) { results.unshift({ title: response.data.Heading || query, text: response.data.Abstract, url: response.data.AbstractURL, source: 'ddg' }); } return results.slice(0, maxResults); } catch (error) { console.error(`[${new Date().toISOString()}] RobustWebScraper: DuckDuckGo search failed:`, error.message); throw error; } } // Bing search implementation (requires API key) async searchBing(query, maxResults = 5) { const bingApiKey = process.env.BING_API_KEY; if (!bingApiKey) { throw new Error('Bing API key not configured'); } try { const response = await axios.get('https://api.bing.microsoft.com/v7.0/search', { params: { q: query, count: maxResults, offset: 0, mkt: 'en-US', safesearch: 'Moderate' }, headers: { 'Ocp-Apim-Subscription-Key': bingApiKey, 'User-Agent': this.getRandomUserAgent() }, timeout: 10000 }); return (response.data.webPages?.value || []).map(item => ({ title: item.name, text: item.snippet, url: item.url, source: 'bing' })); } catch (error) { console.error(`[${new Date().toISOString()}] RobustWebScraper: Bing search failed:`, error.message); throw error; } } // Google Custom Search implementation (requires API key and search engine ID) async searchGoogle(query, maxResults = 5) { const googleApiKey = process.env.GOOGLE_API_KEY; const searchEngineId = process.env.GOOGLE_SEARCH_ENGINE_ID; if (!googleApiKey || !searchEngineId) { throw new Error('Google search API credentials not configured'); } try { const response = await axios.get('https://www.googleapis.com/customsearch/v1', { params: { key: googleApiKey, cx: searchEngineId, q: query, num: Math.min(maxResults, 10) }, timeout: 10000 }); return (response.data.items || []).map(item => ({ title: item.title, text: item.snippet, url: item.link, source: 'google' })); } catch (error) { console.error(`[${new Date().toISOString()}] RobustWebScraper: Google search failed:`, error.message); throw error; } } // Enhanced URL fetching with robust error handling async fetchUrl(url, options = {}) { const { maxBytes = 200000, timeout = 15000 } = options; try { const response = await axios.get(url, { headers: { 'User-Agent': this.getRandomUserAgent(), ...this.defaultHeaders }, timeout, maxContentLength: maxBytes, responseType: 'text' }); // Parse HTML and extract meaningful content const dom = new JSDOM(response.data); const document = dom.window.document; // Remove script and style elements const scripts = document.querySelectorAll('script, style, nav, footer, aside'); scripts.forEach(el => el.remove()); // Extract main content const contentSelectors = [ 'main', 'article', '[role="main"]', '.content', '.main-content', '#content', '.post-content', '.entry-content' ]; let mainContent = ''; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element) { mainContent = element.textContent || ''; break; } } // Fallback to body content if no main content found if (!mainContent) { mainContent = document.body?.textContent || response.data; } // Clean up the content const cleanContent = mainContent .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n') .trim() .substring(0, maxBytes); return { url, content: cleanContent, title: document.title || '', contentLength: cleanContent.length, success: true }; } catch (error) { console.error(`[${new Date().toISOString()}] RobustWebScraper: Failed to fetch ${url}:`, error.message); return { url, content: '', title: '', contentLength: 0, success: false, error: error.message }; } } // Cost-aware model selection based on query complexity and caching selectOptimalModel(query, domain = 'general', complexity = 'moderate', costPreference = 'low') { const { optimization } = config.caching; // Check cache for previous successful model selections const selectionKey = `model_selection:${domain}:${complexity}:${costPreference}`; const cachedSelection = this.modelCache.get(selectionKey); if (cachedSelection) { console.error(`[${new Date().toISOString()}] RobustWebScraper: Using cached model selection: ${cachedSelection.model}`); return cachedSelection.model; } let selectedModel; // Intelligent model routing based on domain and complexity if (domain === 'coding' || query.includes('code') || query.includes('programming')) { selectedModel = complexity === 'simple' ? 'qwen/qwen3-coder' : 'morph/morph-v3-large'; } else if (domain === 'vision' || query.includes('image') || query.includes('visual')) { selectedModel = 'z-ai/glm-4.5v'; } else if (complexity === 'complex' && costPreference === 'high') { selectedModel = 'x-ai/grok-4'; } else { selectedModel = 'deepseek/deepseek-chat-v3.1'; // Default ultra-low cost } // Cache the selection for future use this.modelCache.set(selectionKey, { model: selectedModel, reasoning: `${domain}:${complexity}:${costPreference}`, timestamp: new Date().toISOString() }); console.error(`[${new Date().toISOString()}] RobustWebScraper: Selected optimal model: ${selectedModel} (${domain}:${complexity}:${costPreference})`); return selectedModel; } // Get comprehensive cache statistics getDetailedStats() { const resultStats = this.resultCache.getStats(); const modelStats = this.modelCache.getStats(); return { enabled: this.enabled, results: { ...resultStats, keys: this.resultCache.keys().length, hitRate: resultStats.hits / (resultStats.hits + resultStats.misses) || 0 }, models: { ...modelStats, keys: this.modelCache.keys().length, hitRate: modelStats.hits / (modelStats.hits + modelStats.misses) || 0 }, optimization: { similarityThreshold: this.similarityThreshold, estimatedCostSavings: this.calculateCostSavings() } }; } calculateCostSavings() { const resultStats = this.resultCache.getStats(); const modelStats = this.modelCache.getStats(); // Rough estimate: each cache hit saves ~$0.001 in API costs const totalHits = resultStats.hits + modelStats.hits; return { hits: totalHits, estimatedSavings: totalHits * 0.001, currency: 'USD' }; } } module.exports = RobustWebScraper;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wheattoast11/openrouter-deep-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server