Skip to main content
Glama
searchService.js18.6 kB
import puppeteer from 'puppeteer'; import axios from 'axios'; import * as cheerio from 'cheerio'; import { logger } from '../utils/logger.js'; class SearchService { constructor() { this.browser = null; this.userAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ]; } // Initialize browser async initBrowser() { if (!this.browser) { this.browser = await puppeteer.launch({ headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--disable-gpu', '--disable-blink-features=AutomationControlled', '--disable-extensions', '--disable-plugins', '--disable-images', '--disable-javascript', '--disable-web-security', '--disable-features=VizDisplayCompositor' ] }); logger.info('Browser initialized'); } return this.browser; } // Close browser async closeBrowser() { if (this.browser) { await this.browser.close(); this.browser = null; logger.info('Browser closed'); } } // Get a random User-Agent getRandomUserAgent() { return this.userAgents[Math.floor(Math.random() * this.userAgents.length)]; } // Clean URL - remove promotional parameters, keep essential ones cleanUrl(url, debug = false) { if (!url) return url; let cleanUrl = url; const originalUrl = url; if (debug) { logger.info(`Start cleaning URL: ${url}`); } // Handle Bing redirect links if (url.includes('bing.com/ck/') || url.includes('bing.com/rd/')) { // Try to extract the real URL from the parameter const urlMatch = url.match(/[?&]u=([^&]+)/); if (urlMatch) { try { cleanUrl = decodeURIComponent(urlMatch[1]); if (debug) { logger.info(`Extracted real URL from Bing redirect: ${cleanUrl}`); } } catch (e) { cleanUrl = url; // fallback to original if decode fails if (debug) { logger.warn(`Failed to decode Bing redirect: ${e.message}`); } } } } // Clean URL parameters - keep only essential ones try { const urlObj = new URL(cleanUrl); const cleanParams = new URLSearchParams(); // Patterns for promotional parameters const promotionalPatterns = [ /^utm_/, // Google Analytics /^fbclid$/, // Facebook click ID /^gclid$/, // Google Ads click ID /^msclkid$/, // Microsoft click ID /^ref_/, // Ref param /^source$/, // Source param /^medium$/, // Medium param /^campaign$/, // Campaign param /^term$/, // Term param /^content$/, // Content param /^hsh$/, // Bing hash param /^fclid$/, // Bing click ID /^ptn$/, // Bing position param /^ver$/, // Version param /^ntb$/, // Bing new tab param /^p$/, // Bing page param /^ck$/, // Bing click param /^rd$/, // Bing redirect param /^ei$/, // Bing event ID /^ved$/, // Google search param /^usg$/, // Google user param /^oq$/, // Google original query /^aqs$/, // Google query source /^sclient$/, // Google search client /^bih$/, // Google browser height /^biw$/, // Google browser width /^dpr$/, // Device pixel ratio /^ie$/, // Encoding param /^oe$/, // Output encoding /^rs$/ // Result source ]; // Essential parameters to keep const essentialParams = [ 'q', 'query', 'search', 'keyword', // search 'id', 'article', 'story', 'item', 'page', 'post', 'news', // content id 'path', 'route', 'slug', 'url', 'link', // path 'lang', 'language', 'locale', 'region', 'country', // localization 'year', 'month', 'day', 'date', 'time', 'timestamp', // time 'category', 'tag', 'type', 'format', 'mode', // category/format 'author', 'writer', 'reporter', 'source', 'publisher' // author/source ]; for (const [key, value] of urlObj.searchParams.entries()) { const keyLower = key.toLowerCase(); // Is promotional param? const isPromotional = promotionalPatterns.some(pattern => typeof pattern === 'string' ? keyLower === pattern : pattern.test(keyLower) ); // Is essential param? const isEssential = essentialParams.some(param => keyLower === param || keyLower.includes(param) ); // Keep if not promotional, or essential, or short key with value if (!isPromotional || isEssential || (key.length <= 3 && value && value.length > 0)) { cleanParams.set(key, value); } } // Rebuild cleaned URL cleanUrl = `${urlObj.origin}${urlObj.pathname}`; if (cleanParams.toString()) { cleanUrl += `?${cleanParams.toString()}`; } if (debug) { logger.info(`URL cleaned: ${originalUrl} -> ${cleanUrl}`); logger.info(`Kept params: ${cleanParams.toString()}`); } } catch (e) { // If URL parsing fails, fallback to cleaned URL so far cleanUrl = cleanUrl; if (debug) { logger.warn(`URL parsing failed, using original: ${e.message}`); } } return cleanUrl; } // Test URL cleaning testUrlCleaning() { const testUrls = [ 'https://www.bing.com/ck/a?!&&p=4d522fd05b7048c5069ccb0f66e7a47ec5374f21a3ebb37ca2675a14fcaab27bJmltdHM9MTc1NTA0MzIwMA&ptn=3&ver=2&hsh=4&fclid=241164da-b243-6c9d-3308-7292b3e86d31&u=a1aHR0cHM6Ly9iYWlrZS5iYWlkdS5jb20vaXRlbS8lRTQlQjglOTYlRTclOTUlOEMlRTYlQTglQTElRTUlOUUlOEIvNDkzODM5Ng&ntb=1', 'https://example.com/article?id=123&utm_source=google&utm_medium=cpc&utm_campaign=news&ref=homepage&source=bing', 'https://news.example.com/story?article=456&category=politics&year=2024&month=01&day=15&lang=zh&region=cn&fbclid=abc123&gclid=def456', 'https://bing.com/search?q=test&count=10&hsh=xyz789&ptn=1&ver=2&ntb=1' ]; logger.info('=== URL Cleaning Test ==='); testUrls.forEach((url, index) => { const cleaned = this.cleanUrl(url, true); logger.info(`Test ${index + 1}: ${url}`); logger.info(`Cleaned: ${cleaned}`); logger.info('---'); }); } // Bing search async searchBing(query, maxResults = 10) { try { const browser = await this.initBrowser(); const page = await browser.newPage(); await page.setUserAgent(this.getRandomUserAgent()); await page.setViewport({ width: 1920, height: 1080 }); const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${maxResults}`; await page.goto(searchUrl, { waitUntil: 'networkidle2' }); await page.waitForSelector('#b_results', { timeout: 10000 }); const rawResults = await page.evaluate(() => { const searchResults = []; const resultElements = document.querySelectorAll('#b_results .b_algo'); resultElements.forEach((element, index) => { if (index >= 10) return; const titleElement = element.querySelector('h2 a'); const snippetElement = element.querySelector('.b_caption p'); if (titleElement) { searchResults.push({ title: titleElement.textContent.trim(), url: titleElement.href, snippet: snippetElement ? snippetElement.textContent.trim() : '', rank: index + 1 }); } }); return searchResults; }); // Clean URLs in Node.js const results = rawResults.map(item => ({ ...item, url: this.cleanUrl(item.url) })); await page.close(); logger.info(`Bing search completed for query: "${query}", found ${results.length} results`); return { engine: 'Bing', query, results, totalResults: results.length, timestamp: new Date().toISOString() }; } catch (error) { logger.error('Bing search error:', error); throw new Error(`Bing search failed: ${error.message}`); } } // Bing News search async searchBingNews(query, maxResults = 10, timeFilter = 'past_24_hours') { try { const browser = await this.initBrowser(); const page = await browser.newPage(); await page.setUserAgent(this.getRandomUserAgent()); await page.setViewport({ width: 1920, height: 1080 }); // Build news search URL let searchUrl = `https://www.bing.com/news/search?q=${encodeURIComponent(query)}`; // Add time filter const timeParams = { 'past_hour': '&qft=interval%3d"1"', 'past_24_hours': '&qft=interval%3d"24"', 'past_7_days': '&qft=interval%3d"7"', 'past_30_days': '&qft=interval%3d"30"' }; if (timeParams[timeFilter]) { searchUrl += timeParams[timeFilter]; } logger.info(`Navigating to Bing News: ${searchUrl}`); await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 15000 }); // Wait for news results to load await new Promise(resolve => setTimeout(resolve, 3000)); // Try to wait for page content try { await page.waitForSelector('body', { timeout: 5000 }); } catch (error) { logger.warn('Timeout waiting for page load, continuing'); } const rawResults = await page.evaluate((maxResults) => { const newsResults = []; // Try multiple selectors for news items const selectors = [ '.news-card', '.news-item', '.news-card-container', '[data-news-card]', '.news-content', '.news-title', '.news-item-container', '.news-list-item' ]; let newsElements = []; for (const selector of selectors) { newsElements = document.querySelectorAll(selector); if (newsElements.length > 0) { break; } } // Fallback: try links containing /news/ if (newsElements.length === 0) { const allLinks = document.querySelectorAll('a[href*="/news/"]'); newsElements = Array.from(allLinks).map(link => { const container = link.closest('div') || link.parentElement; return container || link; }); } // Fallback: just take some links if (newsElements.length === 0) { const allLinks = document.querySelectorAll('a[href]'); newsElements = Array.from(allLinks).slice(0, 20); } // Extract news info newsElements.forEach((element, index) => { if (index >= maxResults) return; let title = ''; let url = ''; let source = ''; let time = ''; // Title const titleSelectors = ['h2', 'h3', '.news-title', '.title', 'a']; for (const selector of titleSelectors) { const el = element.querySelector(selector); if (el && el.textContent.trim()) { title = el.textContent.trim(); if (el.href) { url = el.href; } break; } } // Link if (!url) { const linkEl = element.querySelector('a[href]'); if (linkEl && linkEl.href) { url = linkEl.href; } } // Source const sourceSelectors = ['.news-source', '.source', '.publisher', '.author']; for (const selector of sourceSelectors) { const el = element.querySelector(selector); if (el && el.textContent.trim()) { source = el.textContent.trim(); break; } } // Time const timeSelectors = ['.news-time', '.time', '.date', '.timestamp']; for (const selector of timeSelectors) { const el = element.querySelector(selector); if (el && el.textContent.trim()) { time = el.textContent.trim(); break; } } // Validate if (title && url) { const isValidNewsUrl = url.includes('/news/') || url.includes('news') || url.includes('bing.com') || url.includes('msn.com') || url.includes('baidu.com') || url.includes('baike.baidu.com'); if (isValidNewsUrl) { newsResults.push({ title, url, source, time, rank: index + 1 }); } } }); return newsResults; }, maxResults); // Clean URLs in Node.js const results = rawResults.map(item => ({ ...item, url: this.cleanUrl(item.url) })); await page.close(); logger.info(`Bing News search completed for query: "${query}", found ${results.length} results`); return { engine: 'Bing News', query, results, totalResults: results.length, timeFilter, timestamp: new Date().toISOString() }; } catch (error) { logger.error('Bing News search error:', error); throw new Error(`Bing News search failed: ${error.message}`); } } // Multi-engine search - currently only Bing supported async multiSearch(query, engines = ['bing'], maxResults = 10) { const searchPromises = []; const results = {}; if (engines.includes('bing')) { searchPromises.push( this.searchBing(query, maxResults) .then(result => { results.bing = result; }) .catch(error => { results.bing = { error: error.message }; }) ); } await Promise.allSettled(searchPromises); return { query, engines: Object.keys(results), results, timestamp: new Date().toISOString() }; } // Scrape webpage content async scrapeWebpage(url) { try { const response = await axios.get(url, { headers: { 'User-Agent': this.getRandomUserAgent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 15000 }); const $ = cheerio.load(response.data); // Extract page info const title = $('title').text().trim(); const description = $('meta[name="description"]').attr('content') || ''; const keywords = $('meta[name="keywords"]').attr('content') || ''; // Extract main content const content = $('body').text() .replace(/\s+/g, ' ') .trim() .substring(0, 2000); // limit content length // Extract links const links = []; $('a[href]').each((index, element) => { if (index < 50) { // limit number of links const href = $(element).attr('href'); const text = $(element).text().trim(); if (href && text && href.startsWith('http')) { links.push({ url: href, text }); } } }); logger.info(`Webpage scraped successfully: ${url}`); return { url, title, description, keywords, content, links, timestamp: new Date().toISOString() }; } catch (error) { logger.error(`Webpage scraping error for ${url}:`, error); throw new Error(`Failed to scrape webpage: ${error.message}`); } } // Get webpage source and convert to Markdown async getWebpageMarkdown(url) { try { const response = await axios.get(url, { headers: { 'User-Agent': this.getRandomUserAgent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 15000 }); const $ = cheerio.load(response.data); // Extract page info const title = $('title').text().trim(); const description = $('meta[name="description"]').attr('content') || ''; // Clean HTML, remove unwanted elements $('script, style, noscript, iframe, img').remove(); $('nav, header, footer, aside').remove(); // Get main content area let mainContent = $('main, article, .content, .main, #content, #main'); if (mainContent.length === 0) { mainContent = $('body'); } // Convert to Markdown const TurndownService = (await import('turndown')).default; const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', emDelimiter: '*', bulletListMarker: '-' }); // Custom rule for links turndownService.addRule('links', { filter: 'a', replacement: function(content, node) { const href = node.getAttribute('href'); const text = content.trim(); if (href && text) { return `[${text}](${href})`; } return content; } }); const markdown = turndownService.turndown(mainContent.html()); logger.info(`Webpage converted to Markdown successfully: ${url}`); return { url, title, description, markdown, // htmlSource: response.data, timestamp: new Date().toISOString() }; } catch (error) { logger.error(`Markdown conversion error for ${url}:`, error); throw new Error(`Failed to convert webpage to Markdown: ${error.message}`); } } } export default new SearchService();

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Bosegluon2/spider-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server