import puppeteer from 'puppeteer';
import axios from 'axios';
import * as cheerio from 'cheerio';
import { logger } from '../utils/logger.js';
class SearchService {
constructor() {
this.browser = null;
this.userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
];
}
// Initialize browser
async initBrowser() {
if (!this.browser) {
this.browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu',
'--disable-blink-features=AutomationControlled',
'--disable-extensions',
'--disable-plugins',
'--disable-images',
'--disable-javascript',
'--disable-web-security',
'--disable-features=VizDisplayCompositor'
]
});
logger.info('Browser initialized');
}
return this.browser;
}
// Close browser
async closeBrowser() {
if (this.browser) {
await this.browser.close();
this.browser = null;
logger.info('Browser closed');
}
}
// Get a random User-Agent
getRandomUserAgent() {
return this.userAgents[Math.floor(Math.random() * this.userAgents.length)];
}
// Clean URL - remove promotional parameters, keep essential ones
cleanUrl(url, debug = false) {
if (!url) return url;
let cleanUrl = url;
const originalUrl = url;
if (debug) {
logger.info(`Start cleaning URL: ${url}`);
}
// Handle Bing redirect links
if (url.includes('bing.com/ck/') || url.includes('bing.com/rd/')) {
// Try to extract the real URL from the parameter
const urlMatch = url.match(/[?&]u=([^&]+)/);
if (urlMatch) {
try {
cleanUrl = decodeURIComponent(urlMatch[1]);
if (debug) {
logger.info(`Extracted real URL from Bing redirect: ${cleanUrl}`);
}
} catch (e) {
cleanUrl = url; // fallback to original if decode fails
if (debug) {
logger.warn(`Failed to decode Bing redirect: ${e.message}`);
}
}
}
}
// Clean URL parameters - keep only essential ones
try {
const urlObj = new URL(cleanUrl);
const cleanParams = new URLSearchParams();
// Patterns for promotional parameters
const promotionalPatterns = [
/^utm_/, // Google Analytics
/^fbclid$/, // Facebook click ID
/^gclid$/, // Google Ads click ID
/^msclkid$/, // Microsoft click ID
/^ref_/, // Ref param
/^source$/, // Source param
/^medium$/, // Medium param
/^campaign$/, // Campaign param
/^term$/, // Term param
/^content$/, // Content param
/^hsh$/, // Bing hash param
/^fclid$/, // Bing click ID
/^ptn$/, // Bing position param
/^ver$/, // Version param
/^ntb$/, // Bing new tab param
/^p$/, // Bing page param
/^ck$/, // Bing click param
/^rd$/, // Bing redirect param
/^ei$/, // Bing event ID
/^ved$/, // Google search param
/^usg$/, // Google user param
/^oq$/, // Google original query
/^aqs$/, // Google query source
/^sclient$/, // Google search client
/^bih$/, // Google browser height
/^biw$/, // Google browser width
/^dpr$/, // Device pixel ratio
/^ie$/, // Encoding param
/^oe$/, // Output encoding
/^rs$/ // Result source
];
// Essential parameters to keep
const essentialParams = [
'q', 'query', 'search', 'keyword', // search
'id', 'article', 'story', 'item', 'page', 'post', 'news', // content id
'path', 'route', 'slug', 'url', 'link', // path
'lang', 'language', 'locale', 'region', 'country', // localization
'year', 'month', 'day', 'date', 'time', 'timestamp', // time
'category', 'tag', 'type', 'format', 'mode', // category/format
'author', 'writer', 'reporter', 'source', 'publisher' // author/source
];
for (const [key, value] of urlObj.searchParams.entries()) {
const keyLower = key.toLowerCase();
// Is promotional param?
const isPromotional = promotionalPatterns.some(pattern =>
typeof pattern === 'string' ? keyLower === pattern : pattern.test(keyLower)
);
// Is essential param?
const isEssential = essentialParams.some(param =>
keyLower === param || keyLower.includes(param)
);
// Keep if not promotional, or essential, or short key with value
if (!isPromotional || isEssential || (key.length <= 3 && value && value.length > 0)) {
cleanParams.set(key, value);
}
}
// Rebuild cleaned URL
cleanUrl = `${urlObj.origin}${urlObj.pathname}`;
if (cleanParams.toString()) {
cleanUrl += `?${cleanParams.toString()}`;
}
if (debug) {
logger.info(`URL cleaned: ${originalUrl} -> ${cleanUrl}`);
logger.info(`Kept params: ${cleanParams.toString()}`);
}
} catch (e) {
// If URL parsing fails, fallback to cleaned URL so far
cleanUrl = cleanUrl;
if (debug) {
logger.warn(`URL parsing failed, using original: ${e.message}`);
}
}
return cleanUrl;
}
// Test URL cleaning
testUrlCleaning() {
const testUrls = [
'https://www.bing.com/ck/a?!&&p=4d522fd05b7048c5069ccb0f66e7a47ec5374f21a3ebb37ca2675a14fcaab27bJmltdHM9MTc1NTA0MzIwMA&ptn=3&ver=2&hsh=4&fclid=241164da-b243-6c9d-3308-7292b3e86d31&u=a1aHR0cHM6Ly9iYWlrZS5iYWlkdS5jb20vaXRlbS8lRTQlQjglOTYlRTclOTUlOEMlRTYlQTglQTElRTUlOUUlOEIvNDkzODM5Ng&ntb=1',
'https://example.com/article?id=123&utm_source=google&utm_medium=cpc&utm_campaign=news&ref=homepage&source=bing',
'https://news.example.com/story?article=456&category=politics&year=2024&month=01&day=15&lang=zh®ion=cn&fbclid=abc123&gclid=def456',
'https://bing.com/search?q=test&count=10&hsh=xyz789&ptn=1&ver=2&ntb=1'
];
logger.info('=== URL Cleaning Test ===');
testUrls.forEach((url, index) => {
const cleaned = this.cleanUrl(url, true);
logger.info(`Test ${index + 1}: ${url}`);
logger.info(`Cleaned: ${cleaned}`);
logger.info('---');
});
}
// Bing search
async searchBing(query, maxResults = 10) {
try {
const browser = await this.initBrowser();
const page = await browser.newPage();
await page.setUserAgent(this.getRandomUserAgent());
await page.setViewport({ width: 1920, height: 1080 });
const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${maxResults}`;
await page.goto(searchUrl, { waitUntil: 'networkidle2' });
await page.waitForSelector('#b_results', { timeout: 10000 });
const rawResults = await page.evaluate(() => {
const searchResults = [];
const resultElements = document.querySelectorAll('#b_results .b_algo');
resultElements.forEach((element, index) => {
if (index >= 10) return;
const titleElement = element.querySelector('h2 a');
const snippetElement = element.querySelector('.b_caption p');
if (titleElement) {
searchResults.push({
title: titleElement.textContent.trim(),
url: titleElement.href,
snippet: snippetElement ? snippetElement.textContent.trim() : '',
rank: index + 1
});
}
});
return searchResults;
});
// Clean URLs in Node.js
const results = rawResults.map(item => ({
...item,
url: this.cleanUrl(item.url)
}));
await page.close();
logger.info(`Bing search completed for query: "${query}", found ${results.length} results`);
return {
engine: 'Bing',
query,
results,
totalResults: results.length,
timestamp: new Date().toISOString()
};
} catch (error) {
logger.error('Bing search error:', error);
throw new Error(`Bing search failed: ${error.message}`);
}
}
// Bing News search
async searchBingNews(query, maxResults = 10, timeFilter = 'past_24_hours') {
try {
const browser = await this.initBrowser();
const page = await browser.newPage();
await page.setUserAgent(this.getRandomUserAgent());
await page.setViewport({ width: 1920, height: 1080 });
// Build news search URL
let searchUrl = `https://www.bing.com/news/search?q=${encodeURIComponent(query)}`;
// Add time filter
const timeParams = {
'past_hour': '&qft=interval%3d"1"',
'past_24_hours': '&qft=interval%3d"24"',
'past_7_days': '&qft=interval%3d"7"',
'past_30_days': '&qft=interval%3d"30"'
};
if (timeParams[timeFilter]) {
searchUrl += timeParams[timeFilter];
}
logger.info(`Navigating to Bing News: ${searchUrl}`);
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 15000 });
// Wait for news results to load
await new Promise(resolve => setTimeout(resolve, 3000));
// Try to wait for page content
try {
await page.waitForSelector('body', { timeout: 5000 });
} catch (error) {
logger.warn('Timeout waiting for page load, continuing');
}
const rawResults = await page.evaluate((maxResults) => {
const newsResults = [];
// Try multiple selectors for news items
const selectors = [
'.news-card',
'.news-item',
'.news-card-container',
'[data-news-card]',
'.news-content',
'.news-title',
'.news-item-container',
'.news-list-item'
];
let newsElements = [];
for (const selector of selectors) {
newsElements = document.querySelectorAll(selector);
if (newsElements.length > 0) {
break;
}
}
// Fallback: try links containing /news/
if (newsElements.length === 0) {
const allLinks = document.querySelectorAll('a[href*="/news/"]');
newsElements = Array.from(allLinks).map(link => {
const container = link.closest('div') || link.parentElement;
return container || link;
});
}
// Fallback: just take some links
if (newsElements.length === 0) {
const allLinks = document.querySelectorAll('a[href]');
newsElements = Array.from(allLinks).slice(0, 20);
}
// Extract news info
newsElements.forEach((element, index) => {
if (index >= maxResults) return;
let title = '';
let url = '';
let source = '';
let time = '';
// Title
const titleSelectors = ['h2', 'h3', '.news-title', '.title', 'a'];
for (const selector of titleSelectors) {
const el = element.querySelector(selector);
if (el && el.textContent.trim()) {
title = el.textContent.trim();
if (el.href) {
url = el.href;
}
break;
}
}
// Link
if (!url) {
const linkEl = element.querySelector('a[href]');
if (linkEl && linkEl.href) {
url = linkEl.href;
}
}
// Source
const sourceSelectors = ['.news-source', '.source', '.publisher', '.author'];
for (const selector of sourceSelectors) {
const el = element.querySelector(selector);
if (el && el.textContent.trim()) {
source = el.textContent.trim();
break;
}
}
// Time
const timeSelectors = ['.news-time', '.time', '.date', '.timestamp'];
for (const selector of timeSelectors) {
const el = element.querySelector(selector);
if (el && el.textContent.trim()) {
time = el.textContent.trim();
break;
}
}
// Validate
if (title && url) {
const isValidNewsUrl = url.includes('/news/') ||
url.includes('news') ||
url.includes('bing.com') ||
url.includes('msn.com') ||
url.includes('baidu.com') ||
url.includes('baike.baidu.com');
if (isValidNewsUrl) {
newsResults.push({
title,
url,
source,
time,
rank: index + 1
});
}
}
});
return newsResults;
}, maxResults);
// Clean URLs in Node.js
const results = rawResults.map(item => ({
...item,
url: this.cleanUrl(item.url)
}));
await page.close();
logger.info(`Bing News search completed for query: "${query}", found ${results.length} results`);
return {
engine: 'Bing News',
query,
results,
totalResults: results.length,
timeFilter,
timestamp: new Date().toISOString()
};
} catch (error) {
logger.error('Bing News search error:', error);
throw new Error(`Bing News search failed: ${error.message}`);
}
}
// Multi-engine search - currently only Bing supported
async multiSearch(query, engines = ['bing'], maxResults = 10) {
const searchPromises = [];
const results = {};
if (engines.includes('bing')) {
searchPromises.push(
this.searchBing(query, maxResults)
.then(result => { results.bing = result; })
.catch(error => { results.bing = { error: error.message }; })
);
}
await Promise.allSettled(searchPromises);
return {
query,
engines: Object.keys(results),
results,
timestamp: new Date().toISOString()
};
}
// Scrape webpage content
async scrapeWebpage(url) {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': this.getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
timeout: 15000
});
const $ = cheerio.load(response.data);
// Extract page info
const title = $('title').text().trim();
const description = $('meta[name="description"]').attr('content') || '';
const keywords = $('meta[name="keywords"]').attr('content') || '';
// Extract main content
const content = $('body').text()
.replace(/\s+/g, ' ')
.trim()
.substring(0, 2000); // limit content length
// Extract links
const links = [];
$('a[href]').each((index, element) => {
if (index < 50) { // limit number of links
const href = $(element).attr('href');
const text = $(element).text().trim();
if (href && text && href.startsWith('http')) {
links.push({ url: href, text });
}
}
});
logger.info(`Webpage scraped successfully: ${url}`);
return {
url,
title,
description,
keywords,
content,
links,
timestamp: new Date().toISOString()
};
} catch (error) {
logger.error(`Webpage scraping error for ${url}:`, error);
throw new Error(`Failed to scrape webpage: ${error.message}`);
}
}
// Get webpage source and convert to Markdown
async getWebpageMarkdown(url) {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': this.getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
timeout: 15000
});
const $ = cheerio.load(response.data);
// Extract page info
const title = $('title').text().trim();
const description = $('meta[name="description"]').attr('content') || '';
// Clean HTML, remove unwanted elements
$('script, style, noscript, iframe, img').remove();
$('nav, header, footer, aside').remove();
// Get main content area
let mainContent = $('main, article, .content, .main, #content, #main');
if (mainContent.length === 0) {
mainContent = $('body');
}
// Convert to Markdown
const TurndownService = (await import('turndown')).default;
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
emDelimiter: '*',
bulletListMarker: '-'
});
// Custom rule for links
turndownService.addRule('links', {
filter: 'a',
replacement: function(content, node) {
const href = node.getAttribute('href');
const text = content.trim();
if (href && text) {
return `[${text}](${href})`;
}
return content;
}
});
const markdown = turndownService.turndown(mainContent.html());
logger.info(`Webpage converted to Markdown successfully: ${url}`);
return {
url,
title,
description,
markdown,
// htmlSource: response.data,
timestamp: new Date().toISOString()
};
} catch (error) {
logger.error(`Markdown conversion error for ${url}:`, error);
throw new Error(`Failed to convert webpage to Markdown: ${error.message}`);
}
}
}
export default new SearchService();