search.js•12 kB
import axios from 'axios';
import * as cheerio from 'cheerio';
import https from 'https';
// Constants
const RESULTS_PER_PAGE = 10;
const MAX_CACHE_PAGES = 5;
// Rotating User Agents
const USER_AGENTS = [
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/120.0.0.0',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
];
// Cache results to avoid repeated requests
const resultsCache = new Map();
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
// HTTPS agent configuration to handle certificate chain issues
const httpsAgent = new https.Agent({
  rejectUnauthorized: true, // Keep security enabled
  keepAlive: true,
  timeout: 10000,
  // Provide fallback for certificate issues while maintaining security
  secureProtocol: 'TLSv1_2_method'
});
/**
 * Get a random user agent from the list
 * @returns {string} A random user agent string
 */
function getRandomUserAgent() {
  return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
/**
 * Generate a cache key for a search query and page
 * @param {string} query - The search query
 * @param {number} page - The page number
 * @returns {string} The cache key
 */
function getCacheKey(query, page) {
  return `${query}-${page}`;
}
/**
 * Clear old entries from the cache
 */
function clearOldCache() {
  const now = Date.now();
  for (const [key, value] of resultsCache.entries()) {
    if (now - value.timestamp > CACHE_DURATION) {
      resultsCache.delete(key);
    }
  }
}
/**
 * Extract the direct URL from a DuckDuckGo redirect URL
 * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
 * @returns {string} The direct URL
 */
function extractDirectUrl(duckduckgoUrl) {
  try {
    // Handle relative URLs from DuckDuckGo
    if (duckduckgoUrl.startsWith('//')) {
      duckduckgoUrl = 'https:' + duckduckgoUrl;
    } else if (duckduckgoUrl.startsWith('/')) {
      duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
    }
    const url = new URL(duckduckgoUrl);
    // Extract direct URL from DuckDuckGo redirect
    if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
      const uddg = url.searchParams.get('uddg');
      if (uddg) {
        return decodeURIComponent(uddg);
      }
    }
    // Handle ad redirects
    if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
      const u3 = url.searchParams.get('u3');
      if (u3) {
        try {
          const decodedU3 = decodeURIComponent(u3);
          const u3Url = new URL(decodedU3);
          const clickUrl = u3Url.searchParams.get('ld');
          if (clickUrl) {
            return decodeURIComponent(clickUrl);
          }
          return decodedU3;
        } catch {
          return duckduckgoUrl;
        }
      }
    }
    return duckduckgoUrl;
  } catch {
    // If URL parsing fails, try to extract URL from a basic string match
    const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
    if (urlMatch) {
      return urlMatch[0];
    }
    return duckduckgoUrl;
  }
}
/**
 * Get a favicon URL for a given website URL
 * @param {string} url - The website URL
 * @returns {string} The favicon URL
 */
function getFaviconUrl(url) {
  try {
    const urlObj = new URL(url);
    return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
  } catch {
    return ''; // Return empty string if URL is invalid
  }
}
/**
 * Scrapes search results from DuckDuckGo HTML
 * @param {string} query - The search query
 * @param {number} page - The page number (default: 1)
 * @param {number} numResults - Number of results to return (default: 10)
 * @returns {Promise<Array>} - Array of search results
 */
async function searchDuckDuckGo(query, page = 1, numResults = 10) {
  try {
    // Clear old cache entries
    clearOldCache();
    // Calculate start index for pagination
    const startIndex = (page - 1) * RESULTS_PER_PAGE;
    // Check cache first
    const cacheKey = getCacheKey(query, page);
    const cachedResults = resultsCache.get(cacheKey);
    if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
      return cachedResults.results.slice(0, numResults);
    }
    // Get a random user agent
    const userAgent = getRandomUserAgent();
    // Fetch results
    const response = await axios.get(
      `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}&s=${startIndex}`,
      {
        headers: {
          'User-Agent': userAgent
        },
        httpsAgent: httpsAgent
      }
    );
    if (response.status !== 200) {
      throw new Error('Failed to fetch search results');
    }
    const html = response.data;
    // Parse results using cheerio
    const $ = cheerio.load(html);
    const results = [];
    $('.result').each((i, result) => {
      const $result = $(result);
      const titleEl = $result.find('.result__title a');
      const linkEl = $result.find('.result__url');
      const snippetEl = $result.find('.result__snippet');
      const title = titleEl.text()?.trim();
      const rawLink = titleEl.attr('href');
      const description = snippetEl.text()?.trim();
      const displayUrl = linkEl.text()?.trim();
      const directLink = extractDirectUrl(rawLink || '');
      const favicon = getFaviconUrl(directLink);
      if (title && directLink) {
        results.push({
          title,
          url: directLink,
          snippet: description || '',
          favicon: favicon,
          displayUrl: displayUrl || ''
        });
      }
    });
    // Get paginated results
    const paginatedResults = results.slice(0, numResults);
    // Cache the results
    resultsCache.set(cacheKey, {
      results: paginatedResults,
      timestamp: Date.now()
    });
    // If cache is too big, remove oldest entries
    if (resultsCache.size > MAX_CACHE_PAGES) {
      const oldestKey = Array.from(resultsCache.keys())[0];
      resultsCache.delete(oldestKey);
    }
    return paginatedResults;
  } catch (error) {
    console.error('Error searching DuckDuckGo:', error.message);
    throw error;
  }
}
/**
 * Fetches the content of a URL and returns it as text
 * @param {string} url - The URL to fetch
 * @param {Object} options - Options for content extraction
 * @param {boolean} options.extractMainContent - Whether to attempt to extract main content (default: true)
 * @param {boolean} options.includeLinks - Whether to include link text (default: true)
 * @param {boolean} options.includeImages - Whether to include image alt text (default: true)
 * @param {string[]} options.excludeTags - Tags to exclude from extraction
 * @returns {Promise<string>} - The content of the URL
 */
async function fetchUrlContent(url, options = {}) {
  try {
    // Default options
    const {
      extractMainContent = true,
      includeLinks = true,
      includeImages = true,
      excludeTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'footer', 'header', 'aside']
    } = options;
    // Get a random user agent
    const userAgent = getRandomUserAgent();
    const response = await axios.get(url, {
      headers: {
        'User-Agent': userAgent
      },
      timeout: 10000, // 10 second timeout
      httpsAgent: httpsAgent
    });
    if (response.status !== 200) {
      throw new Error(`Failed to fetch URL: ${url}`);
    }
    // If the content is HTML, extract the text content
    const contentType = response.headers['content-type'] || '';
    if (contentType.includes('text/html')) {
      const $ = cheerio.load(response.data);
      // Remove unwanted elements
      excludeTags.forEach(tag => {
        $(tag).remove();
      });
      // Remove ads and other common unwanted elements
      const unwantedSelectors = [
        '[id*="ad"]', '[class*="ad"]', '[id*="banner"]', '[class*="banner"]',
        '[id*="popup"]', '[class*="popup"]', '[class*="cookie"]',
        '[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]',
        '[class*="social"]', '[id*="social"]', '[class*="share"]', '[id*="share"]'
      ];
      unwantedSelectors.forEach(selector => {
        try {
          $(selector).remove();
        } catch (e) {
          // Ignore invalid selectors
        }
      });
      // Handle links and images
      if (!includeLinks) {
        $('a').each((i, link) => {
          $(link).replaceWith($(link).text());
        });
      }
      if (!includeImages) {
        $('img').remove();
      } else {
        // Replace images with their alt text
        $('img').each((i, img) => {
          const alt = $(img).attr('alt');
          if (alt) {
            $(img).replaceWith(`[Image: ${alt}]`);
          } else {
            $(img).remove();
          }
        });
      }
      // Try to extract main content if requested
      if (extractMainContent) {
        // Common content selectors in order of priority
        const contentSelectors = [
          'article', 'main', '[role="main"]', '.post-content', '.article-content',
          '.content', '#content', '.post', '.article', '.entry-content',
          '.page-content', '.post-body', '.post-text', '.story-body'
        ];
        for (const selector of contentSelectors) {
          const mainContent = $(selector).first();
          if (mainContent.length > 0) {
            // Clean up the content
            return cleanText(mainContent.text());
          }
        }
      }
      // If no main content found or not requested, use the body
      return cleanText($('body').text());
    }
    // For non-HTML content, return as is
    return response.data.toString();
  } catch (error) {
    console.error('Error fetching URL content:', error.message);
    throw error;
  }
}
/**
 * Cleans up text by removing excessive whitespace and normalizing line breaks
 * @param {string} text - The text to clean
 * @returns {string} - The cleaned text
 */
function cleanText(text) {
  return text
    .replace(/\s+/g, ' ')  // Replace multiple whitespace with single space
    .replace(/\n\s*\n/g, '\n\n')  // Normalize multiple line breaks
    .replace(/^\s+|\s+$/g, '')  // Trim start and end
    .trim();
}
/**
 * Extracts metadata from a URL (title, description, etc.)
 * @param {string} url - The URL to extract metadata from
 * @returns {Promise<Object>} - The metadata
 */
async function extractUrlMetadata(url) {
  try {
    // Get a random user agent
    const userAgent = getRandomUserAgent();
    const response = await axios.get(url, {
      headers: {
        'User-Agent': userAgent
      },
      httpsAgent: httpsAgent
    });
    if (response.status !== 200) {
      throw new Error(`Failed to fetch URL: ${url}`);
    }
    const $ = cheerio.load(response.data);
    // Extract metadata
    const title = $('title').text() || '';
    const description = $('meta[name="description"]').attr('content') ||
                       $('meta[property="og:description"]').attr('content') || '';
    const ogImage = $('meta[property="og:image"]').attr('content') || '';
    const favicon = $('link[rel="icon"]').attr('href') ||
                  $('link[rel="shortcut icon"]').attr('href') || '';
    // Resolve relative URLs
    const resolvedFavicon = favicon ? new URL(favicon, url).href : getFaviconUrl(url);
    const resolvedOgImage = ogImage ? new URL(ogImage, url).href : '';
    return {
      title,
      description,
      ogImage: resolvedOgImage,
      favicon: resolvedFavicon,
      url
    };
  } catch (error) {
    console.error('Error extracting URL metadata:', error.message);
    throw error;
  }
}
export {
  searchDuckDuckGo,
  fetchUrlContent,
  extractUrlMetadata,
  extractDirectUrl,
  getFaviconUrl
};