Skip to main content
Glama

LLM Researcher

by Code-Hex
search.ts15.7 kB
import { load } from "cheerio"; import { config } from "./config.js"; import type { SearchResult, SearchResultsPage, SearchStatistics, Searcher, SearchOptions, } from "./types.js"; import { RatelimitException, DuckDuckGoSearchException } from "./types.js"; export class DuckDuckGoSearcher implements Searcher { private lastRequestTime = 0; async rateLimit(): Promise<void> { const now = Date.now(); const timeSinceLastRequest = now - this.lastRequestTime; if (timeSinceLastRequest < config.rateLimitDelay) { const delay = config.rateLimitDelay - timeSinceLastRequest; config.log(`Rate limiting: waiting ${delay}ms`); await new Promise((resolve) => setTimeout(resolve, delay)); } this.lastRequestTime = Date.now(); } decodeUrl(duckDuckGoUrl: string): string { try { const url = new URL(duckDuckGoUrl, "https://duckduckgo.com"); const uddg = url.searchParams.get("uddg"); if (uddg) { return decodeURIComponent(uddg); } return duckDuckGoUrl; } catch (error) { config.log("URL decode error:", (error as Error).message); return duckDuckGoUrl; } } private checkResponseStatus(status: number): void { // Rate limit related status codes if ([202, 301, 403, 400, 429, 418].includes(status)) { throw new RatelimitException(status); } // Any other non-200 status if (status !== 200) { throw new DuckDuckGoSearchException(status); } } async search( query: string, nextToken?: string, options?: SearchOptions, retryCount = 0 ): Promise<SearchResultsPage> { await this.rateLimit(); // Use POST for all DuckDuckGo requests const searchUrl = "https://html.duckduckgo.com/html/"; // If nextToken is provided, parse it and use those parameters let params: Record<string, string>; if (nextToken) { try { params = JSON.parse(nextToken); } catch (error) { throw new Error('Invalid next token format'); } } else { params = { q: query, b: "", kl: options?.locale || "wt-wt", df: "y", }; } config.log(`Searching with POST params:`, params); const startTime = Date.now(); try { const formData = new URLSearchParams(params); const response = await fetch(searchUrl, { method: "POST", headers: { "User-Agent": config.userAgent, Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", Connection: "keep-alive", "Content-Type": "application/x-www-form-urlencoded", "Cache-Control": "max-age=0", Cookie: "df=y; kl=us-en", Referer: "https://html.duckduckgo.com/", Origin: "https://html.duckduckgo.com", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", Priority: "u=0, i", }, body: formData, }); const responseTime = Date.now() - startTime; config.log(`Response: ${response.status} in ${responseTime}ms`); // Check status code and throw appropriate exceptions this.checkResponseStatus(response.status); const html = await response.text(); const results = this.parseResults(html); const paginationInfo = this.parsePaginationInfo(html); config.log(`Parsed ${results.length} results`); config.log( `Pagination: hasNext=${paginationInfo.hasNext}, hasPrevious=${paginationInfo.hasPrevious}` ); // Estimate current page from nextToken if available let currentPage = 1; if (nextToken) { try { const tokenParams = JSON.parse(nextToken); // Try to extract page info from DuckDuckGo parameters currentPage = tokenParams.s ? Math.floor(parseInt(tokenParams.s) / 10) + 1 : 1; } catch { currentPage = 1; } } // Estimate total pages based on pagination availability const estimatedTotalPages = paginationInfo.hasNext ? Math.max(currentPage + 1, 10) : currentPage; const estimatedTotalResults = results.length * estimatedTotalPages; const searchPage: SearchResultsPage = { results, currentPage, totalPages: estimatedTotalPages, totalResults: estimatedTotalResults, hasNextPage: paginationInfo.hasNext, hasPreviousPage: paginationInfo.hasPrevious, // Use actual pagination info from DuckDuckGo query, }; if (paginationInfo.nextPageParams) { searchPage.nextPageParams = paginationInfo.nextPageParams; } if (paginationInfo.previousPageParams) { searchPage.previousPageParams = paginationInfo.previousPageParams; } return searchPage; } catch (error) { config.log( `Search error (attempt ${retryCount + 1}):`, (error as Error).message ); // Don't retry for rate limit exceptions only if (error instanceof RatelimitException) { throw error; } if (retryCount < config.maxRetries) { const delay = Math.pow(2, retryCount) * 1000; config.log(`Retrying in ${delay}ms...`); await new Promise((resolve) => setTimeout(resolve, delay)); return this.search(query, nextToken, options, retryCount + 1); } throw new Error( `Search failed after ${config.maxRetries + 1} attempts: ${ (error as Error).message }` ); } } async getNextPage( currentPage: SearchResultsPage ): Promise<SearchResultsPage> { if (!this.hasNextPage(currentPage)) { throw new Error("No next page available"); } // Use extracted form parameters for POST-based pagination if (currentPage.nextPageParams) { return this.searchWithParams( currentPage.nextPageParams, currentPage.currentPage + 1 ); } throw new Error("No next page parameters available"); } async getPreviousPage( currentPage: SearchResultsPage ): Promise<SearchResultsPage> { if (!this.hasPreviousPage(currentPage)) { throw new Error("No previous page available"); } // Use extracted form parameters for POST-based pagination if (currentPage.previousPageParams) { return this.searchWithParams( currentPage.previousPageParams, currentPage.currentPage - 1 ); } throw new Error("No previous page parameters available"); } async searchWithParams( params: Record<string, string>, pageNumber: number, retryCount = 0 ): Promise<SearchResultsPage> { await this.rateLimit(); const searchUrl = "https://html.duckduckgo.com/html/"; config.log(`Searching page ${pageNumber} with POST params:`, params); // Create form data from extracted parameters const formData = new URLSearchParams(params); const startTime = Date.now(); try { const response = await fetch(searchUrl, { method: "POST", headers: { "User-Agent": config.userAgent, Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", Connection: "keep-alive", "Content-Type": "application/x-www-form-urlencoded", "Cache-Control": "max-age=0", Cookie: "df=y; kl=us-en", Referer: `https://html.duckduckgo.com/html/`, Origin: "https://html.duckduckgo.com", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", }, body: formData, }); const responseTime = Date.now() - startTime; config.log(`Response: ${response.status} in ${responseTime}ms`); // Check status code and throw appropriate exceptions this.checkResponseStatus(response.status); const html = await response.text(); const results = this.parseResults(html); const paginationInfo = this.parsePaginationInfo(html); config.log(`Parsed ${results.length} results for page ${pageNumber}`); config.log( `Pagination: hasNext=${paginationInfo.hasNext}, hasPrevious=${paginationInfo.hasPrevious}` ); // Estimate total pages based on pagination availability const estimatedTotalPages = paginationInfo.hasNext ? Math.max(pageNumber + 1, 10) : pageNumber; const estimatedTotalResults = results.length * estimatedTotalPages; const searchPage: SearchResultsPage = { results, currentPage: pageNumber, totalPages: estimatedTotalPages, totalResults: estimatedTotalResults, hasNextPage: paginationInfo.hasNext, hasPreviousPage: paginationInfo.hasPrevious, // Use actual pagination info from DuckDuckGo query: params.q || "", }; if (paginationInfo.nextPageParams) { searchPage.nextPageParams = paginationInfo.nextPageParams; } if (paginationInfo.previousPageParams) { searchPage.previousPageParams = paginationInfo.previousPageParams; } return searchPage; } catch (error) { config.log( `Search error (attempt ${retryCount + 1}):`, (error as Error).message ); // Don't retry for rate limit exceptions only if (error instanceof RatelimitException) { throw error; } if (retryCount < config.maxRetries) { const delay = Math.pow(2, retryCount) * 1000; config.log(`Retrying in ${delay}ms...`); await new Promise((resolve) => setTimeout(resolve, delay)); return this.searchWithParams(params, pageNumber, retryCount + 1); } throw new Error( `Search failed after ${config.maxRetries + 1} attempts: ${ (error as Error).message }` ); } } async getSearchStatistics( query: string, page: number ): Promise<SearchStatistics> { // For DuckDuckGo, we don't have a good way to convert page number to nextToken // without a previous search, so we'll search with undefined (first page) const searchPage = await this.search(query, undefined, undefined); return { totalResults: searchPage.totalResults, currentPage: searchPage.currentPage, totalPages: searchPage.totalPages, resultsPerPage: searchPage.results.length, hasNextPage: searchPage.hasNextPage, hasPreviousPage: searchPage.hasPreviousPage, }; } hasNextPage(currentPage: SearchResultsPage): boolean { return currentPage.hasNextPage; } hasPreviousPage(currentPage: SearchResultsPage): boolean { return currentPage.hasPreviousPage; } parsePaginationInfo(html: string): { hasNext: boolean; hasPrevious: boolean; nextPageParams?: Record<string, string>; previousPageParams?: Record<string, string>; } { const $ = load(html); // Check for Next button and extract form parameters const nextButton = $('#links form input[value="Next"]'); const hasNext = nextButton.length > 0; let nextPageParams: Record<string, string> | undefined; if (hasNext) { const nextForm = nextButton.parent("form"); nextPageParams = {}; nextForm.find('input[type="hidden"]').each((_, el) => { const name = $(el).attr("name"); const value = $(el).attr("value"); if (name && value !== undefined) { nextPageParams![name] = value; } }); config.log(`Next page params:`, nextPageParams); } // Check for Previous button and extract form parameters const prevButton = $('#links form input[value="Previous"]'); const hasPrevious = prevButton.length > 0; let previousPageParams: Record<string, string> | undefined; if (hasPrevious) { const prevForm = prevButton.parent("form"); previousPageParams = {}; prevForm.find('input[type="hidden"]').each((_, el) => { const name = $(el).attr("name"); const value = $(el).attr("value"); if (name && value !== undefined) { previousPageParams![name] = value; } }); config.log(`Previous page params:`, previousPageParams); } config.log( `DuckDuckGo pagination buttons - Next: ${hasNext}, Previous: ${hasPrevious}` ); const result: { hasNext: boolean; hasPrevious: boolean; nextPageParams?: Record<string, string>; previousPageParams?: Record<string, string>; } = { hasNext, hasPrevious }; if (nextPageParams) { result.nextPageParams = nextPageParams; } if (previousPageParams) { result.previousPageParams = previousPageParams; } return result; } parseResults(html: string): SearchResult[] { const $ = load(html); const results: SearchResult[] = []; // Debug: Log the search structure config.log(`HTML length: ${html.length}`); config.log(`Found .result elements: ${$(".result").length}`); config.log(`Found .web-result elements: ${$(".web-result").length}`); config.log(`Found .result__a elements: ${$(".result__a").length}`); // First, try to extract all links and create synthetic pagination const seenUrls = new Set<string>(); // Extract all meaningful search result links $("a[href]").each((index, element) => { if (results.length >= 50) return false; // Limit to 50 for performance const $link = $(element); const href = $link.attr("href"); const title = $link.text().trim(); if (!href || !title) return undefined; // Decode the URL const url = this.decodeUrl(href); // Skip if we've seen this URL before if (seenUrls.has(url)) return undefined; // Skip DuckDuckGo internal links, ads, and irrelevant links if ( url.includes("duckduckgo.com") || url.includes("javascript:") || url.startsWith("#") || url.includes("/ads/") || title.length < 5 || title.toLowerCase().includes("duckduckgo") || title.toLowerCase().includes("more results") || title.toLowerCase().includes("images") || title.toLowerCase().includes("videos") || title.toLowerCase().includes("news") ) { return undefined; } // Must be an external HTTP/HTTPS link if (!url.startsWith("http://") && !url.startsWith("https://")) { return undefined; } seenUrls.add(url); // Try to find a good snippet let snippet = ""; const $parent = $link.closest( '.result, .web-result, div[class*="result"]' ); if ($parent.length) { snippet = $parent .find('.result__snippet, .snippet, [class*="snippet"]') .first() .text() .trim(); if (!snippet) { // Fallback to parent text, cleaned up snippet = $parent .text() .replace(/\s+/g, " ") .trim() .substring(0, 200); } } results.push({ title, url, snippet, index: results.length + 1, }); return undefined; }); config.log(`Extracted ${results.length} unique results from all links`); config.log(`Final parsed results: ${results.length}`); return results; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Code-Hex/light-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server