search.ts•15.7 kB
import { load } from "cheerio";
import { config } from "./config.js";
import type {
SearchResult,
SearchResultsPage,
SearchStatistics,
Searcher,
SearchOptions,
} from "./types.js";
import { RatelimitException, DuckDuckGoSearchException } from "./types.js";
export class DuckDuckGoSearcher implements Searcher {
private lastRequestTime = 0;
async rateLimit(): Promise<void> {
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequestTime;
if (timeSinceLastRequest < config.rateLimitDelay) {
const delay = config.rateLimitDelay - timeSinceLastRequest;
config.log(`Rate limiting: waiting ${delay}ms`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
this.lastRequestTime = Date.now();
}
decodeUrl(duckDuckGoUrl: string): string {
try {
const url = new URL(duckDuckGoUrl, "https://duckduckgo.com");
const uddg = url.searchParams.get("uddg");
if (uddg) {
return decodeURIComponent(uddg);
}
return duckDuckGoUrl;
} catch (error) {
config.log("URL decode error:", (error as Error).message);
return duckDuckGoUrl;
}
}
private checkResponseStatus(status: number): void {
// Rate limit related status codes
if ([202, 301, 403, 400, 429, 418].includes(status)) {
throw new RatelimitException(status);
}
// Any other non-200 status
if (status !== 200) {
throw new DuckDuckGoSearchException(status);
}
}
async search(
query: string,
nextToken?: string,
options?: SearchOptions,
retryCount = 0
): Promise<SearchResultsPage> {
await this.rateLimit();
// Use POST for all DuckDuckGo requests
const searchUrl = "https://html.duckduckgo.com/html/";
// If nextToken is provided, parse it and use those parameters
let params: Record<string, string>;
if (nextToken) {
try {
params = JSON.parse(nextToken);
} catch (error) {
throw new Error('Invalid next token format');
}
} else {
params = {
q: query,
b: "",
kl: options?.locale || "wt-wt",
df: "y",
};
}
config.log(`Searching with POST params:`, params);
const startTime = Date.now();
try {
const formData = new URLSearchParams(params);
const response = await fetch(searchUrl, {
method: "POST",
headers: {
"User-Agent": config.userAgent,
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
Connection: "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Cache-Control": "max-age=0",
Cookie: "df=y; kl=us-en",
Referer: "https://html.duckduckgo.com/",
Origin: "https://html.duckduckgo.com",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
Priority: "u=0, i",
},
body: formData,
});
const responseTime = Date.now() - startTime;
config.log(`Response: ${response.status} in ${responseTime}ms`);
// Check status code and throw appropriate exceptions
this.checkResponseStatus(response.status);
const html = await response.text();
const results = this.parseResults(html);
const paginationInfo = this.parsePaginationInfo(html);
config.log(`Parsed ${results.length} results`);
config.log(
`Pagination: hasNext=${paginationInfo.hasNext}, hasPrevious=${paginationInfo.hasPrevious}`
);
// Estimate current page from nextToken if available
let currentPage = 1;
if (nextToken) {
try {
const tokenParams = JSON.parse(nextToken);
// Try to extract page info from DuckDuckGo parameters
currentPage = tokenParams.s ? Math.floor(parseInt(tokenParams.s) / 10) + 1 : 1;
} catch {
currentPage = 1;
}
}
// Estimate total pages based on pagination availability
const estimatedTotalPages = paginationInfo.hasNext
? Math.max(currentPage + 1, 10)
: currentPage;
const estimatedTotalResults = results.length * estimatedTotalPages;
const searchPage: SearchResultsPage = {
results,
currentPage,
totalPages: estimatedTotalPages,
totalResults: estimatedTotalResults,
hasNextPage: paginationInfo.hasNext,
hasPreviousPage: paginationInfo.hasPrevious, // Use actual pagination info from DuckDuckGo
query,
};
if (paginationInfo.nextPageParams) {
searchPage.nextPageParams = paginationInfo.nextPageParams;
}
if (paginationInfo.previousPageParams) {
searchPage.previousPageParams = paginationInfo.previousPageParams;
}
return searchPage;
} catch (error) {
config.log(
`Search error (attempt ${retryCount + 1}):`,
(error as Error).message
);
// Don't retry for rate limit exceptions only
if (error instanceof RatelimitException) {
throw error;
}
if (retryCount < config.maxRetries) {
const delay = Math.pow(2, retryCount) * 1000;
config.log(`Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
return this.search(query, nextToken, options, retryCount + 1);
}
throw new Error(
`Search failed after ${config.maxRetries + 1} attempts: ${
(error as Error).message
}`
);
}
}
async getNextPage(
currentPage: SearchResultsPage
): Promise<SearchResultsPage> {
if (!this.hasNextPage(currentPage)) {
throw new Error("No next page available");
}
// Use extracted form parameters for POST-based pagination
if (currentPage.nextPageParams) {
return this.searchWithParams(
currentPage.nextPageParams,
currentPage.currentPage + 1
);
}
throw new Error("No next page parameters available");
}
async getPreviousPage(
currentPage: SearchResultsPage
): Promise<SearchResultsPage> {
if (!this.hasPreviousPage(currentPage)) {
throw new Error("No previous page available");
}
// Use extracted form parameters for POST-based pagination
if (currentPage.previousPageParams) {
return this.searchWithParams(
currentPage.previousPageParams,
currentPage.currentPage - 1
);
}
throw new Error("No previous page parameters available");
}
async searchWithParams(
params: Record<string, string>,
pageNumber: number,
retryCount = 0
): Promise<SearchResultsPage> {
await this.rateLimit();
const searchUrl = "https://html.duckduckgo.com/html/";
config.log(`Searching page ${pageNumber} with POST params:`, params);
// Create form data from extracted parameters
const formData = new URLSearchParams(params);
const startTime = Date.now();
try {
const response = await fetch(searchUrl, {
method: "POST",
headers: {
"User-Agent": config.userAgent,
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
Connection: "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Cache-Control": "max-age=0",
Cookie: "df=y; kl=us-en",
Referer: `https://html.duckduckgo.com/html/`,
Origin: "https://html.duckduckgo.com",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
},
body: formData,
});
const responseTime = Date.now() - startTime;
config.log(`Response: ${response.status} in ${responseTime}ms`);
// Check status code and throw appropriate exceptions
this.checkResponseStatus(response.status);
const html = await response.text();
const results = this.parseResults(html);
const paginationInfo = this.parsePaginationInfo(html);
config.log(`Parsed ${results.length} results for page ${pageNumber}`);
config.log(
`Pagination: hasNext=${paginationInfo.hasNext}, hasPrevious=${paginationInfo.hasPrevious}`
);
// Estimate total pages based on pagination availability
const estimatedTotalPages = paginationInfo.hasNext
? Math.max(pageNumber + 1, 10)
: pageNumber;
const estimatedTotalResults = results.length * estimatedTotalPages;
const searchPage: SearchResultsPage = {
results,
currentPage: pageNumber,
totalPages: estimatedTotalPages,
totalResults: estimatedTotalResults,
hasNextPage: paginationInfo.hasNext,
hasPreviousPage: paginationInfo.hasPrevious, // Use actual pagination info from DuckDuckGo
query: params.q || "",
};
if (paginationInfo.nextPageParams) {
searchPage.nextPageParams = paginationInfo.nextPageParams;
}
if (paginationInfo.previousPageParams) {
searchPage.previousPageParams = paginationInfo.previousPageParams;
}
return searchPage;
} catch (error) {
config.log(
`Search error (attempt ${retryCount + 1}):`,
(error as Error).message
);
// Don't retry for rate limit exceptions only
if (error instanceof RatelimitException) {
throw error;
}
if (retryCount < config.maxRetries) {
const delay = Math.pow(2, retryCount) * 1000;
config.log(`Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
return this.searchWithParams(params, pageNumber, retryCount + 1);
}
throw new Error(
`Search failed after ${config.maxRetries + 1} attempts: ${
(error as Error).message
}`
);
}
}
async getSearchStatistics(
query: string,
page: number
): Promise<SearchStatistics> {
// For DuckDuckGo, we don't have a good way to convert page number to nextToken
// without a previous search, so we'll search with undefined (first page)
const searchPage = await this.search(query, undefined, undefined);
return {
totalResults: searchPage.totalResults,
currentPage: searchPage.currentPage,
totalPages: searchPage.totalPages,
resultsPerPage: searchPage.results.length,
hasNextPage: searchPage.hasNextPage,
hasPreviousPage: searchPage.hasPreviousPage,
};
}
hasNextPage(currentPage: SearchResultsPage): boolean {
return currentPage.hasNextPage;
}
hasPreviousPage(currentPage: SearchResultsPage): boolean {
return currentPage.hasPreviousPage;
}
parsePaginationInfo(html: string): {
hasNext: boolean;
hasPrevious: boolean;
nextPageParams?: Record<string, string>;
previousPageParams?: Record<string, string>;
} {
const $ = load(html);
// Check for Next button and extract form parameters
const nextButton = $('#links form input[value="Next"]');
const hasNext = nextButton.length > 0;
let nextPageParams: Record<string, string> | undefined;
if (hasNext) {
const nextForm = nextButton.parent("form");
nextPageParams = {};
nextForm.find('input[type="hidden"]').each((_, el) => {
const name = $(el).attr("name");
const value = $(el).attr("value");
if (name && value !== undefined) {
nextPageParams![name] = value;
}
});
config.log(`Next page params:`, nextPageParams);
}
// Check for Previous button and extract form parameters
const prevButton = $('#links form input[value="Previous"]');
const hasPrevious = prevButton.length > 0;
let previousPageParams: Record<string, string> | undefined;
if (hasPrevious) {
const prevForm = prevButton.parent("form");
previousPageParams = {};
prevForm.find('input[type="hidden"]').each((_, el) => {
const name = $(el).attr("name");
const value = $(el).attr("value");
if (name && value !== undefined) {
previousPageParams![name] = value;
}
});
config.log(`Previous page params:`, previousPageParams);
}
config.log(
`DuckDuckGo pagination buttons - Next: ${hasNext}, Previous: ${hasPrevious}`
);
const result: {
hasNext: boolean;
hasPrevious: boolean;
nextPageParams?: Record<string, string>;
previousPageParams?: Record<string, string>;
} = { hasNext, hasPrevious };
if (nextPageParams) {
result.nextPageParams = nextPageParams;
}
if (previousPageParams) {
result.previousPageParams = previousPageParams;
}
return result;
}
parseResults(html: string): SearchResult[] {
const $ = load(html);
const results: SearchResult[] = [];
// Debug: Log the search structure
config.log(`HTML length: ${html.length}`);
config.log(`Found .result elements: ${$(".result").length}`);
config.log(`Found .web-result elements: ${$(".web-result").length}`);
config.log(`Found .result__a elements: ${$(".result__a").length}`);
// First, try to extract all links and create synthetic pagination
const seenUrls = new Set<string>();
// Extract all meaningful search result links
$("a[href]").each((index, element) => {
if (results.length >= 50) return false; // Limit to 50 for performance
const $link = $(element);
const href = $link.attr("href");
const title = $link.text().trim();
if (!href || !title) return undefined;
// Decode the URL
const url = this.decodeUrl(href);
// Skip if we've seen this URL before
if (seenUrls.has(url)) return undefined;
// Skip DuckDuckGo internal links, ads, and irrelevant links
if (
url.includes("duckduckgo.com") ||
url.includes("javascript:") ||
url.startsWith("#") ||
url.includes("/ads/") ||
title.length < 5 ||
title.toLowerCase().includes("duckduckgo") ||
title.toLowerCase().includes("more results") ||
title.toLowerCase().includes("images") ||
title.toLowerCase().includes("videos") ||
title.toLowerCase().includes("news")
) {
return undefined;
}
// Must be an external HTTP/HTTPS link
if (!url.startsWith("http://") && !url.startsWith("https://")) {
return undefined;
}
seenUrls.add(url);
// Try to find a good snippet
let snippet = "";
const $parent = $link.closest(
'.result, .web-result, div[class*="result"]'
);
if ($parent.length) {
snippet = $parent
.find('.result__snippet, .snippet, [class*="snippet"]')
.first()
.text()
.trim();
if (!snippet) {
// Fallback to parent text, cleaned up
snippet = $parent
.text()
.replace(/\s+/g, " ")
.trim()
.substring(0, 200);
}
}
results.push({
title,
url,
snippet,
index: results.length + 1,
});
return undefined;
});
config.log(`Extracted ${results.length} unique results from all links`);
config.log(`Final parsed results: ${results.length}`);
return results;
}
}