import { CrawlQueue } from './queue.js';
import { RobotsParser } from './robots.js';
import { HtmlParser } from './parser.js';
import { ContentAnalyzer } from '@/llm/analyzer.js';
import { createLLMClient } from '@/llm/client.js';
import { AnalysisType } from '@/llm/types.js';
import { logger } from '@/utils/logger.js';
import { withRetry } from '@/utils/retry.js';
import { normalizeUrl, isSameDomain, matchesPattern } from '@/utils/url.js';
import type { CrawlResult, CrawlOptions, CrawlStats, QueueItem, LLMAnalysisResult } from './types.js';
export class WebCrawler {
private queue = new CrawlQueue();
private results = new Map<string, CrawlResult>();
private robots?: RobotsParser;
private analyzer?: ContentAnalyzer;
private stats: CrawlStats;
private activeRequests = 0;
private crawlDelay = 0;
private lastRequestTime = 0;
constructor(private options: CrawlOptions) {
this.stats = {
totalPages: 0,
successfulPages: 0,
failedPages: 0,
averageResponseTime: 0,
startTime: Date.now(),
};
}
async crawl(startUrl: string): Promise<CrawlResult[]> {
logger.info(`Starting crawl from ${startUrl}`);
this.stats.startTime = Date.now();
const normalizedUrl = normalizeUrl(startUrl);
// Initialize LLM analyzer if enabled
if (this.options.enableLLMAnalysis) {
const llmClient = createLLMClient();
if (llmClient.isAvailable()) {
this.analyzer = new ContentAnalyzer(llmClient);
logger.info(`LLM analysis enabled with provider: ${llmClient.getProviderName()}`);
} else {
logger.warn('LLM analysis requested but no provider available');
}
}
// Initialize robots.txt if needed
if (this.options.respectRobotsTxt) {
this.robots = await RobotsParser.fetch(normalizedUrl, this.options.userAgent);
this.crawlDelay = this.robots.getCrawlDelay(this.options.userAgent);
logger.info(`Crawl delay: ${this.crawlDelay}ms`);
}
// Add start URL to queue
this.queue.add({ url: normalizedUrl, depth: 0 });
// Process queue with concurrency limit
const workers: Promise<void>[] = [];
for (let i = 0; i < this.options.concurrency; i++) {
workers.push(this.worker());
}
await Promise.all(workers);
this.stats.endTime = Date.now();
this.logStats();
return Array.from(this.results.values());
}
private async worker(): Promise<void> {
while (!this.queue.isEmpty() && this.results.size < this.options.maxPages) {
const item = this.queue.next();
if (!item) break;
try {
await this.processItem(item);
} catch (error) {
logger.error(`Error processing ${item.url}:`, error);
this.queue.markFailed(item.url);
this.stats.failedPages++;
}
}
}
private async processItem(item: QueueItem): Promise<void> {
const { url, depth } = item;
// Check robots.txt
if (this.robots && !this.robots.canFetch(url, this.options.userAgent)) {
logger.debug(`Blocked by robots.txt: ${url}`);
this.queue.markFailed(url);
return;
}
// Apply crawl delay
if (this.crawlDelay > 0) {
const timeSinceLastRequest = Date.now() - this.lastRequestTime;
if (timeSinceLastRequest < this.crawlDelay) {
await new Promise(resolve => setTimeout(resolve, this.crawlDelay - timeSinceLastRequest));
}
}
this.lastRequestTime = Date.now();
this.activeRequests++;
try {
const result = await this.fetchAndParse(url, depth);
if (result) {
this.results.set(url, result);
this.queue.markVisited(url);
this.stats.successfulPages++;
// Add discovered links to queue
if (depth < this.options.maxDepth) {
this.addLinksToQueue(result.metadata.links, depth + 1, url);
}
} else {
this.queue.markFailed(url);
this.stats.failedPages++;
}
} finally {
this.activeRequests--;
}
this.stats.totalPages++;
}
private getAnalysisTypeEnum(type: string): AnalysisType {
switch (type) {
case 'full': return AnalysisType.FULL;
case 'summary': return AnalysisType.SUMMARY_ONLY;
case 'links': return AnalysisType.LINKS_ONLY;
case 'classification': return AnalysisType.CLASSIFICATION_ONLY;
case 'code_examples': return AnalysisType.CODE_EXAMPLES_ONLY;
default: return AnalysisType.FULL;
}
}
private async fetchAndParse(url: string, depth: number): Promise<CrawlResult | null> {
const startTime = Date.now();
try {
const response = await withRetry(
() => this.fetchPage(url),
{ maxAttempts: 3 }
);
if (!response.ok) {
logger.warn(`HTTP ${response.status} for ${url}`);
return null;
}
const contentType = response.headers.get('content-type') || '';
if (!contentType.includes('text/html')) {
logger.debug(`Skipping non-HTML content: ${url} (${contentType})`);
return null;
}
const html = await response.text();
const parser = new HtmlParser(url);
const parsed = parser.parse(html);
const responseTime = Date.now() - startTime;
this.updateAverageResponseTime(responseTime);
// Perform LLM analysis if enabled
let llmAnalysis: LLMAnalysisResult | undefined;
if (this.analyzer) {
try {
const analysisType = this.getAnalysisTypeEnum(this.options.llmAnalysisType || 'full');
const analysis = await this.analyzer.analyzeContent({
content: parsed.content,
url,
title: parsed.title,
links: parsed.links,
analysisType,
});
if (analysis) {
llmAnalysis = {
summary: analysis.summary,
keyPoints: analysis.keyPoints,
contentType: analysis.contentType,
relevantLinks: analysis.relevantLinks.map(link => ({
url: link.url,
title: link.title,
relevance: link.relevance,
reason: link.reason,
category: link.category,
})),
codeExamples: analysis.codeExamples.map(example => ({
language: example.language,
code: example.code,
description: example.description,
category: example.category,
})),
confidence: analysis.confidence,
analyzedAt: Date.now(),
provider: this.analyzer.getProviderInfo(),
};
logger.debug(`LLM analysis completed for ${url} (confidence: ${analysis.confidence})`);
}
} catch (error) {
logger.warn(`LLM analysis failed for ${url}:`, error);
}
}
logger.info(`Crawled ${url} (${parsed.metadata.wordCount} words, ${parsed.links.length} links)${llmAnalysis ? ' + LLM analysis' : ''}`);
return {
url,
title: parsed.title,
content: parsed.content,
metadata: parsed.metadata,
llmAnalysis,
timestamp: Date.now(),
depth,
};
} catch (error) {
logger.error(`Failed to fetch ${url}:`, error);
return null;
}
}
private async fetchPage(url: string): Promise<Response> {
return fetch(url, {
headers: {
'User-Agent': this.options.userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
signal: AbortSignal.timeout(this.options.timeout),
});
}
private addLinksToQueue(links: string[], depth: number, parentUrl: string): void {
let added = 0;
for (const link of links) {
// Only crawl same domain
if (!isSameDomain(link, parentUrl)) {
continue;
}
// Apply include/exclude patterns
if (this.options.includePatterns.length > 0 &&
!matchesPattern(link, this.options.includePatterns)) {
continue;
}
if (this.options.excludePatterns.length > 0 &&
matchesPattern(link, this.options.excludePatterns)) {
continue;
}
const normalizedLink = normalizeUrl(link);
if (this.queue.add({ url: normalizedLink, depth, parentUrl })) {
added++;
}
}
if (added > 0) {
logger.debug(`Added ${added} links to queue from ${parentUrl}`);
}
}
private updateAverageResponseTime(responseTime: number): void {
if (this.stats.successfulPages === 0) {
this.stats.averageResponseTime = responseTime;
} else {
this.stats.averageResponseTime =
(this.stats.averageResponseTime * (this.stats.successfulPages - 1) + responseTime) /
this.stats.successfulPages;
}
}
private logStats(): void {
const duration = (this.stats.endTime! - this.stats.startTime) / 1000;
logger.info(`Crawl completed in ${duration.toFixed(2)}s`);
logger.info(`Pages: ${this.stats.successfulPages} successful, ${this.stats.failedPages} failed`);
logger.info(`Average response time: ${this.stats.averageResponseTime.toFixed(0)}ms`);
}
getResults(): CrawlResult[] {
return Array.from(this.results.values());
}
getStats(): CrawlStats {
return { ...this.stats };
}
}