import { WebCrawler } from './crawler.js';
import { FileCache } from './cache.js';
import { logger } from '@/utils/logger.js';
import { defaultConfig, type SpiderConfig } from '@/utils/config.js';
import { searchInText, highlightMatches } from '@/utils/text.js';
import { matchesPattern } from '@/utils/url.js';
import type { CrawlResult, CrawlOptions } from './types.js';
import type { SearchResult, PageListItem } from '@/mcp/types.js';
export class SpiderService {
private cache: FileCache;
private config: SpiderConfig;
constructor(cacheDir: string = './cache', config: Partial<SpiderConfig> = {}) {
this.config = { ...defaultConfig, ...config };
this.cache = new FileCache(cacheDir, this.config.cacheTTL);
logger.info('SpiderService initialized');
}
async crawlDocs(
url: string,
maxDepth?: number,
includePatterns?: string[],
excludePatterns?: string[],
enableLLMAnalysis?: boolean,
llmAnalysisType?: 'full' | 'summary' | 'links' | 'classification' | 'code_examples'
): Promise<CrawlResult[]> {
const options: CrawlOptions = {
maxDepth: maxDepth || this.config.maxDepth,
maxPages: this.config.maxPages,
includePatterns: includePatterns || this.config.includePatterns,
excludePatterns: excludePatterns || this.config.excludePatterns,
respectRobotsTxt: this.config.respectRobotsTxt,
userAgent: this.config.userAgent,
timeout: this.config.timeout,
concurrency: this.config.concurrency,
enableLLMAnalysis: enableLLMAnalysis ?? false,
llmAnalysisType: llmAnalysisType || 'full',
};
logger.info(`Starting crawl of ${url} with options:`, options);
const crawler = new WebCrawler(options);
const results = await crawler.crawl(url);
// Cache all results
for (const result of results) {
await this.cache.set(result.url, result);
}
logger.info(`Crawl completed: ${results.length} pages crawled`);
return results;
}
async getPage(url: string): Promise<CrawlResult | null> {
logger.debug(`Getting page: ${url}`);
return await this.cache.get(url);
}
async searchDocs(query: string, limit: number = 10): Promise<SearchResult[]> {
logger.debug(`Searching for: ${query} (limit: ${limit})`);
const results: SearchResult[] = [];
const stats = await this.cache.getStats();
if (stats.totalEntries === 0) {
logger.warn('No cached pages available for search');
return results;
}
// Get all cached pages and search through them
// Note: In a production system, you'd want a proper search index
const pages = await this.getAllCachedPages();
for (const page of pages) {
const titleMatch = searchInText(page.title, query);
const contentMatch = searchInText(page.content, query);
if (titleMatch || contentMatch) {
const snippet = this.createSnippet(page.content, query);
const relevance = this.calculateRelevance(page, query);
results.push({
url: page.url,
title: page.title,
snippet,
relevance,
});
}
}
// Sort by relevance and limit results
results.sort((a, b) => b.relevance - a.relevance);
return results.slice(0, limit);
}
async listPages(
filter?: string,
sort: 'url' | 'title' | 'timestamp' = 'url',
order: 'asc' | 'desc' = 'asc'
): Promise<PageListItem[]> {
logger.debug(`Listing pages: filter=${filter}, sort=${sort}, order=${order}`);
const pages = await this.getAllCachedPages();
let filteredPages = pages;
// Apply filter if provided
if (filter) {
filteredPages = pages.filter(page =>
matchesPattern(page.url, [filter]) ||
searchInText(page.title, filter)
);
}
// Convert to PageListItem and sort
const items: PageListItem[] = filteredPages.map(page => ({
url: page.url,
title: page.title,
timestamp: page.timestamp,
wordCount: page.metadata.wordCount,
}));
items.sort((a, b) => {
let comparison = 0;
switch (sort) {
case 'url':
comparison = a.url.localeCompare(b.url);
break;
case 'title':
comparison = a.title.localeCompare(b.title);
break;
case 'timestamp':
comparison = a.timestamp - b.timestamp;
break;
}
return order === 'desc' ? -comparison : comparison;
});
return items;
}
async clearCache(urlPattern?: string): Promise<number> {
logger.info(`Clearing cache: pattern=${urlPattern || 'all'}`);
return await this.cache.clear(urlPattern);
}
async getCacheStats() {
return await this.cache.getStats();
}
async cleanupCache(): Promise<number> {
return await this.cache.cleanup();
}
private async getAllCachedPages(): Promise<CrawlResult[]> {
// Get all cached entries from the cache
try {
return await this.cache.getAllEntries();
} catch (error) {
logger.error('Failed to get all cached pages:', error);
return [];
}
}
private createSnippet(content: string, query: string, maxLength: number = 200): string {
const queryLower = query.toLowerCase();
const contentLower = content.toLowerCase();
const index = contentLower.indexOf(queryLower);
if (index === -1) {
// Query not found, return beginning of content
return content.slice(0, maxLength) + (content.length > maxLength ? '...' : '');
}
// Extract snippet around the match
const start = Math.max(0, index - maxLength / 2);
const end = Math.min(content.length, start + maxLength);
let snippet = content.slice(start, end);
// Add ellipsis if truncated
if (start > 0) snippet = '...' + snippet;
if (end < content.length) snippet = snippet + '...';
// Highlight matches
return highlightMatches(snippet, query);
}
private calculateRelevance(page: CrawlResult, query: string): number {
let score = 0;
const queryLower = query.toLowerCase();
const titleLower = page.title.toLowerCase();
const contentLower = page.content.toLowerCase();
// Title matches are more relevant
const titleMatches = (titleLower.match(new RegExp(queryLower, 'g')) || []).length;
score += titleMatches * 10;
// Content matches
const contentMatches = (contentLower.match(new RegExp(queryLower, 'g')) || []).length;
score += contentMatches;
// Boost for exact phrase matches
if (titleLower.includes(queryLower)) score += 20;
if (contentLower.includes(queryLower)) score += 5;
// Normalize by content length to favor more focused content
score = score / Math.log(page.content.length + 1);
return score;
}
}
export * from './types.js';
export { WebCrawler } from './crawler.js';
export { FileCache } from './cache.js';