Spider MCP Server

Overview Schema Related Servers Score Discussions

index.ts•6.71 KiB

import { WebCrawler } from './crawler.js'; import { FileCache } from './cache.js'; import { logger } from '@/utils/logger.js'; import { defaultConfig, type SpiderConfig } from '@/utils/config.js'; import { searchInText, highlightMatches } from '@/utils/text.js'; import { matchesPattern } from '@/utils/url.js'; import type { CrawlResult, CrawlOptions } from './types.js'; import type { SearchResult, PageListItem } from '@/mcp/types.js'; export class SpiderService { private cache: FileCache; private config: SpiderConfig; constructor(cacheDir: string = './cache', config: Partial<SpiderConfig> = {}) { this.config = { ...defaultConfig, ...config }; this.cache = new FileCache(cacheDir, this.config.cacheTTL); logger.info('SpiderService initialized'); } async crawlDocs( url: string, maxDepth?: number, includePatterns?: string[], excludePatterns?: string[], enableLLMAnalysis?: boolean, llmAnalysisType?: 'full' | 'summary' | 'links' | 'classification' | 'code_examples' ): Promise<CrawlResult[]> { const options: CrawlOptions = { maxDepth: maxDepth || this.config.maxDepth, maxPages: this.config.maxPages, includePatterns: includePatterns || this.config.includePatterns, excludePatterns: excludePatterns || this.config.excludePatterns, respectRobotsTxt: this.config.respectRobotsTxt, userAgent: this.config.userAgent, timeout: this.config.timeout, concurrency: this.config.concurrency, enableLLMAnalysis: enableLLMAnalysis ?? false, llmAnalysisType: llmAnalysisType || 'full', }; logger.info(`Starting crawl of ${url} with options:`, options); const crawler = new WebCrawler(options); const results = await crawler.crawl(url); // Cache all results for (const result of results) { await this.cache.set(result.url, result); } logger.info(`Crawl completed: ${results.length} pages crawled`); return results; } async getPage(url: string): Promise<CrawlResult | null> { logger.debug(`Getting page: ${url}`); return await this.cache.get(url); } async searchDocs(query: string, limit: number = 10): Promise<SearchResult[]> { logger.debug(`Searching for: ${query} (limit: ${limit})`); const results: SearchResult[] = []; const stats = await this.cache.getStats(); if (stats.totalEntries === 0) { logger.warn('No cached pages available for search'); return results; } // Get all cached pages and search through them // Note: In a production system, you'd want a proper search index const pages = await this.getAllCachedPages(); for (const page of pages) { const titleMatch = searchInText(page.title, query); const contentMatch = searchInText(page.content, query); if (titleMatch || contentMatch) { const snippet = this.createSnippet(page.content, query); const relevance = this.calculateRelevance(page, query); results.push({ url: page.url, title: page.title, snippet, relevance, }); } } // Sort by relevance and limit results results.sort((a, b) => b.relevance - a.relevance); return results.slice(0, limit); } async listPages( filter?: string, sort: 'url' | 'title' | 'timestamp' = 'url', order: 'asc' | 'desc' = 'asc' ): Promise<PageListItem[]> { logger.debug(`Listing pages: filter=${filter}, sort=${sort}, order=${order}`); const pages = await this.getAllCachedPages(); let filteredPages = pages; // Apply filter if provided if (filter) { filteredPages = pages.filter(page => matchesPattern(page.url, [filter]) || searchInText(page.title, filter) ); } // Convert to PageListItem and sort const items: PageListItem[] = filteredPages.map(page => ({ url: page.url, title: page.title, timestamp: page.timestamp, wordCount: page.metadata.wordCount, })); items.sort((a, b) => { let comparison = 0; switch (sort) { case 'url': comparison = a.url.localeCompare(b.url); break; case 'title': comparison = a.title.localeCompare(b.title); break; case 'timestamp': comparison = a.timestamp - b.timestamp; break; } return order === 'desc' ? -comparison : comparison; }); return items; } async clearCache(urlPattern?: string): Promise<number> { logger.info(`Clearing cache: pattern=${urlPattern || 'all'}`); return await this.cache.clear(urlPattern); } async getCacheStats() { return await this.cache.getStats(); } async cleanupCache(): Promise<number> { return await this.cache.cleanup(); } private async getAllCachedPages(): Promise<CrawlResult[]> { // Get all cached entries from the cache try { return await this.cache.getAllEntries(); } catch (error) { logger.error('Failed to get all cached pages:', error); return []; } } private createSnippet(content: string, query: string, maxLength: number = 200): string { const queryLower = query.toLowerCase(); const contentLower = content.toLowerCase(); const index = contentLower.indexOf(queryLower); if (index === -1) { // Query not found, return beginning of content return content.slice(0, maxLength) + (content.length > maxLength ? '...' : ''); } // Extract snippet around the match const start = Math.max(0, index - maxLength / 2); const end = Math.min(content.length, start + maxLength); let snippet = content.slice(start, end); // Add ellipsis if truncated if (start > 0) snippet = '...' + snippet; if (end < content.length) snippet = snippet + '...'; // Highlight matches return highlightMatches(snippet, query); } private calculateRelevance(page: CrawlResult, query: string): number { let score = 0; const queryLower = query.toLowerCase(); const titleLower = page.title.toLowerCase(); const contentLower = page.content.toLowerCase(); // Title matches are more relevant const titleMatches = (titleLower.match(new RegExp(queryLower, 'g')) || []).length; score += titleMatches * 10; // Content matches const contentMatches = (contentLower.match(new RegExp(queryLower, 'g')) || []).length; score += contentMatches; // Boost for exact phrase matches if (titleLower.includes(queryLower)) score += 20; if (contentLower.includes(queryLower)) score += 5; // Normalize by content length to favor more focused content score = score / Math.log(page.content.length + 1); return score; } } export * from './types.js'; export { WebCrawler } from './crawler.js'; export { FileCache } from './cache.js';

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/oeo/spider-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•6.71 KiB