RagDocs MCP Server

import axios, { AxiosError } from 'axios'; import * as cheerio from 'cheerio'; export class ContentFetchError extends Error { constructor(message: string, public readonly url: string) { super(message); this.name = 'ContentFetchError'; } } export interface FetchedContent { url: string; title: string; content: string; timestamp: string; metadata: { domain: string; contentType: string; wordCount: number; hasCode: boolean; }; } export class ContentFetcher { private static readonly TIMEOUT = 30000; // 30 seconds private static readonly MAX_RETRIES = 3; private static readonly RETRY_DELAY = 1000; // 1 second /** * Fetches and processes content from a URL * @param url URL to fetch content from * @returns Processed content with metadata */ static async fetchContent(url: string): Promise<FetchedContent> { let retries = 0; let lastError: Error | null = null; while (retries < this.MAX_RETRIES) { try { const response = await axios.get(url, { timeout: this.TIMEOUT, maxRedirects: 5, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; RagDocsBot/1.0)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', 'Accept-Language': 'en-US,en;q=0.5', }, }); const contentType = response.headers['content-type'] || ''; if (!contentType.includes('html')) { throw new ContentFetchError('Unsupported content type: ' + contentType, url); } return this.processHtmlContent(url, response.data); } catch (error) { lastError = error as Error; if (error instanceof AxiosError && error.response?.status === 404) { throw new ContentFetchError('Page not found', url); } retries++; if (retries < this.MAX_RETRIES) { await new Promise(resolve => setTimeout(resolve, this.RETRY_DELAY)); } } } throw new ContentFetchError( `Failed to fetch content after ${this.MAX_RETRIES} attempts: ${lastError?.message}`, url ); } /** * Processes HTML content to extract relevant text and metadata * @param url Original URL * @param html Raw HTML content * @returns Processed content with metadata */ private static processHtmlContent(url: string, html: string): FetchedContent { const $ = cheerio.load(html); // Remove unwanted elements this.removeUnwantedElements($); // Extract title const title = $('title').text().trim() || $('h1').first().text().trim() || 'Untitled Document'; // Extract main content const mainContent = this.extractMainContent($); // Check for code blocks const hasCode = $('pre, code').length > 0 || mainContent.includes('```') || /\`[^\`]+\`/.test(mainContent); // Count words const wordCount = mainContent.split(/\s+/).filter(Boolean).length; return { url, title, content: mainContent, timestamp: new Date().toISOString(), metadata: { domain: new URL(url).hostname, contentType: 'text/html', wordCount, hasCode, }, }; } /** * Removes unwanted elements from the HTML * @param $ Cheerio instance */ private static removeUnwantedElements($: cheerio.CheerioAPI): void { // Remove common non-content elements const selectorsToRemove = [ 'script', 'style', 'nav', 'header', 'footer', 'iframe', '.advertisement', '.ads', '#comments', '.comments', '.social-share', '.related-posts', 'aside', ]; $(selectorsToRemove.join(', ')).remove(); } /** * Extracts main content from the HTML * @param $ Cheerio instance * @returns Extracted and cleaned content */ private static extractMainContent($: cheerio.CheerioAPI): string { // Try to find main content container const mainSelectors = [ 'article', 'main', '.main-content', '#main-content', '.post-content', '.article-content', '.entry-content', ]; let $content = $(); for (const selector of mainSelectors) { $content = $(selector); if ($content.length > 0) break; } // Fallback to body if no main content container found if ($content.length === 0) { $content = $('body'); } // Extract text content const text = $content .find('h1, h2, h3, h4, h5, h6, p, li, pre, code') .map((_, el) => { const $el = $(el); // Preserve code blocks if ($el.is('pre, code')) { return '\n```\n' + $el.text() + '\n```\n'; } return $el.text(); }) .get() .join('\n') .trim(); // Clean up the text return this.cleanText(text); } /** * Cleans extracted text content * @param text Raw text content * @returns Cleaned text */ private static cleanText(text: string): string { return text .replace(/[\r\n]+/g, '\n') // Normalize line endings .replace(/\n\s+\n/g, '\n\n') // Remove excess whitespace between paragraphs .replace(/\s+/g, ' ') // Normalize whitespace within paragraphs .split('\n') // Split into lines .map(line => line.trim()) // Trim each line .filter(Boolean) // Remove empty lines .join('\n') // Rejoin with newlines .trim(); // Final trim } }