RagDocs MCP Server

Apache 2.0
OverviewInspectSchema Related Servers Reviews Score
src
tools
import axios, { AxiosError } from 'axios';
import * as cheerio from 'cheerio';

export class ContentFetchError extends Error {
  constructor(message: string, public readonly url: string) {
    super(message);
    this.name = 'ContentFetchError';
  }
}

export interface FetchedContent {
  url: string;
  title: string;
  content: string;
  timestamp: string;
  metadata: {
    domain: string;
    contentType: string;
    wordCount: number;
    hasCode: boolean;
  };
}

export class ContentFetcher {
  private static readonly TIMEOUT = 30000; // 30 seconds
  private static readonly MAX_RETRIES = 3;
  private static readonly RETRY_DELAY = 1000; // 1 second

  /**
   * Fetches and processes content from a URL
   * @param url URL to fetch content from
   * @returns Processed content with metadata
   */
  static async fetchContent(url: string): Promise<FetchedContent> {
    let retries = 0;
    let lastError: Error | null = null;

    while (retries < this.MAX_RETRIES) {
      try {
        const response = await axios.get(url, {
          timeout: this.TIMEOUT,
          maxRedirects: 5,
          headers: {
            'User-Agent': 'Mozilla/5.0 (compatible; RagDocsBot/1.0)',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
            'Accept-Language': 'en-US,en;q=0.5',
          },
        });

        const contentType = response.headers['content-type'] || '';
        if (!contentType.includes('html')) {
          throw new ContentFetchError('Unsupported content type: ' + contentType, url);
        }

        return this.processHtmlContent(url, response.data);
      } catch (error) {
        lastError = error as Error;
        if (error instanceof AxiosError && error.response?.status === 404) {
          throw new ContentFetchError('Page not found', url);
        }
        retries++;
        if (retries < this.MAX_RETRIES) {
          await new Promise(resolve => setTimeout(resolve, this.RETRY_DELAY));
        }
      }
    }

    throw new ContentFetchError(
      `Failed to fetch content after ${this.MAX_RETRIES} attempts: ${lastError?.message}`,
      url
    );
  }

  /**
   * Processes HTML content to extract relevant text and metadata
   * @param url Original URL
   * @param html Raw HTML content
   * @returns Processed content with metadata
   */
  private static processHtmlContent(url: string, html: string): FetchedContent {
    const $ = cheerio.load(html);

    // Remove unwanted elements
    this.removeUnwantedElements($);

    // Extract title
    const title = $('title').text().trim() || 
                 $('h1').first().text().trim() || 
                 'Untitled Document';

    // Extract main content
    const mainContent = this.extractMainContent($);

    // Check for code blocks
    const hasCode = $('pre, code').length > 0 || 
                   mainContent.includes('```') ||
                   /\`[^\`]+\`/.test(mainContent);

    // Count words
    const wordCount = mainContent.split(/\s+/).filter(Boolean).length;

    return {
      url,
      title,
      content: mainContent,
      timestamp: new Date().toISOString(),
      metadata: {
        domain: new URL(url).hostname,
        contentType: 'text/html',
        wordCount,
        hasCode,
      },
    };
  }

  /**
   * Removes unwanted elements from the HTML
   * @param $ Cheerio instance
   */
  private static removeUnwantedElements($: cheerio.CheerioAPI): void {
    // Remove common non-content elements
    const selectorsToRemove = [
      'script',
      'style',
      'nav',
      'header',
      'footer',
      'iframe',
      '.advertisement',
      '.ads',
      '#comments',
      '.comments',
      '.social-share',
      '.related-posts',
      'aside',
    ];

    $(selectorsToRemove.join(', ')).remove();
  }

  /**
   * Extracts main content from the HTML
   * @param $ Cheerio instance
   * @returns Extracted and cleaned content
   */
  private static extractMainContent($: cheerio.CheerioAPI): string {
    // Try to find main content container
    const mainSelectors = [
      'article',
      'main',
      '.main-content',
      '#main-content',
      '.post-content',
      '.article-content',
      '.entry-content',
    ];

    let $content = $();
    for (const selector of mainSelectors) {
      $content = $(selector);
      if ($content.length > 0) break;
    }

    // Fallback to body if no main content container found
    if ($content.length === 0) {
      $content = $('body');
    }

    // Extract text content
    const text = $content
      .find('h1, h2, h3, h4, h5, h6, p, li, pre, code')
      .map((_, el) => {
        const $el = $(el);
        // Preserve code blocks
        if ($el.is('pre, code')) {
          return '\n```\n' + $el.text() + '\n```\n';
        }
        return $el.text();
      })
      .get()
      .join('\n')
      .trim();

    // Clean up the text
    return this.cleanText(text);
  }

  /**
   * Cleans extracted text content
   * @param text Raw text content
   * @returns Cleaned text
   */
  private static cleanText(text: string): string {
    return text
      .replace(/[\r\n]+/g, '\n') // Normalize line endings
      .replace(/\n\s+\n/g, '\n\n') // Remove excess whitespace between paragraphs
      .replace(/\s+/g, ' ') // Normalize whitespace within paragraphs
      .split('\n') // Split into lines
      .map(line => line.trim()) // Trim each line
      .filter(Boolean) // Remove empty lines
      .join('\n') // Rejoin with newlines
      .trim(); // Final trim
  }
}