Web Content MCP Server

by amotivv
Verified
TypeScript
MIT License
Reddit Discord
OverviewInspectSchema Related Servers Reviews Score
Need Help?View Source Code Report Issue
experiments
content-extraction
experiments/content-extraction/index.ts
import axios from 'axios';

/**
 * Experiment: Content Extraction and Processing for LLM Context
 * 
 * This experiment demonstrates how to extract and process web content
 * specifically for use as context in LLMs using Cloudflare Browser Rendering.
 */

/**
 * Simulated function to extract clean content from a web page
 * @param url The URL to extract content from
 */
async function extractCleanContent(url: string) {
  console.log(`Simulating content extraction from: ${url}`);
  
  // In a real implementation with Cloudflare Browser Rendering, you would:
  // 1. Use the /content endpoint to get the rendered HTML
  // 2. Use the /scrape endpoint to extract specific elements
  // 3. Process the content to make it suitable for LLM context
  
  // Simulated HTML content from Cloudflare docs
  const simulatedHtml = `
    <!DOCTYPE html>
    <html>
      <head>
        <title>Browser Rendering API | Cloudflare Docs</title>
        <meta name="description" content="Learn how to use Cloudflare's Browser Rendering API">
      </head>
      <body>
        <header>
          <nav>
            <ul>
              <li><a href="/">Home</a></li>
              <li><a href="/products">Products</a></li>
              <li><a href="/developers">Developers</a></li>
            </ul>
          </nav>
        </header>
        <main>
          <article>
            <h1>Browser Rendering API</h1>
            <p class="description">Cloudflare Browser Rendering is a serverless headless browser service that allows execution of browser actions within Cloudflare Workers.</p>
            
            <section id="overview">
              <h2>Overview</h2>
              <p>Browser Rendering allows you to run a headless browser directly within Cloudflare's network, enabling you to:</p>
              <ul>
                <li>Render JavaScript-heavy websites</li>
                <li>Take screenshots of web pages</li>
                <li>Generate PDFs</li>
                <li>Extract structured data</li>
                <li>Automate browser interactions</li>
              </ul>
            </section>
            
            <section id="rest-api">
              <h2>REST API</h2>
              <p>The REST API provides simple endpoints for common browser tasks:</p>
              <div class="endpoint">
                <h3>/content</h3>
                <p>Fetches rendered HTML content from a URL after JavaScript execution.</p>
                <pre><code>
POST /content
{
  "url": "https://example.com",
  "rejectResourceTypes": ["image", "font"]
}
                </code></pre>
              </div>
              
              <div class="endpoint">
                <h3>/screenshot</h3>
                <p>Captures a screenshot of a web page.</p>
              </div>
            </section>
            
            <section id="workers-binding">
              <h2>Workers Binding API</h2>
              <p>For more advanced use cases, you can use the Workers Binding API with Puppeteer.</p>
              <pre><code>
import puppeteer from '@cloudflare/puppeteer';

export default {
  async fetch(request, env) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://example.com');
    const content = await page.content();
    await browser.disconnect();
    return new Response(content);
  }
};
              </code></pre>
            </section>
          </article>
        </main>
        <footer>
          <p>&copy; 2025 Cloudflare, Inc.</p>
          <nav>
            <ul>
              <li><a href="/terms">Terms</a></li>
              <li><a href="/privacy">Privacy</a></li>
            </ul>
          </nav>
        </footer>
      </body>
    </html>
  `;
  
  return simulatedHtml;
}

/**
 * Extracts main content from HTML and removes unnecessary elements
 * @param html The HTML content
 * @returns Cleaned content suitable for LLM context
 */
function cleanContentForLLM(html: string): string {
  // In a real implementation, you would use a proper HTML parser
  // For this experiment, we'll use a simple approach with regex
  
  // Extract the article content
  const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
  let content = articleMatch ? articleMatch[1] : html;
  
  // Remove HTML tags but preserve headings and paragraph structure
  content = content
    // Replace headings with markdown-style headings
    .replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '# $1\n\n')
    .replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '## $1\n\n')
    .replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '### $1\n\n')
    // Replace list items with markdown-style list items
    .replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n')
    // Replace paragraphs with newline-separated text
    .replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '$1\n\n')
    // Replace code blocks with markdown-style code blocks
    .replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '```\n$1\n```\n\n')
    // Remove all other HTML tags
    .replace(/<[^>]*>/g, '')
    // Fix multiple newlines
    .replace(/\n{3,}/g, '\n\n')
    // Trim whitespace
    .trim();
  
  return content;
}

/**
 * Extracts metadata from HTML
 * @param html The HTML content
 * @returns Metadata object
 */
function extractMetadata(html: string): Record<string, string> {
  const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
  const descriptionMatch = html.match(/<meta name="description" content="([^"]*)">/i);
  
  return {
    title: titleMatch ? titleMatch[1].trim() : 'Unknown Title',
    description: descriptionMatch ? descriptionMatch[1].trim() : 'No description available',
    url: 'https://developers.cloudflare.com/browser-rendering/', // Simulated URL
    source: 'Cloudflare Documentation',
    extractedAt: new Date().toISOString(),
  };
}

/**
 * Formats content for LLM context
 * @param content The cleaned content
 * @param metadata The metadata
 * @returns Formatted content for LLM context
 */
function formatForLLMContext(content: string, metadata: Record<string, string>): string {
  // Create a header with metadata
  const header = `
Title: ${metadata.title}
Source: ${metadata.source}
URL: ${metadata.url}
Extracted: ${metadata.extractedAt}
Description: ${metadata.description}
---

`;
  
  // Combine header and content
  return header + content;
}

/**
 * Simulates content summarization using an LLM
 * @param content The content to summarize
 * @returns Summarized content
 */
function simulateLLMSummarization(content: string): string {
  // In a real implementation, you would call an LLM API here
  console.log('Simulating LLM summarization...');
  
  // For this simulation, we'll return a mock summary
  return `
# Browser Rendering API Summary

Cloudflare Browser Rendering is a serverless headless browser service for Cloudflare Workers that enables:

1. Rendering JavaScript-heavy websites
2. Taking screenshots and generating PDFs
3. Extracting structured data
4. Automating browser interactions

It offers two main interfaces:

- **REST API**: Simple endpoints for common tasks like fetching content (/content) and taking screenshots (/screenshot)
- **Workers Binding API**: Advanced integration with Puppeteer for complex automation

The service runs within Cloudflare's network, providing low-latency access to browser capabilities without managing infrastructure.
  `;
}

/**
 * Main function to run the experiment
 */
async function runExperiment() {
  console.log('Starting Content Extraction and Processing experiment...');
  
  try {
    // Extract content from Cloudflare docs
    const url = 'https://developers.cloudflare.com/browser-rendering/';
    const html = await extractCleanContent(url);
    
    // Clean the content for LLM context
    const cleanedContent = cleanContentForLLM(html);
    console.log('\nCleaned content for LLM:');
    console.log(cleanedContent.substring(0, 500) + '...');
    
    // Extract metadata
    const metadata = extractMetadata(html);
    console.log('\nExtracted metadata:');
    console.log(metadata);
    
    // Format for LLM context
    const formattedContent = formatForLLMContext(cleanedContent, metadata);
    console.log('\nFormatted content for LLM context:');
    console.log(formattedContent.substring(0, 300) + '...');
    
    // Simulate LLM summarization
    const summarizedContent = simulateLLMSummarization(formattedContent);
    console.log('\nSimulated LLM summarization:');
    console.log(summarizedContent);
    
    console.log('\nIn a real implementation, you would:');
    console.log('1. Use Cloudflare Browser Rendering to fetch the actual content');
    console.log('2. Use a proper HTML parser for content extraction');
    console.log('3. Call a real LLM API for summarization');
    console.log('4. Store the processed content in Cloudflare R2 or another storage solution');
    
  } catch (error) {
    console.error('Experiment failed:', error);
  }
}

// Run the experiment
runExperiment();