Skip to main content
Glama

Docs Fetch MCP Server

by wolfyy970
content-extractor.ts3.77 kB
import { Page } from 'puppeteer'; import { CONTENT_SELECTORS } from '../types/index.js'; import { ExtractedContent } from '../types/index.js'; /** * Extracts and processes content from web pages */ export class ContentExtractor { /** * Wait for content to be available on the page */ async waitForContent(page: Page): Promise<void> { await page.waitForFunction(() => { return document.body && document.body.textContent && document.body.textContent.length > 0; }); } /** * Check if page has meaningful content */ async hasContent(page: Page): Promise<boolean> { return await page.evaluate(() => { return Boolean(document.body && document.body.textContent && document.body.textContent.trim().length > 100); }); } /** * Extract the main content and links from a page */ async extractContent(page: Page): Promise<ExtractedContent> { const content = await page.evaluate((selectors) => { // Helper function to clean text inside the evaluate context function cleanText(text: string): string { return text .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') .trim(); } // Try all selectors until we find content let mainElement = null; for (const selector of selectors) { const elements = Array.from(document.querySelectorAll(selector)); if (elements.length > 0) { // If multiple elements match, find the one with the most text mainElement = elements.reduce((longest, current) => { return (current.textContent?.length || 0) > (longest.textContent?.length || 0) ? current : longest; }, elements[0]); if (mainElement.textContent && mainElement.textContent.trim().length > 200) { break; } } } // Fallback to body if no content found mainElement = mainElement || document.body; // Extract the main content text const mainContent = mainElement ? cleanText(mainElement.textContent || '') : ''; // Extract links with relevance score const relatedLinks = Array.from(document.querySelectorAll('a[href]')) .map(link => { const href = link.getAttribute('href'); const text = link.textContent || ''; if (!href || href.startsWith('#') || href.startsWith('javascript:') || href.startsWith('mailto:') || href.startsWith('tel:')) { return null; } // Calculate relevance score based on various factors let relevance = 0; // Text length (longer text = more informative) relevance += Math.min(text.length / 10, 5); // Link appears in the main content area if (mainElement?.contains(link)) { relevance += 5; } // Link text hints at being informative content const informativeWords = ['guide', 'docs', 'tutorial', 'reference', 'example', 'api', 'learn', 'how to']; for (const word of informativeWords) { if (text.toLowerCase().includes(word)) { relevance += 2; } } return { url: href, text: text.trim(), relevance }; }) .filter(Boolean); // Remove null entries return { mainContent, relatedLinks }; }, CONTENT_SELECTORS); return content as ExtractedContent; } /** * Clean text by removing excessive whitespace * @private */ private cleanText(text: string): string { return text .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') .trim(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wolfyy970/docs-fetch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server