Perplexity MCP Server

GPL 3.0

175

Overview InspectNew Endpoints Schema Related Servers Reviews Score

extraction.ts•15.7 kB

import { Readability } from "@mozilla/readability"; import axios from "axios"; /** * Content extraction utilities for Puppeteer-based scraping and recursive exploration. */ import { JSDOM } from "jsdom"; import type { Page } from "puppeteer"; import { CONFIG } from "../server/config.js"; import type { PageContentResult, PuppeteerContext } from "../types/index.js"; import { fetchSimpleContent } from "./fetch.js"; import { initializeBrowser } from "./puppeteer.js"; // Helper functions for content extraction function detectAndRewriteGitHubUrl( originalUrl: string, ctx: PuppeteerContext, ): { extractionUrl: string; isGitHubRepo: boolean } { try { const parsedUrl = new URL(originalUrl); if (parsedUrl.hostname === "github.com") { const pathParts = parsedUrl.pathname.split("/").filter((part) => part.length > 0); if (pathParts.length === 2) { const gitingestUrl = `https://gitingest.com${parsedUrl.pathname}`; ctx.log("info", `Detected GitHub repo URL. Rewriting to: ${gitingestUrl}`); return { extractionUrl: gitingestUrl, isGitHubRepo: true }; } } } catch (urlParseError) { ctx.log("warn", `Failed to parse URL for GitHub check: ${urlParseError}`); } return { extractionUrl: originalUrl, isGitHubRepo: false }; } async function performContentTypeCheck( extractionUrl: string, isGitHubRepo: boolean, originalUrl: string, ctx: PuppeteerContext, ): Promise<PageContentResult | null> { if (isGitHubRepo) { return null; // Skip content type check for GitHub repos } try { ctx.log("info", `Performing HEAD request for ${extractionUrl}...`); const headResponse = await axios.head(extractionUrl, { timeout: 5000, // Reduced from 10000 headers: { "User-Agent": CONFIG.USER_AGENT }, }); const contentType = headResponse.headers["content-type"]; ctx.log("info", `Content-Type: ${contentType}`); if (contentType && !contentType.includes("html") && !contentType.includes("text/plain")) { const errorMsg = `Unsupported content type: ${contentType}`; ctx.log("error", errorMsg); return { url: originalUrl, error: errorMsg }; } } catch (headError) { ctx.log( "warn", `HEAD request failed for ${extractionUrl}: ${headError instanceof Error ? headError.message : String(headError)}. Proceeding with Puppeteer.`, ); } return null; } async function initializePageIfNeeded(ctx: PuppeteerContext): Promise<Page> { let page = ctx.page; if (!page || page?.isClosed()) { ctx.log("info", "No active page, initializing browser..."); ctx.setPage(null); ctx.setBrowser(null); ctx.setIsInitializing(false); await initializeBrowser(ctx); page = ctx.page; if (!page) { throw new Error("Failed to initialize Puppeteer page"); } } return page; } async function navigateToUrl( page: Page, extractionUrl: string, originalUrl: string, ctx: PuppeteerContext, ): Promise<{ pageTitle: string; error?: PageContentResult }> { ctx.log("info", `Navigating to ${extractionUrl} for extraction...`); const response = await page.goto(extractionUrl, { waitUntil: "domcontentloaded", timeout: CONFIG.TIMEOUT_PROFILES.navigation, }); const pageTitle = await page.title(); if (response && !response.ok()) { const statusCode = response.status(); const errorMsg = `HTTP error ${statusCode} received when accessing URL: ${extractionUrl}`; ctx.log("error", errorMsg); return { pageTitle, error: { url: originalUrl, error: errorMsg } }; } return { pageTitle }; } async function waitForGitHubContent( page: Page, isGitHubRepo: boolean, ctx: PuppeteerContext, ): Promise<void> { if (!isGitHubRepo) return; ctx.log("info", "Waiting for gitingest content selector (.result-text)..."); try { await page.waitForSelector(".result-text", { timeout: CONFIG.TIMEOUT_PROFILES.content, }); ctx.log("info", "Gitingest content selector found."); } catch (waitError) { ctx.log("warn", `Timeout waiting for gitingest selector: ${waitError}. Proceeding anyway.`); } } async function extractGitHubContent( page: Page, isGitHubRepo: boolean, originalUrl: string, pageTitle: string, ctx: PuppeteerContext, ): Promise<PageContentResult | null> { if (!isGitHubRepo) return null; const gitingestContent = await page.evaluate(() => { const resultTextArea = document.querySelector(".result-text") as HTMLTextAreaElement | null; return resultTextArea ? resultTextArea.value : null; }); if (gitingestContent && gitingestContent.trim().length > 0) { ctx.log("info", `Gitingest specific extraction successful (${gitingestContent.length} chars)`); return { url: originalUrl, title: pageTitle, textContent: gitingestContent.trim(), error: null, }; } ctx.log("warn", "Gitingest specific extraction failed. Falling back to Readability."); return null; } function extractGeneralContent( dom: JSDOM, originalUrl: string, pageTitle: string, ctx: PuppeteerContext, ): PageContentResult | null { const reader = new Readability(dom.window.document); const article = reader.parse(); if (article?.textContent && article.textContent.trim().length > (article.title?.length || 0)) { ctx.log("info", `Readability extracted content (${article.textContent.length} chars)`); return { url: originalUrl, title: article.title || pageTitle, textContent: article.textContent.trim(), error: null, }; } return null; } async function extractFallbackContent( page: Page, originalUrl: string, pageTitle: string, ctx: PuppeteerContext, ): Promise<PageContentResult | null> { ctx.log("warn", "Readability failed. Attempting sophisticated fallback selectors..."); const fallbackResult = await page.evaluate(() => { const selectors = [ "article", "main", '[role="main"]', "#content", ".content", "#main", ".main", "#article-body", ".article-body", ".post-content", ".entry-content", ]; for (const selector of selectors) { const element = document.querySelector(selector) as HTMLElement | null; if (element?.innerText && element.innerText.trim().length > 100) { console.error(`Fallback using selector: ${selector}`); return { text: element.innerText.trim(), selector: selector }; } } // Advanced body text cleanup const bodyClone = document.body.cloneNode(true) as HTMLElement; const elementsToRemove = bodyClone.querySelectorAll( 'nav, header, footer, aside, script, style, noscript, button, form, [role="navigation"], [role="banner"], [role="contentinfo"], [aria-hidden="true"]', ); for (const el of elementsToRemove) { el.remove(); } const bodyText = bodyClone.innerText.trim(); if (bodyText.length > 200) { console.error("Fallback using filtered body text."); return { text: bodyText, selector: "body (filtered)" }; } return null; }); if (fallbackResult) { ctx.log( "info", `Fallback extracted content (${fallbackResult.text.length} chars) using selector: ${fallbackResult.selector}`, ); return { url: originalUrl, title: pageTitle, textContent: fallbackResult.text, error: null, }; } return null; } function formatExtractionError( error: unknown, extractionUrl: string, originalUrl: string, ): PageContentResult { let errorMessage = `Failed to extract content from ${extractionUrl}.`; let errorReason = "Unknown error"; if (error instanceof Error) { if (error.message.includes("timeout")) { errorReason = "Navigation or content loading timed out."; } else if (error.message.includes("net::") || error.message.includes("Failed to load")) { errorReason = "Could not resolve or load the URL."; } else if (error.message.includes("extract meaningful content")) { errorReason = "Readability and fallback selectors failed."; } else { errorReason = error.message; } } errorMessage += ` Reason: ${errorReason}`; return { url: originalUrl, error: errorMessage }; } /** * Extracts content from a single page using Puppeteer and Readability. * Includes GitHub/Gitingest URL rewriting, content-type pre-checking, and sophisticated fallback extraction. */ export async function fetchSinglePageContent( url: string, ctx: PuppeteerContext, ): Promise<PageContentResult> { const originalUrl = url; // GitHub URL detection and rewriting const { extractionUrl, isGitHubRepo } = detectAndRewriteGitHubUrl(originalUrl, ctx); // Content-Type pre-check (skip for GitHub) const contentTypeError = await performContentTypeCheck( extractionUrl, isGitHubRepo, originalUrl, ctx, ); if (contentTypeError) { return contentTypeError; } try { // Initialize page if needed const page = await initializePageIfNeeded(ctx); // Navigate to URL const navigationResult = await navigateToUrl(page, extractionUrl, originalUrl, ctx); if (navigationResult.error) { return navigationResult.error; } const { pageTitle } = navigationResult; // Wait for GitHub content if needed await waitForGitHubContent(page, isGitHubRepo, ctx); // Get page HTML and create DOM const html = await page.content(); // Suppress JSDOM console output to prevent CSS/HTML dumps in logs const originalConsoleError = console.error; const originalConsoleWarn = console.warn; console.error = () => {}; // Suppress JSDOM errors console.warn = () => {}; // Suppress JSDOM warnings const dom = new JSDOM(html, { url: extractionUrl, // Additional options to reduce JSDOM verbosity resources: "usable", runScripts: "outside-only", }); // Restore console methods console.error = originalConsoleError; console.warn = originalConsoleWarn; // Try GitHub-specific extraction first const gitHubResult = await extractGitHubContent( page, isGitHubRepo, originalUrl, pageTitle, ctx, ); if (gitHubResult) { return gitHubResult; } // Try general Readability extraction const generalResult = extractGeneralContent(dom, originalUrl, pageTitle, ctx); if (generalResult) { return generalResult; } // Try sophisticated fallback extraction const fallbackResult = await extractFallbackContent(page, originalUrl, pageTitle, ctx); if (fallbackResult) { return fallbackResult; } return { url: originalUrl, error: "No meaningful content extracted" }; } catch (error) { ctx.log( "error", `Error extracting content from ${extractionUrl}: ${error instanceof Error ? error.message : String(error)}`, ); return formatExtractionError(error, extractionUrl, originalUrl); } } /** * Extracts all same-domain links from a Puppeteer page. * Filters out non-HTTP(S), anchor, mailto, and JavaScript links. Resolves relative URLs. * @param page - Puppeteer Page instance * @param baseUrl - The base URL for resolving relative links * @returns Array of { url, text } for same-domain links */ export async function extractSameDomainLinks( page: Page, baseUrl: string, ): Promise<{ url: string; text: string }[]> { try { const baseHostname = new URL(baseUrl).hostname; const links = await page.evaluate(() => { return Array.from(document.querySelectorAll("a[href]")) .map((link) => { const href = link.getAttribute("href"); const text = (link as HTMLElement).innerText || link.textContent || ""; if ( !href || href.startsWith("#") || href.startsWith("javascript:") || href.startsWith("data:") || href.startsWith("vbscript:") || href.startsWith("mailto:") || href.startsWith("tel:") ) { return null; } return { url: href, text: text.trim() }; }) .filter(Boolean); }, baseUrl); const resolvedLinks: { url: string; text: string }[] = []; for (const link of links) { if (!link) continue; try { const absoluteUrl = new URL(link.url, baseUrl).href; if (new URL(absoluteUrl).hostname === baseHostname) { resolvedLinks.push({ url: absoluteUrl, text: link.text || absoluteUrl }); } } catch { // Ignore invalid URLs } } // Prioritize links with longer text, limit count resolvedLinks.sort((a, b) => b.text.length - a.text.length); return resolvedLinks.slice(0, 10); } catch (error) { // On error, return empty array return []; } } /** * Recursively fetches content from a root URL, exploring links up to maxDepth. * Uses fetchSinglePageContent and extractSameDomainLinks. Respects visitedUrls and globalTimeoutSignal. * @param startUrl - The root URL to start crawling * @param maxDepth - Maximum recursion depth * @param currentDepth - Current recursion depth * @param visitedUrls - Set of already visited URLs * @param results - Array to collect PageContentResult * @param globalTimeoutSignal - Object with .timedOut boolean to abort on timeout * @param ctx - PuppeteerContext */ export async function recursiveFetch( startUrl: string, maxDepth: number, currentDepth: number, visitedUrls: Set<string>, results: PageContentResult[], globalTimeoutSignal: { timedOut: boolean }, ctx: PuppeteerContext, ): Promise<void> { if (currentDepth > maxDepth || visitedUrls.has(startUrl) || globalTimeoutSignal.timedOut) { return; } ctx.log("info", `[Depth ${currentDepth}] Fetching: ${startUrl}`); visitedUrls.add(startUrl); const pageResult: PageContentResult = { url: startUrl, title: null, textContent: null, error: null, }; let linksToExplore: { url: string; text: string }[] = []; try { if (currentDepth === 1) { // Use Puppeteer/Readability for the initial page const result = await fetchSinglePageContent(startUrl, ctx); pageResult.title = result.title; pageResult.textContent = result.textContent; pageResult.error = result.error || null; if (ctx.page && !ctx.page.isClosed()) { linksToExplore = await extractSameDomainLinks(ctx.page, startUrl); } } else { // Use the simpler fetch for deeper levels const result = await fetchSimpleContent(startUrl, ctx); pageResult.title = result.title; pageResult.textContent = result.textContent; pageResult.error = result.error || null; } if (pageResult.textContent === null && pageResult.error === null) { pageResult.error = "Failed to extract content"; } } catch (error) { ctx.log( "error", `[Depth ${currentDepth}] Error fetching ${startUrl}: ${error instanceof Error ? error.message : String(error)}`, ); pageResult.error = error instanceof Error ? error.message : String(error); } results.push(pageResult); // Explore links only if depth allows and initial fetch was successful if (currentDepth < maxDepth && !pageResult.error && linksToExplore.length > 0) { const linksToFollow = linksToExplore.slice(0, 3); // Limit to 3 links per page const promises = linksToFollow.map((link) => { if (globalTimeoutSignal.timedOut) return Promise.resolve(); return recursiveFetch( link.url, maxDepth, currentDepth + 1, visitedUrls, results, globalTimeoutSignal, ctx, ); }); await Promise.all(promises); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wysh3/perplexity-mcp-zerver'

If you have feedback or need assistance with the MCP directory API, please join our Discord server