Scientific Paper Harvester MCP Server

html-extractor.ts•8.14 KiB

import * as cheerio from "cheerio"; import axios from "axios"; import { BaseExtractor, TextExtractionResult, ExtractionConfig, } from "./base-extractor.js"; import { TextCleaner } from "./text-cleaner.js"; import { PdfExtractor } from "./pdf-extractor.js"; import { logger } from "../core/logger.js"; export class HtmlExtractor extends BaseExtractor { private textCleaner: TextCleaner; private pdfExtractor: PdfExtractor | null = null; constructor(config: ExtractionConfig) { super(config); this.textCleaner = new TextCleaner(config.cleaningOptions); // Initialize PDF extractor if enabled if (config.enablePdfExtraction) { this.pdfExtractor = new PdfExtractor(config, { maxSizeMB: 50, timeoutMs: 120000, maxPages: 100, requireConfirmation: false, // No confirmation in automatic fallback interactive: false, }); } } async extractText(url: string, fallbackPdfUrl?: string): Promise<TextExtractionResult> { try { logger.info("Starting HTML text extraction", { url }); let result: TextExtractionResult; // Determine extraction strategy based on URL if (url.includes("arxiv.org") || url.includes("ar5iv.labs.arxiv.org")) { result = await this.extractArxivText(url); } else { result = await this.extractOpenAlexText(url); } // If HTML extraction failed and PDF extraction is enabled, try PDF fallback if (!result.extractionSuccess && this.pdfExtractor && fallbackPdfUrl) { logger.info("HTML extraction failed, attempting PDF fallback", { htmlUrl: url, pdfUrl: fallbackPdfUrl, }); try { const pdfResult = await this.pdfExtractor.extractText(fallbackPdfUrl); if (pdfResult.extractionSuccess) { logger.info("PDF fallback extraction successful", { htmlUrl: url, pdfUrl: fallbackPdfUrl, }); return pdfResult; } } catch (pdfError) { logger.warn("PDF fallback extraction failed", { htmlUrl: url, pdfUrl: fallbackPdfUrl, error: pdfError instanceof Error ? pdfError.message : String(pdfError), }); } } return result; } catch (error) { logger.error("HTML text extraction failed", { url, error: (error as Error).message, }); return this.createFailedResult(); } } private async extractArxivText(url: string): Promise<TextExtractionResult> { let htmlContent: string; let source: "arxiv-html" | "ar5iv" = "arxiv-html"; try { // Try main arXiv HTML first htmlContent = await this.fetchHtml(url); } catch (error) { if (!this.config.enableArxivFallback) { logger.warn("arXiv HTML extraction failed and fallback disabled", { url, }); return this.createFailedResult(); } // Try ar5iv fallback try { const arxivId = this.extractArxivId(url); const fallbackUrl = `https://ar5iv.labs.arxiv.org/html/${arxivId}`; logger.info("Trying ar5iv fallback", { originalUrl: url, fallbackUrl }); htmlContent = await this.fetchHtml(fallbackUrl); source = "ar5iv"; } catch (fallbackError) { logger.error("Both arXiv and ar5iv extraction failed", { url, originalError: (error as Error).message, fallbackError: (fallbackError as Error).message, }); return this.createFailedResult(); } } return this.processArxivHtml(htmlContent, source); } private async extractOpenAlexText( url: string, ): Promise<TextExtractionResult> { if (!this.config.enableOpenAlexExtraction) { logger.warn("OpenAlex text extraction disabled", { url }); return this.createFailedResult(); } try { const htmlContent = await this.fetchHtml(url); return this.processOpenAlexHtml(htmlContent); } catch (error) { logger.error("OpenAlex HTML extraction failed", { url, error: (error as Error).message, }); return this.createFailedResult(); } } private async fetchHtml(url: string): Promise<string> { const response = await axios.get(url, { timeout: 10000, headers: { "User-Agent": "Scientific Paper Harvester MCP Server/1.0", }, }); if (response.status !== 200) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } return response.data; } private processArxivHtml( html: string, source: "arxiv-html" | "ar5iv", ): TextExtractionResult { try { const $ = cheerio.load(html); // Remove unwanted elements $( "nav, header, footer, aside, script, style, .sidebar, .navigation", ).remove(); // arXiv/ar5iv specific selectors let content = ""; // Try LaTeX document structure first (common in ar5iv) const latexDoc = $(".ltx_document"); if (latexDoc.length > 0) { content = latexDoc.text(); } else { // Fallback to article or main content const article = $('article, main, [role="main"], #content'); if (article.length > 0) { content = article.text(); } else { // Last resort: get body text but try to exclude navigation $( "body nav, body header, body footer, body aside, body .sidebar", ).remove(); content = $("body").text(); } } // Clean the extracted text const cleanedText = this.textCleaner.cleanText(content); const { text, truncated } = this.checkTextLength(cleanedText); logger.info("arXiv text extraction successful", { source, originalLength: content.length, cleanedLength: cleanedText.length, finalLength: text.length, truncated, }); return { text, truncated, extractionSuccess: true, source, }; } catch (error) { logger.error("arXiv HTML processing failed", { source, error: (error as Error).message, }); return this.createFailedResult(); } } private processOpenAlexHtml(html: string): TextExtractionResult { try { const $ = cheerio.load(html); // Remove unwanted elements $( "nav, header, footer, aside, script, style, .sidebar, .navigation", ).remove(); // Common academic paper selectors let content = ""; const contentSelectors = [ "article", '[role="main"]', ".paper-content", ".article-body", ".content", "main", "#content", ".paper-text", ".fulltext", ]; for (const selector of contentSelectors) { const element = $(selector); if ( element.length > 0 && element.text().trim().length > content.length ) { content = element.text(); } } // Fallback if no specific content found if (!content.trim()) { $( "body nav, body header, body footer, body aside, body .sidebar", ).remove(); content = $("body").text(); } // Clean the extracted text const cleanedText = this.textCleaner.cleanText(content); const { text, truncated } = this.checkTextLength(cleanedText); logger.info("OpenAlex text extraction successful", { originalLength: content.length, cleanedLength: cleanedText.length, finalLength: text.length, truncated, }); return { text, truncated, extractionSuccess: true, source: "openalex-html", }; } catch (error) { logger.error("OpenAlex HTML processing failed", { error: (error as Error).message, }); return this.createFailedResult(); } } private extractArxivId(url: string): string { // Extract arXiv ID from various URL formats const matches = url.match( /(?:arxiv\.org\/(?:html|abs|pdf)\/|ar5iv\.labs\.arxiv\.org\/html\/)([0-9]{4}\.[0-9]{4,5})/, ); if (!matches) { throw new Error(`Could not extract arXiv ID from URL: ${url}`); } return matches[1]; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/benedict2310/Scientific-Papers-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html-extractor.ts•8.14 KiB