Scientific Paper Harvester MCP Server

text-cleaner.ts•3.01 KiB

import { ExtractionConfig } from "./base-extractor.js"; export class TextCleaner { private config: ExtractionConfig["cleaningOptions"]; constructor(config: ExtractionConfig["cleaningOptions"]) { this.config = config; } cleanText(text: string): string { let cleaned = text; if (this.config.removeExtraWhitespace) { cleaned = this.normalizeWhitespace(cleaned); } if (this.config.normalizeLineBreaks) { cleaned = this.normalizeLineBreaks(cleaned); } if (this.config.removeSpecialChars) { cleaned = this.removeSpecialCharacters(cleaned); } return cleaned.trim(); } private normalizeWhitespace(text: string): string { // Replace multiple spaces with single space // Replace tabs with spaces // Remove trailing whitespace from lines return text .replace(/\t/g, " ") // Replace tabs with spaces .replace(/ +/g, " ") // Multiple spaces to single space .replace(/^[ \t]+|[ \t]+$/gm, ""); // Remove leading/trailing whitespace per line } private normalizeLineBreaks(text: string): string { // Normalize line breaks and ensure proper paragraph structure return text .replace(/\r\n/g, "\n") // Windows to Unix line endings .replace(/\r/g, "\n") // Mac to Unix line endings .replace(/\n{3,}/g, "\n\n") // Multiple line breaks to double .replace(/\n\s*\n/g, "\n\n"); // Clean up whitespace between paragraphs } private removeSpecialCharacters(text: string): string { // Remove common HTML artifacts and special characters that don't add value return text .replace(/[^\w\s\.,;:!?()[\]{}"'-]/g, "") // Keep basic punctuation .replace(/\s+/g, " "); // Clean up any multiple spaces created } /** * Clean HTML content by removing navigation, sidebar, and other non-content elements */ cleanHtmlContent(html: string): string { // Remove common non-content sections return html .replace(/<nav[^>]*>.*?<\/nav>/gis, "") // Navigation .replace(/<header[^>]*>.*?<\/header>/gis, "") // Headers .replace(/<footer[^>]*>.*?<\/footer>/gis, "") // Footers .replace(/<aside[^>]*>.*?<\/aside>/gis, "") // Sidebars .replace(/<div[^>]*class[^>]*sidebar[^>]*>.*?<\/div>/gis, "") // Sidebar divs .replace(/<div[^>]*class[^>]*nav[^>]*>.*?<\/div>/gis, "") // Navigation divs .replace(/<script[^>]*>.*?<\/script>/gis, "") // Scripts .replace(/<style[^>]*>.*?<\/style>/gis, "") // Styles .replace(//gis, ""); // Comments } /** * Extract text content from specific academic paper sections */ extractAcademicContent(html: string): string { // Look for common academic paper selectors const contentSelectors = [ "article", '[role="main"]', ".paper-content", ".article-body", ".content", "main", "#content", ".ltx_document", // LaTeX-specific for arXiv papers ]; // This is a simplified version - the actual HTML parsing will be done with cheerio return this.cleanHtmlContent(html); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/benedict2310/Scientific-Papers-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

text-cleaner.ts•3.01 KiB