Spider MCP Server

Overview Schema Related Servers Score Discussions

spider-mcp
src
extractors

readability.ts•3.91 KiB

import { JSDOM } from 'jsdom'; import { Readability } from '@mozilla/readability'; import { BaseExtractor, ExtractorResult, ExtractorOptions } from './base.js'; import { logger } from '@/utils/logger.js'; export class ReadabilityExtractor extends BaseExtractor { constructor(options: ExtractorOptions = {}) { super(options); } async extract(html: string, url: string): Promise<ExtractorResult> { try { const dom = new JSDOM(html, { url }); const document = dom.window.document; // Create a copy for Readability to modify const documentClone = document.cloneNode(true) as Document; const reader = new Readability(documentClone); const article = reader.parse(); if (!article) { throw new Error('Readability failed to extract content'); } let content = article.textContent || ''; if (this.options.preserveFormatting && article.content) { // Convert HTML to plain text while preserving some structure content = this.htmlToText(article.content); } content = this.sanitizeText(content); if (this.options.maxLength) { content = this.truncateContent(content, this.options.maxLength); } const result: ExtractorResult = { title: article.title || 'Untitled', content, excerpt: article.excerpt || this.extractExcerpt(content), author: this.extractAuthor(document), publishedTime: this.extractPublishTime(document), language: this.extractLanguage(document), }; logger.debug(`Readability extracted ${content.length} characters from ${url}`); return result; } catch (error) { logger.error(`Readability extraction failed for ${url}:`, error); throw error; } } private htmlToText(html: string): string { // Simple HTML to text conversion that preserves some structure return html .replace(/<h[1-6][^>]*>/gi, '\n# ') .replace(/<\/h[1-6]>/gi, '\n') .replace(/<p[^>]*>/gi, '\n') .replace(/<\/p>/gi, '\n') .replace(/<br[^>]*>/gi, '\n') .replace(/<li[^>]*>/gi, '\n- ') .replace(/<\/li>/gi, '') .replace(/<code[^>]*>/gi, '`') .replace(/<\/code>/gi, '`') .replace(/<pre[^>]*>/gi, '\n```\n') .replace(/<\/pre>/gi, '\n```\n') .replace(/<[^>]+>/g, '') .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'"); } private extractAuthor(document: Document): string | undefined { const selectors = [ 'meta[name="author"]', 'meta[property="article:author"]', '[rel="author"]', '.author', '.byline', '[data-author]', ]; for (const selector of selectors) { const element = document.querySelector(selector); if (element) { const content = element.getAttribute('content') || element.textContent; if (content?.trim()) { return content.trim(); } } } return undefined; } private extractPublishTime(document: Document): string | undefined { const selectors = [ 'meta[property="article:published_time"]', 'meta[name="date"]', 'meta[name="publish_date"]', 'time[datetime]', '[data-date]', ]; for (const selector of selectors) { const element = document.querySelector(selector); if (element) { const content = element.getAttribute('content') || element.getAttribute('datetime') || element.textContent; if (content?.trim()) { return content.trim(); } } } return undefined; } private extractLanguage(document: Document): string | undefined { return document.documentElement.lang || document.querySelector('meta[http-equiv="content-language"]')?.getAttribute('content') || undefined; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/oeo/spider-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

readability.ts•3.91 KiB