MCP Content Analyzer

my-mcp
src
utils

content-extractor.ts•4.38 kB

import * as cheerio from 'cheerio'; import type { ScrapedContent } from '../types/content.js'; export class ContentExtractor { static extractMetaData(html: string): ScrapedContent['metadata'] { const $ = cheerio.load(html); const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || $('meta[name="twitter:description"]').attr('content'); const author = $('meta[name="author"]').attr('content') || $('meta[property="article:author"]').attr('content') || $('meta[name="twitter:creator"]').attr('content'); const publishDate = $('meta[property="article:published_time"]').attr('content') || $('meta[name="date"]').attr('content') || $('time[datetime]').attr('datetime'); const language = $('html').attr('lang') || $('meta[http-equiv="content-language"]').attr('content'); // Extract text content for word count const textContent = this.extractTextContent(html); const wordCount = textContent.split(/\s+/).filter(word => word.length > 0).length; const metadata: ScrapedContent['metadata'] = { wordCount }; if (description) metadata.description = description; if (author) metadata.author = author; if (publishDate) metadata.publishDate = publishDate; if (language) metadata.language = language; return metadata; } static extractTextContent(html: string): string { const $ = cheerio.load(html); // Remove script and style elements $('script, style, nav, header, footer, aside, .ads, .advertisement').remove(); // Try to find main content area const contentSelectors = [ 'main', 'article', '.content', '.post-content', '.entry-content', '.article-content', '#content', '.main-content' ]; let content = ''; for (const selector of contentSelectors) { const element = $(selector); if (element.length > 0) { content = element.text(); break; } } // Fallback to body if no specific content area found if (!content) { content = $('body').text(); } // Clean up whitespace return content .replace(/\s+/g, ' ') .replace(/\n+/g, '\n') .trim(); } static extractTitle(html: string, url: string): string { const $ = cheerio.load(html); // Try multiple title sources in order of preference const title = $('title').text() || $('meta[property="og:title"]').attr('content') || $('meta[name="twitter:title"]').attr('content') || $('h1').first().text() || new URL(url).hostname; return title.trim(); } static generateSummary(content: string, maxLength: number = 500): string { if (!content || content.length <= maxLength) { return content; } // Split into sentences and take complete sentences up to maxLength const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0); let summary = ''; for (const sentence of sentences) { const trimmedSentence = sentence.trim(); if (summary.length + trimmedSentence.length + 1 <= maxLength) { summary += (summary ? '. ' : '') + trimmedSentence; } else { break; } } // If no complete sentences fit, truncate at word boundary if (!summary && content.length > maxLength) { const truncated = content.substring(0, maxLength); const lastSpace = truncated.lastIndexOf(' '); summary = lastSpace > 0 ? truncated.substring(0, lastSpace) + '...' : truncated + '...'; } return summary || content; } static isValidUrl(url: string): boolean { try { const urlObj = new URL(url); return urlObj.protocol === 'http:' || urlObj.protocol === 'https:'; } catch { return false; } } static normalizeUrl(url: string): string { try { const urlObj = new URL(url); // Remove unnecessary query parameters and fragments const cleanUrl = `${urlObj.protocol}//${urlObj.host}${urlObj.pathname}`; return cleanUrl.endsWith('/') && urlObj.pathname !== '/' ? cleanUrl.slice(0, -1) : cleanUrl; } catch { return url; } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DuncanDam/my-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server