Web Tools MCP Server

helpers.ts•5.94 KiB

// Helper functions for research analysis and processing interface ExtractedContent { title: string; content: string; quality: 'high' | 'medium' | 'low'; wordCount: number; keyPoints: string[]; } export function enhancedContentExtraction(rawContent: string, url: string): ExtractedContent { if (!rawContent || typeof rawContent !== 'string') { throw new Error('Raw content must be a non-empty string'); } if (!url || typeof url !== 'string') { throw new Error('URL must be a non-empty string'); } const lines = rawContent.split('\n'); let title = ''; const contentLines: string[] = []; const keyPoints: string[] = []; // Enhanced title extraction for (let i = 0; i < Math.min(10, lines.length); i++) { const line = lines[i].trim(); if (!line) continue; // Look for title patterns (improved) if (!title && ( line.includes('<title>') || line.includes('<h1>') || line.includes('**') || line.includes('##') || (line.match(/^[A-Z][^.!?]*[.!?]?$/) && line.length > 10 && line.length < 100) )) { title = line.replace(/<[^>]+>/g, '') .replace(/\*\*/g, '') .replace(/##/g, '') .replace(/&[^;]+;/g, '') .trim(); } } // Enhanced content extraction for (const line of lines) { const cleanLine = line.trim(); if (!cleanLine || cleanLine.length < 10) continue; // Skip navigation, ads, and boilerplate if ( cleanLine.includes('cookie') || cleanLine.includes('advertisement') || cleanLine.includes('subscribe') || cleanLine.includes('newsletter') || cleanLine.match(/^(menu|nav|header|footer|sidebar)/i) || cleanLine.match(/^\d+\s*(min|mins|minutes?)\s*(read|ago)/i) ) continue; // Collect substantial content if (cleanLine.length > 30) { contentLines.push(cleanLine); // Extract key points (sentences with important indicators) if ( cleanLine.length > 50 && cleanLine.length < 300 && (cleanLine.includes(':') || cleanLine.match(/\b(because|therefore|however|important|key|main|primary|essential)\b/i) || cleanLine.match(/^\d+\./) || cleanLine.includes('•') || cleanLine.includes('-')) ) { keyPoints.push(cleanLine); } } } const content = contentLines.join('\n\n'); const wordCount = content.split(/\s+/).length; // Determine content quality let quality: 'high' | 'medium' | 'low' = 'low'; if (wordCount > 200 && keyPoints.length > 2) quality = 'medium'; if (wordCount > 500 && keyPoints.length > 5) quality = 'high'; try { return { title: title || `Content from ${new URL(url).hostname}`, content, keyPoints: keyPoints.slice(0, 10), // Limit key points wordCount, quality }; } catch { return { title: title || 'Unknown Source', content, keyPoints: keyPoints.slice(0, 10), wordCount, quality }; } } // URL utilities (consolidated from url-utilities.ts) interface ScoredUrl { url: string; score: number; } // Extract URLs from general content with regex export function extractRelevantLinks(content: string, maxLinks: number = 5): string[] { const urlPattern = /https?:\/\/[^\s<>"{}|\\^`[\]]+/g; const urls = content.match(urlPattern) || []; // Filter out common non-content URLs const relevantUrls = urls.filter(url => !url.includes('google.com/search') && !url.includes('javascript:') && !url.includes('.css') && !url.includes('.js') && !url.includes('.png') && !url.includes('.jpg') && !url.includes('.gif') ); // Return unique URLs, limited to maxLinks return [...new Set(relevantUrls)].slice(0, maxLinks); } // Consolidated URL relevance scoring (enhanced version) export function scoreUrlRelevance(url: string, query: string, title?: string, position?: number): number { let score = 0; const queryTerms = query.toLowerCase().split(' '); const urlLower = url.toLowerCase(); // Position factor (if provided - earlier results are generally more relevant) if (typeof position === 'number') { score += Math.max(0, (10 - position) / 10) * 0.2; } // Title factor (if provided) if (title) { const titleLower = title.toLowerCase(); const titleLength = title.length; // Title length factor (reasonable length titles are often better) if (titleLength >= 20 && titleLength <= 80) { score += 0.15; } // Query term presence in title const matchingTerms = queryTerms.filter(term => titleLower.includes(term)).length; score += (matchingTerms / queryTerms.length) * 0.25; // Avoid obviously commercial or low-quality patterns if (title.toLowerCase().includes('buy now') || title.toLowerCase().includes('click here')) { score -= 0.2; } } // Score based on query terms in URL queryTerms.forEach(term => { if (urlLower.includes(term)) { score += 0.2; } }); // Domain credibility (enhanced list) const domain = url.split('/')[2] || ''; const reputableDomains = ['wikipedia.org', 'github.com', 'stackoverflow.com', '.edu', '.gov', 'arxiv.org', 'medium.com']; if (reputableDomains.some(reputableDomain => domain.includes(reputableDomain))) { score += 0.2; } // Penalty for suspicious domains if (domain.includes('ads') || domain.includes('tracker') || domain.includes('spam')) { score -= 0.3; } // Penalty for very long URLs (often tracking/advertising) if (url.length > 100) { score -= 0.1; } return Math.max(0, Math.min(1, score)); } // Extract links with priority scoring export function extractLinksWithPriority(content: string, query: string, maxLinks: number = 5): ScoredUrl[] { const urls = extractRelevantLinks(content, maxLinks * 2); // Score each URL and return top-scored ones const scoredUrls = urls.map(url => ({ url, score: scoreUrlRelevance(url, query) })).sort((a, b) => b.score - a.score); return scoredUrls.slice(0, maxLinks); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/prestonfong/cc-webtools-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

helpers.ts•5.94 KiB