Web Crawler MCP Server

content-extractor.ts•5.53 KiB

/** * Content extraction utilities for web crawling * Parses HTML content to extract links and relevant text for AI analysis */ export interface ExtractedContent { links: ExtractedLink[]; pageText: string; title?: string; } export interface ExtractedLink { url: string; text: string; type: 'internal' | 'external'; } /** * Extract content from markdown with pre-extracted links */ export function extractPageContentFromMarkdown( markdown: string, baseUrl: string, extractedLinks: string[] ): ExtractedContent { const links = processExtractedLinks(extractedLinks, baseUrl); const pageText = extractTextFromMarkdown(markdown); const title = extractTitleFromMarkdown(markdown); return { links, pageText, title }; } /** * Resolve relative URLs to absolute URLs */ function resolveUrl(href: string, baseUrl: string): string { // Already absolute URL if (href.startsWith('http://') || href.startsWith('https://')) { return href; } // Protocol-relative URL if (href.startsWith('//')) { const baseUrlObj = new URL(baseUrl); return baseUrlObj.protocol + href; } // Absolute path if (href.startsWith('/')) { const baseUrlObj = new URL(baseUrl); return `${baseUrlObj.protocol}//${baseUrlObj.host}${href}`; } // Relative path const baseUrlObj = new URL(baseUrl); const basePath = baseUrlObj.pathname.endsWith('/') ? baseUrlObj.pathname : baseUrlObj.pathname.replace(/\/[^\/]*$/, '/'); return `${baseUrlObj.protocol}//${baseUrlObj.host}${basePath}${href}`; } /** * Check if a link should be filtered out (CSS, JS, images, etc.) */ function isNonContentLink(href: string): boolean { const url = href.toLowerCase(); // File extensions to filter out const nonContentExtensions = [ '.css', '.js', '.json', '.xml', '.rss', '.atom', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.pdf', '.zip', '.tar', '.gz', '.rar', '.7z', '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx' ]; // Check file extensions for (const ext of nonContentExtensions) { if (url.endsWith(ext)) { return true; } } // Filter out common non-content URL patterns const nonContentPatterns = [ 'mailto:', 'tel:', 'ftp:', 'file:', 'javascript:', 'data:', '#', // Fragment-only links '/feed', '/rss', '/sitemap', '/wp-content/', '/wp-includes/', '/assets/', '/static/', '/media/', '?format=', '&format=', '/print/', '/download/' ]; for (const pattern of nonContentPatterns) { if (url.includes(pattern)) { return true; } } return false; } /** * Process pre-extracted links from Cloudflare Browser API */ function processExtractedLinks(extractedLinks: string[], baseUrl: string): ExtractedLink[] { const links: ExtractedLink[] = []; const baseUrlObj = new URL(baseUrl); for (const url of extractedLinks) { // Skip non-content links if (isNonContentLink(url)) { continue; } try { const absoluteUrl = resolveUrl(url, baseUrl); const absoluteUrlObj = new URL(absoluteUrl); // Generate a basic text representation from URL const linkText = generateLinkText(absoluteUrl); const link: ExtractedLink = { url: absoluteUrl, text: linkText, type: absoluteUrlObj.hostname === baseUrlObj.hostname ? 'internal' : 'external' }; // Avoid duplicates if (!links.some(l => l.url === absoluteUrl)) { links.push(link); } } catch (error) { // Skip malformed URLs continue; } } return links; } /** * Generate link text from URL when actual link text is not available */ function generateLinkText(url: string): string { try { const urlObj = new URL(url); // Use the last part of the pathname if available if (urlObj.pathname && urlObj.pathname !== '/') { const pathParts = urlObj.pathname.split('/').filter(Boolean); if (pathParts.length > 0) { const lastPart = pathParts[pathParts.length - 1]; // Clean up common file extensions and make readable return lastPart .replace(/\.[^.]*$/, '') // Remove file extension .replace(/[-_]/g, ' ') // Replace dashes and underscores with spaces .replace(/\b\w/g, l => l.toUpperCase()); // Title case } } // Fallback to hostname return urlObj.hostname.replace(/^www\./, ''); } catch (error) { // Fallback to the URL itself if parsing fails return url.length > 50 ? url.substring(0, 50) + '...' : url; } } /** * Extract readable text content from markdown */ function extractTextFromMarkdown(markdown: string): string { // Remove markdown syntax but keep the text content let text = markdown // Remove code blocks .replace(/```[\s\S]*?```/g, '') // Remove inline code .replace(/`[^`]*`/g, '') // Remove headers markdown but keep text .replace(/^#{1,6}\s+(.*)$/gm, '$1') // Remove bold/italic markdown .replace(/\*\*([^*]+)\*\*/g, '$1') .replace(/\*([^*]+)\*/g, '$1') .replace(/__([^_]+)__/g, '$1') .replace(/_([^_]+)_/g, '$1') // Remove links but keep text .replace(/\[([^\]]+)\]$[^)]+$/g, '$1') // Remove images .replace(/!\[[^\]]*\]$[^)]+$/g, '') // Remove horizontal rules .replace(/^---+$/gm, '') // Remove blockquote markers .replace(/^>\s+/gm, ''); // Normalize whitespace text = text.replace(/\s+/g, ' ').trim(); // Limit text length for AI processing (keep first 3000 chars) return text.length > 3000 ? text.substring(0, 3000) + '...' : text; } /** * Extract page title from markdown (first # heading) */ function extractTitleFromMarkdown(markdown: string): string | undefined { const titleMatch = markdown.match(/^#\s+(.+)$/m); return titleMatch ? titleMatch[1].trim() : undefined; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dhannusch/smart-web-crawler-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content-extractor.ts•5.53 KiB