MCP Workspace Server

Overview Schema Related Servers Score Discussions

webScrape.ts•11.9 KiB

/** * Web Scraping Tool * Extract data from web pages, parse HTML, and scrape content */ import { Tool } from '@modelcontextprotocol/sdk/types.js'; import * as https from 'https'; import * as http from 'http'; import { URL } from 'url'; import { ServerConfig } from '../config.js'; import { WorkspaceError, ErrorCode } from '../utils/errors.js'; export interface WebScrapeArgs { url: string; operation: 'fetch' | 'extract' | 'links' | 'images' | 'text' | 'metadata'; selector?: string; attribute?: string; timeout?: number; headers?: Record<string, string>; followRedirects?: boolean; maxRedirects?: number; } export const webScrapeTool: Tool = { name: 'web_scrape', description: 'Scrape web pages - fetch HTML, extract elements, get links, images, text, metadata', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL to scrape' }, operation: { type: 'string', enum: ['fetch', 'extract', 'links', 'images', 'text', 'metadata'], description: 'Scraping operation to perform' }, selector: { type: 'string', description: 'CSS selector for element extraction' }, attribute: { type: 'string', description: 'HTML attribute to extract (e.g., "href", "src")' }, timeout: { type: 'number', description: 'Request timeout in milliseconds', default: 10000 }, headers: { type: 'object', description: 'HTTP headers to send', additionalProperties: { type: 'string' } }, followRedirects: { type: 'boolean', description: 'Follow HTTP redirects', default: true }, maxRedirects: { type: 'number', description: 'Maximum number of redirects to follow', default: 5 } }, required: ['url', 'operation'] } }; export async function executeWebScrape( args: WebScrapeArgs, _config: ServerConfig ): Promise<{ message: string; data?: any; html?: string; links?: string[]; images?: string[]; text?: string; metadata?: any }> { const { url, operation, selector, attribute, timeout = 10000, headers, followRedirects = true, maxRedirects = 5 } = args; try { // Validate URL new URL(url); } catch (error) { throw new WorkspaceError(ErrorCode.INVALID_INPUT, `Invalid URL: ${url}`); } try { const html = await fetchHtml(url, timeout, headers, followRedirects, maxRedirects); switch (operation) { case 'fetch': return { message: `HTML fetched successfully from ${url}`, html }; case 'extract': if (!selector) { throw new WorkspaceError(ErrorCode.INVALID_INPUT, 'CSS selector is required for extract operation'); } return await extractElements(html, selector, attribute, url); case 'links': return await extractLinks(html, url); case 'images': return await extractImages(html, url); case 'text': return await extractText(html, url); case 'metadata': return await extractMetadata(html, url); default: throw new WorkspaceError(ErrorCode.INVALID_INPUT, `Unknown operation: ${operation}`); } } catch (error) { if (error instanceof WorkspaceError) { throw error; } throw new WorkspaceError( ErrorCode.UNEXPECTED_ERROR, `Web scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}` ); } } async function fetchHtml( url: string, timeout: number, headers?: Record<string, string>, followRedirects: boolean = true, maxRedirects: number = 5 ): Promise<string> { return new Promise((resolve, reject) => { const urlObj = new URL(url); const isHttps = urlObj.protocol === 'https:'; const httpModule = isHttps ? https : http; const options = { hostname: urlObj.hostname, port: urlObj.port || (isHttps ? 443 : 80), path: urlObj.pathname + urlObj.search, method: 'GET', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', ...headers } }; const req = httpModule.request(options, (res) => { // Handle redirects if (followRedirects && res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { if (maxRedirects <= 0) { reject(new Error('Too many redirects')); return; } const redirectUrl = new URL(res.headers.location, url).toString(); fetchHtml(redirectUrl, timeout, headers, followRedirects, maxRedirects - 1) .then(resolve) .catch(reject); return; } if (res.statusCode && res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}: ${res.statusMessage}`)); return; } let data = ''; res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { resolve(data); }); }); req.on('error', (error) => { reject(error); }); req.setTimeout(timeout, () => { req.destroy(); reject(new Error('Request timeout')); }); req.end(); }); } async function extractElements( html: string, selector: string, attribute?: string, _url?: string ): Promise<{ message: string; data: any[] }> { // Simple HTML parsing - in a real implementation, you'd use a proper HTML parser like cheerio const elements = parseHtmlElements(html, selector); let data: any[]; if (attribute) { data = elements.map(el => extractAttribute(el, attribute)).filter(val => val !== null); } else { data = elements.map(el => extractTextContent(el)); } return { message: `Extracted ${data.length} elements using selector "${selector}"${attribute ? ` (attribute: ${attribute})` : ''}`, data }; } async function extractLinks(html: string, baseUrl: string): Promise<{ message: string; links: string[] }> { const linkRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>/gi; const links: string[] = []; let match; while ((match = linkRegex.exec(html)) !== null) { try { const link = new URL(match[1], baseUrl).toString(); if (!links.includes(link)) { links.push(link); } } catch { // Invalid URL, skip } } return { message: `Extracted ${links.length} unique links`, links }; } async function extractImages(html: string, baseUrl: string): Promise<{ message: string; images: string[] }> { const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; const images: string[] = []; let match; while ((match = imgRegex.exec(html)) !== null) { try { const img = new URL(match[1], baseUrl).toString(); if (!images.includes(img)) { images.push(img); } } catch { // Invalid URL, skip } } return { message: `Extracted ${images.length} unique images`, images }; } async function extractText(html: string, url: string): Promise<{ message: string; text: string }> { // Remove script and style elements let cleanHtml = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ''); cleanHtml = cleanHtml.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ''); // Remove HTML tags const text = cleanHtml .replace(/<[^>]*>/g, ' ') .replace(/\s+/g, ' ') .trim(); return { message: `Extracted text content from ${url} (${text.length} characters)`, text }; } async function extractMetadata(html: string, url: string): Promise<{ message: string; metadata: any }> { const metadata: any = { url, title: '', description: '', keywords: '', author: '', ogTitle: '', ogDescription: '', ogImage: '', twitterTitle: '', twitterDescription: '', twitterImage: '' }; // Extract title const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i); if (titleMatch) { metadata.title = titleMatch[1].trim(); } // Extract meta tags const metaRegex = /<meta[^>]+>/gi; let match; while ((match = metaRegex.exec(html)) !== null) { const metaTag = match[0]; // Description if (metaTag.includes('name="description"') || metaTag.includes("name='description'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.description = contentMatch[1]; } // Keywords if (metaTag.includes('name="keywords"') || metaTag.includes("name='keywords'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.keywords = contentMatch[1]; } // Author if (metaTag.includes('name="author"') || metaTag.includes("name='author'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.author = contentMatch[1]; } // Open Graph if (metaTag.includes('property="og:title"') || metaTag.includes("property='og:title'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.ogTitle = contentMatch[1]; } if (metaTag.includes('property="og:description"') || metaTag.includes("property='og:description'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.ogDescription = contentMatch[1]; } if (metaTag.includes('property="og:image"') || metaTag.includes("property='og:image'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.ogImage = contentMatch[1]; } // Twitter Cards if (metaTag.includes('name="twitter:title"') || metaTag.includes("name='twitter:title'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.twitterTitle = contentMatch[1]; } if (metaTag.includes('name="twitter:description"') || metaTag.includes("name='twitter:description'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.twitterDescription = contentMatch[1]; } if (metaTag.includes('name="twitter:image"') || metaTag.includes("name='twitter:image'")) { const contentMatch = metaTag.match(/content=["']([^"']+)["']/i); if (contentMatch) metadata.twitterImage = contentMatch[1]; } } return { message: `Extracted metadata from ${url}`, metadata }; } function parseHtmlElements(html: string, selector: string): string[] { // Very basic CSS selector parsing - in a real implementation, use a proper HTML parser const elements: string[] = []; if (selector.startsWith('.')) { // Class selector const className = selector.substring(1); const regex = new RegExp(`<[^>]+class=["'][^"']*\\b${className}\\b[^"']*["'][^>]*>.*?</[^>]+>`, 'gi'); let match; while ((match = regex.exec(html)) !== null) { elements.push(match[0]); } } else if (selector.startsWith('#')) { // ID selector const id = selector.substring(1); const regex = new RegExp(`<[^>]+id=["']${id}["'][^>]*>.*?</[^>]+>`, 'gi'); let match; while ((match = regex.exec(html)) !== null) { elements.push(match[0]); } } else { // Tag selector const regex = new RegExp(`<${selector}[^>]*>.*?</${selector}>`, 'gi'); let match; while ((match = regex.exec(html)) !== null) { elements.push(match[0]); } } return elements; } function extractAttribute(element: string, attribute: string): string | null { const regex = new RegExp(`${attribute}=["']([^"']+)["']`, 'i'); const match = element.match(regex); return match ? match[1] : null; } function extractTextContent(element: string): string { return element.replace(/<[^>]*>/g, '').trim(); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ShayYeffet/mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

webScrape.ts•11.9 KiB