MCP Server for Crawl4AI

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

utility-handlers.ts•9.98 kB

import { BaseHandler } from './base-handler.js'; import { JSExecuteEndpointOptions, JSExecuteEndpointResponse, CrawlResultItem } from '../types.js'; export class UtilityHandlers extends BaseHandler { async executeJS(options: JSExecuteEndpointOptions) { try { // Check if scripts is provided if (!options.scripts || options.scripts === null) { throw new Error( 'scripts is required. Please provide JavaScript code to execute. Use "return" statements to get values back.', ); } const result: JSExecuteEndpointResponse = await this.service.executeJS(options); // Extract JavaScript execution results const jsResults = result.js_execution_result?.results || []; // Ensure scripts is always an array for mapping const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts]; // Format results for display let formattedResults = ''; if (jsResults.length > 0) { formattedResults = jsResults .map((res: unknown, idx: number) => { const script = scripts[idx] || 'Script ' + (idx + 1); // Handle the actual return value or success/error status let resultStr = ''; if (res && typeof res === 'object' && 'success' in res) { // This is a status object (e.g., from null return or execution without return) const statusObj = res as { success: unknown; error?: unknown }; resultStr = statusObj.success ? 'Executed successfully (no return value)' : `Error: ${statusObj.error || 'Unknown error'}`; } else { // This is an actual return value resultStr = JSON.stringify(res, null, 2); } return `Script: ${script}\nReturned: ${resultStr}`; }) .join('\n\n'); } else { formattedResults = 'No results returned'; } // Handle markdown content - can be string or object let markdownContent = ''; if (result.markdown) { if (typeof result.markdown === 'string') { markdownContent = result.markdown; } else if (typeof result.markdown === 'object' && result.markdown.raw_markdown) { // Use raw_markdown from the object structure markdownContent = result.markdown.raw_markdown; } } return { content: [ { type: 'text', text: `JavaScript executed on: ${options.url}\n\nResults:\n${formattedResults}${markdownContent ? `\n\nPage Content After Execution:\n${markdownContent}` : ''}`, }, ], }; } catch (error) { throw this.formatError(error, 'execute JavaScript'); } } async extractLinks(options: { url: string; categorize?: boolean }) { try { // Use crawl endpoint instead of md to get full link data const response = await this.axiosClient.post('/crawl', { urls: [options.url], crawler_config: { cache_mode: 'bypass', }, }); const results = response.data.results || [response.data]; const result: CrawlResultItem = results[0] || {}; // Variables for manually extracted links let manuallyExtractedInternal: string[] = []; let manuallyExtractedExternal: string[] = []; let hasManuallyExtractedLinks = false; // Check if the response is likely JSON or non-HTML content if (!result.links || (result.links.internal.length === 0 && result.links.external.length === 0)) { // Try to detect if this might be a JSON endpoint const markdownContent = result.markdown?.raw_markdown || result.markdown?.fit_markdown || ''; const htmlContent = result.html || ''; // Check for JSON indicators if ( // Check URL pattern options.url.includes('/api/') || options.url.includes('/api.') || // Check content type (often shown in markdown conversion) markdownContent.includes('application/json') || // Check for JSON structure patterns (markdownContent.startsWith('{') && markdownContent.endsWith('}')) || (markdownContent.startsWith('[') && markdownContent.endsWith(']')) || // Check HTML for JSON indicators htmlContent.includes('application/json') || // Common JSON patterns markdownContent.includes('"links"') || markdownContent.includes('"url"') || markdownContent.includes('"data"') ) { return { content: [ { type: 'text', text: `Note: ${options.url} appears to return JSON data rather than HTML. The extract_links tool is designed for HTML pages with <a> tags. To extract URLs from JSON, you would need to parse the JSON structure directly.`, }, ], }; } // If no links found but it's HTML, let's check the markdown content for href patterns if (markdownContent && markdownContent.includes('href=')) { // Extract links manually from markdown if server didn't provide them const hrefPattern = /href=["']([^"']+)["']/g; const foundLinks: string[] = []; let match; while ((match = hrefPattern.exec(markdownContent)) !== null) { foundLinks.push(match[1]); } if (foundLinks.length > 0) { hasManuallyExtractedLinks = true; // Categorize found links const currentDomain = new URL(options.url).hostname; foundLinks.forEach((link) => { try { const linkUrl = new URL(link, options.url); if (linkUrl.hostname === currentDomain) { manuallyExtractedInternal.push(linkUrl.href); } else { manuallyExtractedExternal.push(linkUrl.href); } } catch { // Relative link manuallyExtractedInternal.push(link); } }); } } } // Handle both cases: API-provided links and manually extracted links let internalUrls: string[] = []; let externalUrls: string[] = []; if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) { // Use API-provided links internalUrls = result.links.internal.map((link) => (typeof link === 'string' ? link : link.href)); externalUrls = result.links.external.map((link) => (typeof link === 'string' ? link : link.href)); } else if (hasManuallyExtractedLinks) { // Use manually extracted links internalUrls = manuallyExtractedInternal; externalUrls = manuallyExtractedExternal; } const allUrls = [...internalUrls, ...externalUrls]; if (!options.categorize) { return { content: [ { type: 'text', text: `All links from ${options.url}:\n${allUrls.join('\n')}`, }, ], }; } // Categorize links const categorized: Record<string, string[]> = { internal: [], external: [], social: [], documents: [], images: [], scripts: [], }; // Further categorize links const socialDomains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com']; const docExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']; const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp']; const scriptExtensions = ['.js', '.css']; // Categorize internal URLs internalUrls.forEach((href: string) => { if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.documents.push(href); } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.images.push(href); } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.scripts.push(href); } else { categorized.internal.push(href); } }); // Categorize external URLs externalUrls.forEach((href: string) => { if (socialDomains.some((domain) => href.includes(domain))) { categorized.social.push(href); } else if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.documents.push(href); } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.images.push(href); } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.scripts.push(href); } else { categorized.external.push(href); } }); // Return based on categorize option (defaults to true) if (options.categorize) { return { content: [ { type: 'text', text: `Link analysis for ${options.url}:\n\n${Object.entries(categorized) .map( ([category, links]: [string, string[]]) => `${category} (${links.length}):\n${links.slice(0, 10).join('\n')}${links.length > 10 ? '\n...' : ''}`, ) .join('\n\n')}`, }, ], }; } else { // Return simple list without categorization const allLinks = [...internalUrls, ...externalUrls]; return { content: [ { type: 'text', text: `All links from ${options.url} (${allLinks.length} total):\n\n${allLinks.slice(0, 50).join('\n')}${allLinks.length > 50 ? '\n...' : ''}`, }, ], }; } } catch (error) { throw this.formatError(error, 'extract links'); } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server