Skip to main content
Glama

Bright Data Web MCP

by dsouza-anush
server.js28.1 kB
#!/usr/bin/env node 'use strict'; /*jslint node:true es9:true*/ import {FastMCP} from 'fastmcp'; import {z} from 'zod'; import axios from 'axios'; import {tools as browser_tools} from './browser_tools.js'; import {createRequire} from 'node:module'; const require = createRequire(import.meta.url); const package_json = require('./package.json'); const api_token = process.env.API_TOKEN; const unlocker_zone = process.env.WEB_UNLOCKER_ZONE || 'mcp_unlocker'; const browser_zone = process.env.BROWSER_ZONE || 'mcp_browser'; // All tools are enabled by default in this version function parse_rate_limit(rate_limit_str) { if (!rate_limit_str) return null; const match = rate_limit_str.match(/^(\d+)\/(\d+)([mhs])$/); if (!match) throw new Error('Invalid RATE_LIMIT format. Use: 100/1h or 50/30m'); const [, limit, time, unit] = match; const multiplier = unit==='h' ? 3600 : unit==='m' ? 60 : 1; return { limit: parseInt(limit), window: parseInt(time) * multiplier * 1000, display: rate_limit_str }; } const rate_limit_config = parse_rate_limit(process.env.RATE_LIMIT); if (!api_token) throw new Error('Cannot run MCP server without API_TOKEN env'); const api_headers = ()=>({ 'user-agent': `${package_json.name}/${package_json.version}`, authorization: `Bearer ${api_token}`, }); function check_rate_limit(){ if (!rate_limit_config) return true; const now = Date.now(); const window_start = now - rate_limit_config.window; debug_stats.call_timestamps = debug_stats.call_timestamps.filter(timestamp=>timestamp>window_start); if (debug_stats.call_timestamps.length>=rate_limit_config.limit) throw new Error(`Rate limit exceeded: ${rate_limit_config.display}`); debug_stats.call_timestamps.push(now); return true; } async function ensure_required_zones(){ try { console.error('Checking for required zones...'); let response = await axios({ url: 'https://api.brightdata.com/zone/get_active_zones', method: 'GET', headers: api_headers(), }); let zones = response.data || []; let has_unlocker_zone = zones.some(zone=>zone.name==unlocker_zone); let has_browser_zone = zones.some(zone=>zone.name==browser_zone); if (!has_unlocker_zone) { console.error(`Required zone "${unlocker_zone}" not found, ` +`creating it...`); await axios({ url: 'https://api.brightdata.com/zone', method: 'POST', headers: { ...api_headers(), 'Content-Type': 'application/json', }, data: { zone: {name: unlocker_zone, type: 'unblocker'}, plan: {type: 'unblocker'}, }, }); console.error(`Zone "${unlocker_zone}" created successfully`); } else console.error(`Required zone "${unlocker_zone}" already exists`); if (!has_browser_zone) { console.error(`Required zone "${browser_zone}" not found, ` +`creating it...`); await axios({ url: 'https://api.brightdata.com/zone', method: 'POST', headers: { ...api_headers(), 'Content-Type': 'application/json', }, data: { zone: {name: browser_zone, type: 'browser_api'}, plan: {type: 'browser_api'}, }, }); console.error(`Zone "${browser_zone}" created successfully`); } else console.error(`Required zone "${browser_zone}" already exists`); } catch(e){ console.error('Error checking/creating zones:', e.response?.data||e.message); } } await ensure_required_zones(); let server = new FastMCP({ name: 'Bright Data', version: package_json.version, }); let debug_stats = {tool_calls: {}, session_calls: 0, call_timestamps: []}; const addTool = (tool) => { // Register all tools without restriction server.addTool(tool); }; addTool({ name: 'search_engine', description: 'Scrape search results from Google, Bing or Yandex. Returns ' +'SERP results in markdown (URL, title, description)', parameters: z.object({ query: z.string(), engine: z.enum([ 'google', 'bing', 'yandex', ]).optional().default('google'), cursor: z.string().optional().describe('Pagination cursor for next page'), }), execute: tool_fn('search_engine', async({query, engine, cursor})=>{ let response = await axios({ url: 'https://api.brightdata.com/request', method: 'POST', data: { url: search_url(engine, query, cursor), zone: unlocker_zone, format: 'raw', data_format: 'markdown', }, headers: api_headers(), responseType: 'text', }); return response.data; }), }); addTool({ name: 'scrape_as_markdown', description: 'Scrape a single webpage URL with advanced options for ' +'content extraction and get back the results in MarkDown language. ' +'This tool can unlock any webpage even if it uses bot detection or ' +'CAPTCHA.', parameters: z.object({url: z.string().url()}), execute: tool_fn('scrape_as_markdown', async({url})=>{ let response = await axios({ url: 'https://api.brightdata.com/request', method: 'POST', data: { url, zone: unlocker_zone, format: 'raw', data_format: 'markdown', }, headers: api_headers(), responseType: 'text', }); return response.data; }), }); addTool({ name: 'scrape_as_html', description: 'Scrape a single webpage URL with advanced options for ' +'content extraction and get back the results in HTML. ' +'This tool can unlock any webpage even if it uses bot detection or ' +'CAPTCHA.', parameters: z.object({url: z.string().url()}), execute: tool_fn('scrape_as_html', async({url})=>{ let response = await axios({ url: 'https://api.brightdata.com/request', method: 'POST', data: { url, zone: unlocker_zone, format: 'raw', }, headers: api_headers(), responseType: 'text', }); return response.data; }), }); addTool({ name: 'extract', description: 'Scrape a webpage and extract structured data as JSON. ' + 'First scrapes the page as markdown, then uses AI sampling to convert ' + 'it to structured JSON format. This tool can unlock any webpage even ' + 'if it uses bot detection or CAPTCHA.', parameters: z.object({ url: z.string().url(), extraction_prompt: z.string().optional().describe( 'Custom prompt to guide the extraction process. If not provided, ' + 'will extract general structured data from the page.' ), }), execute: tool_fn('extract', async ({ url, extraction_prompt }, ctx) => { let scrape_response = await axios({ url: 'https://api.brightdata.com/request', method: 'POST', data: { url, zone: unlocker_zone, format: 'raw', data_format: 'markdown', }, headers: api_headers(), responseType: 'text', }); let markdown_content = scrape_response.data; let system_prompt = 'You are a data extraction specialist. You MUST respond with ONLY valid JSON, no other text or formatting. ' + 'Extract the requested information from the markdown content and return it as a properly formatted JSON object. ' + 'Do not include any explanations, markdown formatting, or text outside the JSON response.'; let user_prompt = extraction_prompt || 'Extract the requested information from this markdown content and return ONLY a JSON object:'; let session = server.sessions[0]; // Get the first active session if (!session) throw new Error('No active session available for sampling'); let sampling_response = await session.requestSampling({ messages: [ { role: "user", content: { type: "text", text: `${user_prompt}\n\nMarkdown content:\n${markdown_content}\n\nRemember: Respond with ONLY valid JSON, no other text.`, }, }, ], systemPrompt: system_prompt, includeContext: "thisServer", }); return sampling_response.content.text; }), }); addTool({ name: 'session_stats', description: 'Tell the user about the tool usage during this session', parameters: z.object({}), execute: tool_fn('session_stats', async()=>{ let used_tools = Object.entries(debug_stats.tool_calls); let lines = ['Tool calls this session:']; for (let [name, calls] of used_tools) lines.push(`- ${name} tool: called ${calls} times`); return lines.join('\n'); }), }); const datasets = [{ id: 'amazon_product', dataset_id: 'gd_l7q7dkf244hwjntr0', description: [ 'Quickly read structured amazon product data.', 'Requires a valid product URL with /dp/ in it.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'amazon_product_reviews', dataset_id: 'gd_le8e811kzy4ggddlq', description: [ 'Quickly read structured amazon product review data.', 'Requires a valid product URL with /dp/ in it.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'amazon_product_search', dataset_id: 'gd_lwdb4vjm1ehb499uxs', description: [ 'Quickly read structured amazon product search data.', 'Requires a valid search keyword and amazon domain URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['keyword', 'url', 'pages_to_search'], defaults: {pages_to_search: '1'}, }, { id: 'walmart_product', dataset_id: 'gd_l95fol7l1ru6rlo116', description: [ 'Quickly read structured walmart product data.', 'Requires a valid product URL with /ip/ in it.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'walmart_seller', dataset_id: 'gd_m7ke48w81ocyu4hhz0', description: [ 'Quickly read structured walmart seller data.', 'Requires a valid walmart seller URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'ebay_product', dataset_id: 'gd_ltr9mjt81n0zzdk1fb', description: [ 'Quickly read structured ebay product data.', 'Requires a valid ebay product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'homedepot_products', dataset_id: 'gd_lmusivh019i7g97q2n', description: [ 'Quickly read structured homedepot product data.', 'Requires a valid homedepot product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'zara_products', dataset_id: 'gd_lct4vafw1tgx27d4o0', description: [ 'Quickly read structured zara product data.', 'Requires a valid zara product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'etsy_products', dataset_id: 'gd_ltppk0jdv1jqz25mz', description: [ 'Quickly read structured etsy product data.', 'Requires a valid etsy product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'bestbuy_products', dataset_id: 'gd_ltre1jqe1jfr7cccf', description: [ 'Quickly read structured bestbuy product data.', 'Requires a valid bestbuy product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'linkedin_person_profile', dataset_id: 'gd_l1viktl72bvl7bjuj0', description: [ 'Quickly read structured linkedin people profile data.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'linkedin_company_profile', dataset_id: 'gd_l1vikfnt1wgvvqz95w', description: [ 'Quickly read structured linkedin company profile data', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'linkedin_job_listings', dataset_id: 'gd_lpfll7v5hcqtkxl6l', description: [ 'Quickly read structured linkedin job listings data', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'linkedin_posts', dataset_id: 'gd_lyy3tktm25m4avu764', description: [ 'Quickly read structured linkedin posts data', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'linkedin_people_search', dataset_id: 'gd_m8d03he47z8nwb5xc', description: [ 'Quickly read structured linkedin people search data', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url', 'first_name', 'last_name'], }, { id: 'crunchbase_company', dataset_id: 'gd_l1vijqt9jfj7olije', description: [ 'Quickly read structured crunchbase company data', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'zoominfo_company_profile', dataset_id: 'gd_m0ci4a4ivx3j5l6nx', description: [ 'Quickly read structured ZoomInfo company profile data.', 'Requires a valid ZoomInfo company URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'instagram_profiles', dataset_id: 'gd_l1vikfch901nx3by4', description: [ 'Quickly read structured Instagram profile data.', 'Requires a valid Instagram URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'instagram_posts', dataset_id: 'gd_lk5ns7kz21pck8jpis', description: [ 'Quickly read structured Instagram post data.', 'Requires a valid Instagram URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'instagram_reels', dataset_id: 'gd_lyclm20il4r5helnj', description: [ 'Quickly read structured Instagram reel data.', 'Requires a valid Instagram URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'instagram_comments', dataset_id: 'gd_ltppn085pokosxh13', description: [ 'Quickly read structured Instagram comments data.', 'Requires a valid Instagram URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'facebook_posts', dataset_id: 'gd_lyclm1571iy3mv57zw', description: [ 'Quickly read structured Facebook post data.', 'Requires a valid Facebook post URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'facebook_marketplace_listings', dataset_id: 'gd_lvt9iwuh6fbcwmx1a', description: [ 'Quickly read structured Facebook marketplace listing data.', 'Requires a valid Facebook marketplace listing URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'facebook_company_reviews', dataset_id: 'gd_m0dtqpiu1mbcyc2g86', description: [ 'Quickly read structured Facebook company reviews data.', 'Requires a valid Facebook company URL and number of reviews.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url', 'num_of_reviews'], }, { id: 'facebook_events', dataset_id: 'gd_m14sd0to1jz48ppm51', description: [ 'Quickly read structured Facebook events data.', 'Requires a valid Facebook event URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'tiktok_profiles', dataset_id: 'gd_l1villgoiiidt09ci', description: [ 'Quickly read structured Tiktok profiles data.', 'Requires a valid Tiktok profile URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'tiktok_posts', dataset_id: 'gd_lu702nij2f790tmv9h', description: [ 'Quickly read structured Tiktok post data.', 'Requires a valid Tiktok post URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'tiktok_shop', dataset_id: 'gd_m45m1u911dsa4274pi', description: [ 'Quickly read structured Tiktok shop data.', 'Requires a valid Tiktok shop product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'tiktok_comments', dataset_id: 'gd_lkf2st302ap89utw5k', description: [ 'Quickly read structured Tiktok comments data.', 'Requires a valid Tiktok video URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'google_maps_reviews', dataset_id: 'gd_luzfs1dn2oa0teb81', description: [ 'Quickly read structured Google maps reviews data.', 'Requires a valid Google maps URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url', 'days_limit'], defaults: {days_limit: '3'}, }, { id: 'google_shopping', dataset_id: 'gd_ltppk50q18kdw67omz', description: [ 'Quickly read structured Google shopping data.', 'Requires a valid Google shopping product URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'google_play_store', dataset_id: 'gd_lsk382l8xei8vzm4u', description: [ 'Quickly read structured Google play store data.', 'Requires a valid Google play store app URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'apple_app_store', dataset_id: 'gd_lsk9ki3u2iishmwrui', description: [ 'Quickly read structured apple app store data.', 'Requires a valid apple app store app URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'reuter_news', dataset_id: 'gd_lyptx9h74wtlvpnfu', description: [ 'Quickly read structured reuter news data.', 'Requires a valid reuter news report URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'github_repository_file', dataset_id: 'gd_lyrexgxc24b3d4imjt', description: [ 'Quickly read structured github repository data.', 'Requires a valid github repository file URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'yahoo_finance_business', dataset_id: 'gd_lmrpz3vxmz972ghd7', description: [ 'Quickly read structured yahoo finance business data.', 'Requires a valid yahoo finance business URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'x_posts', dataset_id: 'gd_lwxkxvnf1cynvib9co', description: [ 'Quickly read structured X post data.', 'Requires a valid X post URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'zillow_properties_listing', dataset_id: 'gd_lfqkr8wm13ixtbd8f5', description: [ 'Quickly read structured zillow properties listing data.', 'Requires a valid zillow properties listing URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'booking_hotel_listings', dataset_id: 'gd_m5mbdl081229ln6t4a', description: [ 'Quickly read structured booking hotel listings data.', 'Requires a valid booking hotel listing URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'youtube_profiles', dataset_id: 'gd_lk538t2k2p1k3oos71', description: [ 'Quickly read structured youtube profiles data.', 'Requires a valid youtube profile URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'youtube_comments', dataset_id: 'gd_lk9q0ew71spt1mxywf', description: [ 'Quickly read structured youtube comments data.', 'Requires a valid youtube video URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url', 'num_of_comments'], defaults: {num_of_comments: '10'}, }, { id: 'reddit_posts', dataset_id: 'gd_lvz8ah06191smkebj4', description: [ 'Quickly read structured reddit posts data.', 'Requires a valid reddit post URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }, { id: 'youtube_videos', dataset_id: 'gd_lk56epmy2i5g7lzu0k', description: [ 'Quickly read structured YouTube videos data.', 'Requires a valid YouTube video URL.', 'This can be a cache lookup, so it can be more reliable than scraping', ].join('\n'), inputs: ['url'], }]; for (let {dataset_id, id, description, inputs, defaults = {}} of datasets) { let parameters = {}; for (let input of inputs) { let param_schema = input=='url' ? z.string().url() : z.string(); parameters[input] = defaults[input] !== undefined ? param_schema.default(defaults[input]) : param_schema; } addTool({ name: `web_data_${id}`, description, parameters: z.object(parameters), execute: tool_fn(`web_data_${id}`, async(data, ctx)=>{ let trigger_response = await axios({ url: 'https://api.brightdata.com/datasets/v3/trigger', params: {dataset_id, include_errors: true}, method: 'POST', data: [data], headers: api_headers(), }); if (!trigger_response.data?.snapshot_id) throw new Error('No snapshot ID returned from request'); let snapshot_id = trigger_response.data.snapshot_id; console.error(`[web_data_${id}] triggered collection with ` +`snapshot ID: ${snapshot_id}`); let max_attempts = 600; let attempts = 0; while (attempts < max_attempts) { try { if (ctx && ctx.reportProgress) { await ctx.reportProgress({ progress: attempts, total: max_attempts, message: `Polling for data (attempt ` +`${attempts + 1}/${max_attempts})`, }); } let snapshot_response = await axios({ url: `https://api.brightdata.com/datasets/v3` +`/snapshot/${snapshot_id}`, params: {format: 'json'}, method: 'GET', headers: api_headers(), }); if (['running', 'building'].includes(snapshot_response.data?.status)) { console.error(`[web_data_${id}] snapshot not ready, ` +`polling again (attempt ` +`${attempts + 1}/${max_attempts})`); attempts++; await new Promise(resolve=>setTimeout(resolve, 1000)); continue; } console.error(`[web_data_${id}] snapshot data received ` +`after ${attempts + 1} attempts`); let result_data = JSON.stringify(snapshot_response.data); return result_data; } catch(e){ console.error(`[web_data_${id}] polling error: ` +`${e.message}`); attempts++; await new Promise(resolve=>setTimeout(resolve, 1000)); } } throw new Error(`Timeout after ${max_attempts} seconds waiting ` +`for data`); }), }); } for (let tool of browser_tools) addTool(tool); console.error('Starting server...'); server.start({transportType: 'stdio'}); function tool_fn(name, fn){ return async(data, ctx)=>{ check_rate_limit(); debug_stats.tool_calls[name] = debug_stats.tool_calls[name]||0; debug_stats.tool_calls[name]++; debug_stats.session_calls++; let ts = Date.now(); console.error(`[%s] executing %s`, name, JSON.stringify(data)); try { return await fn(data, ctx); } catch(e){ if (e.response) { console.error(`[%s] error %s %s: %s`, name, e.response.status, e.response.statusText, e.response.data); let message = e.response.data; if (message?.length) throw new Error(`HTTP ${e.response.status}: ${message}`); } else console.error(`[%s] error %s`, name, e.stack); throw e; } finally { let dur = Date.now()-ts; console.error(`[%s] tool finished in %sms`, name, dur); } }; } function search_url(engine, query, cursor){ let q = encodeURIComponent(query); let page = cursor ? parseInt(cursor) : 0; let start = page * 10; if (engine=='yandex') return `https://yandex.com/search/?text=${q}&p=${page}`; if (engine=='bing') return `https://www.bing.com/search?q=${q}&first=${start + 1}`; return `https://www.google.com/search?q=${q}&start=${start}`; }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dsouza-anush/brightdata-mcp-heroku'

If you have feedback or need assistance with the MCP directory API, please join our Discord server