Skip to main content
Glama
index.js16.6 kB
#!/usr/bin/env node /* * MCP WebSearch Server * Search & scrape the web via SearchAPI.io */ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import axios from "axios"; import * as cheerio from "cheerio"; const API_KEY = process.env.SEARCHAPI_KEY; const API_URL = "https://www.searchapi.io/api/v1/search"; // warn if no key if (!API_KEY) console.error("[warn] SEARCHAPI_KEY not set"); const mcp = new McpServer({ name: "mcp-websearch", version: "2.0.0", }); // selectors we try when looking for main content const CONTENT_SELECTORS = [ "article", "main", "[role='main']", ".post-content", ".article-content", ".entry-content", ".content", "#content", ".post-body", ".markdown-body" ]; // junk we strip out const JUNK_SELECTORS = [ "script", "style", "noscript", "iframe", "svg", "nav", "footer", "header", "aside", ".ads", ".sidebar", ".comments", ".cookie-banner" ].join(", "); // browser-ish headers const HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0", "Accept": "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9" }; function trim(str, len = 10000) { return str.replace(/[\t ]+/g, " ").replace(/\n{3,}/g, "\n\n").trim().slice(0, len); } async function req(url, params, tries = 2) { let err; for (let i = 0; i <= tries; i++) { try { let res = await axios.get(url, { params, timeout: 15000 }); return res.data; } catch (e) { err = e; if (i < tries) await new Promise(r => setTimeout(r, 1000 * (i + 1))); } } throw err; } function fmtResults(items) { if (!items || !items.length) return "Nothing found."; let out = ""; for (let i = 0; i < items.length; i++) { let r = items[i]; out += `[${i + 1}] ${r.title}\n URL: ${r.link}\n ${r.snippet || "-"}\n\n`; } return out.trim(); } function parseError(e) { if (e.code === "ENOTFOUND") return "Domain not found"; if (e.code === "ETIMEDOUT") return "Timed out"; let status = e.response?.status; if (status === 403) return "Forbidden (403)"; if (status === 404) return "Not found (404)"; if (status === 429) return "Rate limited"; return e.message; } // ---- web_search ---- mcp.tool( "web_search", `Search the web using Google Search via SearchAPI.io. USE THIS TOOL WHEN: - User asks to find information, news, or current events - User needs to research a topic - User asks "search for...", "find...", "look up..." - You need factual information that may have changed recently RETURNS: List of search results with titles, URLs, and snippets. After searching, you can use web_scrape to get full content from relevant URLs.`, { query: z.string().describe("The search query - be specific for better results"), num_results: z.number().min(1).max(20).default(10).optional() .describe("Number of results to return (1-20, default 10)"), page: z.number().min(1).max(10).default(1).optional() .describe("Page number for pagination (1-10, default 1). Use if first page doesn't have what you need."), time_period: z.enum(["last_hour", "last_day", "last_week", "last_month", "last_year"]).optional() .describe("Filter results by time period"), site: z.string().optional() .describe("Limit search to specific site (e.g., 'github.com', 'stackoverflow.com')"), }, async (args) => { if (!API_KEY) { return { content: [{ type: "text", text: "Error: SEARCHAPI_KEY not configured." }], isError: true }; } let { query, num_results, page, time_period, site } = args; num_results = num_results || 10; page = page || 1; try { let q = site ? `site:${site} ${query}` : query; let data = await req(API_URL, { engine: "google", q, api_key: API_KEY, page, time_period, hl: "en" }); let results = (data.organic_results || []).slice(0, num_results); let total = data.search_information?.total_results || "?"; let header = `Search: "${query}"`; if (site) header += ` (site:${site})`; if (time_period) header += ` [${time_period}]`; header += `\nPage ${page} | ~${total} results\n${"=".repeat(50)}\n\n`; let body = fmtResults(results.map(r => ({ title: r.title, link: r.link, snippet: r.snippet }))); let tip = results.length < 5 ? `\n\nTip: try page=${page + 1} or use ai_search for better answers` : ""; return { content: [{ type: "text", text: header + body + tip }] }; } catch (e) { return { content: [{ type: "text", text: "Search failed: " + parseError(e) }], isError: true }; } } ); // ---- ai_search ---- mcp.tool( "ai_search", `Search using Google AI Mode - get AI-generated answers with sources. USE THIS TOOL WHEN: - User needs a comprehensive, synthesized answer (not just links) - Complex questions that need explanation - "Explain...", "What is...", "How does... work?" - When regular search results aren't giving good answers - Need code examples or technical explanations RETURNS: AI-generated answer with markdown formatting and reference links. This is more expensive than web_search, use wisely.`, { query: z.string().describe("The question or topic to get AI-generated answer for"), image_url: z.string().url().optional() .describe("Optional image URL for visual questions (e.g., 'What is in this image?')"), location: z.string().optional() .describe("Location for local queries (e.g., 'New York' for 'restaurants near me')"), }, async (args) => { if (!API_KEY) { return { content: [{ type: "text", text: "Error: SEARCHAPI_KEY not configured." }], isError: true }; } let { query, image_url, location } = args; try { let params = { engine: "google_ai_mode", q: query, api_key: API_KEY }; if (image_url) params.url = image_url; if (location) params.location = location; let data = await req(API_URL, params); let out = `AI Answer: "${query}"\n${"=".repeat(50)}\n\n`; // prefer markdown response if (data.markdown) { out += data.markdown.slice(0, 8000); } else if (data.text_blocks) { // fallback: parse blocks for (let b of data.text_blocks.slice(0, 10)) { if (b.type === "header") out += `\n### ${b.answer}\n`; else if (b.type === "paragraph") out += b.answer + "\n\n"; else if (b.type === "code_blocks") out += "```" + (b.language || "") + "\n" + b.code + "\n```\n\n"; else if (b.type === "unordered_list" && b.items) { b.items.forEach(it => { out += "- " + (it.answer || it) + "\n"; }); out += "\n"; } } } // sources if (data.reference_links?.length) { out += "\n---\nSources:\n"; data.reference_links.slice(0, 8).forEach(ref => { out += `[${ref.index}] ${ref.title}\n ${ref.link}\n`; }); } // local results (restaurants etc) if (data.local_results?.length) { out += "\n---\nNearby:\n"; data.local_results.slice(0, 5).forEach(loc => { out += `- ${loc.title}`; if (loc.rating) out += ` (${loc.rating}★)`; if (loc.address) out += ` - ${loc.address}`; out += "\n"; }); } return { content: [{ type: "text", text: trim(out, 12000) }] }; } catch (e) { return { content: [{ type: "text", text: "AI search failed: " + parseError(e) }], isError: true }; } } ); // ---- web_scrape ---- mcp.tool( "web_scrape", `Scrape and extract readable content from a webpage URL. USE THIS TOOL WHEN: - You have a URL and need to read its full content - User shares a link and asks about its content - After web_search, to get detailed information from a result - User asks to "read", "summarize", or "extract" from a URL RETURNS: Page title, meta description, and main text content (cleaned and truncated). The content is automatically cleaned of scripts, styles, and navigation elements.`, { url: z.string().url().describe("Full URL to scrape (must start with http:// or https://)"), selector: z.string().optional() .describe("Optional CSS selector to extract specific elements (e.g., 'article', '.main-content', '#post-body')"), extract_mode: z.enum(["text", "markdown", "structured"]).default("text").optional() .describe("Output format: 'text' (plain), 'markdown' (preserve formatting), 'structured' (headings + paragraphs)"), include_links: z.boolean().default(false).optional() .describe("Include links found in the content"), max_length: z.number().min(1000).max(50000).default(10000).optional() .describe("Maximum content length (1000-50000, default 10000)"), }, async (args) => { let { url, selector, extract_mode, include_links, max_length } = args; extract_mode = extract_mode || "text"; max_length = max_length || 10000; try { let res = await axios.get(url, { headers: HEADERS, timeout: 20000, maxRedirects: 5, validateStatus: s => s < 400 }); let $ = cheerio.load(res.data); $(JUNK_SELECTORS).remove(); // meta let title = $("title").text().trim() || $('meta[property="og:title"]').attr("content") || "Untitled"; let desc = $('meta[name="description"]').attr("content") || $('meta[property="og:description"]').attr("content") || ""; let author = $('meta[name="author"]').attr("content") || ""; let date = $('meta[property="article:published_time"]').attr("content") || $("time[datetime]").attr("datetime") || ""; // find content container let $main; if (selector) { $main = $(selector); } else { for (let sel of CONTENT_SELECTORS) { let el = $(sel); if (el.length && el.text().trim().length > 300) { $main = el; break; } } } if (!$main || !$main.length) $main = $("body"); let body = ""; let links = []; if (extract_mode === "structured") { $main.find("h1,h2,h3,h4,h5,h6,p,li,pre,blockquote").each((_, el) => { let tag = el.tagName.toLowerCase(); let txt = $(el).text().trim(); if (!txt) return; if (tag[0] === "h") body += "\n" + "#".repeat(+tag[1]) + " " + txt + "\n\n"; else if (tag === "li") body += "- " + txt + "\n"; else if (tag === "pre") body += "```\n" + txt + "\n```\n\n"; else if (tag === "blockquote") body += "> " + txt + "\n\n"; else body += txt + "\n\n"; }); } else if (extract_mode === "markdown") { $main.find("h1,h2,h3,h4,h5,h6").each((_, el) => { let lvl = +el.tagName[1]; $(el).replaceWith("\n" + "#".repeat(lvl) + " " + $(el).text().trim() + "\n\n"); }); $main.find("strong,b").each((_, el) => $(el).replaceWith("**" + $(el).text() + "**")); $main.find("em,i").each((_, el) => $(el).replaceWith("*" + $(el).text() + "*")); $main.find("code").each((_, el) => $(el).replaceWith("`" + $(el).text() + "`")); $main.find("pre").each((_, el) => $(el).replaceWith("\n```\n" + $(el).text() + "\n```\n")); body = $main.text(); } else { body = $main.text(); } if (include_links) { $main.find("a[href]").each((_, el) => { let href = $(el).attr("href"); let txt = $(el).text().trim(); if (!href || href[0] === "#" || href.startsWith("javascript:")) return; if (!txt) return; try { links.push({ text: txt.slice(0, 80), url: new URL(href, url).href }); } catch {} }); } body = trim(body, max_length); // build output let out = `URL: ${url}\nTitle: ${title}\n`; if (desc) out += `Description: ${desc}\n`; if (author) out += `Author: ${author}\n`; if (date) out += `Date: ${date}\n`; out += "=".repeat(50) + "\n\n" + (body || "No content found."); if (include_links && links.length) { let uniq = [...new Map(links.map(l => [l.url, l])).values()].slice(0, 20); out += "\n\n---\nLinks:\n"; uniq.forEach((l, i) => { out += `[${i + 1}] ${l.text}\n ${l.url}\n`; }); } return { content: [{ type: "text", text: out }] }; } catch (e) { return { content: [{ type: "text", text: `Scrape failed (${url}): ${parseError(e)}` }], isError: true }; } } ); // ---- get_links ---- mcp.tool( "get_links", `Extract all links from a webpage. USE THIS TOOL WHEN: - User wants to see what pages are linked from a URL - You need to find related pages or navigation options - Exploring a website's structure - Finding specific resources linked on a page RETURNS: List of links with their anchor text and URLs (max 50 links).`, { url: z.string().url().describe("Full URL to extract links from"), filter: z.string().optional() .describe("Optional text filter - only return links containing this text in URL or anchor"), }, async (args) => { let { url, filter } = args; try { let res = await axios.get(url, { headers: HEADERS, timeout: 15000 }); let $ = cheerio.load(res.data); let links = []; $("a[href]").each((_, el) => { let href = $(el).attr("href"); let txt = $(el).text().trim().replace(/\s+/g, " "); if (!href || href[0] === "#" || href.startsWith("javascript:") || href.startsWith("mailto:")) return; try { let abs = new URL(href, url).href; txt = txt.slice(0, 100) || "(no text)"; if (filter) { let f = filter.toLowerCase(); if (!abs.toLowerCase().includes(f) && !txt.toLowerCase().includes(f)) return; } links.push({ text: txt, url: abs }); } catch {} }); let uniq = [...new Map(links.map(l => [l.url, l])).values()]; let show = uniq.slice(0, 50); let out = `Links from: ${url}\n`; out += `Found: ${uniq.length}`; if (uniq.length > 50) out += " (showing 50)"; if (filter) out += ` | filter: "${filter}"`; out += "\n" + "=".repeat(50) + "\n\n"; show.forEach((l, i) => { out += `[${i + 1}] ${l.text}\n ${l.url}\n\n`; }); return { content: [{ type: "text", text: out.trim() }] }; } catch (e) { return { content: [{ type: "text", text: "Failed: " + parseError(e) }], isError: true }; } } ); // ---- scrape_multiple ---- mcp.tool( "scrape_multiple", `Scrape multiple URLs at once and return combined results. USE THIS TOOL WHEN: - You need to compare content from multiple pages - Gathering information from several search results - Research that requires reading multiple sources RETURNS: Combined content from all URLs with clear separation. Limited to 5 URLs to avoid timeout.`, { urls: z.array(z.string().url()).min(1).max(5) .describe("Array of URLs to scrape (max 5)"), max_per_page: z.number().min(500).max(5000).default(2000).optional() .describe("Max content length per page (500-5000, default 2000)"), }, async (args) => { let { urls, max_per_page } = args; max_per_page = max_per_page || 2000; let jobs = urls.map(async (u) => { let res = await axios.get(u, { headers: HEADERS, timeout: 15000 }); let $ = cheerio.load(res.data); $(JUNK_SELECTORS).remove(); let title = $("title").text().trim() || "Untitled"; let body = ""; for (let sel of ["article", "main", ".content", "#content", "body"]) { if ($(sel).length) { body = $(sel).text().trim(); break; } } return { url: u, title, body: trim(body, max_per_page) }; }); let results = await Promise.allSettled(jobs); let out = "Multi-page scrape\n" + "=".repeat(50) + "\n\n"; results.forEach((r, i) => { if (r.status === "fulfilled") { let { url, title, body } = r.value; out += `[${i + 1}] ${title}\n${url}\n${"─".repeat(40)}\n${body}\n\n`; } else { out += `[${i + 1}] FAILED: ${urls[i]}\n${r.reason?.message || "error"}\n\n`; } }); return { content: [{ type: "text", text: out }] }; } ); // ---- start ---- (async () => { let transport = new StdioServerTransport(); await mcp.connect(transport); console.error("websearch mcp ready"); })();

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/galangw/mcp-websearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server