import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
import { z } from "zod";
import express from "express";
import { fetch, fetchMarkdown, type CrawlResult as ExternalCrawlResult } from "@just-every/crawl";
import { type CrawlOptions, type CrawlResult, type ToolResult } from "./types.js";
import { cdpManager } from "./cdp.js";
interface DOMElement {
textContent: string | null;
querySelector: (selector: string) => DOMElement | null;
querySelectorAll: (selector: string) => DOMElement[];
href?: string;
}
declare global {
const window: {
location: {
href: string;
};
};
const document: {
querySelector: (selector: string) => DOMElement | null;
querySelectorAll: (selector: string) => DOMElement[];
title: string;
body: {
textContent: string | null;
};
};
}
function mapExternalCrawlResult(result: ExternalCrawlResult): CrawlResult {
return {
url: result.url,
markdown: result.markdown,
title: result.title,
links: result.links,
error: result.error
};
}
const FetchMarkdownInputSchema = z.object({
url: z.string()
.url("Must be a valid URL")
.describe("The URL to fetch and convert to markdown"),
options: z.object({
timeout: z.number()
.int()
.min(1000)
.max(300000)
.default(30000)
.describe("Request timeout in milliseconds"),
})
.partial()
.optional()
.describe("Optional crawling configuration")
}).strict();
type FetchMarkdownInput = z.infer<typeof FetchMarkdownInputSchema>;
const FetchInputSchema = z.object({
url: z.string()
.url("Must be a valid URL")
.describe("The URL to crawl"),
options: z.object({
pages: z.number()
.int()
.min(1)
.max(100)
.default(1)
.describe("Maximum number of pages to crawl"),
maxConcurrency: z.number()
.int()
.min(1)
.max(20)
.default(3)
.describe("Maximum concurrent requests"),
sameOriginOnly: z.boolean()
.default(true)
.describe("Whether to only crawl same-origin URLs"),
timeout: z.number()
.int()
.min(1000)
.max(300000)
.default(30000)
.describe("Request timeout in milliseconds"),
maxResults: z.number()
.int()
.min(1)
.max(50)
.optional()
.describe("Maximum results to return (for context management)"),
})
.partial()
.optional()
.describe("Optional crawling configuration")
}).strict();
type FetchInput = z.infer<typeof FetchInputSchema>;
const ReadInputSchema = z.object({
url: z.string()
.url("Must be a valid URL")
.describe("The URL to read and extract content from"),
options: z.object({
timeout: z.number()
.int()
.min(1000)
.max(300000)
.default(30000)
.describe("Request timeout in milliseconds"),
})
.partial()
.optional()
.describe("Optional reading configuration")
}).strict();
type ReadInput = z.infer<typeof ReadInputSchema>;
const ReadBatchInputSchema = z.object({
urls: z.array(z.string().url("Must be a valid URL"))
.min(1)
.max(50)
.describe("Array of URLs to read (1-50 URLs)"),
options: z.object({
maxConcurrency: z.number()
.int()
.min(1)
.max(20)
.default(5)
.describe("Maximum concurrent requests for batch processing"),
timeout: z.number()
.int()
.min(1000)
.max(300000)
.default(30000)
.describe("Request timeout in milliseconds"),
maxResults: z.number()
.int()
.min(1)
.max(50)
.optional()
.describe("Maximum results to return (for context management)"),
})
.partial()
.optional()
.describe("Optional batch reading configuration")
}).strict();
type ReadBatchInput = z.infer<typeof ReadBatchInputSchema>;
function formatMarkdownOutput(results: CrawlResult[], truncated: boolean): string {
const lines: string[] = ["# Web Crawl Results", ""];
results.forEach((result, index) => {
lines.push(`## Result ${index + 1}: ${result.url}`);
lines.push("");
if (result.error) {
lines.push(`**Error**: ${result.error}`);
lines.push("");
return;
}
if (result.title) {
lines.push(`**Title**: ${result.title}`);
lines.push("");
}
lines.push(result.markdown);
lines.push("");
});
if (truncated) {
lines.push("\n*Response truncated due to length. Consider using more specific URLs or reducing the number of pages.*");
}
return lines.join("\n");
}
const server = new McpServer({
name: "crawl-mcp-server",
version: "1.0.0"
});
server.registerTool(
"crawl_fetch_markdown",
{
title: "Fetch URL as Markdown",
description: `Fetch a single URL and convert it to clean Markdown.
WORKFLOW:
1. Fetch URL with Mozilla Readability algorithm
2. Convert to clean Markdown
3. Return Markdown result
Args:
- url (string, required): The URL to fetch
- options.timeout (number, optional): Request timeout in ms (default: 30000)
Returns Markdown with url, markdown, title, and error fields.`,
inputSchema: FetchMarkdownInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: true
}
},
async (params: FetchMarkdownInput) => {
try {
const crawlOptions: Partial<CrawlOptions> = {
pages: 1,
respectRobots: false,
cacheDir: ".cache",
timeout: params.options?.timeout ?? 30000
};
const markdown = await fetchMarkdown(params.url, crawlOptions);
const result: CrawlResult = {
url: params.url,
markdown
};
const toolResult: ToolResult = {
content: [{ type: "text", text: result.markdown }],
structuredContent: result
};
return toolResult;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [{
type: "text",
text: `Error fetching ${params.url}: ${errorMessage}`
}]
};
}
}
);
server.registerTool(
"crawl_fetch",
{
title: "Crawl Multiple Web Pages",
description: `Fetch a URL and crawl multiple pages with intelligent link extraction.
WORKFLOW:
1. Analyze starting URL and extract links
2. Execute concurrent crawling (configurable 1-20)
3. Aggregate and format results
ARGS:
- url (string, required): The starting URL to crawl
- options.pages (number, default: 1): Maximum pages to crawl (1-100)
- options.maxConcurrency (number, default: 3): Concurrent requests (1-20)
- options.sameOriginOnly (boolean, default: true): Same-origin filtering
- options.timeout (number, default: 30000): Request timeout (ms)
- options.maxResults (number, optional): Limit returned results (1-50)
Returns Markdown with results array and summary statistics.`,
inputSchema: FetchInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: true
}
},
async (params: FetchInput) => {
try {
const crawlOptions: CrawlOptions = {
pages: params.options?.pages ?? 1,
maxConcurrency: params.options?.maxConcurrency ?? 3,
respectRobots: false,
sameOriginOnly: params.options?.sameOriginOnly ?? true,
cacheDir: ".cache",
timeout: params.options?.timeout ?? 30000
};
const externalResults = await fetch(params.url, crawlOptions);
const results = externalResults.map(mapExternalCrawlResult);
const maxResults = params.options?.maxResults;
const limitedResults = maxResults ? results.slice(0, maxResults) : results;
const output = {
results: limitedResults,
count: limitedResults.length,
summary: {
successful: limitedResults.filter(r => !r.error).length,
failed: limitedResults.filter(r => r.error).length,
...(maxResults && results.length > maxResults ? {
note: `Results limited to ${maxResults} of ${results.length} total`
} : {})
}
};
const formattedOutput = formatMarkdownOutput(limitedResults, maxResults ? results.length > maxResults : false);
const toolResult: ToolResult = {
content: [{ type: "text", text: formattedOutput }],
structuredContent: output
};
return toolResult;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [{
type: "text",
text: `Error crawling ${params.url}: ${errorMessage}`
}]
};
}
}
);
server.registerTool(
"crawl_read",
{
title: "Simple URL Reader",
description: `Simple, fast URL content reader for basic use cases.
WORKFLOW:
1. Read URL with default settings
2. Extract content using Mozilla Readability
3. Return Markdown result
ARGS:
- url (string, required): The URL to read
- options.timeout (number, optional): Request timeout in ms (default: 30000)
Returns Markdown with url, markdown, title, and error fields.`,
inputSchema: ReadInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: true
}
},
async (params: ReadInput) => {
try {
const crawlOptions: Partial<CrawlOptions> = {
pages: 1,
respectRobots: false,
cacheDir: ".cache",
timeout: params.options?.timeout ?? 30000
};
const markdown = await fetchMarkdown(params.url, crawlOptions);
const result: CrawlResult = {
url: params.url,
markdown
};
const toolResult: ToolResult = {
content: [{ type: "text", text: result.markdown }],
structuredContent: result
};
return toolResult;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [{
type: "text",
text: `Error reading ${params.url}: ${errorMessage}`
}]
};
}
}
);
server.registerTool(
"crawl_read_batch",
{
title: "Batch URL Reader",
description: `Efficiently read multiple URLs in a single call.
WORKFLOW:
1. Validate all URLs (1-50 URLs)
2. Execute concurrent reading (configurable 1-20)
3. Aggregate and summarize results
ARGS:
- urls (array of strings, required): URLs to read (1-50 URLs)
- options.maxConcurrency (number, default: 5): Concurrent requests (1-20)
- options.timeout (number, default: 30000): Request timeout (ms)
- options.maxResults (number, optional): Limit returned results (1-50)
Returns JSON with results array and summary statistics.`,
inputSchema: ReadBatchInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: true
}
},
async (params: ReadBatchInput) => {
try {
const crawlOptions: CrawlOptions = {
pages: 1,
maxConcurrency: params.options?.maxConcurrency ?? 5,
respectRobots: false,
sameOriginOnly: false,
cacheDir: ".cache",
timeout: params.options?.timeout ?? 30000
};
const results: CrawlResult[] = [];
for (const url of params.urls) {
try {
const markdown = await fetchMarkdown(url, crawlOptions);
results.push({
url,
markdown,
title: undefined
});
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
results.push({
url,
markdown: "",
error: errorMessage
});
}
}
const maxResults = params.options?.maxResults;
const limitedResults = maxResults ? results.slice(0, maxResults) : results;
const lines: string[] = ["# Batch Reading Results", "", `Total URLs: ${params.urls.length}`, ""];
limitedResults.forEach((result, index) => {
lines.push(`## Result ${index + 1}: ${result.url}`);
lines.push("");
if (result.error) {
lines.push(`**Error**: ${result.error}`);
lines.push("");
return;
}
lines.push(result.markdown);
lines.push("");
});
if (maxResults && results.length > maxResults) {
lines.push(`\n*Results limited to ${maxResults} of ${results.length} total*`);
}
const markdownText = lines.join("\n");
const toolResult: ToolResult = {
content: [{ type: "text", text: markdownText }],
structuredContent: {
results: limitedResults,
count: limitedResults.length,
summary: {
successful: limitedResults.filter(r => !r.error).length,
failed: limitedResults.filter(r => !r.error).length,
total_urls: params.urls.length,
...(maxResults && results.length > maxResults ? {
note: `Results limited to ${maxResults} of ${results.length} total`
} : {})
}
}
};
return toolResult;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [{
type: "text",
text: `Error reading batch: ${errorMessage}`
}]
};
}
}
);
const REALISTIC_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
const ANTI_DETECTION_ARGS = [
"--disable-blink-features=AutomationControlled"
];
const EXTRA_HTTP_HEADERS = {
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9",
"DNT": "1"
};
import type { Browser, BrowserContext, Page } from "playwright-core";
type ConnectionType = "cdp_system" | "cdp_remote" | "local" | null;
type State = {
browser: Browser | null;
context: BrowserContext | null;
page: Page | null;
connectedVia: ConnectionType;
};
const state: State = {
browser: null,
context: null,
page: null,
connectedVia: null
};
async function ensurePage() {
if (state.page) return state.page;
if (!state.browser) throw new Error("No browser session. Call launch_chrome_cdp, connect_cdp, or launch_local first.");
const contexts = state.browser.contexts();
state.context = contexts[0] ?? (await state.browser.newContext());
const pages = state.context.pages();
state.page = pages[0] ?? (await state.context.newPage());
return state.page;
}
async function setupAntiDetection(page: Page): Promise<void> {
await page.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
(window as { chrome?: { runtime: Record<string, never> } }).chrome = { runtime: {} };
});
}
async function closeAll(): Promise<void> {
try {
if (state.page) {
await state.page.close({ runBeforeUnload: true }).catch(() => {});
}
} finally {
state.page = null;
}
try {
if (state.context) {
await state.context.close().catch(() => {});
}
} finally {
state.context = null;
}
try {
if (state.browser && state.connectedVia === "local") {
await state.browser.close().catch(() => {});
}
} finally {
if (state.connectedVia !== "cdp_system") {
state.browser = null;
}
state.connectedVia = null;
}
}
const LaunchChromeCDPInputSchema = z.object({
headless: z.boolean().default(true),
port: z.number().int().min(1024).max(65535).default(9222),
userDataDir: z.string().optional()
}).strict();
const ConnectCDPInputSchema = z.object({
cdpWsUrl: z.string().min(1).describe("CDP WebSocket URL or HTTP endpoint")
}).strict();
const SearchInputSchema = z.object({
query: z.string().min(1),
category: z.enum(["general", "images", "videos", "news", "map", "music", "it", "science"]).default("general"),
maxResults: z.number().int().min(1).max(50).default(20),
timeRange: z.enum(["day", "week", "month", "year"]).optional(),
language: z.string().default("en")
}).strict();
server.registerTool(
"launch_chrome_cdp",
{
title: "Launch system Chrome via CDP",
description: "Launch local system Chrome with remote debugging enabled for SearX search. Uses your installed Chrome.",
inputSchema: LaunchChromeCDPInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: false,
openWorldHint: false
}
},
async ({ headless, port, userDataDir }) => {
await closeAll();
const runningChrome = await cdpManager.startChrome({ headless, port, userDataDir });
state.connectedVia = "cdp_system";
const output = {
ok: true,
endpoint: runningChrome.endpointURL,
port: runningChrome.port,
headless,
connected: await cdpManager.isHealthy()
};
return {
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
structuredContent: output
};
}
);
server.registerTool(
"connect_cdp",
{
title: "Connect to remote CDP browser",
description: "Connect to remote CDP browser (Browserbase, remote Chrome, etc.).",
inputSchema: ConnectCDPInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: false,
openWorldHint: false
}
},
async ({ cdpWsUrl }) => {
await closeAll();
const { chromium } = await import("playwright-core");
state.browser = await chromium.connectOverCDP(cdpWsUrl, { timeout: 30000 });
state.connectedVia = "cdp_remote";
return {
content: [{ type: "text", text: JSON.stringify({
ok: true,
connectedVia: "cdp_remote" as const,
endpoint: cdpWsUrl
}, null, 2) }],
structuredContent: { ok: true, connectedVia: "cdp_remote" as const, endpoint: cdpWsUrl }
};
}
);
server.registerTool(
"launch_local",
{
title: "Launch local Chromium (bundled)",
description: "Launch bundled Chromium for SearX search. Works with SearX but DuckDuckGo requires CDP.",
inputSchema: z.object({
headless: z.boolean().default(true),
userAgent: z.string().optional()
}),
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: false,
openWorldHint: false
}
},
async ({ headless, userAgent }) => {
await closeAll();
const { chromium } = await import("playwright-core");
state.browser = await chromium.launch({ headless, args: ANTI_DETECTION_ARGS });
state.connectedVia = "local";
const contextOptions = {
userAgent: userAgent || REALISTIC_USER_AGENT,
viewport: { width: 1920, height: 1080 },
locale: "en-US",
extraHTTPHeaders: EXTRA_HTTP_HEADERS,
ignoreHTTPSErrors: true
} as const;
state.context = await state.browser.newContext(contextOptions);
state.page = await state.context.newPage();
await setupAntiDetection(state.page);
return {
content: [{ type: "text", text: JSON.stringify({ ok: true, connectedVia: "local" as const, hasPage: true }) }],
structuredContent: { ok: true, connectedVia: "local" as const, hasPage: true }
};
}
);
server.registerTool(
"search_searx",
{
title: "Search SearXNG (Headless-Capable)",
description: `Search SearX using searx.fmhy.net. Works with local Chromium, system Chrome CDP, or remote CDP.
WORKFLOW:
1. Launch/connect browser (launch_local, launch_chrome_cdp, or connect_cdp)
2. Navigate to searx.fmhy.net
3. Execute search query
4. Parse and extract results
ARGS:
- query (string, required): Search query
- category (enum, default: general): Search category
- maxResults (number, default: 20): Maximum results (1-50)
- timeRange (enum, optional): Time filter
- language (string, default: en): Language code
Returns JSON with search results array and metadata.`,
inputSchema: SearchInputSchema,
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: true
}
},
async ({ query, category, maxResults, timeRange, language }) => {
if (!state.browser) {
const { chromium } = await import("playwright-core");
state.browser = await chromium.launch({ headless: true, args: ANTI_DETECTION_ARGS });
state.connectedVia = "local";
const contextOptions = {
userAgent: REALISTIC_USER_AGENT,
viewport: { width: 1920, height: 1080 },
locale: "en-US",
extraHTTPHeaders: EXTRA_HTTP_HEADERS,
ignoreHTTPSErrors: true
} as const;
state.context = await state.browser.newContext(contextOptions);
state.page = await state.context.newPage();
await setupAntiDetection(state.page);
}
const page = await ensurePage();
const searchUrl = `https://searx.fmhy.net/?q=${encodeURIComponent(query)}&category=${category}${timeRange ? `&time_range=${timeRange}` : ""}&lang=${language}`;
await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
const resultSelectors = ["article.result", "div.result", "li.result", ".result"];
let resultsFound = false;
for (const selector of resultSelectors) {
try {
await page.waitForSelector(selector, { timeout: 5000 });
resultsFound = true;
break;
} catch {
continue;
}
}
await page.waitForTimeout(resultsFound ? 1000 : 3000);
const diagnostic = await page.evaluate(() => ({
finalUrl: window.location.href,
pageTitle: document.title,
bodyLength: document.body.textContent?.length ?? 0
}));
const results = await page.evaluate((max: number) => {
const items: Array<{ title: string; url: string; snippet: string; engine?: string }> = [];
const selectors = ["article.result", "div.result", "li.result", ".result"];
for (const sel of selectors) {
const elements = document.querySelectorAll(sel);
if (elements.length > 0) {
elements.forEach((el: DOMElement) => {
const titleEl = el.querySelector("h3 a, h2 a, a");
const title = titleEl?.textContent?.trim() ?? "";
const urlEl = el.querySelector("a");
const url = urlEl?.href ?? "";
const snippetEl = el.querySelector("p, .content, .description");
const snippet = snippetEl?.textContent?.trim() ?? "";
const engineEl = el.querySelector(".engine, .badge, .result_engine");
const engine = engineEl?.textContent?.trim() || undefined;
if (title && url && !url.startsWith("javascript:") && !url.startsWith("#")) {
items.push({ title, url, snippet, engine });
}
});
if (items.length > 0) break;
}
}
return items.slice(0, max);
}, maxResults).catch(() => []);
const output = {
ok: true,
engine: "searx" as const,
query,
url: page.url(),
resultsCount: results.length,
diagnostic: results.length === 0 ? diagnostic : undefined,
results: results as Array<{ title: string; url: string; snippet: string; engine?: string }>
};
return {
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
structuredContent: output
};
}
);
server.registerTool(
"chrome_status",
{
title: "Get Chrome CDP status",
description: "Check if system Chrome CDP is running and healthy.",
inputSchema: z.object({}),
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: false
}
},
async () => {
const running = cdpManager.chrome !== null;
const healthy = await cdpManager.isHealthy();
return {
content: [{ type: "text", text: JSON.stringify({
running,
healthy,
endpoint: cdpManager.chrome?.endpointURL,
pid: cdpManager.chrome?.proc.pid
}, null, 2) }],
structuredContent: {
running,
healthy,
endpoint: cdpManager.chrome?.endpointURL,
pid: cdpManager.chrome?.proc.pid
}
};
}
);
server.registerTool(
"close",
{
title: "Close browser session",
description: "Close current browser session/context. Keeps Chrome CDP running if launched.",
inputSchema: z.object({}),
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: false
}
},
async () => {
await closeAll();
return {
content: [{ type: "text", text: JSON.stringify({
ok: true,
chromeStillRunning: cdpManager.chrome !== null
}, null, 2) }],
structuredContent: {
ok: true,
chromeStillRunning: cdpManager.chrome !== null
}
};
}
);
server.registerTool(
"shutdown_chrome_cdp",
{
title: "Shutdown system Chrome CDP",
description: "Shutdown the system Chrome CDP process and cleanup temp directories.",
inputSchema: z.object({}),
annotations: {
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
openWorldHint: false
}
},
async () => {
await closeAll();
await cdpManager.shutdown();
return {
content: [{ type: "text", text: JSON.stringify({
ok: true,
message: "Chrome CDP shutdown complete"
}, null, 2) }],
structuredContent: { ok: true, message: "Chrome CDP shutdown complete" }
};
}
);
async function runStdio() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("MCP server running via stdio");
}
async function runHTTP() {
const app = express();
app.use(express.json());
app.post('/mcp', async (req: express.Request, res: express.Response) => {
const transport = new StreamableHTTPServerTransport({
sessionIdGenerator: undefined,
enableJsonResponse: true
});
res.on('close', () => transport.close());
await server.connect(transport);
await transport.handleRequest(req, res, req.body);
});
const port = parseInt(process.env.PORT || '3000');
app.listen(port, () => {
console.error(`MCP server running on http://localhost:${port}/mcp`);
});
}
const transport = process.env.TRANSPORT || 'stdio';
if (transport === 'http') {
runHTTP().catch(error => {
console.error("Server error:", error);
process.exit(1);
});
} else {
runStdio().catch(error => {
console.error("Server error:", error);
process.exit(1);
});
}
process.on('SIGINT', async () => {
console.error('Shutting down...');
await cdpManager.shutdown().catch(() => {});
process.exit(0);
});
process.on('SIGTERM', async () => {
console.error('Shutting down...');
await cdpManager.shutdown().catch(() => {});
process.exit(0);
});