RAG Documentation MCP Server
by rahulretnan
- src
- tools
import { ErrorCode, McpError } from "@modelcontextprotocol/sdk/types.js";
import * as cheerio from "cheerio";
import fs from "fs/promises";
import path from "path";
import { fileURLToPath } from "url";
import { ApiClient } from "../api-client.js";
import { McpToolResponse, ToolDefinition } from "../types.js";
import { BaseTool } from "./base-tool.js";
// Get current directory in ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const QUEUE_FILE = path.join(__dirname, "..", "..", "queue.txt");
export class ExtractUrlsTool extends BaseTool {
private apiClient: ApiClient;
constructor(apiClient: ApiClient) {
super();
this.apiClient = apiClient;
}
get definition(): ToolDefinition {
return {
name: "extract_urls",
description: "Extract all URLs from a given web page",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "URL of the page to extract URLs from",
},
add_to_queue: {
type: "boolean",
description:
"If true, automatically add extracted URLs to the queue",
default: false,
},
},
required: ["url"],
},
};
}
async execute(args: any): Promise<McpToolResponse> {
if (!args.url || typeof args.url !== "string") {
throw new McpError(ErrorCode.InvalidParams, "URL is required");
}
await this.apiClient.initBrowser();
const page = await this.apiClient.browser.newPage();
try {
await page.goto(args.url, { waitUntil: "networkidle" });
const content = await page.content();
const $ = cheerio.load(content);
const urls = new Set<string>();
$("a[href]").each((_, element) => {
const href = $(element).attr("href");
if (href) {
try {
const url = new URL(href, args.url);
// Only include URLs from the same domain to avoid external links
if (
url.origin === new URL(args.url).origin &&
!url.hash &&
!url.href.endsWith("#")
) {
urls.add(url.href);
}
} catch (e) {
// Ignore invalid URLs
}
}
});
const urlArray = Array.from(urls);
if (args.add_to_queue) {
try {
// Ensure queue file exists
try {
await fs.access(QUEUE_FILE);
} catch {
await fs.writeFile(QUEUE_FILE, "");
}
// Append URLs to queue
const urlsToAdd =
urlArray.join("\n") + (urlArray.length > 0 ? "\n" : "");
await fs.appendFile(QUEUE_FILE, urlsToAdd);
return {
content: [
{
type: "text",
text: `Successfully added ${urlArray.length} URLs to the queue`,
},
],
};
} catch (error) {
return {
content: [
{
type: "text",
text: `Failed to add URLs to queue: ${error}`,
},
],
isError: true,
};
}
}
return {
content: [
{
type: "text",
text: urlArray.join("\n") || "No URLs found on this page.",
},
],
};
} catch (error) {
return {
content: [
{
type: "text",
text: `Failed to extract URLs: ${error}`,
},
],
isError: true,
};
} finally {
await page.close();
}
}
}