DocsFetcher MCP Server

import * as cheerio from "cheerio"; import { APISignature, CodeExample } from "../types/index.js"; /** * Extract relevant links from HTML content * @param html HTML content * @param baseUrl Base URL of the page * @param libraryName Name of the library * @returns Array of relevant links */ export function extractRelevantLinks( html: string, baseUrl: string, libraryName: string ): string[] { const $ = cheerio.load(html); const links = new Set<string>(); const baseUrlObj = new URL(baseUrl); const libraryNameLower = libraryName.toLowerCase(); // Keywords that indicate important documentation pages const apiKeywords = [ "api", "reference", "doc", "guide", "tutorial", "example", "usage", "getting-started", "introduction", "started", ]; $("a[href]").each((_, element) => { const href = $(element).attr("href"); if (!href) return; try { // Convert relative URLs to absolute const absoluteUrl = new URL(href, baseUrl).href; const urlObj = new URL(absoluteUrl); // Only include links from the same hostname if (urlObj.hostname !== baseUrlObj.hostname) return; const linkText = $(element).text().toLowerCase(); const linkPath = urlObj.pathname.toLowerCase(); // Check if link contains relevant keywords const isRelevant = apiKeywords.some( (keyword) => linkPath.includes(keyword) || linkText.includes(keyword) ) || linkPath.includes(libraryNameLower); if (isRelevant) { // Avoid hash links to the same page if (absoluteUrl.split("#")[0] !== baseUrl.split("#")[0]) { links.add(absoluteUrl); } } } catch (error) { // Ignore invalid URLs } }); return Array.from(links); } /** * Extract code examples from HTML content * @param html HTML content * @returns Array of code examples */ export function extractCodeExamples(html: string): CodeExample[] { const $ = cheerio.load(html); const examples: CodeExample[] = []; $( 'pre code, pre, code, .highlight, .code-example, [class*="code"], [class*="example"]' ).each((_, element) => { const $elem = $(element); // Skip nested code elements if ( $elem.parents("pre, code").length > 0 && element.name !== "pre" && element.name !== "code" ) { return; } let code = $elem.text().trim(); if (!code || code.length < 10) return; // Skip very short code blocks let language = ""; // Try to determine the language from class attributes const className = $elem.attr("class") || ""; const classMatch = className.match(/(language|lang|syntax)-(\w+)/i); if (classMatch) { language = classMatch[2]; } else if (className.includes("js") || className.includes("javascript")) { language = "javascript"; } else if (className.includes("ts") || className.includes("typescript")) { language = "typescript"; } if (!language) { language = $elem.attr("data-language") || $elem.attr("data-lang") || $elem.attr("language") || $elem.attr("lang") || ""; } // Try to find a description for this code block let description = ""; let $heading = $elem.prev("h1, h2, h3, h4, h5, h6, p"); if ($heading.length > 0) { description = $heading.text().trim(); } else { // Look for a heading in the parent element const $parent = $elem.parent(); $heading = $parent.find("h1, h2, h3, h4, h5, h6").first(); if ($heading.length > 0) { description = $heading.text().trim(); } } examples.push({ code, language: language.toLowerCase(), description, }); }); return examples; } /** * Extract API signatures from HTML content * @param html HTML content * @param libraryName Name of the library * @returns Array of API signatures */ export function extractAPISignatures( html: string, libraryName: string ): APISignature[] { const $ = cheerio.load(html); const signatures: APISignature[] = []; const cleanText = (text: string): string => text.replace(/\s+/g, " ").trim(); $("h1, h2, h3, h4, h5, h6").each((_, heading) => { const $heading = $(heading); const headingText = cleanText($heading.text()); // Skip very long headings or common sections if ( headingText.length > 100 || headingText.toLowerCase().includes("introduction") || headingText.toLowerCase().includes("getting started") ) { return; } let signature = ""; let description = ""; // Look for code blocks after the heading const $code = $heading .nextAll("pre, code, .signature, .function-signature") .first(); if ( $code.length > 0 && $code.prevAll("h1, h2, h3, h4, h5, h6").first().is($heading) ) { signature = cleanText($code.text()); } // Look for description paragraphs const $description = $heading.nextAll("p").first(); if ( $description.length > 0 && $description.prevAll("h1, h2, h3, h4, h5, h6").first().is($heading) ) { description = cleanText($description.text()); } // Only add if we have either a signature or description if (signature || description) { signatures.push({ name: headingText, signature, description, }); } }); return signatures; }