DocsFetcher MCP Server
by cdugo
- src
- utils
import * as cheerio from "cheerio";
import { APISignature, CodeExample } from "../types/index.js";
/**
* Extract relevant links from HTML content
* @param html HTML content
* @param baseUrl Base URL of the page
* @param libraryName Name of the library
* @returns Array of relevant links
*/
export function extractRelevantLinks(
html: string,
baseUrl: string,
libraryName: string
): string[] {
const $ = cheerio.load(html);
const links = new Set<string>();
const baseUrlObj = new URL(baseUrl);
const libraryNameLower = libraryName.toLowerCase();
// Keywords that indicate important documentation pages
const apiKeywords = [
"api",
"reference",
"doc",
"guide",
"tutorial",
"example",
"usage",
"getting-started",
"introduction",
"started",
];
$("a[href]").each((_, element) => {
const href = $(element).attr("href");
if (!href) return;
try {
// Convert relative URLs to absolute
const absoluteUrl = new URL(href, baseUrl).href;
const urlObj = new URL(absoluteUrl);
// Only include links from the same hostname
if (urlObj.hostname !== baseUrlObj.hostname) return;
const linkText = $(element).text().toLowerCase();
const linkPath = urlObj.pathname.toLowerCase();
// Check if link contains relevant keywords
const isRelevant =
apiKeywords.some(
(keyword) => linkPath.includes(keyword) || linkText.includes(keyword)
) || linkPath.includes(libraryNameLower);
if (isRelevant) {
// Avoid hash links to the same page
if (absoluteUrl.split("#")[0] !== baseUrl.split("#")[0]) {
links.add(absoluteUrl);
}
}
} catch (error) {
// Ignore invalid URLs
}
});
return Array.from(links);
}
/**
* Extract code examples from HTML content
* @param html HTML content
* @returns Array of code examples
*/
export function extractCodeExamples(html: string): CodeExample[] {
const $ = cheerio.load(html);
const examples: CodeExample[] = [];
$(
'pre code, pre, code, .highlight, .code-example, [class*="code"], [class*="example"]'
).each((_, element) => {
const $elem = $(element);
// Skip nested code elements
if (
$elem.parents("pre, code").length > 0 &&
element.name !== "pre" &&
element.name !== "code"
) {
return;
}
let code = $elem.text().trim();
if (!code || code.length < 10) return; // Skip very short code blocks
let language = "";
// Try to determine the language from class attributes
const className = $elem.attr("class") || "";
const classMatch = className.match(/(language|lang|syntax)-(\w+)/i);
if (classMatch) {
language = classMatch[2];
} else if (className.includes("js") || className.includes("javascript")) {
language = "javascript";
} else if (className.includes("ts") || className.includes("typescript")) {
language = "typescript";
}
if (!language) {
language =
$elem.attr("data-language") ||
$elem.attr("data-lang") ||
$elem.attr("language") ||
$elem.attr("lang") ||
"";
}
// Try to find a description for this code block
let description = "";
let $heading = $elem.prev("h1, h2, h3, h4, h5, h6, p");
if ($heading.length > 0) {
description = $heading.text().trim();
} else {
// Look for a heading in the parent element
const $parent = $elem.parent();
$heading = $parent.find("h1, h2, h3, h4, h5, h6").first();
if ($heading.length > 0) {
description = $heading.text().trim();
}
}
examples.push({
code,
language: language.toLowerCase(),
description,
});
});
return examples;
}
/**
* Extract API signatures from HTML content
* @param html HTML content
* @param libraryName Name of the library
* @returns Array of API signatures
*/
export function extractAPISignatures(
html: string,
libraryName: string
): APISignature[] {
const $ = cheerio.load(html);
const signatures: APISignature[] = [];
const cleanText = (text: string): string => text.replace(/\s+/g, " ").trim();
$("h1, h2, h3, h4, h5, h6").each((_, heading) => {
const $heading = $(heading);
const headingText = cleanText($heading.text());
// Skip very long headings or common sections
if (
headingText.length > 100 ||
headingText.toLowerCase().includes("introduction") ||
headingText.toLowerCase().includes("getting started")
) {
return;
}
let signature = "";
let description = "";
// Look for code blocks after the heading
const $code = $heading
.nextAll("pre, code, .signature, .function-signature")
.first();
if (
$code.length > 0 &&
$code.prevAll("h1, h2, h3, h4, h5, h6").first().is($heading)
) {
signature = cleanText($code.text());
}
// Look for description paragraphs
const $description = $heading.nextAll("p").first();
if (
$description.length > 0 &&
$description.prevAll("h1, h2, h3, h4, h5, h6").first().is($heading)
) {
description = cleanText($description.text());
}
// Only add if we have either a signature or description
if (signature || description) {
signatures.push({
name: headingText,
signature,
description,
});
}
});
return signatures;
}