import { cache } from "./cache.js";
const FUMADOCS_BASE_URL = "https://www.fumadocs.dev";
const REQUEST_TIMEOUT = 30000; // 30 seconds
const MAX_RETRIES = 2;
const RETRY_DELAY = 1000; // 1 second
export class FumadocsError extends Error {
constructor(
message: string,
public code: string,
public details?: Record<string, unknown>
) {
super(message);
this.name = "FumadocsError";
}
}
async function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchWithRetry(
url: string,
retries = MAX_RETRIES
): Promise<string> {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: {
"User-Agent": "fumadocs-mcp/1.0.0",
Accept: "text/html,text/plain,text/markdown,*/*",
},
});
clearTimeout(timeoutId);
if (!response.ok) {
if (response.status === 404) {
throw new FumadocsError(
`Page not found: ${url}`,
"PAGE_NOT_FOUND",
{ url, status: 404 }
);
}
if (response.status === 429 && retries > 0) {
await sleep(5000); // Wait 5 seconds for rate limiting
return fetchWithRetry(url, retries - 1);
}
throw new FumadocsError(
`HTTP error ${response.status}: ${response.statusText}`,
"HTTP_ERROR",
{ url, status: response.status }
);
}
return await response.text();
} catch (error) {
clearTimeout(timeoutId);
if (error instanceof FumadocsError) {
throw error;
}
if (error instanceof Error) {
if (error.name === "AbortError") {
throw new FumadocsError(
`Request timeout after ${REQUEST_TIMEOUT}ms`,
"TIMEOUT",
{ url }
);
}
// Network error - retry
if (retries > 0) {
await sleep(RETRY_DELAY * (MAX_RETRIES - retries + 1));
return fetchWithRetry(url, retries - 1);
}
throw new FumadocsError(
`Network error: ${error.message}`,
"NETWORK_ERROR",
{ url, originalError: error.message }
);
}
throw new FumadocsError(
"Unknown error occurred",
"UNKNOWN_ERROR",
{ url }
);
}
}
export interface DocsIndex {
sections: {
name: string;
id: string;
entries: {
title: string;
path: string;
description: string;
}[];
}[];
}
export async function fetchLlmsTxt(): Promise<string> {
const cacheKey = "llms-txt";
const cached = cache.get<string>(cacheKey);
if (cached) {
return cached;
}
const content = await fetchWithRetry(`${FUMADOCS_BASE_URL}/llms.txt`);
cache.set(cacheKey, content, "DOCS_INDEX");
return content;
}
export async function fetchLlmsFullTxt(): Promise<string> {
const cacheKey = "llms-full-txt";
const cached = cache.get<string>(cacheKey);
if (cached) {
return cached;
}
const content = await fetchWithRetry(`${FUMADOCS_BASE_URL}/llms-full.txt`);
cache.set(cacheKey, content, "FULL_DOCS");
return content;
}
export async function fetchPage(path: string): Promise<string> {
// Normalize path
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
const cacheKey = `page:${normalizedPath}`;
const cached = cache.get<string>(cacheKey);
if (cached) {
return cached;
}
// Try fetching from llms-full.txt first (contains all docs in markdown)
try {
const fullDocs = await fetchLlmsFullTxt();
const pageContent = extractPageFromFullDocs(fullDocs, normalizedPath);
if (pageContent) {
cache.set(cacheKey, pageContent, "PAGE_CONTENT");
return pageContent;
}
} catch {
// Fall through to direct fetch
}
// Fallback: fetch the HTML page directly
const url = `${FUMADOCS_BASE_URL}${normalizedPath}`;
const html = await fetchWithRetry(url);
const content = extractContentFromHtml(html);
cache.set(cacheKey, content, "PAGE_CONTENT");
return content;
}
function extractPageFromFullDocs(fullDocs: string, path: string): string | null {
// The llms-full.txt format has pages marked with:
// # Title
// URL: /path
// Source: https://...
// Normalize the path
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
// Find the page by its URL marker
const urlPattern = new RegExp(`URL:\\s*${escapeRegex(normalizedPath)}\\s*\\n`, "m");
const urlMatch = fullDocs.match(urlPattern);
if (!urlMatch || urlMatch.index === undefined) {
return null;
}
// Find the start of this page section (the # heading before the URL)
const beforeUrl = fullDocs.slice(0, urlMatch.index);
const lastHeadingMatch = beforeUrl.match(/\n(# [^\n]+)\s*$/);
const startIndex = lastHeadingMatch
? urlMatch.index - (lastHeadingMatch[0].length - 1)
: urlMatch.index;
// Find the end of this page (next page starts with # Title followed by URL:)
const afterStart = fullDocs.slice(urlMatch.index + urlMatch[0].length);
const nextPageMatch = afterStart.match(/\n# [^\n]+\nURL:/);
const endIndex = nextPageMatch?.index !== undefined
? urlMatch.index + urlMatch[0].length + nextPageMatch.index
: fullDocs.length;
const content = fullDocs.slice(startIndex, endIndex).trim();
if (content.length > 50) {
return content;
}
return null;
}
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function extractContentFromHtml(html: string): string {
// Extract page title from meta or h1
let title = "";
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
if (titleMatch) {
title = titleMatch[1].replace(/\s*[|\-–—].*$/, "").trim();
}
// Extract description from meta tag
let description = "";
const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]*)"[^>]*>/i);
if (descMatch) {
description = descMatch[1];
}
// Remove script, style, nav, and other non-content elements
let content = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, "")
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, "")
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, "")
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, "")
.replace(/<!--[\s\S]*?-->/g, "");
// Try to find the main content area - Fumadocs uses article or specific class patterns
// Priority: article > main > div with prose class
const articleMatch = content.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
if (articleMatch) {
content = articleMatch[1];
} else {
const mainMatch = content.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
if (mainMatch) {
content = mainMatch[1];
} else {
// Try to find prose content (common in Tailwind-based docs)
const proseMatch = content.match(/<div[^>]*class="[^"]*prose[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
if (proseMatch) {
content = proseMatch[1];
}
}
}
// Extract code blocks with language hints before general processing
// Match pre > code with data-language or class="language-xxx"
content = content.replace(
/<pre[^>]*>[\s\S]*?<code[^>]*(?:data-language="([^"]*)"[^>]*|class="[^"]*language-([^"\s]*)[^"]*"[^>]*)>([\s\S]*?)<\/code>[\s\S]*?<\/pre>/gi,
(_, lang1, lang2, code) => {
const lang = lang1 || lang2 || "";
return `\n\`\`\`${lang}\n${code}\n\`\`\`\n`;
}
);
// Handle remaining code blocks without language
content = content.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, "\n```\n$1\n```\n");
// Convert HTML to markdown-like format
content = content
// Headings (preserve id for anchors)
.replace(/<h1[^>]*id="([^"]*)"[^>]*>([\s\S]*?)<\/h1>/gi, "\n# $2 [#$1]\n")
.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, "\n# $1\n")
.replace(/<h2[^>]*id="([^"]*)"[^>]*>([\s\S]*?)<\/h2>/gi, "\n## $2 [#$1]\n")
.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, "\n## $1\n")
.replace(/<h3[^>]*id="([^"]*)"[^>]*>([\s\S]*?)<\/h3>/gi, "\n### $2 [#$1]\n")
.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, "\n### $1\n")
.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, "\n#### $1\n")
// Inline code
.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, "`$1`")
// Tables
.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_, tableContent) => {
return convertTableToMarkdown(tableContent);
})
// Lists
.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, "$1")
.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, "$1")
.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, "- $1\n")
// Paragraphs and breaks
.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, "\n$1\n")
.replace(/<br\s*\/?>/gi, "\n")
.replace(/<hr\s*\/?>/gi, "\n---\n")
// Links
.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)")
// Bold and italic
.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, "**$1**")
.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, "**$1**")
.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, "*$1*")
.replace(/<i[^>]*>([\s\S]*?)<\/i>/gi, "*$1*")
// Blockquotes
.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, quote) => {
return quote.split("\n").map((line: string) => `> ${line}`).join("\n");
})
// Definition lists (common in API docs)
.replace(/<dt[^>]*>([\s\S]*?)<\/dt>/gi, "\n**$1**\n")
.replace(/<dd[^>]*>([\s\S]*?)<\/dd>/gi, ": $1\n")
// Remove divs but keep content
.replace(/<div[^>]*>([\s\S]*?)<\/div>/gi, "$1")
.replace(/<span[^>]*>([\s\S]*?)<\/span>/gi, "$1")
// Remove remaining tags
.replace(/<[^>]+>/g, "")
// Decode HTML entities
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/&/g, "&")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/'/g, "'")
.replace(/ /g, " ")
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(parseInt(dec, 10)))
// Clean up whitespace
.replace(/\n{3,}/g, "\n\n")
.replace(/[ \t]+/g, " ")
.trim();
// Build final output with metadata
const output: string[] = [];
if (title) {
output.push(`# ${title}`);
}
if (description) {
output.push(`\n${description}\n`);
}
if (content) {
output.push(content);
}
return output.join("\n").trim();
}
function convertTableToMarkdown(tableHtml: string): string {
const rows: string[][] = [];
const headerMatch = tableHtml.match(/<thead[^>]*>([\s\S]*?)<\/thead>/i);
const bodyMatch = tableHtml.match(/<tbody[^>]*>([\s\S]*?)<\/tbody>/i);
// Extract header row
if (headerMatch) {
const headerCells: string[] = [];
const thMatches = headerMatch[1].matchAll(/<th[^>]*>([\s\S]*?)<\/th>/gi);
for (const match of thMatches) {
headerCells.push(match[1].replace(/<[^>]+>/g, "").trim());
}
if (headerCells.length > 0) {
rows.push(headerCells);
}
}
// Extract body rows
const content = bodyMatch ? bodyMatch[1] : tableHtml;
const rowMatches = content.matchAll(/<tr[^>]*>([\s\S]*?)<\/tr>/gi);
for (const rowMatch of rowMatches) {
const cells: string[] = [];
const cellMatches = rowMatch[1].matchAll(/<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi);
for (const cellMatch of cellMatches) {
cells.push(cellMatch[1].replace(/<[^>]+>/g, "").trim());
}
if (cells.length > 0) {
rows.push(cells);
}
}
if (rows.length === 0) return "";
// Build markdown table
const colCount = Math.max(...rows.map(r => r.length));
const lines: string[] = [];
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
// Pad row to have consistent columns
while (row.length < colCount) row.push("");
lines.push(`| ${row.join(" | ")} |`);
// Add separator after header
if (i === 0) {
lines.push(`| ${Array(colCount).fill("---").join(" | ")} |`);
}
}
return "\n" + lines.join("\n") + "\n";
}
export function parseDocsIndex(llmsTxt: string): DocsIndex {
const lines = llmsTxt.split("\n");
const sections: DocsIndex["sections"] = [];
let currentSection: DocsIndex["sections"][0] | null = null;
for (const line of lines) {
const trimmedLine = line.trim();
// Section header (## section-name)
if (trimmedLine.startsWith("## ")) {
const sectionName = trimmedLine.slice(3).trim();
currentSection = {
name: sectionName,
id: sectionName.toLowerCase().replace(/\s+/g, "-"),
entries: [],
};
sections.push(currentSection);
continue;
}
// Entry line (- [title](path): description)
const entryMatch = trimmedLine.match(/^-\s*\[([^\]]+)\]\(([^)]+)\)(?::\s*(.*))?$/);
if (entryMatch && currentSection) {
currentSection.entries.push({
title: entryMatch[1],
path: entryMatch[2],
description: entryMatch[3] || "",
});
}
}
return { sections };
}