generate_llms_txt
Generate a spec-compliant llms.txt for any domain by reading its sitemap and synthesizing a grouped summary of sampled pages. Optionally includes the expanded llms-full.txt variant.
Instructions
Generate a spec-compliant llms.txt (and optionally llms-full.txt) for a domain by reading its sitemap, sampling up to max_pages pages, and synthesizing a grouped, sectioned summary.
Read-only. Issues one HTTP GET for the sitemap then one per sampled page.
Deterministic; no LLM. Output is the file content as a string - this tool does NOT write to disk or upload anywhere. The caller is responsible for hosting the resulting file at https://<domain>/llms.txt.
When to use: bootstrapping llms.txt for a site you own. To check an existing llms.txt, use validate_llms_txt instead.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| domain | Yes | Hostname or origin to generate llms.txt for. Examples: `example.com`, `https://example.com`. The tool reads the domain's sitemap, fetches up to `max_pages` of them, and synthesizes a spec-compliant llms.txt grouped by section. Issues N+1 HTTP GETs: one for the sitemap, then one per sampled page. Read-only. | |
| max_pages | No | How many pages to sample from the sitemap when building section groupings. Default 30. Each page is fetched (one HTTP GET per page) - keep this low for large sites or rate-limited hosts. | |
| include_full | No | If true, also generate llms-full.txt (the expanded variant containing full page text, not just URLs and titles). Default false. The llms-full.txt output can be large; only enable when you actually plan to host both files. | |
| site_name | No | Override the site name used in the generated llms.txt header. If omitted, inferred from the homepage's <title> tag. | |
| site_description | No | Override the site description used in the generated llms.txt header. If omitted, inferred from the homepage's meta description. |
Implementation Reference
- src/tools/generate-llms-txt.ts:67-211 (handler)Main handler function for the generate_llms_txt tool. Fetches sitemap, samples up to max_pages pages, groups by section, and generates llms.txt (and optionally llms-full.txt).
export async function generateLlmsTxtTool( input: GenerateLlmsTxtInput, hostDelays?: HostDelayMap, robotsCache?: Map<string, string> ): Promise<LlmsTxtResult> { const hostname = normalizeDomain(input.domain); const baseUrl = `https://${hostname}`; const validation_issues: Finding[] = []; // Discover pages from sitemap let pageUrls: string[] = []; try { const sitemapResult = await checkSitemap( { domain: hostname, max_urls_to_check: input.max_pages }, hostDelays, robotsCache ); if (sitemapResult.status === "found" && sitemapResult.sitemap_url) { // Re-fetch the sitemap to get priority-sorted URLs try { const sRes = await politeFetch(sitemapResult.sitemap_url, { respectRobots: false, hostDelays, robotsCache, }); const parser = new XMLParser({ ignoreAttributes: false }); const parsed = parser.parse(sRes.body) as Record<string, unknown>; const urlset = parsed["urlset"] as Record<string, unknown> | undefined; if (urlset) { const urlEntries = urlset["url"]; const urlList: Array<Record<string, unknown>> = Array.isArray(urlEntries) ? (urlEntries as Array<Record<string, unknown>>) : urlEntries ? [urlEntries as Record<string, unknown>] : []; pageUrls = urlList .sort((a, b) => { const pa = parseFloat(String(a["priority"] ?? "0.5")); const pb = parseFloat(String(b["priority"] ?? "0.5")); return pb - pa; }) .slice(0, input.max_pages) .map((u) => String(u["loc"] ?? "")) .filter(Boolean); } } catch { // fall through to root page fallback } } } catch { // sitemap unavailable } // Fallback to root page if no sitemap if (pageUrls.length === 0) { pageUrls = [baseUrl]; validation_issues.push({ severity: "warning", category: "sitemap", where: `https://${hostname}/sitemap.xml`, message: "No sitemap found - llms.txt generated from root page only.", fix: "Create a sitemap.xml to enable comprehensive llms.txt generation.", estimated_impact: "medium", }); } // Fetch each page const pages: Array<LlmsPage & { fullText?: string }> = []; let siteName = input.site_name ?? hostname; let siteDescription = input.site_description ?? `Content from ${hostname}.`; for (const url of pageUrls) { if (pages.length >= input.max_pages) break; try { const res = await politeFetch(url, { respectRobots: true, hostDelays: hostDelays ?? new Map(), robotsCache, }); const head = parseHead(res.body); const body = parseBody(res.body, url); let pathFallback = ""; try { pathFallback = new URL(url).pathname.replace(/\//g, " ").trim(); } catch { pathFallback = url; } const title = head.ogTitle ?? head.title ?? (pathFallback || url); const description = head.metaDescription ?? head.ogDescription ?? (body.paragraphs[0] ? body.paragraphs[0].substring(0, 120) : ""); // Use site root for name/description if this is the home page if (url === baseUrl || url === `${baseUrl}/`) { if (!input.site_name && head.ogTitle) siteName = head.ogTitle; if (!input.site_description && head.metaDescription) { siteDescription = head.metaDescription; } } pages.push({ url, title, description, fullText: body.bodyText.substring(0, 5000), }); } catch { // skip pages that fail to fetch } } if (pages.length === 0) { pages.push({ url: baseUrl, title: siteName, description: siteDescription }); } const groups = groupPagesBySection(pages); const llms_txt = buildLlmsTxt(siteName, siteDescription, groups); let llms_full_txt: string | null = null; if (input.include_full) { const { content, truncated } = generateLlmsFullTxt(siteName, siteDescription, pages); llms_full_txt = content; if (truncated) { validation_issues.push({ severity: "info", category: "llms_txt", where: "llms-full.txt", message: "llms-full.txt was truncated at 500KB.", fix: "Reduce max_pages or trim per-page body text extraction.", }); } } // Validate generated output const structuralIssues = validateLlmsTxtContent(llms_txt); validation_issues.push(...structuralIssues); return { llms_txt, llms_full_txt, pages_indexed: pages.length, validation_issues, suggested_path: "/llms.txt", }; } - src/tools/generate-llms-txt.ts:18-44 (schema)Zod schema for generate_llms_txt input: domain (required), max_pages (default 30), include_full (default false), site_name (optional), site_description (optional).
export const generateLlmsTxtInputSchema = z.object({ domain: z .string() .min(3) .describe("Hostname or origin to generate llms.txt for. Examples: `example.com`, `https://example.com`. The tool reads the domain's sitemap, fetches up to `max_pages` of them, and synthesizes a spec-compliant llms.txt grouped by section. Issues N+1 HTTP GETs: one for the sitemap, then one per sampled page. Read-only."), max_pages: z .number() .int() .min(1) .max(100) .optional() .default(30) .describe("How many pages to sample from the sitemap when building section groupings. Default 30. Each page is fetched (one HTTP GET per page) - keep this low for large sites or rate-limited hosts."), include_full: z .boolean() .optional() .default(false) .describe("If true, also generate llms-full.txt (the expanded variant containing full page text, not just URLs and titles). Default false. The llms-full.txt output can be large; only enable when you actually plan to host both files."), site_name: z .string() .optional() .describe("Override the site name used in the generated llms.txt header. If omitted, inferred from the homepage's <title> tag."), site_description: z .string() .optional() .describe("Override the site description used in the generated llms.txt header. If omitted, inferred from the homepage's meta description."), }); - src/index.ts:168-179 (registration)MCP server registration for the generate_llms_txt tool, connecting schema and handler via server.tool().
// --- Tool 8: generate_llms_txt --- server.tool( "generate_llms_txt", [ "Generate a spec-compliant llms.txt (and optionally llms-full.txt) for a domain by reading its sitemap, sampling up to `max_pages` pages, and synthesizing a grouped, sectioned summary.", "Read-only. Issues one HTTP GET for the sitemap then one per sampled page.", "Deterministic; no LLM. Output is the file content as a string - this tool does NOT write to disk or upload anywhere. The caller is responsible for hosting the resulting file at `https://<domain>/llms.txt`.", "When to use: bootstrapping llms.txt for a site you own. To check an existing llms.txt, use `validate_llms_txt` instead.", ].join("\n\n"), generateLlmsTxtInputSchema.shape, async (input) => wrapHandler(() => generateLlmsTxtTool(input)) ); - src/lib/llms-txt.ts:34-65 (helper)Core llms.txt content builder (generateLlmsTxt): formats site name, description, and grouped page sections into spec-compliant markdown.
export function generateLlmsTxt( siteName: string, siteDescription: string, groups: Map<string, LlmsPage[]> ): string { const lines: string[] = []; lines.push(`# ${siteName}`); lines.push(""); lines.push(`> ${siteDescription}`); lines.push(""); for (const [section, pages] of groups.entries()) { const sectionTitle = section === "Root" ? "Pages" : section.replace(/^\/|\/$/g, "").replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()); lines.push(`## ${sectionTitle}`); lines.push(""); for (const page of pages) { const desc = page.description ? page.description.substring(0, 120) : ""; if (desc) { lines.push(`- [${page.title}](${page.url}): ${desc}`); } else { lines.push(`- [${page.title}](${page.url})`); } } lines.push(""); } return lines.join("\n").trim(); } - src/lib/llms-txt.ts:12-31 (helper)Groups fetched pages by URL path prefix into named sections for structured llms.txt output.
export function groupPagesBySection(pages: LlmsPage[]): Map<string, LlmsPage[]> { const groups = new Map<string, LlmsPage[]>(); for (const page of pages) { try { const url = new URL(page.url); const parts = url.pathname.split("/").filter(Boolean); const group = parts.length >= 2 ? `/${parts[0]}/` : "Root"; const existing = groups.get(group) ?? []; existing.push(page); groups.set(group, existing); } catch { const existing = groups.get("Root") ?? []; existing.push(page); groups.set("Root", existing); } } return groups; }