get_manifest
Retrieve the structural manifest of a webpage with title, summary, keywords, language, and heading sections for token-efficient content extraction.
Instructions
Fetch the WASP structural index for a webpage. Returns a manifest with the page title, summary, keywords, language, and a list of heading sections (chunks) with their anchors and token estimates. Checks /.well-known/wasp.json first (native manifest); falls back to DOM-generated manifest if not found.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | Fully-qualified URL of the webpage to index |
Implementation Reference
- index.ts:90-99 (handler)The CallToolRequestSchema handler for 'get_manifest'. Extracts the 'url' argument from the request, calls getManifest(url), and returns the manifest as a JSON-stringified text response.
server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; if (name === "get_manifest") { const { url } = args as { url: string }; const { manifest } = await getManifest(url); return { content: [{ type: "text", text: JSON.stringify(manifest, null, 2) }], }; } - manifest.ts:140-179 (helper)The exported getManifest() function that fetches the WASP manifest. Tries a native manifest from /.well-known/wasp.json first (3s timeout), falls back to DOM-generated manifest via generateManifest(). Returns both the manifest object and the page HTML.
export async function getManifest( url: string ): Promise<{ manifest: Manifest; html: string }> { const cached = getCached(url); if (cached) return { manifest: cached.manifest, html: cached.html }; // Try native manifest first (3 s timeout) const native = await checkNativeManifest(url); // Fetch page HTML regardless — needed for chunk content extraction const res = await fetch(url, { headers: { "User-Agent": USER_AGENT }, redirect: "follow", }); if (!res.ok) { throw new Error(`HTTP ${res.status} fetching ${url}`); } const rawHtml = await res.text(); if (native) { // Use the native manifest but cache the page HTML for chunk extraction setCached(url, { manifest: native, html: rawHtml }); return { manifest: native, html: rawHtml }; } // Build client manifest from DOM, stamp IDs, then serialize const dom = new JSDOM(rawHtml, { url }); const manifest = generateManifest(dom, url); // Serialize after ID stamping so fetch_chunk sees the same anchors const html = dom.serialize(); if (manifest.chunks.length === 0 && !manifest.summary) { manifest.summary = "This page appears to be a SPA — JavaScript-rendered content may not be indexed."; } setCached(url, { manifest, html }); return { manifest, html }; } - manifest.ts:58-138 (helper)generateManifest() builds a client-side manifest from the JSDOM-parsed HTML, extracting title, summary, keywords, language, heading chunks (with synthetic IDs), and interactive actions.
export function generateManifest(dom: JSDOM, url: string): Manifest { const document = dom.window.document; const ogTitle = document .querySelector('meta[property="og:title"]') ?.getAttribute("content"); const h1Text = document.querySelector("h1")?.textContent?.trim(); const title = ogTitle || h1Text || document.title || url; const summary = document .querySelector('meta[name="description"]') ?.getAttribute("content") ?? ""; const language = document.documentElement.getAttribute("lang") ?? "en"; const kwMeta = document .querySelector('meta[name="keywords"]') ?.getAttribute("content"); let keywords: string[]; if (kwMeta) { keywords = kwMeta .split(",") .map((k) => k.trim()) .filter(Boolean) .slice(0, 10); } else { const texts = Array.from(document.querySelectorAll("h1,h2,h3,h4")) .map((h) => h.textContent?.trim() ?? "") .filter(Boolean); keywords = [...new Set(texts)].slice(0, 10); } const headingEls = Array.from(document.querySelectorAll("h1,h2,h3,h4")); const chunks: Chunk[] = headingEls.map((el, i) => { // Stamp synthetic IDs onto the live DOM so fetch_chunk can resolve them if (!el.id) { el.id = "wasp-" + String(i + 1).padStart(3, "0"); } const depth = parseInt(el.tagName[1], 10); const heading = el.textContent?.trim() ?? ""; return { id: "chunk_" + String(i + 1).padStart(3, "0"), heading, anchor: "#" + el.id, type: isInsideNav(el) ? "nav" : "heading", depth, tokens: Math.ceil(heading.length / 4), order: i + 1, }; }); const actionEls = Array.from( document.querySelectorAll("input, form, button, a[href]") ).slice(0, 10); const actions: Action[] = actionEls.map((el, i) => { let type: "fill" | "click" | "navigate" = "click"; if (el.tagName === "INPUT" || el.tagName === "FORM") type = "fill"; else if (el.tagName === "A") type = "navigate"; return { id: "action_" + String(i + 1).padStart(3, "0"), type, label: getActionLabel(el), selector: buildActionSelector(el), description: "", }; }); return { wasp: "1.0", url, title, summary, keywords, language, generatedAt: new Date().toISOString(), generated: "client", chunks, actions, }; } - index.ts:16-36 (registration)Tool registration metadata for 'get_manifest' in the ListToolsRequestSchema handler, defining the tool name, description, and input schema (url string required).
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: "get_manifest", description: "Fetch the WASP structural index for a webpage. Returns a manifest with the " + "page title, summary, keywords, language, and a list of heading sections " + "(chunks) with their anchors and token estimates. " + "Checks /.well-known/wasp.json first (native manifest); falls back to " + "DOM-generated manifest if not found.", inputSchema: { type: "object", properties: { url: { type: "string", description: "Fully-qualified URL of the webpage to index", }, }, required: ["url"], }, }, - manifest.ts:9-29 (helper)checkNativeManifest() attempts to fetch the native WASP manifest from /.well-known/wasp.json with a 3-second timeout, returning null on failure.
async function checkNativeManifest(url: string): Promise<Manifest | null> { const { protocol, host } = new URL(url); const nativeUrl = `${protocol}//${host}/.well-known/wasp.json`; try { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), NATIVE_TIMEOUT_MS); const res = await fetch(nativeUrl, { signal: controller.signal, headers: { "User-Agent": USER_AGENT }, }); clearTimeout(timer); if (!res.ok) return null; const json = (await res.json()) as unknown; if (json && typeof json === "object" && "wasp" in json) { return json as Manifest; } } catch { // 404, timeout, or non-JSON — fall through to DOM generation } return null; }