extract_page_content
Extract structured content from a webpage, including tables and links, for data analysis or archiving. Solves the problem of retrieving clean, organized page content without manual scraping.
Instructions
Extrae el contenido estructurado de una p�gina.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| include_tables | No | ||
| include_links | No |
Implementation Reference
- src/index.js:299-330 (registration)Tool registration for 'extract_page_content' via server.tool() with name, description, schema, and handler.
server.tool( 'extract_page_content', 'Extrae el contenido estructurado de una p�gina.', { url: z.string(), include_tables: z.boolean().default(true), include_links: z.boolean().default(true) }, async ({ url, include_tables, include_links }) => { try { const { html, finalUrl } = await fetchPage(url); const extracted = extractReadableContent(html, finalUrl); return { content: [ { type: 'text', text: JSON.stringify({ title: extracted.title, url: finalUrl, sections: extracted.sections, tables: include_tables ? extractTables(html) : [], links: include_links ? extracted.links : [] }, null, 2) } ] }; } catch (error) { return mcpError(error); } } ); - src/index.js:302-306 (schema)Input schema for extract_page_content: url (required string), include_tables (boolean, default true), include_links (boolean, default true).
{ url: z.string(), include_tables: z.boolean().default(true), include_links: z.boolean().default(true) }, - src/index.js:307-329 (handler)Handler function that fetches the page, extracts readable content, and returns title, sections, tables (optional), and links (optional) as JSON.
async ({ url, include_tables, include_links }) => { try { const { html, finalUrl } = await fetchPage(url); const extracted = extractReadableContent(html, finalUrl); return { content: [ { type: 'text', text: JSON.stringify({ title: extracted.title, url: finalUrl, sections: extracted.sections, tables: include_tables ? extractTables(html) : [], links: include_links ? extracted.links : [] }, null, 2) } ] }; } catch (error) { return mcpError(error); } } - src/index.js:114-164 (helper)extractReadableContent() helper: parses HTML with cheerio, extracts title, heading-based sections, and deduplicated internal links.
function extractReadableContent(html, pageUrl) { const $ = cheerio.load(html); $('script, style, noscript').remove(); const title = $('title').first().text().trim() || $('h1').first().text().trim() || pageUrl; const sections = []; $('h1, h2, h3').each((_, el) => { const heading = $(el).text().trim(); const texts = []; let current = $(el).next(); while (current.length && !['h1', 'h2', 'h3'].includes(current.get(0)?.tagName)) { const text = current.text().trim(); if (text) texts.push(text); current = current.next(); } if (heading || texts.length) { sections.push({ heading, text: texts.join('\n\n') }); } }); const links = []; $('a[href]').each((_, el) => { const href = $(el).attr('href'); const label = $(el).text().trim(); if (!href) return; try { const url = new URL(href, pageUrl).toString(); if (url.startsWith(BASE_URL)) { links.push({ label: label || url, url }); } } catch { } }); const contentText = $('body') .text() .replace(/\s+\n/g, '\n') .replace(/\n\s+/g, '\n') .replace(/\n{3,}/g, '\n\n') .trim(); return { title, contentText, sections, links: dedupeLinks(links) }; } - src/index.js:175-196 (helper)extractTables() helper: parses HTML tables into headers/rows structure.
function extractTables(html) { const $ = cheerio.load(html); const tables = []; $('table').each((_, table) => { const headers = []; $(table).find('thead th').each((__, th) => headers.push($(th).text().trim())); const rows = []; $(table).find('tbody tr').each((__, tr) => { const row = []; $(tr).find('td').each((___, td) => row.push($(td).text().trim())); if (row.length) rows.push(row); }); if (headers.length || rows.length) { tables.push({ title: null, headers, rows }); } }); return tables; }