get_page_markdown
Extract clean markdown from web pages by removing navigation, headers, and sidebars while preserving main content and formatting for readability.
Instructions
Extract clean markdown content from a URL. Returns only the main content without navigation, headers, footers, or sidebars.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | The URL to extract markdown from | |
| includeImages | No | Whether to include image references in markdown (default: true) | |
| includeLinks | No | Whether to include hyperlinks in markdown (default: true) | |
| waitForSelector | No | Optional CSS selector to wait for before extracting content | |
| timeout | No | Navigation timeout in milliseconds (default: 30000) |
Implementation Reference
- markdown-mcp.js:86-432 (handler)Main implementation of the get_page_markdown tool handler. Uses Playwright to load the page, evaluate JavaScript to find main content, skip navigation/UI elements, and convert HTML to clean Markdown format.async getPageMarkdown(args) { const { url, includeImages = true, includeLinks = true, waitForSelector, timeout = 30000, } = args; const browser = await this.ensureBrowser(); const context = await browser.newContext(); const page = await context.newPage(); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout }); if (waitForSelector) { await page.waitForSelector(waitForSelector, { timeout: 10000 }); } else { // Wait for content to load - especially important for JS-heavy sites await page.waitForTimeout(5000); } const markdown = await page.evaluate( ({ includeImages, includeLinks }) => { function extractMainContent() { // Confluence-specific selectors first, then general ones const mainSelectors = [ '#main-content', '.wiki-content', '[data-test-id="wiki-content"]', 'main[role="main"]', 'main', 'article', '[role="main"]', '.main-content', '.content', '#content', '.post-content', '.article-content', 'body', ]; for (const selector of mainSelectors) { const element = document.querySelector(selector); if (element && element.textContent.trim().length > 100) { return element; } } return document.body; } function shouldSkipElement(element) { if (!element || !element.tagName) return true; const tagName = element.tagName.toLowerCase(); // Never skip these content elements if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table', 'pre', 'code', 'blockquote'].includes(tagName)) { return false; } // Check for hidden elements if (element.offsetParent === null && tagName !== 'script' && tagName !== 'style') { const style = window.getComputedStyle(element); if (style.display === 'none' || style.visibility === 'hidden') { return true; } } // Skip technical elements if (['script', 'style', 'noscript', 'iframe'].includes(tagName)) { return true; } // Check role attributes const role = element.getAttribute('role'); if (['navigation', 'banner', 'contentinfo', 'complementary'].includes(role)) { return true; } // Check specific element types if (tagName === 'nav' || tagName === 'header' || tagName === 'footer' || tagName === 'aside') { return true; } // Check class and id for common patterns (but be less aggressive) const className = (element.className || '').toString().toLowerCase(); const id = (element.id || '').toLowerCase(); const combined = className + ' ' + id; const strictSkipPatterns = [ 'cookie-banner', 'gdpr', 'advertisement', 'sponsored', ]; return strictSkipPatterns.some(pattern => combined.includes(pattern)); } function getTextContent(node) { let text = ''; for (const child of node.childNodes) { if (child.nodeType === Node.TEXT_NODE) { text += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { const tag = child.tagName.toLowerCase(); if (tag === 'br') { text += '\n'; } else if (!shouldSkipElement(child)) { text += getTextContent(child); } } } return text; } function convertToMarkdown(node, depth = 0, inList = false) { if (!node || shouldSkipElement(node)) return ''; let markdown = ''; const tagName = node.tagName?.toLowerCase(); // Headings if (tagName?.match(/^h[1-6]$/)) { const level = parseInt(tagName[1]); const text = getTextContent(node).trim(); if (text) { markdown += '\n' + '#'.repeat(level) + ' ' + text + '\n\n'; } return markdown; } // Paragraphs if (tagName === 'p') { let content = ''; for (const child of node.childNodes) { if (child.nodeType === Node.TEXT_NODE) { content += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { content += convertToMarkdown(child, depth + 1); } } const text = content.trim(); if (text) { markdown += text + '\n\n'; } return markdown; } // Code blocks if (tagName === 'pre') { const code = node.querySelector('code'); const text = (code || node).textContent.trim(); if (text) { const language = code?.className.match(/language-(\w+)/)?.[1] || ''; markdown += '\n```' + language + '\n' + text + '\n```\n\n'; } return markdown; } // Inline code if (tagName === 'code' && node.parentElement?.tagName !== 'PRE') { return '`' + node.textContent.trim() + '`'; } // Blockquotes if (tagName === 'blockquote') { const text = getTextContent(node).trim(); if (text) { const lines = text.split('\n').filter(l => l.trim()); markdown += '\n' + lines.map(line => '> ' + line.trim()).join('\n') + '\n\n'; } return markdown; } // Lists if (tagName === 'ul' || tagName === 'ol') { const items = Array.from(node.children).filter(child => child.tagName === 'LI'); items.forEach((li, idx) => { const prefix = tagName === 'ol' ? `${idx + 1}. ` : '- '; let itemContent = ''; for (const child of li.childNodes) { if (child.nodeType === Node.TEXT_NODE) { itemContent += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { itemContent += convertToMarkdown(child, depth + 1, true); } } const text = itemContent.trim(); if (text) { markdown += prefix + text + '\n'; } }); if (!inList) markdown += '\n'; return markdown; } // Images if (tagName === 'img' && includeImages) { const alt = node.getAttribute('alt') || ''; const src = node.getAttribute('src') || node.getAttribute('data-src') || ''; if (src) { try { const fullSrc = new URL(src, window.location.href).href; markdown += `\n\n`; } catch (e) { // Invalid URL, skip } } return markdown; } // Links if (tagName === 'a' && includeLinks) { const text = getTextContent(node).trim(); const href = node.getAttribute('href'); if (text && href) { try { const fullHref = new URL(href, window.location.href).href; return `[${text}](${fullHref})`; } catch (e) { return text; } } return text || ''; } // Strong/Bold if (tagName === 'strong' || tagName === 'b') { const text = getTextContent(node).trim(); return text ? `**${text}**` : ''; } // Emphasis/Italic if (tagName === 'em' || tagName === 'i') { const text = getTextContent(node).trim(); return text ? `*${text}*` : ''; } // Horizontal rule if (tagName === 'hr') { return '\n---\n\n'; } // Tables if (tagName === 'table') { const rows = Array.from(node.querySelectorAll('tr')); if (rows.length > 0) { rows.forEach((row, rowIdx) => { const cells = Array.from(row.querySelectorAll('th, td')); const cellTexts = cells.map(cell => getTextContent(cell).trim().replace(/\n/g, ' ')); if (cellTexts.some(t => t)) { markdown += '| ' + cellTexts.join(' | ') + ' |\n'; if (rowIdx === 0) { markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n'; } } }); markdown += '\n'; } return markdown; } // Line break if (tagName === 'br') { return '\n'; } // Container elements - process children if (['div', 'section', 'article', 'main', 'span', 'td', 'th', 'li'].includes(tagName)) { for (const child of node.childNodes) { if (child.nodeType === Node.ELEMENT_NODE) { markdown += convertToMarkdown(child, depth + 1, inList); } else if (child.nodeType === Node.TEXT_NODE && depth === 0 && !inList) { const text = child.textContent.trim(); if (text && text.length > 0) { markdown += text + ' '; } } } return markdown; } // For any other element, try to extract text from children if (node.childNodes && node.childNodes.length > 0) { for (const child of node.childNodes) { if (child.nodeType === Node.ELEMENT_NODE) { markdown += convertToMarkdown(child, depth + 1, inList); } } } return markdown; } const mainContent = extractMainContent(); let result = convertToMarkdown(mainContent); // Clean up excessive newlines and spaces result = result .replace(/ +/g, ' ') // Multiple spaces to single .replace(/\n\n\n+/g, '\n\n') // Multiple newlines to double .trim(); // If still empty, use fallback if (!result || result.length < 50) { const allText = mainContent.textContent.trim(); if (allText) { result = allText .split('\n') .map(line => line.trim()) .filter(line => line.length > 0) .join('\n\n'); } } return result; }, { includeImages, includeLinks } ); await context.close(); return { content: [ { type: 'text', text: markdown || 'No content could be extracted from this page.', }, ], }; } catch (error) { await context.close(); return { content: [ { type: 'text', text: `Error extracting markdown: ${error.message}`, }, ], isError: true, }; } }
- markdown-mcp.js:36-64 (schema)Input schema definition for the get_page_markdown tool, specifying parameters like url (required), includeImages, includeLinks, waitForSelector, and timeout.inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract markdown from', }, includeImages: { type: 'boolean', description: 'Whether to include image references in markdown (default: true)', default: true, }, includeLinks: { type: 'boolean', description: 'Whether to include hyperlinks in markdown (default: true)', default: true, }, waitForSelector: { type: 'string', description: 'Optional CSS selector to wait for before extracting content', }, timeout: { type: 'number', description: 'Navigation timeout in milliseconds (default: 30000)', default: 30000, }, }, required: ['url'], },
- markdown-mcp.js:31-67 (registration)Registration of the get_page_markdown tool in the ListToolsRequest handler, providing name, description, and input schema.this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'get_page_markdown', description: 'Extract clean markdown content from a URL. Returns only the main content without navigation, headers, footers, or sidebars.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract markdown from', }, includeImages: { type: 'boolean', description: 'Whether to include image references in markdown (default: true)', default: true, }, includeLinks: { type: 'boolean', description: 'Whether to include hyperlinks in markdown (default: true)', default: true, }, waitForSelector: { type: 'string', description: 'Optional CSS selector to wait for before extracting content', }, timeout: { type: 'number', description: 'Navigation timeout in milliseconds (default: 30000)', default: 30000, }, }, required: ['url'], }, }, ], }));
- markdown-mcp.js:69-74 (registration)Registration of the CallToolRequest handler that dispatches calls to 'get_page_markdown' to the getPageMarkdown method.this.server.setRequestHandler(CallToolRequestSchema, async (request) => { if (request.params.name === 'get_page_markdown') { return await this.getPageMarkdown(request.params.arguments); } throw new Error(`Unknown tool: ${request.params.name}`); });
- markdown-mcp-gemini.js:39-341 (handler)Alternative implementation of the core extraction logic in markdown-mcp-gemini.js, used by the tool handler.async function extractMarkdownContent(url, options = {}) { const { includeImages = true, includeLinks = true, waitForSelector, timeout = 30000, } = options; const browserInstance = await ensureBrowser(); const context = await browserInstance.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }); const page = await context.newPage(); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout }); if (waitForSelector) { await page.waitForSelector(waitForSelector, { timeout: 10000 }); } else { // Wait for content to load - especially important for JS-heavy sites await page.waitForTimeout(5000); } const markdown = await page.evaluate( ({ includeImages, includeLinks }) => { function extractMainContent() { // Confluence-specific selectors first, then general ones const mainSelectors = [ '#main-content', '.wiki-content', '[data-test-id="wiki-content"]', 'main[role="main"]', 'main', 'article', '[role="main"]', '.main-content', '.content', '#content', '.post-content', '.article-content', 'body', ]; for (const selector of mainSelectors) { const element = document.querySelector(selector); if (element && element.textContent.trim().length > 100) { return element; } } return document.body; } function shouldSkipElement(element) { if (!element || !element.tagName) return true; const tagName = element.tagName.toLowerCase(); // Never skip these content elements if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table', 'pre', 'code', 'blockquote'].includes(tagName)) { return false; } // Check for hidden elements if (element.offsetParent === null && tagName !== 'script' && tagName !== 'style') { const style = window.getComputedStyle(element); if (style.display === 'none' || style.visibility === 'hidden') { return true; } } // Skip technical elements if (['script', 'style', 'noscript', 'iframe'].includes(tagName)) { return true; } // Check role attributes const role = element.getAttribute('role'); if (['navigation', 'banner', 'contentinfo', 'complementary'].includes(role)) { return true; } // Check specific element types if (tagName === 'nav' || tagName === 'header' || tagName === 'footer' || tagName === 'aside') { return true; } // Check class and id for common patterns (but be less aggressive) const className = (element.className || '').toString().toLowerCase(); const id = (element.id || '').toLowerCase(); const combined = className + ' ' + id; const strictSkipPatterns = [ 'cookie-banner', 'gdpr', 'advertisement', 'sponsored', ]; return strictSkipPatterns.some(pattern => combined.includes(pattern)); } function getTextContent(node) { let text = ''; for (const child of node.childNodes) { if (child.nodeType === Node.TEXT_NODE) { text += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { const tag = child.tagName.toLowerCase(); if (tag === 'br') { text += '\n'; } else if (!shouldSkipElement(child)) { text += getTextContent(child); } } } return text; } function convertToMarkdown(node, depth = 0, inList = false) { if (!node || shouldSkipElement(node)) return ''; let markdown = ''; const tagName = node.tagName?.toLowerCase(); // Headings if (tagName?.match(/^h[1-6]$/)) { const level = parseInt(tagName[1]); const text = getTextContent(node).trim(); if (text) { markdown += '\n' + '#'.repeat(level) + ' ' + text + '\n\n'; } return markdown; } // Paragraphs if (tagName === 'p') { let content = ''; for (const child of node.childNodes) { if (child.nodeType === Node.TEXT_NODE) { content += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { content += convertToMarkdown(child, depth + 1); } } const text = content.trim(); if (text) { markdown += text + '\n\n'; } return markdown; } // Code blocks if (tagName === 'pre') { const code = node.querySelector('code'); const text = (code || node).textContent.trim(); if (text) { const language = code?.className.match(/language-(\w+)/)?.[1] || ''; markdown += '\n```' + language + '\n' + text + '\n```\n\n'; } return markdown; } // Inline code if (tagName === 'code' && node.parentElement?.tagName !== 'PRE') { return '`' + node.textContent.trim() + '`'; } // Blockquotes if (tagName === 'blockquote') { const text = getTextContent(node).trim(); if (text) { const lines = text.split('\n').filter(l => l.trim()); markdown += '\n' + lines.map(line => '> ' + line.trim()).join('\n') + '\n\n'; } return markdown; } // Lists if (tagName === 'ul' || tagName === 'ol') { const items = Array.from(node.children).filter(child => child.tagName === 'LI'); items.forEach((li, idx) => { const prefix = tagName === 'ol' ? `${idx + 1}. ` : '- '; let itemContent = ''; for (const child of li.childNodes) { if (child.nodeType === Node.TEXT_NODE) { itemContent += child.textContent; } else if (child.nodeType === Node.ELEMENT_NODE) { itemContent += convertToMarkdown(child, depth + 1, true); } } const text = itemContent.trim(); if (text) { markdown += prefix + text + '\n'; } }); if (!inList) markdown += '\n'; return markdown; } // Strong/Bold if (tagName === 'strong' || tagName === 'b') { const text = getTextContent(node).trim(); return text ? `**${text}**` : ''; } // Emphasis/Italic if (tagName === 'em' || tagName === 'i') { const text = getTextContent(node).trim(); return text ? `*${text}*` : ''; } // Horizontal rule if (tagName === 'hr') { return '\n---\n\n'; } // Tables if (tagName === 'table') { const rows = Array.from(node.querySelectorAll('tr')); if (rows.length > 0) { rows.forEach((row, rowIdx) => { const cells = Array.from(row.querySelectorAll('th, td')); const cellTexts = cells.map(cell => getTextContent(cell).trim().replace(/\n/g, ' ')); if (cellTexts.some(t => t)) { markdown += '| ' + cellTexts.join(' | ') + ' |\n'; if (rowIdx === 0) { markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n'; } } }); markdown += '\n'; } return markdown; } // Line break if (tagName === 'br') { return '\n'; } // Container elements - process children if (['div', 'section', 'article', 'main', 'span', 'td', 'th', 'li'].includes(tagName)) { for (const child of node.childNodes) { if (child.nodeType === Node.ELEMENT_NODE) { markdown += convertToMarkdown(child, depth + 1, inList); } else if (child.nodeType === Node.TEXT_NODE && depth === 0 && !inList) { const text = child.textContent.trim(); if (text && text.length > 0) { markdown += text + ' '; } } } return markdown; } // For any other element, try to extract text from children if (node.childNodes && node.childNodes.length > 0) { for (const child of node.childNodes) { if (child.nodeType === Node.ELEMENT_NODE) { markdown += convertToMarkdown(child, depth + 1, inList); } } } return markdown; } const mainContent = extractMainContent(); let result = convertToMarkdown(mainContent); // Clean up excessive newlines and spaces result = result .replace(/ +/g, ' ') // Multiple spaces to single .replace(/\n\n\n+/g, '\n\n') // Multiple newlines to double .trim(); // If still empty, use fallback if (!result || result.length < 50) { const allText = mainContent.textContent.trim(); if (allText) { result = allText .split('\n') .map(line => line.trim()) .filter(line => line.length > 0) .join('\n\n'); } } return result; }, { includeImages, includeLinks } ); await context.close(); return markdown || 'No content could be extracted from this page.'; } catch (error) { await context.close(); throw new Error(`Error extracting markdown: ${error.message}`); } }