Skip to main content
Glama

get_page_markdown

Extract clean markdown from web pages by removing navigation, headers, and sidebars while preserving main content and formatting for readability.

Instructions

Extract clean markdown content from a URL. Returns only the main content without navigation, headers, footers, or sidebars.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYesThe URL to extract markdown from
includeImagesNoWhether to include image references in markdown (default: true)
includeLinksNoWhether to include hyperlinks in markdown (default: true)
waitForSelectorNoOptional CSS selector to wait for before extracting content
timeoutNoNavigation timeout in milliseconds (default: 30000)

Implementation Reference

  • Main implementation of the get_page_markdown tool handler. Uses Playwright to load the page, evaluate JavaScript to find main content, skip navigation/UI elements, and convert HTML to clean Markdown format.
    async getPageMarkdown(args) {
      const {
        url,
        includeImages = true,
        includeLinks = true,
        waitForSelector,
        timeout = 30000,
      } = args;
    
      const browser = await this.ensureBrowser();
      const context = await browser.newContext();
      const page = await context.newPage();
    
      try {
        await page.goto(url, { waitUntil: 'domcontentloaded', timeout });
    
        if (waitForSelector) {
          await page.waitForSelector(waitForSelector, { timeout: 10000 });
        } else {
          // Wait for content to load - especially important for JS-heavy sites
          await page.waitForTimeout(5000);
        }
    
        const markdown = await page.evaluate(
          ({ includeImages, includeLinks }) => {
            function extractMainContent() {
              // Confluence-specific selectors first, then general ones
              const mainSelectors = [
                '#main-content',
                '.wiki-content',
                '[data-test-id="wiki-content"]',
                'main[role="main"]',
                'main',
                'article',
                '[role="main"]',
                '.main-content',
                '.content',
                '#content',
                '.post-content',
                '.article-content',
                'body',
              ];
    
              for (const selector of mainSelectors) {
                const element = document.querySelector(selector);
                if (element && element.textContent.trim().length > 100) {
                  return element;
                }
              }
    
              return document.body;
            }
    
            function shouldSkipElement(element) {
              if (!element || !element.tagName) return true;
    
              const tagName = element.tagName.toLowerCase();
              
              // Never skip these content elements
              if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table', 'pre', 'code', 'blockquote'].includes(tagName)) {
                return false;
              }
    
              // Check for hidden elements
              if (element.offsetParent === null && tagName !== 'script' && tagName !== 'style') {
                const style = window.getComputedStyle(element);
                if (style.display === 'none' || style.visibility === 'hidden') {
                  return true;
                }
              }
    
              // Skip technical elements
              if (['script', 'style', 'noscript', 'iframe'].includes(tagName)) {
                return true;
              }
    
              // Check role attributes
              const role = element.getAttribute('role');
              if (['navigation', 'banner', 'contentinfo', 'complementary'].includes(role)) {
                return true;
              }
    
              // Check specific element types
              if (tagName === 'nav' || tagName === 'header' || tagName === 'footer' || tagName === 'aside') {
                return true;
              }
    
              // Check class and id for common patterns (but be less aggressive)
              const className = (element.className || '').toString().toLowerCase();
              const id = (element.id || '').toLowerCase();
              const combined = className + ' ' + id;
    
              const strictSkipPatterns = [
                'cookie-banner',
                'gdpr',
                'advertisement',
                'sponsored',
              ];
    
              return strictSkipPatterns.some(pattern => combined.includes(pattern));
            }
    
            function getTextContent(node) {
              let text = '';
              for (const child of node.childNodes) {
                if (child.nodeType === Node.TEXT_NODE) {
                  text += child.textContent;
                } else if (child.nodeType === Node.ELEMENT_NODE) {
                  const tag = child.tagName.toLowerCase();
                  if (tag === 'br') {
                    text += '\n';
                  } else if (!shouldSkipElement(child)) {
                    text += getTextContent(child);
                  }
                }
              }
              return text;
            }
    
            function convertToMarkdown(node, depth = 0, inList = false) {
              if (!node || shouldSkipElement(node)) return '';
    
              let markdown = '';
              const tagName = node.tagName?.toLowerCase();
    
              // Headings
              if (tagName?.match(/^h[1-6]$/)) {
                const level = parseInt(tagName[1]);
                const text = getTextContent(node).trim();
                if (text) {
                  markdown += '\n' + '#'.repeat(level) + ' ' + text + '\n\n';
                }
                return markdown;
              }
    
              // Paragraphs
              if (tagName === 'p') {
                let content = '';
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.TEXT_NODE) {
                    content += child.textContent;
                  } else if (child.nodeType === Node.ELEMENT_NODE) {
                    content += convertToMarkdown(child, depth + 1);
                  }
                }
                const text = content.trim();
                if (text) {
                  markdown += text + '\n\n';
                }
                return markdown;
              }
    
              // Code blocks
              if (tagName === 'pre') {
                const code = node.querySelector('code');
                const text = (code || node).textContent.trim();
                if (text) {
                  const language = code?.className.match(/language-(\w+)/)?.[1] || '';
                  markdown += '\n```' + language + '\n' + text + '\n```\n\n';
                }
                return markdown;
              }
    
              // Inline code
              if (tagName === 'code' && node.parentElement?.tagName !== 'PRE') {
                return '`' + node.textContent.trim() + '`';
              }
    
              // Blockquotes
              if (tagName === 'blockquote') {
                const text = getTextContent(node).trim();
                if (text) {
                  const lines = text.split('\n').filter(l => l.trim());
                  markdown += '\n' + lines.map(line => '> ' + line.trim()).join('\n') + '\n\n';
                }
                return markdown;
              }
    
              // Lists
              if (tagName === 'ul' || tagName === 'ol') {
                const items = Array.from(node.children).filter(child => child.tagName === 'LI');
                items.forEach((li, idx) => {
                  const prefix = tagName === 'ol' ? `${idx + 1}. ` : '- ';
                  let itemContent = '';
                  for (const child of li.childNodes) {
                    if (child.nodeType === Node.TEXT_NODE) {
                      itemContent += child.textContent;
                    } else if (child.nodeType === Node.ELEMENT_NODE) {
                      itemContent += convertToMarkdown(child, depth + 1, true);
                    }
                  }
                  const text = itemContent.trim();
                  if (text) {
                    markdown += prefix + text + '\n';
                  }
                });
                if (!inList) markdown += '\n';
                return markdown;
              }
    
              // Images
              if (tagName === 'img' && includeImages) {
                const alt = node.getAttribute('alt') || '';
                const src = node.getAttribute('src') || node.getAttribute('data-src') || '';
                if (src) {
                  try {
                    const fullSrc = new URL(src, window.location.href).href;
                    markdown += `![${alt}](${fullSrc})\n\n`;
                  } catch (e) {
                    // Invalid URL, skip
                  }
                }
                return markdown;
              }
    
              // Links
              if (tagName === 'a' && includeLinks) {
                const text = getTextContent(node).trim();
                const href = node.getAttribute('href');
                if (text && href) {
                  try {
                    const fullHref = new URL(href, window.location.href).href;
                    return `[${text}](${fullHref})`;
                  } catch (e) {
                    return text;
                  }
                }
                return text || '';
              }
    
              // Strong/Bold
              if (tagName === 'strong' || tagName === 'b') {
                const text = getTextContent(node).trim();
                return text ? `**${text}**` : '';
              }
    
              // Emphasis/Italic
              if (tagName === 'em' || tagName === 'i') {
                const text = getTextContent(node).trim();
                return text ? `*${text}*` : '';
              }
    
              // Horizontal rule
              if (tagName === 'hr') {
                return '\n---\n\n';
              }
    
              // Tables
              if (tagName === 'table') {
                const rows = Array.from(node.querySelectorAll('tr'));
                if (rows.length > 0) {
                  rows.forEach((row, rowIdx) => {
                    const cells = Array.from(row.querySelectorAll('th, td'));
                    const cellTexts = cells.map(cell => getTextContent(cell).trim().replace(/\n/g, ' '));
                    if (cellTexts.some(t => t)) {
                      markdown += '| ' + cellTexts.join(' | ') + ' |\n';
                      if (rowIdx === 0) {
                        markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n';
                      }
                    }
                  });
                  markdown += '\n';
                }
                return markdown;
              }
    
              // Line break
              if (tagName === 'br') {
                return '\n';
              }
    
              // Container elements - process children
              if (['div', 'section', 'article', 'main', 'span', 'td', 'th', 'li'].includes(tagName)) {
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.ELEMENT_NODE) {
                    markdown += convertToMarkdown(child, depth + 1, inList);
                  } else if (child.nodeType === Node.TEXT_NODE && depth === 0 && !inList) {
                    const text = child.textContent.trim();
                    if (text && text.length > 0) {
                      markdown += text + ' ';
                    }
                  }
                }
                return markdown;
              }
    
              // For any other element, try to extract text from children
              if (node.childNodes && node.childNodes.length > 0) {
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.ELEMENT_NODE) {
                    markdown += convertToMarkdown(child, depth + 1, inList);
                  }
                }
              }
    
              return markdown;
            }
    
            const mainContent = extractMainContent();
            let result = convertToMarkdown(mainContent);
    
            // Clean up excessive newlines and spaces
            result = result
              .replace(/ +/g, ' ')  // Multiple spaces to single
              .replace(/\n\n\n+/g, '\n\n')  // Multiple newlines to double
              .trim();
    
            // If still empty, use fallback
            if (!result || result.length < 50) {
              const allText = mainContent.textContent.trim();
              if (allText) {
                result = allText
                  .split('\n')
                  .map(line => line.trim())
                  .filter(line => line.length > 0)
                  .join('\n\n');
              }
            }
    
            return result;
          },
          { includeImages, includeLinks }
        );
    
        await context.close();
    
        return {
          content: [
            {
              type: 'text',
              text: markdown || 'No content could be extracted from this page.',
            },
          ],
        };
      } catch (error) {
        await context.close();
        return {
          content: [
            {
              type: 'text',
              text: `Error extracting markdown: ${error.message}`,
            },
          ],
          isError: true,
        };
      }
    }
  • Input schema definition for the get_page_markdown tool, specifying parameters like url (required), includeImages, includeLinks, waitForSelector, and timeout.
    inputSchema: {
      type: 'object',
      properties: {
        url: {
          type: 'string',
          description: 'The URL to extract markdown from',
        },
        includeImages: {
          type: 'boolean',
          description: 'Whether to include image references in markdown (default: true)',
          default: true,
        },
        includeLinks: {
          type: 'boolean',
          description: 'Whether to include hyperlinks in markdown (default: true)',
          default: true,
        },
        waitForSelector: {
          type: 'string',
          description: 'Optional CSS selector to wait for before extracting content',
        },
        timeout: {
          type: 'number',
          description: 'Navigation timeout in milliseconds (default: 30000)',
          default: 30000,
        },
      },
      required: ['url'],
    },
  • markdown-mcp.js:31-67 (registration)
    Registration of the get_page_markdown tool in the ListToolsRequest handler, providing name, description, and input schema.
    this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
      tools: [
        {
          name: 'get_page_markdown',
          description: 'Extract clean markdown content from a URL. Returns only the main content without navigation, headers, footers, or sidebars.',
          inputSchema: {
            type: 'object',
            properties: {
              url: {
                type: 'string',
                description: 'The URL to extract markdown from',
              },
              includeImages: {
                type: 'boolean',
                description: 'Whether to include image references in markdown (default: true)',
                default: true,
              },
              includeLinks: {
                type: 'boolean',
                description: 'Whether to include hyperlinks in markdown (default: true)',
                default: true,
              },
              waitForSelector: {
                type: 'string',
                description: 'Optional CSS selector to wait for before extracting content',
              },
              timeout: {
                type: 'number',
                description: 'Navigation timeout in milliseconds (default: 30000)',
                default: 30000,
              },
            },
            required: ['url'],
          },
        },
      ],
    }));
  • markdown-mcp.js:69-74 (registration)
    Registration of the CallToolRequest handler that dispatches calls to 'get_page_markdown' to the getPageMarkdown method.
    this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
      if (request.params.name === 'get_page_markdown') {
        return await this.getPageMarkdown(request.params.arguments);
      }
      throw new Error(`Unknown tool: ${request.params.name}`);
    });
  • Alternative implementation of the core extraction logic in markdown-mcp-gemini.js, used by the tool handler.
    async function extractMarkdownContent(url, options = {}) {
      const {
        includeImages = true,
        includeLinks = true,
        waitForSelector,
        timeout = 30000,
      } = options;
    
      const browserInstance = await ensureBrowser();
      const context = await browserInstance.newContext({
        userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
      });
      const page = await context.newPage();
    
      try {
        await page.goto(url, { waitUntil: 'domcontentloaded', timeout });
    
        if (waitForSelector) {
          await page.waitForSelector(waitForSelector, { timeout: 10000 });
        } else {
          // Wait for content to load - especially important for JS-heavy sites
          await page.waitForTimeout(5000);
        }
    
        const markdown = await page.evaluate(
          ({ includeImages, includeLinks }) => {
            function extractMainContent() {
              // Confluence-specific selectors first, then general ones
              const mainSelectors = [
                '#main-content',
                '.wiki-content',
                '[data-test-id="wiki-content"]',
                'main[role="main"]',
                'main',
                'article',
                '[role="main"]',
                '.main-content',
                '.content',
                '#content',
                '.post-content',
                '.article-content',
                'body',
              ];
    
              for (const selector of mainSelectors) {
                const element = document.querySelector(selector);
                if (element && element.textContent.trim().length > 100) {
                  return element;
                }
              }
    
              return document.body;
            }
    
            function shouldSkipElement(element) {
              if (!element || !element.tagName) return true;
    
              const tagName = element.tagName.toLowerCase();
              
              // Never skip these content elements
              if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table', 'pre', 'code', 'blockquote'].includes(tagName)) {
                return false;
              }
    
              // Check for hidden elements
              if (element.offsetParent === null && tagName !== 'script' && tagName !== 'style') {
                const style = window.getComputedStyle(element);
                if (style.display === 'none' || style.visibility === 'hidden') {
                  return true;
                }
              }
    
              // Skip technical elements
              if (['script', 'style', 'noscript', 'iframe'].includes(tagName)) {
                return true;
              }
    
              // Check role attributes
              const role = element.getAttribute('role');
              if (['navigation', 'banner', 'contentinfo', 'complementary'].includes(role)) {
                return true;
              }
    
              // Check specific element types
              if (tagName === 'nav' || tagName === 'header' || tagName === 'footer' || tagName === 'aside') {
                return true;
              }
    
              // Check class and id for common patterns (but be less aggressive)
              const className = (element.className || '').toString().toLowerCase();
              const id = (element.id || '').toLowerCase();
              const combined = className + ' ' + id;
    
              const strictSkipPatterns = [
                'cookie-banner',
                'gdpr',
                'advertisement',
                'sponsored',
              ];
    
              return strictSkipPatterns.some(pattern => combined.includes(pattern));
            }
    
            function getTextContent(node) {
              let text = '';
              for (const child of node.childNodes) {
                if (child.nodeType === Node.TEXT_NODE) {
                  text += child.textContent;
                } else if (child.nodeType === Node.ELEMENT_NODE) {
                  const tag = child.tagName.toLowerCase();
                  if (tag === 'br') {
                    text += '\n';
                  } else if (!shouldSkipElement(child)) {
                    text += getTextContent(child);
                  }
                }
              }
              return text;
            }
    
            function convertToMarkdown(node, depth = 0, inList = false) {
              if (!node || shouldSkipElement(node)) return '';
    
              let markdown = '';
              const tagName = node.tagName?.toLowerCase();
    
              // Headings
              if (tagName?.match(/^h[1-6]$/)) {
                const level = parseInt(tagName[1]);
                const text = getTextContent(node).trim();
                if (text) {
                  markdown += '\n' + '#'.repeat(level) + ' ' + text + '\n\n';
                }
                return markdown;
              }
    
              // Paragraphs
              if (tagName === 'p') {
                let content = '';
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.TEXT_NODE) {
                    content += child.textContent;
                  } else if (child.nodeType === Node.ELEMENT_NODE) {
                    content += convertToMarkdown(child, depth + 1);
                  }
                }
                const text = content.trim();
                if (text) {
                  markdown += text + '\n\n';
                }
                return markdown;
              }
    
              // Code blocks
              if (tagName === 'pre') {
                const code = node.querySelector('code');
                const text = (code || node).textContent.trim();
                if (text) {
                  const language = code?.className.match(/language-(\w+)/)?.[1] || '';
                  markdown += '\n```' + language + '\n' + text + '\n```\n\n';
                }
                return markdown;
              }
    
              // Inline code
              if (tagName === 'code' && node.parentElement?.tagName !== 'PRE') {
                return '`' + node.textContent.trim() + '`';
              }
    
              // Blockquotes
              if (tagName === 'blockquote') {
                const text = getTextContent(node).trim();
                if (text) {
                  const lines = text.split('\n').filter(l => l.trim());
                  markdown += '\n' + lines.map(line => '> ' + line.trim()).join('\n') + '\n\n';
                }
                return markdown;
              }
    
              // Lists
              if (tagName === 'ul' || tagName === 'ol') {
                const items = Array.from(node.children).filter(child => child.tagName === 'LI');
                items.forEach((li, idx) => {
                  const prefix = tagName === 'ol' ? `${idx + 1}. ` : '- ';
                  let itemContent = '';
                  for (const child of li.childNodes) {
                    if (child.nodeType === Node.TEXT_NODE) {
                      itemContent += child.textContent;
                    } else if (child.nodeType === Node.ELEMENT_NODE) {
                      itemContent += convertToMarkdown(child, depth + 1, true);
                    }
                  }
                  const text = itemContent.trim();
                  if (text) {
                    markdown += prefix + text + '\n';
                  }
                });
                if (!inList) markdown += '\n';
                return markdown;
              }
    
              // Strong/Bold
              if (tagName === 'strong' || tagName === 'b') {
                const text = getTextContent(node).trim();
                return text ? `**${text}**` : '';
              }
    
              // Emphasis/Italic
              if (tagName === 'em' || tagName === 'i') {
                const text = getTextContent(node).trim();
                return text ? `*${text}*` : '';
              }
    
              // Horizontal rule
              if (tagName === 'hr') {
                return '\n---\n\n';
              }
    
              // Tables
              if (tagName === 'table') {
                const rows = Array.from(node.querySelectorAll('tr'));
                if (rows.length > 0) {
                  rows.forEach((row, rowIdx) => {
                    const cells = Array.from(row.querySelectorAll('th, td'));
                    const cellTexts = cells.map(cell => getTextContent(cell).trim().replace(/\n/g, ' '));
                    if (cellTexts.some(t => t)) {
                      markdown += '| ' + cellTexts.join(' | ') + ' |\n';
                      if (rowIdx === 0) {
                        markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n';
                      }
                    }
                  });
                  markdown += '\n';
                }
                return markdown;
              }
    
              // Line break
              if (tagName === 'br') {
                return '\n';
              }
    
              // Container elements - process children
              if (['div', 'section', 'article', 'main', 'span', 'td', 'th', 'li'].includes(tagName)) {
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.ELEMENT_NODE) {
                    markdown += convertToMarkdown(child, depth + 1, inList);
                  } else if (child.nodeType === Node.TEXT_NODE && depth === 0 && !inList) {
                    const text = child.textContent.trim();
                    if (text && text.length > 0) {
                      markdown += text + ' ';
                    }
                  }
                }
                return markdown;
              }
    
              // For any other element, try to extract text from children
              if (node.childNodes && node.childNodes.length > 0) {
                for (const child of node.childNodes) {
                  if (child.nodeType === Node.ELEMENT_NODE) {
                    markdown += convertToMarkdown(child, depth + 1, inList);
                  }
                }
              }
    
              return markdown;
            }
    
            const mainContent = extractMainContent();
            let result = convertToMarkdown(mainContent);
    
            // Clean up excessive newlines and spaces
            result = result
              .replace(/ +/g, ' ')  // Multiple spaces to single
              .replace(/\n\n\n+/g, '\n\n')  // Multiple newlines to double
              .trim();
    
            // If still empty, use fallback
            if (!result || result.length < 50) {
              const allText = mainContent.textContent.trim();
              if (allText) {
                result = allText
                  .split('\n')
                  .map(line => line.trim())
                  .filter(line => line.length > 0)
                  .join('\n\n');
              }
            }
    
            return result;
          },
          { includeImages, includeLinks }
        );
    
        await context.close();
        return markdown || 'No content could be extracted from this page.';
    
      } catch (error) {
        await context.close();
        throw new Error(`Error extracting markdown: ${error.message}`);
      }
    }
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vishwajeetdabholkar/markdown-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server