Skip to main content
Glama
SAhmadUmass

Notion MCP Server

by SAhmadUmass

extract-url-metadata

Extract metadata such as publication, author, date, and summary from URLs stored in a Notion database. Automate data enrichment and enhance content organization within your Notion workspace.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
authorPropertyNameNo
batchSizeNo
databaseIdYes
datePropertyNameNo
generateSummaryNo
limitNo
publicationPropertyNameNo
silentErrorsNo
summaryPropertyNameNo
urlPropertyNameNo

Implementation Reference

  • src/index.ts:489-743 (registration)
    Registration of the 'extract-url-metadata' tool including inline input schema (lines 491-502) and complete handler function that processes Notion database pages, auto-detects URL properties, extracts metadata from URLs using helper functions, and updates Notion page properties with publication, author, date, and summary.
    server.tool(
      "extract-url-metadata",
      { 
        databaseId: z.string(),
        urlPropertyName: z.string().optional(),
        publicationPropertyName: z.string().optional(),
        authorPropertyName: z.string().optional(),
        datePropertyName: z.string().optional(),
        summaryPropertyName: z.string().optional(),
        batchSize: z.number().default(5),
        limit: z.number().default(50),
        generateSummary: z.boolean().default(true),
        silentErrors: z.boolean().default(true)
      },
      async ({ 
        databaseId, 
        urlPropertyName, 
        publicationPropertyName, 
        authorPropertyName, 
        datePropertyName, 
        summaryPropertyName,
        batchSize,
        limit,
        generateSummary,
        silentErrors
      }) => {
        try {
          // First retrieve database to get property types
          const databaseInfo = await notion.databases.retrieve({
            database_id: databaseId
          });
          
          // Get all available property names and types
          const propertyInfoMap = databaseInfo.properties || {};
          
          // Auto-detect or use specified property names
          const urlPropertyName2 = urlPropertyName || findMatchingProperty(propertyInfoMap, [
            "URL", "Link", "Website", "Address", "Source Link"
          ]);
          
          const publicationProperty = publicationPropertyName || findMatchingProperty(propertyInfoMap, [
            "Publication", "Publisher", "Source", "Site", "Website Name", "Origin"
          ]);
          
          const authorProperty = authorPropertyName || findMatchingProperty(propertyInfoMap, [
            "Author", "Author(s)", "Writer", "Creator", "By"
          ]);
          
          const dateProperty = datePropertyName || findMatchingProperty(propertyInfoMap, [
            "Date", "Published", "Published Date", "Publish Date", "Release Date", "Post Date"
          ]);
          
          const summaryProperty = summaryPropertyName || findMatchingProperty(propertyInfoMap, [
            "Summary", "Article Summary", "TLDR", "Description", "Brief"
          ]);
          
          // Get property types for the detected properties
          const publicationPropertyType = getPropertyType(propertyInfoMap, publicationProperty);
          const authorPropertyType = getPropertyType(propertyInfoMap, authorProperty);
          const datePropertyType = getPropertyType(propertyInfoMap, dateProperty);
          const summaryPropertyType = getPropertyType(propertyInfoMap, summaryProperty);
          
          // Query the database to get rows with URLs
          const response = await notion.databases.query({
            database_id: databaseId,
            page_size: limit
          });
          
          const results: string[] = [];
          let successCount = 0;
          let failureCount = 0;
          
          // Log the property mapping being used
          results.push(`Using field mapping:
    - URLs: "${urlPropertyName2}" (${getPropertyType(propertyInfoMap, urlPropertyName2)})
    - Publication: "${publicationProperty}" (${publicationPropertyType})
    - Author: "${authorProperty}" (${authorPropertyType})
    - Date: "${dateProperty}" (${datePropertyType})
    - Summary: "${summaryProperty}" (${summaryPropertyType})`);
          
          // Process rows in batches
          for (let i = 0; i < response.results.length; i += batchSize) {
            const batch = response.results.slice(i, i + batchSize);
            
            // Process each row in the batch concurrently
            const batchPromises = batch.map(async (page: any) => {
              try {
                // Extract URL from the specified property
                const urlPropertyValue = page.properties[urlPropertyName2];
                let url = null;
                
                // Handle different property types that could contain URLs
                if (urlPropertyValue?.type === 'url' && urlPropertyValue.url) {
                  url = urlPropertyValue.url;
                } else if (urlPropertyValue?.type === 'rich_text' && urlPropertyValue.rich_text.length > 0) {
                  url = urlPropertyValue.rich_text[0]?.plain_text;
                } else if (urlPropertyValue?.type === 'title' && urlPropertyValue.title.length > 0) {
                  url = urlPropertyValue.title[0]?.plain_text;
                }
                
                if (!url || !url.startsWith('http')) {
                  return `Row ${page.id}: No valid URL found in property "${urlPropertyName2}"`;
                }
                
                // Fetch and extract metadata
                const metadata = await extractMetadataFromUrl(url);
                
                // Update the row with extracted metadata
                const properties: any = {};
                
                // Handle publication based on property type
                if (metadata.publication && publicationPropertyType) {
                  try {
                    if (publicationPropertyType === 'select') {
                      properties[publicationProperty] = createSelectProperty(metadata.publication);
                    } else if (publicationPropertyType === 'rich_text') {
                      properties[publicationProperty] = createRichTextProperty(metadata.publication);
                    } else if (publicationPropertyType === 'title') {
                      properties[publicationProperty] = createTitleProperty(metadata.publication);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${publicationProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Handle author based on property type
                if (metadata.author && authorPropertyType) {
                  try {
                    if (authorPropertyType === 'multi_select') {
                      properties[authorProperty] = createMultiSelectProperty(parseAuthors(metadata.author));
                    } else if (authorPropertyType === 'select') {
                      properties[authorProperty] = createSelectProperty(metadata.author);
                    } else if (authorPropertyType === 'rich_text') {
                      properties[authorProperty] = createRichTextProperty(metadata.author);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${authorProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Handle date based on property type
                if (metadata.date && datePropertyType === 'date') {
                  try {
                    properties[dateProperty] = createDateProperty(metadata.date);
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${dateProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Get content for page update and summary generation
                let content = metadata.content || '';
                let summary = '';
                
                // Generate summary using extracted content
                if (content && generateSummary && summaryPropertyType) {
                  try {
                    // For now, use a simple summarization method
                    summary = createSimpleSummary(content);
                    
                    // Add summary to properties based on property type
                    if (summaryPropertyType === 'rich_text') {
                      properties[summaryProperty] = createRichTextProperty(summary);
                    } else if (summaryPropertyType === 'select') {
                      properties[summaryProperty] = createSelectProperty(summary);
                    } else if (summaryPropertyType === 'multi_select') {
                      properties[summaryProperty] = createMultiSelectProperty([summary]);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${summaryProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Update the page properties if we have any to update
                if (Object.keys(properties).length > 0) {
                  try {
                    await notion.pages.update({
                      page_id: page.id,
                      properties: properties
                    });
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error updating properties: ${err.message}`;
                    }
                    // If we fail to update properties, we'll still try to update content
                  }
                }
                
                // Update the page content if we have content and it's not already in the page
                if (content) {
                  try {
                    // Get existing blocks
                    const blocks = await notion.blocks.children.list({
                      block_id: page.id
                    });
                    
                    // Only update if there are no blocks or fewer than 3 (assuming just a title)
                    if (blocks.results.length < 3) {
                      // Create content blocks (paragraphs)
                      const contentBlocks = createContentBlocks(content);
                      
                      await notion.blocks.children.append({
                        block_id: page.id,
                        children: contentBlocks
                      });
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error updating content: ${err.message}`;
                    }
                  }
                }
                
                successCount++;
                return `Row ${page.id}: Successfully extracted metadata from ${url}`;
              } catch (error: any) {
                failureCount++;
                return `Row ${page.id}: Failed to extract metadata - ${silentErrors ? 'Error occurred' : error.message}`;
              }
            });
            
            // Wait for all pages in the batch to be processed
            const batchResults = await Promise.all(batchPromises);
            results.push(...batchResults);
            
            // Add a small delay between batches to avoid rate limiting
            if (i + batchSize < response.results.length) {
              await new Promise(resolve => setTimeout(resolve, 1000));
            }
          }
          
          return {
            content: [{
              type: "text",
              text: `Processed ${successCount + failureCount} URLs\n${successCount} successful\n${failureCount} failed\n\nDetails:\n${results.join('\n')}`
            }]
          };
        } catch (error: any) {
          return {
            content: [{
              type: "text",
              text: `Error extracting metadata: ${error.message}`
            }],
            isError: true
          };
        }
      }
    );
  • The core handler function for 'extract-url-metadata': queries Notion database, finds URLs in specified property, fetches each URL to extract metadata (publication, author, date, content/summary), updates the corresponding Notion pages with extracted data respecting property types.
      async ({ 
        databaseId, 
        urlPropertyName, 
        publicationPropertyName, 
        authorPropertyName, 
        datePropertyName, 
        summaryPropertyName,
        batchSize,
        limit,
        generateSummary,
        silentErrors
      }) => {
        try {
          // First retrieve database to get property types
          const databaseInfo = await notion.databases.retrieve({
            database_id: databaseId
          });
          
          // Get all available property names and types
          const propertyInfoMap = databaseInfo.properties || {};
          
          // Auto-detect or use specified property names
          const urlPropertyName2 = urlPropertyName || findMatchingProperty(propertyInfoMap, [
            "URL", "Link", "Website", "Address", "Source Link"
          ]);
          
          const publicationProperty = publicationPropertyName || findMatchingProperty(propertyInfoMap, [
            "Publication", "Publisher", "Source", "Site", "Website Name", "Origin"
          ]);
          
          const authorProperty = authorPropertyName || findMatchingProperty(propertyInfoMap, [
            "Author", "Author(s)", "Writer", "Creator", "By"
          ]);
          
          const dateProperty = datePropertyName || findMatchingProperty(propertyInfoMap, [
            "Date", "Published", "Published Date", "Publish Date", "Release Date", "Post Date"
          ]);
          
          const summaryProperty = summaryPropertyName || findMatchingProperty(propertyInfoMap, [
            "Summary", "Article Summary", "TLDR", "Description", "Brief"
          ]);
          
          // Get property types for the detected properties
          const publicationPropertyType = getPropertyType(propertyInfoMap, publicationProperty);
          const authorPropertyType = getPropertyType(propertyInfoMap, authorProperty);
          const datePropertyType = getPropertyType(propertyInfoMap, dateProperty);
          const summaryPropertyType = getPropertyType(propertyInfoMap, summaryProperty);
          
          // Query the database to get rows with URLs
          const response = await notion.databases.query({
            database_id: databaseId,
            page_size: limit
          });
          
          const results: string[] = [];
          let successCount = 0;
          let failureCount = 0;
          
          // Log the property mapping being used
          results.push(`Using field mapping:
    - URLs: "${urlPropertyName2}" (${getPropertyType(propertyInfoMap, urlPropertyName2)})
    - Publication: "${publicationProperty}" (${publicationPropertyType})
    - Author: "${authorProperty}" (${authorPropertyType})
    - Date: "${dateProperty}" (${datePropertyType})
    - Summary: "${summaryProperty}" (${summaryPropertyType})`);
          
          // Process rows in batches
          for (let i = 0; i < response.results.length; i += batchSize) {
            const batch = response.results.slice(i, i + batchSize);
            
            // Process each row in the batch concurrently
            const batchPromises = batch.map(async (page: any) => {
              try {
                // Extract URL from the specified property
                const urlPropertyValue = page.properties[urlPropertyName2];
                let url = null;
                
                // Handle different property types that could contain URLs
                if (urlPropertyValue?.type === 'url' && urlPropertyValue.url) {
                  url = urlPropertyValue.url;
                } else if (urlPropertyValue?.type === 'rich_text' && urlPropertyValue.rich_text.length > 0) {
                  url = urlPropertyValue.rich_text[0]?.plain_text;
                } else if (urlPropertyValue?.type === 'title' && urlPropertyValue.title.length > 0) {
                  url = urlPropertyValue.title[0]?.plain_text;
                }
                
                if (!url || !url.startsWith('http')) {
                  return `Row ${page.id}: No valid URL found in property "${urlPropertyName2}"`;
                }
                
                // Fetch and extract metadata
                const metadata = await extractMetadataFromUrl(url);
                
                // Update the row with extracted metadata
                const properties: any = {};
                
                // Handle publication based on property type
                if (metadata.publication && publicationPropertyType) {
                  try {
                    if (publicationPropertyType === 'select') {
                      properties[publicationProperty] = createSelectProperty(metadata.publication);
                    } else if (publicationPropertyType === 'rich_text') {
                      properties[publicationProperty] = createRichTextProperty(metadata.publication);
                    } else if (publicationPropertyType === 'title') {
                      properties[publicationProperty] = createTitleProperty(metadata.publication);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${publicationProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Handle author based on property type
                if (metadata.author && authorPropertyType) {
                  try {
                    if (authorPropertyType === 'multi_select') {
                      properties[authorProperty] = createMultiSelectProperty(parseAuthors(metadata.author));
                    } else if (authorPropertyType === 'select') {
                      properties[authorProperty] = createSelectProperty(metadata.author);
                    } else if (authorPropertyType === 'rich_text') {
                      properties[authorProperty] = createRichTextProperty(metadata.author);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${authorProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Handle date based on property type
                if (metadata.date && datePropertyType === 'date') {
                  try {
                    properties[dateProperty] = createDateProperty(metadata.date);
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${dateProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Get content for page update and summary generation
                let content = metadata.content || '';
                let summary = '';
                
                // Generate summary using extracted content
                if (content && generateSummary && summaryPropertyType) {
                  try {
                    // For now, use a simple summarization method
                    summary = createSimpleSummary(content);
                    
                    // Add summary to properties based on property type
                    if (summaryPropertyType === 'rich_text') {
                      properties[summaryProperty] = createRichTextProperty(summary);
                    } else if (summaryPropertyType === 'select') {
                      properties[summaryProperty] = createSelectProperty(summary);
                    } else if (summaryPropertyType === 'multi_select') {
                      properties[summaryProperty] = createMultiSelectProperty([summary]);
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error setting ${summaryProperty} property: ${err.message}`;
                    }
                  }
                }
                
                // Update the page properties if we have any to update
                if (Object.keys(properties).length > 0) {
                  try {
                    await notion.pages.update({
                      page_id: page.id,
                      properties: properties
                    });
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error updating properties: ${err.message}`;
                    }
                    // If we fail to update properties, we'll still try to update content
                  }
                }
                
                // Update the page content if we have content and it's not already in the page
                if (content) {
                  try {
                    // Get existing blocks
                    const blocks = await notion.blocks.children.list({
                      block_id: page.id
                    });
                    
                    // Only update if there are no blocks or fewer than 3 (assuming just a title)
                    if (blocks.results.length < 3) {
                      // Create content blocks (paragraphs)
                      const contentBlocks = createContentBlocks(content);
                      
                      await notion.blocks.children.append({
                        block_id: page.id,
                        children: contentBlocks
                      });
                    }
                  } catch (err: any) {
                    if (!silentErrors) {
                      return `Row ${page.id}: Error updating content: ${err.message}`;
                    }
                  }
                }
                
                successCount++;
                return `Row ${page.id}: Successfully extracted metadata from ${url}`;
              } catch (error: any) {
                failureCount++;
                return `Row ${page.id}: Failed to extract metadata - ${silentErrors ? 'Error occurred' : error.message}`;
              }
            });
            
            // Wait for all pages in the batch to be processed
            const batchResults = await Promise.all(batchPromises);
            results.push(...batchResults);
            
            // Add a small delay between batches to avoid rate limiting
            if (i + batchSize < response.results.length) {
              await new Promise(resolve => setTimeout(resolve, 1000));
            }
          }
          
          return {
            content: [{
              type: "text",
              text: `Processed ${successCount + failureCount} URLs\n${successCount} successful\n${failureCount} failed\n\nDetails:\n${results.join('\n')}`
            }]
          };
        } catch (error: any) {
          return {
            content: [{
              type: "text",
              text: `Error extracting metadata: ${error.message}`
            }],
            isError: true
          };
        }
      }
  • Key helper that orchestrates URL metadata extraction: fetches HTML with axios, parses with Cheerio, extracts publication/author/date/content using specialized helpers.
    async function extractMetadataFromUrl(url: string) {
      // Fetch the webpage
      const response = await axios.get(url, {
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
          'Accept': 'text/html,application/xhtml+xml,application/xml',
          'Accept-Language': 'en-US,en;q=0.9'
        },
        timeout: 10000,
        maxRedirects: 5
      });
      
      // Parse HTML
      const $ = cheerio.load(response.data);
      
      // Extract metadata
      const publication = extractPublication($, url);
      const author = extractAuthor($);
      const date = extractDate($);
      const content = extractContent($);
      
      return { publication, author, date, content };
    }
  • Helper to auto-detect Notion database property names (e.g., for URL, publication, author) by matching against common names.
    function findMatchingProperty(propertyInfoMap: any, possibleNames: string[]): string {
      const availableProperties = Object.keys(propertyInfoMap);
      
      // First try exact match
      for (const name of possibleNames) {
        if (availableProperties.includes(name)) {
          return name;
        }
      }
      
      // Then try case-insensitive match
      for (const name of possibleNames) {
        const match = availableProperties.find(prop => 
          prop.toLowerCase() === name.toLowerCase()
        );
        if (match) {
          return match;
        }
      }
      
      // Then try partial match (contains)
      for (const name of possibleNames) {
        const match = availableProperties.find(prop => 
          prop.toLowerCase().includes(name.toLowerCase()) || 
          name.toLowerCase().includes(prop.toLowerCase())
        );
        if (match) {
          return match;
        }
      }
      
      // Default to the first possible name if no match found
      return possibleNames[0];
    }
  • Helper extracts publication/site name prioritizing structured data (OG, JSON-LD), then headers/domain.
    function extractPublication($: cheerio.CheerioAPI, url: string): string {
      // Log extraction attempts for debugging
      const attempts: string[] = [];
      
      // Try Open Graph site_name (highest confidence)
      const ogSiteName = $('meta[property="og:site_name"]').attr('content');
      if (ogSiteName) {
        attempts.push(`og:site_name: "${ogSiteName}"`);
        return ogSiteName;
      }
      
      // Try JSON-LD for publisher name (high confidence)
      const jsonLdScripts = $('script[type="application/ld+json"]');
      if (jsonLdScripts.length) {
        let publisher = '';
        jsonLdScripts.each((i, el) => {
          try {
            // Safely parse the JSON, with error handling
            const scriptContent = $(el).html() || '';
            const cleanedJson = sanitizeJsonString(scriptContent);
            
            if (!cleanedJson) return; // Skip if we couldn't clean it
            
            const jsonLd = JSON.parse(cleanedJson);
            const possiblePublisher = extractPublisherFromJsonLd(jsonLd);
            if (possiblePublisher) {
              attempts.push(`JSON-LD publisher: "${possiblePublisher}"`);
              publisher = possiblePublisher;
              return false; // Break the each loop
            }
          } catch (e) {
            // Silently ignore JSON parsing errors
          }
        });
        
        if (publisher) return publisher;
      }
      
      // Try other common meta tags (medium confidence)
      const publisherMeta = $('meta[name="publisher"]').attr('content') ||
                            $('meta[name="application-name"]').attr('content') ||
                            $('meta[property="og:site"]').attr('content');
      
      if (publisherMeta) {
        attempts.push(`meta publisher: "${publisherMeta}"`);
        return publisherMeta;
      }
      
      // Try to find publication name in the site header (medium confidence)
      const headerSelectors = [
        'header .logo', 'header .site-title', 'header .brand', 
        '.site-title', '.logo img', '.logo', '.brand', 
        '#logo', '[itemprop="publisher"]'
      ];
      
      for (const selector of headerSelectors) {
        const headerElement = $(selector).first();
        if (headerElement.length) {
          // Check for alt text in image
          if (headerElement.is('img')) {
            const alt = headerElement.attr('alt');
            if (alt && alt.length < 50) {
              attempts.push(`${selector} alt: "${alt}"`);
              return alt;
            }
          }
          
          // Otherwise use text content
          const text = headerElement.text().trim();
          if (text && text.length < 50) {
            attempts.push(`${selector} text: "${text}"`);
            return text;
          }
        }
      }
      
      // Extract from domain as fallback (low confidence)
      try {
        const domain = new URL(url).hostname.replace(/^www\./, '');
        attempts.push(`domain: "${domain}"`);
        
        // Extract the name part of the domain
        const parts = domain.split('.');
        if (parts.length > 0) {
          const name = parts[0]
            .replace(/-/g, ' ')
            .split(' ')
            .map(word => word.charAt(0).toUpperCase() + word.slice(1))
            .join(' ');
          
          attempts.push(`formatted domain: "${name}"`);
          return name;
        }
        return domain;
      } catch (e) {
        return 'Unknown Publication';
      }
    }
Install Server

Other Tools

Related Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SAhmadUmass/notion-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server