extract-url-metadata
Extract metadata such as publication, author, date, and summary from URLs stored in a Notion database. Automate data enrichment and enhance content organization within your Notion workspace.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| authorPropertyName | No | ||
| batchSize | No | ||
| databaseId | Yes | ||
| datePropertyName | No | ||
| generateSummary | No | ||
| limit | No | ||
| publicationPropertyName | No | ||
| silentErrors | No | ||
| summaryPropertyName | No | ||
| urlPropertyName | No |
Implementation Reference
- src/index.ts:489-743 (registration)Registration of the 'extract-url-metadata' tool including inline input schema (lines 491-502) and complete handler function that processes Notion database pages, auto-detects URL properties, extracts metadata from URLs using helper functions, and updates Notion page properties with publication, author, date, and summary.server.tool( "extract-url-metadata", { databaseId: z.string(), urlPropertyName: z.string().optional(), publicationPropertyName: z.string().optional(), authorPropertyName: z.string().optional(), datePropertyName: z.string().optional(), summaryPropertyName: z.string().optional(), batchSize: z.number().default(5), limit: z.number().default(50), generateSummary: z.boolean().default(true), silentErrors: z.boolean().default(true) }, async ({ databaseId, urlPropertyName, publicationPropertyName, authorPropertyName, datePropertyName, summaryPropertyName, batchSize, limit, generateSummary, silentErrors }) => { try { // First retrieve database to get property types const databaseInfo = await notion.databases.retrieve({ database_id: databaseId }); // Get all available property names and types const propertyInfoMap = databaseInfo.properties || {}; // Auto-detect or use specified property names const urlPropertyName2 = urlPropertyName || findMatchingProperty(propertyInfoMap, [ "URL", "Link", "Website", "Address", "Source Link" ]); const publicationProperty = publicationPropertyName || findMatchingProperty(propertyInfoMap, [ "Publication", "Publisher", "Source", "Site", "Website Name", "Origin" ]); const authorProperty = authorPropertyName || findMatchingProperty(propertyInfoMap, [ "Author", "Author(s)", "Writer", "Creator", "By" ]); const dateProperty = datePropertyName || findMatchingProperty(propertyInfoMap, [ "Date", "Published", "Published Date", "Publish Date", "Release Date", "Post Date" ]); const summaryProperty = summaryPropertyName || findMatchingProperty(propertyInfoMap, [ "Summary", "Article Summary", "TLDR", "Description", "Brief" ]); // Get property types for the detected properties const publicationPropertyType = getPropertyType(propertyInfoMap, publicationProperty); const authorPropertyType = getPropertyType(propertyInfoMap, authorProperty); const datePropertyType = getPropertyType(propertyInfoMap, dateProperty); const summaryPropertyType = getPropertyType(propertyInfoMap, summaryProperty); // Query the database to get rows with URLs const response = await notion.databases.query({ database_id: databaseId, page_size: limit }); const results: string[] = []; let successCount = 0; let failureCount = 0; // Log the property mapping being used results.push(`Using field mapping: - URLs: "${urlPropertyName2}" (${getPropertyType(propertyInfoMap, urlPropertyName2)}) - Publication: "${publicationProperty}" (${publicationPropertyType}) - Author: "${authorProperty}" (${authorPropertyType}) - Date: "${dateProperty}" (${datePropertyType}) - Summary: "${summaryProperty}" (${summaryPropertyType})`); // Process rows in batches for (let i = 0; i < response.results.length; i += batchSize) { const batch = response.results.slice(i, i + batchSize); // Process each row in the batch concurrently const batchPromises = batch.map(async (page: any) => { try { // Extract URL from the specified property const urlPropertyValue = page.properties[urlPropertyName2]; let url = null; // Handle different property types that could contain URLs if (urlPropertyValue?.type === 'url' && urlPropertyValue.url) { url = urlPropertyValue.url; } else if (urlPropertyValue?.type === 'rich_text' && urlPropertyValue.rich_text.length > 0) { url = urlPropertyValue.rich_text[0]?.plain_text; } else if (urlPropertyValue?.type === 'title' && urlPropertyValue.title.length > 0) { url = urlPropertyValue.title[0]?.plain_text; } if (!url || !url.startsWith('http')) { return `Row ${page.id}: No valid URL found in property "${urlPropertyName2}"`; } // Fetch and extract metadata const metadata = await extractMetadataFromUrl(url); // Update the row with extracted metadata const properties: any = {}; // Handle publication based on property type if (metadata.publication && publicationPropertyType) { try { if (publicationPropertyType === 'select') { properties[publicationProperty] = createSelectProperty(metadata.publication); } else if (publicationPropertyType === 'rich_text') { properties[publicationProperty] = createRichTextProperty(metadata.publication); } else if (publicationPropertyType === 'title') { properties[publicationProperty] = createTitleProperty(metadata.publication); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${publicationProperty} property: ${err.message}`; } } } // Handle author based on property type if (metadata.author && authorPropertyType) { try { if (authorPropertyType === 'multi_select') { properties[authorProperty] = createMultiSelectProperty(parseAuthors(metadata.author)); } else if (authorPropertyType === 'select') { properties[authorProperty] = createSelectProperty(metadata.author); } else if (authorPropertyType === 'rich_text') { properties[authorProperty] = createRichTextProperty(metadata.author); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${authorProperty} property: ${err.message}`; } } } // Handle date based on property type if (metadata.date && datePropertyType === 'date') { try { properties[dateProperty] = createDateProperty(metadata.date); } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${dateProperty} property: ${err.message}`; } } } // Get content for page update and summary generation let content = metadata.content || ''; let summary = ''; // Generate summary using extracted content if (content && generateSummary && summaryPropertyType) { try { // For now, use a simple summarization method summary = createSimpleSummary(content); // Add summary to properties based on property type if (summaryPropertyType === 'rich_text') { properties[summaryProperty] = createRichTextProperty(summary); } else if (summaryPropertyType === 'select') { properties[summaryProperty] = createSelectProperty(summary); } else if (summaryPropertyType === 'multi_select') { properties[summaryProperty] = createMultiSelectProperty([summary]); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${summaryProperty} property: ${err.message}`; } } } // Update the page properties if we have any to update if (Object.keys(properties).length > 0) { try { await notion.pages.update({ page_id: page.id, properties: properties }); } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error updating properties: ${err.message}`; } // If we fail to update properties, we'll still try to update content } } // Update the page content if we have content and it's not already in the page if (content) { try { // Get existing blocks const blocks = await notion.blocks.children.list({ block_id: page.id }); // Only update if there are no blocks or fewer than 3 (assuming just a title) if (blocks.results.length < 3) { // Create content blocks (paragraphs) const contentBlocks = createContentBlocks(content); await notion.blocks.children.append({ block_id: page.id, children: contentBlocks }); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error updating content: ${err.message}`; } } } successCount++; return `Row ${page.id}: Successfully extracted metadata from ${url}`; } catch (error: any) { failureCount++; return `Row ${page.id}: Failed to extract metadata - ${silentErrors ? 'Error occurred' : error.message}`; } }); // Wait for all pages in the batch to be processed const batchResults = await Promise.all(batchPromises); results.push(...batchResults); // Add a small delay between batches to avoid rate limiting if (i + batchSize < response.results.length) { await new Promise(resolve => setTimeout(resolve, 1000)); } } return { content: [{ type: "text", text: `Processed ${successCount + failureCount} URLs\n${successCount} successful\n${failureCount} failed\n\nDetails:\n${results.join('\n')}` }] }; } catch (error: any) { return { content: [{ type: "text", text: `Error extracting metadata: ${error.message}` }], isError: true }; } } );
- src/index.ts:503-742 (handler)The core handler function for 'extract-url-metadata': queries Notion database, finds URLs in specified property, fetches each URL to extract metadata (publication, author, date, content/summary), updates the corresponding Notion pages with extracted data respecting property types.async ({ databaseId, urlPropertyName, publicationPropertyName, authorPropertyName, datePropertyName, summaryPropertyName, batchSize, limit, generateSummary, silentErrors }) => { try { // First retrieve database to get property types const databaseInfo = await notion.databases.retrieve({ database_id: databaseId }); // Get all available property names and types const propertyInfoMap = databaseInfo.properties || {}; // Auto-detect or use specified property names const urlPropertyName2 = urlPropertyName || findMatchingProperty(propertyInfoMap, [ "URL", "Link", "Website", "Address", "Source Link" ]); const publicationProperty = publicationPropertyName || findMatchingProperty(propertyInfoMap, [ "Publication", "Publisher", "Source", "Site", "Website Name", "Origin" ]); const authorProperty = authorPropertyName || findMatchingProperty(propertyInfoMap, [ "Author", "Author(s)", "Writer", "Creator", "By" ]); const dateProperty = datePropertyName || findMatchingProperty(propertyInfoMap, [ "Date", "Published", "Published Date", "Publish Date", "Release Date", "Post Date" ]); const summaryProperty = summaryPropertyName || findMatchingProperty(propertyInfoMap, [ "Summary", "Article Summary", "TLDR", "Description", "Brief" ]); // Get property types for the detected properties const publicationPropertyType = getPropertyType(propertyInfoMap, publicationProperty); const authorPropertyType = getPropertyType(propertyInfoMap, authorProperty); const datePropertyType = getPropertyType(propertyInfoMap, dateProperty); const summaryPropertyType = getPropertyType(propertyInfoMap, summaryProperty); // Query the database to get rows with URLs const response = await notion.databases.query({ database_id: databaseId, page_size: limit }); const results: string[] = []; let successCount = 0; let failureCount = 0; // Log the property mapping being used results.push(`Using field mapping: - URLs: "${urlPropertyName2}" (${getPropertyType(propertyInfoMap, urlPropertyName2)}) - Publication: "${publicationProperty}" (${publicationPropertyType}) - Author: "${authorProperty}" (${authorPropertyType}) - Date: "${dateProperty}" (${datePropertyType}) - Summary: "${summaryProperty}" (${summaryPropertyType})`); // Process rows in batches for (let i = 0; i < response.results.length; i += batchSize) { const batch = response.results.slice(i, i + batchSize); // Process each row in the batch concurrently const batchPromises = batch.map(async (page: any) => { try { // Extract URL from the specified property const urlPropertyValue = page.properties[urlPropertyName2]; let url = null; // Handle different property types that could contain URLs if (urlPropertyValue?.type === 'url' && urlPropertyValue.url) { url = urlPropertyValue.url; } else if (urlPropertyValue?.type === 'rich_text' && urlPropertyValue.rich_text.length > 0) { url = urlPropertyValue.rich_text[0]?.plain_text; } else if (urlPropertyValue?.type === 'title' && urlPropertyValue.title.length > 0) { url = urlPropertyValue.title[0]?.plain_text; } if (!url || !url.startsWith('http')) { return `Row ${page.id}: No valid URL found in property "${urlPropertyName2}"`; } // Fetch and extract metadata const metadata = await extractMetadataFromUrl(url); // Update the row with extracted metadata const properties: any = {}; // Handle publication based on property type if (metadata.publication && publicationPropertyType) { try { if (publicationPropertyType === 'select') { properties[publicationProperty] = createSelectProperty(metadata.publication); } else if (publicationPropertyType === 'rich_text') { properties[publicationProperty] = createRichTextProperty(metadata.publication); } else if (publicationPropertyType === 'title') { properties[publicationProperty] = createTitleProperty(metadata.publication); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${publicationProperty} property: ${err.message}`; } } } // Handle author based on property type if (metadata.author && authorPropertyType) { try { if (authorPropertyType === 'multi_select') { properties[authorProperty] = createMultiSelectProperty(parseAuthors(metadata.author)); } else if (authorPropertyType === 'select') { properties[authorProperty] = createSelectProperty(metadata.author); } else if (authorPropertyType === 'rich_text') { properties[authorProperty] = createRichTextProperty(metadata.author); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${authorProperty} property: ${err.message}`; } } } // Handle date based on property type if (metadata.date && datePropertyType === 'date') { try { properties[dateProperty] = createDateProperty(metadata.date); } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${dateProperty} property: ${err.message}`; } } } // Get content for page update and summary generation let content = metadata.content || ''; let summary = ''; // Generate summary using extracted content if (content && generateSummary && summaryPropertyType) { try { // For now, use a simple summarization method summary = createSimpleSummary(content); // Add summary to properties based on property type if (summaryPropertyType === 'rich_text') { properties[summaryProperty] = createRichTextProperty(summary); } else if (summaryPropertyType === 'select') { properties[summaryProperty] = createSelectProperty(summary); } else if (summaryPropertyType === 'multi_select') { properties[summaryProperty] = createMultiSelectProperty([summary]); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error setting ${summaryProperty} property: ${err.message}`; } } } // Update the page properties if we have any to update if (Object.keys(properties).length > 0) { try { await notion.pages.update({ page_id: page.id, properties: properties }); } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error updating properties: ${err.message}`; } // If we fail to update properties, we'll still try to update content } } // Update the page content if we have content and it's not already in the page if (content) { try { // Get existing blocks const blocks = await notion.blocks.children.list({ block_id: page.id }); // Only update if there are no blocks or fewer than 3 (assuming just a title) if (blocks.results.length < 3) { // Create content blocks (paragraphs) const contentBlocks = createContentBlocks(content); await notion.blocks.children.append({ block_id: page.id, children: contentBlocks }); } } catch (err: any) { if (!silentErrors) { return `Row ${page.id}: Error updating content: ${err.message}`; } } } successCount++; return `Row ${page.id}: Successfully extracted metadata from ${url}`; } catch (error: any) { failureCount++; return `Row ${page.id}: Failed to extract metadata - ${silentErrors ? 'Error occurred' : error.message}`; } }); // Wait for all pages in the batch to be processed const batchResults = await Promise.all(batchPromises); results.push(...batchResults); // Add a small delay between batches to avoid rate limiting if (i + batchSize < response.results.length) { await new Promise(resolve => setTimeout(resolve, 1000)); } } return { content: [{ type: "text", text: `Processed ${successCount + failureCount} URLs\n${successCount} successful\n${failureCount} failed\n\nDetails:\n${results.join('\n')}` }] }; } catch (error: any) { return { content: [{ type: "text", text: `Error extracting metadata: ${error.message}` }], isError: true }; } }
- src/index.ts:1105-1127 (helper)Key helper that orchestrates URL metadata extraction: fetches HTML with axios, parses with Cheerio, extracts publication/author/date/content using specialized helpers.async function extractMetadataFromUrl(url: string) { // Fetch the webpage const response = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-US,en;q=0.9' }, timeout: 10000, maxRedirects: 5 }); // Parse HTML const $ = cheerio.load(response.data); // Extract metadata const publication = extractPublication($, url); const author = extractAuthor($); const date = extractDate($); const content = extractContent($); return { publication, author, date, content }; }
- src/index.ts:951-984 (helper)Helper to auto-detect Notion database property names (e.g., for URL, publication, author) by matching against common names.function findMatchingProperty(propertyInfoMap: any, possibleNames: string[]): string { const availableProperties = Object.keys(propertyInfoMap); // First try exact match for (const name of possibleNames) { if (availableProperties.includes(name)) { return name; } } // Then try case-insensitive match for (const name of possibleNames) { const match = availableProperties.find(prop => prop.toLowerCase() === name.toLowerCase() ); if (match) { return match; } } // Then try partial match (contains) for (const name of possibleNames) { const match = availableProperties.find(prop => prop.toLowerCase().includes(name.toLowerCase()) || name.toLowerCase().includes(prop.toLowerCase()) ); if (match) { return match; } } // Default to the first possible name if no match found return possibleNames[0]; }
- src/index.ts:1903-2000 (helper)Helper extracts publication/site name prioritizing structured data (OG, JSON-LD), then headers/domain.function extractPublication($: cheerio.CheerioAPI, url: string): string { // Log extraction attempts for debugging const attempts: string[] = []; // Try Open Graph site_name (highest confidence) const ogSiteName = $('meta[property="og:site_name"]').attr('content'); if (ogSiteName) { attempts.push(`og:site_name: "${ogSiteName}"`); return ogSiteName; } // Try JSON-LD for publisher name (high confidence) const jsonLdScripts = $('script[type="application/ld+json"]'); if (jsonLdScripts.length) { let publisher = ''; jsonLdScripts.each((i, el) => { try { // Safely parse the JSON, with error handling const scriptContent = $(el).html() || ''; const cleanedJson = sanitizeJsonString(scriptContent); if (!cleanedJson) return; // Skip if we couldn't clean it const jsonLd = JSON.parse(cleanedJson); const possiblePublisher = extractPublisherFromJsonLd(jsonLd); if (possiblePublisher) { attempts.push(`JSON-LD publisher: "${possiblePublisher}"`); publisher = possiblePublisher; return false; // Break the each loop } } catch (e) { // Silently ignore JSON parsing errors } }); if (publisher) return publisher; } // Try other common meta tags (medium confidence) const publisherMeta = $('meta[name="publisher"]').attr('content') || $('meta[name="application-name"]').attr('content') || $('meta[property="og:site"]').attr('content'); if (publisherMeta) { attempts.push(`meta publisher: "${publisherMeta}"`); return publisherMeta; } // Try to find publication name in the site header (medium confidence) const headerSelectors = [ 'header .logo', 'header .site-title', 'header .brand', '.site-title', '.logo img', '.logo', '.brand', '#logo', '[itemprop="publisher"]' ]; for (const selector of headerSelectors) { const headerElement = $(selector).first(); if (headerElement.length) { // Check for alt text in image if (headerElement.is('img')) { const alt = headerElement.attr('alt'); if (alt && alt.length < 50) { attempts.push(`${selector} alt: "${alt}"`); return alt; } } // Otherwise use text content const text = headerElement.text().trim(); if (text && text.length < 50) { attempts.push(`${selector} text: "${text}"`); return text; } } } // Extract from domain as fallback (low confidence) try { const domain = new URL(url).hostname.replace(/^www\./, ''); attempts.push(`domain: "${domain}"`); // Extract the name part of the domain const parts = domain.split('.'); if (parts.length > 0) { const name = parts[0] .replace(/-/g, ' ') .split(' ') .map(word => word.charAt(0).toUpperCase() + word.slice(1)) .join(' '); attempts.push(`formatted domain: "${name}"`); return name; } return domain; } catch (e) { return 'Unknown Publication'; } }