Skip to main content
Glama
enhanced-content-extractor.service.ts•17.9 kB
import * as cheerio from 'cheerio'; import { JSDOM } from 'jsdom'; import { ContentExtractor } from './content-extractor.service.js'; import { EnhancedWebpageContent, ExtractedLink, ExtractedImage, TableData, ListData, HierarchyData, KeyValuePair, StructuredData, HierarchyNode } from '../types/enhanced-types.js'; import { WebpageContent, OutputFormat } from '../types.js'; /** * EnhancedContentExtractor * * Extends the ContentExtractor to provide: * 1. Structured data preservation (tables, lists, hierarchies) * 2. Visual context integration (image descriptions) * 3. Link extraction and analysis * 4. Source credibility assessment */ export class EnhancedContentExtractor extends ContentExtractor { constructor() { super(); } /** * Extract enhanced content from a webpage */ async extractEnhancedContent(url: string, format: OutputFormat = 'markdown'): Promise<EnhancedWebpageContent> { // First get the base content using the parent class const baseContent = await super.extractContent(url, format); // Now enhance it with additional information return await this.enhanceContent(baseContent, url); } /** * Batch extract enhanced content from multiple webpages */ async batchExtractEnhancedContent( urls: string[], format: OutputFormat = 'markdown' ): Promise<Record<string, EnhancedWebpageContent | { error: string }>> { const results: Record<string, EnhancedWebpageContent | { error: string }> = {}; await Promise.all( urls.map(async (url) => { try { results[url] = await this.extractEnhancedContent(url, format); } catch (error) { results[url] = { error: error instanceof Error ? error.message : 'Unknown error occurred' }; } }) ); return results; } /** * Enhance content with additional information */ private async enhanceContent(baseContent: WebpageContent, url: string): Promise<EnhancedWebpageContent> { try { // Fetch the raw HTML again to parse for structured data const response = await fetch(url); const html = await response.text(); // Parse the HTML const $ = cheerio.load(html); const dom = new JSDOM(html); // Extract structured data const structuredData = this.extractStructuredData($, dom); // Extract links const links = this.extractLinks($, url); // Extract images with context const images = this.extractImages($); // Assess source credibility const sourceCredibility = this.assessSourceCredibility(baseContent, url, $); // Create enhanced content const enhancedContent: EnhancedWebpageContent = { ...baseContent, links, images, structuredData, lastVisited: new Date(), sourceCredibility }; return enhancedContent; } catch (error) { console.error(`Error enhancing content for ${url}:`, error); // If enhancement fails, return the base content with empty enhancements const fallbackContent: EnhancedWebpageContent = { ...baseContent, links: [], images: [], structuredData: { tables: [], lists: [], hierarchies: [], keyValuePairs: [] }, lastVisited: new Date() }; return fallbackContent; } } /** * Extract structured data from HTML */ private extractStructuredData($: any, dom: JSDOM): StructuredData { return { tables: this.extractTables($), lists: this.extractLists($), hierarchies: this.extractHierarchies($), keyValuePairs: this.extractKeyValuePairs($) }; } /** * Extract tables from HTML */ private extractTables($: any): TableData[] { const tables: TableData[] = []; $('table').each((index: number, element: any) => { try { const $table = $(element); const id = `table-${index}`; const caption = $table.find('caption').text().trim(); // Get context (text before the table) const prevText = $table.prev().text().trim(); // Extract headers const headers: string[] = []; $table.find('thead tr th, tr th').each((_: number, headerCell: any) => { headers.push($(headerCell).text().trim()); }); // Extract rows const rows: string[][] = []; $table.find('tbody tr, tr').each((_: number, row: any) => { const rowData: string[] = []; // Skip header rows if ($(row).find('th').length === 0 || rows.length > 0) { $(row).find('td').each((_: number, cell: any) => { rowData.push($(cell).text().trim()); }); if (rowData.length > 0) { rows.push(rowData); } } }); // Create markdown representation let markdownTable = ''; if (caption) { markdownTable += `**${caption}**\n\n`; } // Add headers if (headers.length > 0) { markdownTable += '| ' + headers.join(' | ') + ' |\n'; markdownTable += '| ' + headers.map(() => '---').join(' | ') + ' |\n'; } // Add rows rows.forEach(row => { markdownTable += '| ' + row.join(' | ') + ' |\n'; }); tables.push({ id, caption, context: prevText, headers, rows, markdownRepresentation: markdownTable }); } catch (error) { console.error('Error extracting table:', error); } }); return tables; } /** * Extract lists from HTML */ private extractLists($: any): ListData[] { const lists: ListData[] = []; $('ul, ol, dl').each((index: number, element: any) => { try { const $list = $(element); const id = `list-${index}`; // Determine list type let type: 'ordered' | 'unordered' | 'definition'; if ($list.is('ol')) { type = 'ordered'; } else if ($list.is('ul')) { type = 'unordered'; } else { type = 'definition'; } // Extract items const items: string[] = []; if (type === 'definition') { // Handle definition lists $list.find('dt').each((_: number, dt: any) => { const term = $(dt).text().trim(); const definition = $(dt).next('dd').text().trim(); items.push(`**${term}**: ${definition}`); }); } else { // Handle ordered and unordered lists $list.find('li').each((_: number, li: any) => { // Skip nested list items if ($(li).parents('li').length === 0) { items.push($(li).text().trim()); } }); } // Create markdown representation let markdownList = ''; items.forEach((item, idx) => { if (type === 'ordered') { markdownList += `${idx + 1}. ${item}\n`; } else { markdownList += `- ${item}\n`; } }); lists.push({ id, type, items, markdownRepresentation: markdownList }); } catch (error) { console.error('Error extracting list:', error); } }); return lists; } /** * Extract hierarchical structures from HTML */ private extractHierarchies($: any): HierarchyData[] { const hierarchies: HierarchyData[] = []; // Look for common hierarchy patterns (nested navs, menus, etc.) $('nav, .menu, .tree, .hierarchy').each((index: number, element: any) => { try { const $hierarchy = $(element); const id = `hierarchy-${index}`; // Try to determine the type let type = 'menu'; if ($hierarchy.is('nav')) { type = 'navigation'; } else if ($hierarchy.attr('class')?.includes('tree')) { type = 'tree'; } // Extract nodes const nodes = this.extractHierarchyNodes($, $hierarchy); // Create markdown representation const markdownHierarchy = this.hierarchyToMarkdown(nodes); hierarchies.push({ id, type, nodes, markdownRepresentation: markdownHierarchy }); } catch (error) { console.error('Error extracting hierarchy:', error); } }); return hierarchies; } /** * Extract key-value pairs from HTML */ private extractKeyValuePairs($: any): KeyValuePair[] { const keyValuePairs: KeyValuePair[] = []; // Look for definition lists $('dl').each((_: number, dl: any) => { let currentKey = ''; $(dl).children().each((_: number, child: any) => { if ($(child).is('dt')) { currentKey = $(child).text().trim(); } else if ($(child).is('dd') && currentKey) { keyValuePairs.push({ key: currentKey, value: $(child).text().trim() }); } }); }); // Look for meta tags $('meta').each((_: number, meta: any) => { const name = $(meta).attr('name') || $(meta).attr('property'); const content = $(meta).attr('content'); if (name && content) { keyValuePairs.push({ key: name, value: content }); } }); return keyValuePairs; } /** * Extract links from HTML */ private extractLinks($: any, baseUrl: string): ExtractedLink[] { const links: ExtractedLink[] = []; $('a[href]').each((_: number, element: any) => { try { const $link = $(element); const href = $link.attr('href'); if (!href || href.startsWith('#') || href.startsWith('javascript:')) { return; } // Resolve relative URLs const url = new URL(href, baseUrl).toString(); const text = $link.text().trim(); // Get surrounding context let context = ''; const parent = $link.parent(); if (parent.length > 0) { // Get the nearest paragraph or section const container = parent.closest('p, section, div, li'); if (container.length > 0) { context = container.text().trim(); // Truncate if too long if (context.length > 300) { context = context.substring(0, 300) + '...'; } } else { // If no container found, use the parent text context = parent.text().trim(); } } links.push({ url, text, context }); } catch (error) { console.error('Error extracting link:', error); } }); return links; } /** * Extract images with context */ private extractImages($: any): ExtractedImage[] { const images: ExtractedImage[] = []; $('img').each((_: number, element: any) => { try { const $img = $(element); const src = $img.attr('src'); if (!src) { return; } const alt = $img.attr('alt') || ''; const width = parseInt($img.attr('width') || '0', 10) || undefined; const height = parseInt($img.attr('height') || '0', 10) || undefined; // Generate description based on context let generatedAlt = ''; if (!alt) { const parent = $img.parent(); // Check for caption const figcaption = parent.find('figcaption').text().trim(); if (figcaption) { generatedAlt = `Image with caption: ${figcaption}`; } else { // Look for nearby text const nearbyText = parent.text().trim(); if (nearbyText) { generatedAlt = `Image related to: ${nearbyText.substring(0, 100)}`; } else { generatedAlt = 'Image without description'; } } } // Get context let context = ''; const parent = $img.parent(); if (parent.length > 0) { // Get the nearest paragraph or section const container = parent.closest('p, section, div, figure'); if (container.length > 0) { context = container.text().trim(); // Truncate if too long if (context.length > 300) { context = context.substring(0, 300) + '...'; } } } // Find nearest heading let nearestHeading = ''; let currentElement = $img; while (currentElement.length > 0 && !nearestHeading) { const prevHeading = currentElement.prev('h1, h2, h3, h4, h5, h6'); if (prevHeading.length > 0) { nearestHeading = prevHeading.text().trim(); break; } currentElement = currentElement.parent(); } images.push({ url: src, alt, generatedAlt: generatedAlt || undefined, context: context || 'No context available', dimensions: { width, height }, position: { nearestHeading, sectionContext: nearestHeading } }); } catch (error) { console.error('Error extracting image:', error); } }); return images; } /** * Assess source credibility */ private assessSourceCredibility( content: WebpageContent, url: string, $: any ): { score: number; factors: string[] } { const factors: string[] = []; let score = 0.5; // Default neutral score // Check for https if (url.startsWith('https://')) { score += 0.05; factors.push('Secure connection (HTTPS)'); } // Check domain reputation const domain = new URL(url).hostname; const educationalDomain = domain.endsWith('.edu') || domain.endsWith('.gov'); const newsDomain = domain.includes('news') || ['cnn.com', 'bbc.com', 'nytimes.com', 'reuters.com'].some(d => domain.includes(d)); if (educationalDomain) { score += 0.1; factors.push('Educational or government domain'); } else if (newsDomain) { score += 0.05; factors.push('Established news source'); } // Check for author information const hasAuthor = $('*[rel="author"], .author, .byline').length > 0 || content.meta_tags['author'] || content.meta_tags['article:author']; if (hasAuthor) { score += 0.1; factors.push('Author information present'); } // Check for publication date const hasDate = content.meta_tags['article:published_time'] || $('time, .date, .published').length > 0; if (hasDate) { score += 0.05; factors.push('Publication date present'); } // Check for citations or references const hasCitations = $('cite, .citation, .reference, .footnote').length > 0; if (hasCitations) { score += 0.1; factors.push('Citations or references present'); } // Check for contact information const hasContact = $('*[itemprop="email"], .contact, .email').length > 0; if (hasContact) { score += 0.05; factors.push('Contact information present'); } // Cap score between 0 and 1 score = Math.max(0, Math.min(1, score)); return { score, factors }; } /** * Extract hierarchy nodes recursively */ private extractHierarchyNodes($: any, element: any, level: number = 0): HierarchyNode[] { const nodes: HierarchyNode[] = []; element.children().each((index: number, child: any) => { const $child = $(child); // Skip empty text nodes if (child.type === 'text' && !$child.text().trim()) { return; } const node: HierarchyNode = { id: `node-${level}-${index}`, text: $child.text().trim(), level, children: [] }; // Recursively extract children if ($child.children().length > 0) { node.children = this.extractHierarchyNodes($, $child, level + 1); } nodes.push(node); }); return nodes; } /** * Convert hierarchy to markdown */ private hierarchyToMarkdown(nodes: HierarchyNode[], level: number = 0): string { let markdown = ''; nodes.forEach(node => { const indent = ' '.repeat(level); markdown += `${indent}- ${node.text}\n`; if (node.children && node.children.length > 0) { markdown += this.hierarchyToMarkdown(node.children, level + 1); } }); return markdown; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mixelpixx/Google-Research-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server