Skip to main content
Glama
RichardDillman

SEO Audit MCP Server

page-capture.ts10.1 kB
// src/utils/page-capture.ts // Single-fetch page capture with HAR recording and comprehensive data extraction import type { BrowserContext, Page } from 'playwright'; import type { PageSnapshot, StoredPageAnalysis, AuditStorage } from './storage.js'; import { createPage, extractMetaTags, extractHeadings, extractJsonLd, extractLinks, extractImages, checkMixedContent, detectFramework, } from './browser.js'; import { savePageSnapshot, savePageAnalysis, saveHarFile, hasPageSnapshot, loadPageSnapshot, loadPageAnalysis, } from './storage.js'; // ============================================================================ // Types // ============================================================================ export interface CaptureOptions { timeout?: number; device?: 'desktop' | 'mobile'; captureHar?: boolean; captureScreenshot?: boolean; waitForSelector?: string; } export interface CaptureResult { snapshot: PageSnapshot; analysis: StoredPageAnalysis; cached: boolean; // True if loaded from storage instead of fetched } // ============================================================================ // Main Capture Function // ============================================================================ /** * Capture a page once and extract all data needed for SEO analysis * Returns cached data if already captured */ export async function capturePage( storage: AuditStorage, url: string, options: CaptureOptions = {} ): Promise<CaptureResult> { const { timeout = 30000, device = 'desktop', captureHar = false, captureScreenshot = false, waitForSelector, } = options; // Check if already captured if (await hasPageSnapshot(storage, url)) { const snapshot = await loadPageSnapshot(storage, url); const analysis = await loadPageAnalysis(storage, url); if (snapshot && analysis) { console.error(`[CACHE HIT] ${url}`); return { snapshot, analysis, cached: true }; } } console.error(`[FETCHING] ${url}`); const { context, page } = await createPage({ device, timeout }); // Set up HAR recording if requested if (captureHar) { await context.tracing.start({ screenshots: false, snapshots: false }); } try { // Capture initial HTML before JS execution let initialHtml = ''; page.on('response', async (response) => { if (response.url() === url && response.headers()['content-type']?.includes('text/html')) { try { initialHtml = await response.text(); } catch { // Ignore } } }); // Navigate to the page const startTime = Date.now(); const response = await page.goto(url, { waitUntil: 'networkidle', timeout, }); // Wait for optional selector if (waitForSelector) { await page.waitForSelector(waitForSelector, { timeout: 10000 }).catch(() => {}); } // Additional wait for dynamic content await page.waitForLoadState('networkidle').catch(() => {}); const loadTimeMs = Date.now() - startTime; const httpStatus = response?.status() || 0; // Get response headers const headers: Record<string, string> = {}; if (response) { const respHeaders = response.headers(); for (const [key, value] of Object.entries(respHeaders)) { headers[key] = value; } } // Get rendered HTML (after JS execution) const renderedHtml = await page.content(); // Create snapshot const snapshot: PageSnapshot = { url, fetchedAt: new Date().toISOString(), httpStatus, headers, initialHtml, renderedHtml, }; // Save HAR if requested if (captureHar) { const harPath = `${storage.paths.har}/${Date.now()}.har`; await context.tracing.stop({ path: harPath }); snapshot.harFile = harPath; } // Take screenshot if requested if (captureScreenshot) { const screenshotPath = `${storage.paths.pages}/${Date.now()}.png`; await page.screenshot({ path: screenshotPath, fullPage: true }); snapshot.screenshotFile = screenshotPath; } // Extract all SEO data from the page (single pass) const analysis = await extractAllSeoData(page, url, snapshot, loadTimeMs); // Save to storage await savePageSnapshot(storage, snapshot); await savePageAnalysis(storage, url, analysis); return { snapshot, analysis, cached: false }; } finally { await context.close(); } } /** * Extract all SEO-relevant data from a page in a single pass */ async function extractAllSeoData( page: Page, url: string, snapshot: PageSnapshot, loadTimeMs: number ): Promise<StoredPageAnalysis> { // Run all extractions in parallel for efficiency const [ metaTags, headings, jsonLd, links, images, hasMixedContent, framework, openGraph, twitterCard, language, ] = await Promise.all([ extractMetaTags(page), extractHeadings(page), extractJsonLd(page), extractLinks(page, url), extractImages(page), checkMixedContent(page), detectFramework(page), extractOpenGraph(page), extractTwitterCard(page), page.$eval('html', el => el.getAttribute('lang')).catch(() => null), ]); // Analyze structured data const structuredData = analyzeStructuredData(jsonLd); // Determine if JS rendering is required const jsRenderingRequired = snapshot.renderedHtml.length > snapshot.initialHtml.length * 1.5; return { snapshot, seo: { title: metaTags.title, metaDescription: metaTags.description, canonical: metaTags.canonical, robots: metaTags.robots, headings, viewport: metaTags.viewport, language, }, structuredData, openGraph, twitter: twitterCard, technical: { isHttps: url.startsWith('https'), hasMixedContent, jsRenderingRequired, framework, loadTimeMs, }, links: { internal: links.internal, external: links.external, totalCount: links.internal.length + links.external.length, }, images: { total: images.total, withAlt: images.withAlt, withoutAlt: images.withoutAlt, images: images.images.slice(0, 50), // Limit for storage }, }; } /** * Extract Open Graph meta tags */ async function extractOpenGraph(page: Page): Promise<StoredPageAnalysis['openGraph']> { return page.evaluate(() => { const getMeta = (property: string): string | null => { const el = document.querySelector(`meta[property="${property}"]`); return el?.getAttribute('content') ?? null; }; return { title: getMeta('og:title'), description: getMeta('og:description'), image: getMeta('og:image'), type: getMeta('og:type'), url: getMeta('og:url'), }; }); } /** * Extract Twitter Card meta tags */ async function extractTwitterCard(page: Page): Promise<StoredPageAnalysis['twitter']> { return page.evaluate(() => { const getMeta = (name: string): string | null => { const el = document.querySelector(`meta[name="${name}"]`); return el?.getAttribute('content') ?? null; }; return { card: getMeta('twitter:card'), title: getMeta('twitter:title'), description: getMeta('twitter:description'), image: getMeta('twitter:image'), }; }); } /** * Analyze structured data with JobPosting focus */ function analyzeStructuredData(jsonLd: any[]): StoredPageAnalysis['structuredData'] { const result = { jsonLd, hasJobPosting: false, jobPostings: [] as any[], jobPostingErrors: [] as any[], jobPostingWarnings: [] as any[], }; for (const item of jsonLd) { const type = item['@type']; if (type === 'JobPosting') { result.hasJobPosting = true; const { errors, warnings } = validateJobPosting(item); result.jobPostings.push(item); result.jobPostingErrors.push(...errors); result.jobPostingWarnings.push(...warnings); } } return result; } /** * Validate JobPosting schema */ function validateJobPosting(schema: any): { errors: any[]; warnings: any[] } { const errors: any[] = []; const warnings: any[] = []; // Required fields const required = ['title', 'description', 'datePosted', 'hiringOrganization', 'jobLocation']; for (const field of required) { if (!schema[field]) { errors.push({ field, message: `Missing required field: ${field}`, severity: 'error' }); } } // Recommended fields const recommended = ['validThrough', 'baseSalary', 'employmentType', 'directApply']; for (const field of recommended) { if (!schema[field]) { warnings.push({ field, message: `Missing recommended field: ${field}`, severity: 'warning' }); } } // Validate expiration if (schema.validThrough) { const expiry = new Date(schema.validThrough); if (expiry < new Date()) { warnings.push({ field: 'validThrough', message: 'Job posting has expired', severity: 'warning' }); } } return { errors, warnings }; } /** * Batch capture multiple pages */ export async function capturePages( storage: AuditStorage, urls: string[], options: CaptureOptions & { concurrency?: number } = {} ): Promise<CaptureResult[]> { const { concurrency = 2, ...captureOptions } = options; const results: CaptureResult[] = []; // Process in chunks for controlled concurrency for (let i = 0; i < urls.length; i += concurrency) { const chunk = urls.slice(i, i + concurrency); const chunkResults = await Promise.all( chunk.map(url => capturePage(storage, url, captureOptions).catch(err => ({ snapshot: { url, fetchedAt: new Date().toISOString(), httpStatus: 0, headers: {}, initialHtml: '', renderedHtml: '' }, analysis: null as any, cached: false, error: err.message, }))) ); results.push(...chunkResults); // Small delay between chunks if (i + concurrency < urls.length) { await new Promise(r => setTimeout(r, 500)); } } return results; } export default { capturePage, capturePages, };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RichardDillman/seo-audit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server