Skip to main content
Glama
citationValidator.js7.61 kB
/** * Citation Validator * * Validates URLs in research citations to detect dead links and hallucinated sources. * Marks invalid citations as [DEAD LINK] or [UNVERIFIED]. * * @module citationValidator * @version 1.8.0 */ 'use strict'; const logger = require('./logger').child('CitationValidator'); // Regex patterns for citation extraction const CITATION_PATTERNS = [ // [Source: Title — https://...] /\[Source:\s*([^\]]+?)\s*—\s*(https?:\/\/[^\]\s]+)\]/gi, // [Title](https://...) /\[([^\]]+)\]\((https?:\/\/[^\)]+)\)/gi, // Inline URLs /(https?:\/\/[^\s\]\)\"<>]+)/gi ]; // Domains that are known-good and don't need validation const TRUSTED_DOMAINS = [ 'github.com', 'npmjs.com', 'docs.python.org', 'developer.mozilla.org', 'pglite.dev', 'spec.modelcontextprotocol.io', 'openrouter.ai', 'anthropic.com', 'openai.com' ]; // Domains that are known to block HEAD requests const SKIP_VALIDATION_DOMAINS = [ 'linkedin.com', 'twitter.com', 'x.com', 'facebook.com', 'instagram.com' ]; /** * Extract all citations from text * @param {string} text - The text to extract citations from * @returns {Array<Object>} Array of citation objects with title and url */ function extractCitations(text) { if (!text || typeof text !== 'string') return []; const citations = []; const seen = new Set(); for (const pattern of CITATION_PATTERNS) { // Reset lastIndex for global patterns pattern.lastIndex = 0; let match; while ((match = pattern.exec(text)) !== null) { let title, url; if (match.length === 3) { // Patterns with title and URL title = match[1].trim(); url = match[2].trim(); } else if (match.length === 2) { // URL-only pattern url = match[1].trim(); title = url; } else { continue; } // Normalize URL url = url.replace(/[.,;:]+$/, ''); // Remove trailing punctuation // Skip duplicates if (seen.has(url)) continue; seen.add(url); try { new URL(url); // Validate URL format citations.push({ title, url, original: match[0] }); } catch { // Invalid URL format, skip } } } return citations; } /** * Check if a domain is trusted and can skip validation * @param {string} url - The URL to check * @returns {boolean} True if the domain is trusted */ function isTrustedDomain(url) { try { const hostname = new URL(url).hostname.toLowerCase(); return TRUSTED_DOMAINS.some(d => hostname.includes(d)); } catch { return false; } } /** * Check if a domain should skip validation due to blocking * @param {string} url - The URL to check * @returns {boolean} True if validation should be skipped */ function shouldSkipValidation(url) { try { const hostname = new URL(url).hostname.toLowerCase(); return SKIP_VALIDATION_DOMAINS.some(d => hostname.includes(d)); } catch { return false; } } /** * Validate a single URL via HEAD request * @param {string} url - The URL to validate * @param {number} timeout - Request timeout in ms * @returns {Promise<Object>} Validation result */ async function validateUrl(url, timeout = 5000) { // Skip validation for certain domains if (shouldSkipValidation(url)) { return { url, valid: true, status: 'skipped', reason: 'domain blocks validation' }; } // Trust known domains if (isTrustedDomain(url)) { return { url, valid: true, status: 'trusted', reason: 'trusted domain' }; } try { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); const response = await fetch(url, { method: 'HEAD', signal: controller.signal, headers: { 'User-Agent': 'OpenRouter-Agents-CitationValidator/1.0' }, redirect: 'follow' }); clearTimeout(timeoutId); if (response.ok || response.status === 403 || response.status === 405) { // 403/405 often mean the page exists but blocks HEAD return { url, valid: true, status: response.status, reason: 'accessible' }; } else if (response.status === 404) { return { url, valid: false, status: 404, reason: 'not found' }; } else { return { url, valid: false, status: response.status, reason: `HTTP ${response.status}` }; } } catch (error) { if (error.name === 'AbortError') { return { url, valid: false, status: 'timeout', reason: 'request timed out' }; } return { url, valid: false, status: 'error', reason: error.message }; } } /** * Validate all citations in text * @param {string} text - Text containing citations * @param {Object} options - Validation options * @returns {Promise<Object>} Validation results */ async function validateCitations(text, options = {}) { const { maxConcurrent = 5, timeout = 5000, requestId = 'unknown' } = options; const citations = extractCitations(text); if (citations.length === 0) { return { total: 0, valid: 0, invalid: 0, results: [] }; } logger.debug('Validating citations', { requestId, count: citations.length }); const results = []; const batches = []; // Process in batches for rate limiting for (let i = 0; i < citations.length; i += maxConcurrent) { batches.push(citations.slice(i, i + maxConcurrent)); } for (const batch of batches) { const batchResults = await Promise.all( batch.map(c => validateUrl(c.url, timeout).then(r => ({ ...c, ...r }))) ); results.push(...batchResults); } const valid = results.filter(r => r.valid).length; const invalid = results.filter(r => !r.valid).length; logger.info('Citation validation complete', { requestId, total: results.length, valid, invalid }); return { total: results.length, valid, invalid, results }; } /** * Annotate text with citation validity markers * @param {string} text - Original text with citations * @param {Array<Object>} validationResults - Results from validateCitations * @returns {string} Annotated text */ function annotateText(text, validationResults) { if (!text || !validationResults?.results) return text; let annotated = text; for (const result of validationResults.results) { if (!result.valid && result.original) { const marker = result.status === 404 ? '[DEAD LINK]' : '[UNVERIFIED]'; annotated = annotated.replace( result.original, `${result.original} ${marker}` ); } } return annotated; } /** * Get a summary of citation quality * @param {Object} validationResults - Results from validateCitations * @returns {Object} Quality summary */ function getQualitySummary(validationResults) { if (!validationResults?.total) { return { score: 1.0, level: 'unknown', message: 'No citations to validate' }; } const { total, valid, invalid } = validationResults; const score = total > 0 ? valid / total : 0; let level, message; if (score >= 0.9) { level = 'high'; message = `${valid}/${total} citations verified`; } else if (score >= 0.7) { level = 'medium'; message = `${invalid} of ${total} citations could not be verified`; } else if (score >= 0.5) { level = 'low'; message = `Warning: ${invalid}/${total} citations unverified or dead`; } else { level = 'very-low'; message = `Critical: Most citations (${invalid}/${total}) could not be verified`; } return { score, level, message }; } module.exports = { extractCitations, validateUrl, validateCitations, annotateText, getQualitySummary, isTrustedDomain };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wheattoast11/openrouter-deep-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server