mcp-open-webresearch

citationExtractor.ts•13.7 KiB

/** * CitationExtractor Agent * * Visits search result URLs, extracts relevant quotes, and assesses source quality. * This agent uses LLM to analyze page content and extract citations. */ import { visitPage } from "../../../infrastructure/visit_page/visit.js"; import { callLLM } from "../../../infrastructure/callLLM.js"; import { getConfig } from "../../../config/index.js"; import { normalizeUrlForDedup } from "../../../utils/url.js"; import { CITATION_EXTRACTOR_SYSTEM_PROMPT, buildCitationExtractorUserPrompt, buildCitationRetryPrompt, } from "../prompts/citationExtractor.prompt.js"; import type { MergedSearchResult } from "../../../types/MergedSearchResult.js"; import type { CitationEntry } from "../contextSheet.js"; export interface CitationExtractorResult { citations: CitationEntry[]; visitedUrls: string[]; failedUrls: string[]; } interface LLMCitationResponse { quality: "HIGH" | "MEDIUM" | "LOW" | "REJECTED"; qualityNote: string; quotes: string[]; } /** * Parse LLM response to extract citation data. */ function parseCitationResponse(responseText: string): LLMCitationResponse { let jsonText = responseText.trim(); // Remove markdown code fencing if present const jsonMatch = jsonText.match(/```(?:json)?\s*([\s\S]*?)```/); if (jsonMatch) { jsonText = jsonMatch[1].trim(); } try { const parsed = JSON.parse(jsonText); // Validate required fields const quality = parsed.quality?.toUpperCase() || "LOW"; const validQualities = ["HIGH", "MEDIUM", "LOW", "REJECTED"]; const normalizedQuality = validQualities.includes(quality) ? (quality as LLMCitationResponse["quality"]) : "LOW"; return { quality: normalizedQuality, qualityNote: parsed.qualityNote || "No quality assessment provided", quotes: Array.isArray(parsed.quotes) ? parsed.quotes : [], }; } catch { console.debug("[CitationExtractor] Failed to parse JSON response"); return { quality: "REJECTED", qualityNote: "Failed to parse LLM response", quotes: [], }; } } /** * Normalize text for verbatim matching. * Strips markdown formatting, handles Unicode variants, and normalizes whitespace. */ function normalizeForMatching(text: string): string { return ( text .toLowerCase() // Case insensitive matching // Replace all non-alphanumeric characters (not letters, numbers, or whitespace) with a space // Using /u flag for Unicode support (\p{L} = Any Letter, \p{N} = Any Number) .replace(/[^\p{L}\p{N}\s]/gu, " ") // Collapse multiple spaces into one and trim .replace(/\s+/g, " ") .trim() ); } /** * Verify that a quote exists verbatim in the source text. * Returns true if the quote (or a punctuation-flexible variant) is found. * * @param alternateSource Optional second source (e.g. plain text) to check if primary source (markdown) fails. */ function verifyQuoteInSource( quote: string, sourceText: string, alternateSource?: string, ): boolean { // 1. Check Primary Source (Markdown) if (sourceText.includes(quote)) return true; const normalizedQuote = normalizeForMatching(quote); const normalizedSource = normalizeForMatching(sourceText); if (normalizedSource.includes(normalizedQuote)) return true; // 2. Check Alternate Source (Plain Text) if provided // This handles cases where markdown links break word adjacency (e.g. "chicken [link] shawarma") if (alternateSource) { if (alternateSource.includes(quote)) return true; const normalizedAlt = normalizeForMatching(alternateSource); if (normalizedAlt.includes(normalizedQuote)) return true; } return false; } /** * Filter quotes to only include those that exist verbatim in the source. * Returns verified quotes and count of rejected quotes. */ function filterVerifiedQuotes( quotes: string[], sourceText: string, ): { verified: string[]; rejectedCount: number } { const verified: string[] = []; let rejectedCount = 0; for (const quote of quotes) { if (verifyQuoteInSource(quote, sourceText)) { verified.push(quote); } else { rejectedCount++; console.debug( `[CitationExtractor] Rejected non-verbatim quote: "${quote}"`, ); } } return { verified, rejectedCount }; } /** * Filter quotes with full rejection list for retry logic. * Returns both verified and rejected quote arrays. */ function filterVerifiedQuotesWithRejected( quotes: string[], sourceText: string, alternateSource?: string, ): { verified: string[]; rejected: string[] } { const verified: string[] = []; const rejected: string[] = []; for (const quote of quotes) { if (verifyQuoteInSource(quote, sourceText, alternateSource)) { verified.push(quote); } else { rejected.push(quote); // Detailed debug logging for rejected quotes const normalizedQuote = normalizeForMatching(quote); const normalizedSource = normalizeForMatching(sourceText); const quoteInNormalized = normalizedSource.includes(normalizedQuote); console.debug( `[CitationExtractor] Rejected non-verbatim quote: "${quote}"`, ); console.debug( `[CitationExtractor] DEBUG - Quote (${quote.length} chars): "${quote}"`, ); console.debug( `[CitationExtractor] DEBUG - Normalized quote: "${normalizedQuote}"`, ); console.debug( `[CitationExtractor] DEBUG - Quote in normalized source: ${quoteInNormalized}`, ); // Try to find similar text in source const searchTerm = quote.slice(0, 30); const idx = sourceText.indexOf(searchTerm); if (idx >= 0) { console.debug( `[CitationExtractor] DEBUG - Similar text found at ${idx}: "${sourceText.slice(idx - 20, idx + quote.length + 20)}"`, ); } else { console.debug( `[CitationExtractor] DEBUG - No similar text found for: "${searchTerm}"`, ); } } } return { verified, rejected }; } /** * Execute the CitationExtractor agent. * Visits each URL, extracts content, and uses LLM to find relevant quotes. */ export async function executeCitationExtractor( results: MergedSearchResult[], objective: string, maxUrls?: number, allCitations: CitationEntry[] = [], startingId: number = 1, signal?: AbortSignal, onProgress?: (batch: { url: string; count: number }[]) => void, ): Promise<CitationExtractorResult> { const citations: CitationEntry[] = []; const visitedUrls: string[] = []; const failedUrls: string[] = []; let nextId = startingId; // Limit URLs to visit: undefined or -1 = all, positive number = limit const urlsToVisit = maxUrls === undefined || maxUrls < 0 ? results : results.slice(0, maxUrls); console.debug( `[CitationExtractor] Processing ${urlsToVisit.length} URLs for citations`, ); // Parallel processing with concurrency limit from configuration const CONCURRENCY = getConfig().browser.concurrency; // Progress batching: collect citations per URL for batched progress reports let progressBatch: { url: string; count: number }[] = []; // Helper to process a single URL async function processUrl( result: MergedSearchResult, urlIndex: number, ): Promise<{ citation?: CitationEntry; visited: boolean; failed: boolean; progressItem: { url: string; count: number }; }> { console.debug( `[CitationExtractor] [${urlIndex + 1}/${urlsToVisit.length}] Visiting: ${result.url}`, ); try { // Visit the page and extract content const page = await visitPage(result.url, false); console.debug( `[CitationExtractor] Got ${page.content.length} chars from "${page.title}"`, ); // Skip empty pages if (!page.content || page.content.trim().length < 100) { console.debug( `[CitationExtractor] Page content too short, skipping LLM analysis`, ); return { citation: { id: 0, // Will be assigned later url: result.url, title: page.title || result.title, quality: "REJECTED", qualityNote: "Page content too short or empty", quotes: [], rawMarkdown: page.content || "", }, visited: true, failed: false, progressItem: { url: normalizeUrlForDedup(result.url), count: 0 }, }; } const existingCitations = allCitations.filter( (c) => c.url === result.url, ); if (existingCitations.length > 0) { console.debug( `[CitationExtractor] Skipping ${result.url} because it has already been visited`, ); return { visited: false, failed: false, progressItem: { url: normalizeUrlForDedup(result.url), count: 0 }, }; } // Use LLM to extract citations const userPrompt = buildCitationExtractorUserPrompt( objective, result.url, page.title || result.title, page.content, existingCitations, ); const llmResult = await callLLM({ systemPrompt: CITATION_EXTRACTOR_SYSTEM_PROMPT, userPrompt, temperature: 0.3, maxTokens: 2000, }); console.debug( `[CitationExtractor] LLM response via ${llmResult.provider}`, ); const extracted = parseCitationResponse(llmResult.text); // Filter quotes to only include those that exist verbatim in the source let { verified, rejected } = filterVerifiedQuotesWithRejected( extracted.quotes, page.content, page.textContent, ); // Retry logic: if some quotes were rejected, ask LLM for replacements const maxRetries = 2; let retryCount = 0; while (rejected.length > 0 && retryCount < maxRetries) { retryCount++; console.debug( `[CitationExtractor] Retry ${retryCount}/${maxRetries}: ${rejected.length} quotes need replacement`, ); const retryPrompt = buildCitationRetryPrompt( objective, result.url, page.title || result.title, page.content, rejected, verified, ); const retryResult = await callLLM({ systemPrompt: CITATION_EXTRACTOR_SYSTEM_PROMPT, userPrompt: retryPrompt, temperature: 0.2, maxTokens: 2000, }); const retryExtracted = parseCitationResponse(retryResult.text); const retryVerification = filterVerifiedQuotesWithRejected( retryExtracted.quotes, page.content, page.textContent, ); for (const newQuote of retryVerification.verified) { if (!verified.includes(newQuote)) { verified.push(newQuote); } } rejected = retryVerification.rejected; console.debug( `[CitationExtractor] Retry ${retryCount}: ${retryVerification.verified.length} verified, ${rejected.length} still rejected`, ); if (rejected.length === 0) { console.debug( `[CitationExtractor] ✓ RECTIFIED: All quotes now verified after retry ${retryCount}`, ); } } if (rejected.length > 0) { console.debug( `[CitationExtractor] ✗ UNRESOLVED: ${rejected.length} quotes could not be verified after ${maxRetries} retries`, ); for (const failedQuote of rejected) { console.debug(`[CitationExtractor] - FAILED: "${failedQuote}"`); } } console.debug( `[CitationExtractor] Extracted ${verified.length} verified quotes (${extracted.quality})`, ); return { citation: { id: 0, // Will be assigned later url: result.url, title: page.title || result.title, quality: extracted.quality, qualityNote: extracted.qualityNote, quotes: verified, rawMarkdown: page.content, }, visited: true, failed: false, progressItem: { url: normalizeUrlForDedup(result.url), count: verified.length, }, }; } catch (error: any) { console.debug( `[CitationExtractor] Failed to visit ${result.url}: ${error.message}`, ); return { visited: false, failed: true, progressItem: { url: normalizeUrlForDedup(result.url), count: 0 }, }; } } // Process URLs in batches for ( let batchStart = 0; batchStart < urlsToVisit.length; batchStart += CONCURRENCY ) { // Check for cancellation before each batch if (signal?.aborted) { console.debug(`[CitationExtractor] Cancelled after ${batchStart} URLs`); break; } const batchEnd = Math.min(batchStart + CONCURRENCY, urlsToVisit.length); const batch = urlsToVisit.slice(batchStart, batchEnd); console.debug( `[CitationExtractor] Processing batch ${Math.floor(batchStart / CONCURRENCY) + 1}: URLs ${batchStart + 1}-${batchEnd}`, ); // Process batch in parallel const batchResults = await Promise.all( batch.map((result, i) => processUrl(result, batchStart + i)), ); // Collect results and send progress for each URL immediately for (const res of batchResults) { if (res.citation) { res.citation.id = nextId++; citations.push(res.citation); } if (res.visited) { visitedUrls.push(res.citation?.url || ""); } if (res.failed) { failedUrls.push(res.progressItem.url); } // Send progress notification for EACH URL immediately if (onProgress) { onProgress([res.progressItem]); } } } console.debug( `[CitationExtractor] Complete: ${visitedUrls.length} visited, ${failedUrls.length} failed, ${citations.length} citations`, ); return { citations, visitedUrls, failedUrls }; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rinaldowouterson/mcp-open-webresearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

citationExtractor.ts•13.7 KiB