NCBI Entrez MCP Server

parsers.ts•20.7 KiB

// src/lib/parsers.ts import type { ParsingDiagnostics } from "./types.js"; // Utility function to strip XML tags and decode HTML entities function stripXmlTags(text: string): string { if (!text) return text; return text .replace(/<[^>]*>/g, "") // Remove XML/HTML tags .replace(/</g, "<") .replace(/>/g, ">") .replace(/&/g, "&") .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, " ") .trim(); } // Enhanced interface for our parsers that returns typed entities with diagnostics. export interface IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; }; } // Enhanced parser for PubMed's XML format with fallback strategies and diagnostics. export class PubMedXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "mesh_descriptors", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "unknown", mesh_availability: "none", }; const articleMatches = content.match(/<PubmedArticle>[\s\S]+?<\/PubmedArticle>/g) || []; let totalMeshTerms = 0; let articlesWithMesh = 0; for (const articleXml of articleMatches) { const pmidMatch = articleXml.match(/<PMID[^>]*>(\d+)<\/PMID>/); const titleMatch = articleXml.match( /<ArticleTitle>([\s\S]+?)<\/ArticleTitle>/, ); const abstractMatch = articleXml.match( /<AbstractText[^>]*>([\s\S]+?)<\/AbstractText>/i, ); const journalMatch = articleXml.match(/<Title>([\s\S]+?)<\/Title>/); const yearMatch = articleXml.match( /<PubDate>[\s\S]*?<Year>(\d{4})<\/Year>/, ); const articleUID = pmidMatch ? pmidMatch[1] : `art_${Math.random()}`; // --- ENHANCED AUTHOR PARSING --- const authorRegex = /<Author[^>]*>([\s\S]*?)<\/Author>/g; const lastNameRegex = /<LastName>([^<]+)<\/LastName>/; const foreNameRegex = /<ForeName>([^<]+)<\/ForeName>/; const affiliationRegex = /<Affiliation>([^<]+)<\/Affiliation>/; const authorBlocks = Array.from(articleXml.matchAll(authorRegex)); const authors = authorBlocks.map((authorBlock, index) => { const authorContent = authorBlock[1]; const lastNameMatch = authorContent.match(lastNameRegex); const foreNameMatch = authorContent.match(foreNameRegex); const affiliations = Array.from( authorContent.matchAll(/<Affiliation>([^<]+)<\/Affiliation>/g), ).map((m) => m[1].trim()); const authorData = { uid: `${articleUID}_auth_${index}`, lastname: lastNameMatch ? stripXmlTags(lastNameMatch[1]) : null, forename: foreNameMatch ? stripXmlTags(foreNameMatch[1]) : null, affiliation: affiliations.length > 0 ? affiliations.map((a) => stripXmlTags(a)).join("; ") : null, }; allEntities.push({ type: "author", data: authorData }); return authorData; }); // --- ENHANCED MeSH PARSING WITH FALLBACK STRATEGIES --- const meshTerms = this.extractMeshTermsWithFallback( articleXml, articleUID, diagnostics, ); if (meshTerms.length > 0) { articlesWithMesh++; totalMeshTerms += meshTerms.length; } else { // Try fallback extraction from title/abstract const fallbackTerms = this.extractFallbackTerms( titleMatch ? titleMatch[1] : "", abstractMatch ? abstractMatch[1] : "", articleUID, diagnostics, ); meshTerms.push(...fallbackTerms); } // Add mesh terms to entities meshTerms.forEach((term) => allEntities.push({ type: "meshterm", data: term }), ); // --- CREATE THE FINAL ARTICLE OBJECT --- allEntities.push({ type: "article", data: { uid: articleUID, pmid: articleUID, title: titleMatch ? stripXmlTags(titleMatch[1]) : "No Title", journal: journalMatch ? stripXmlTags(journalMatch[1]) : null, year: yearMatch ? parseInt(yearMatch[1], 10) : null, abstract: abstractMatch ? stripXmlTags(abstractMatch[1]) : null, authors: authors, meshTerms: meshTerms, }, }); } // Update diagnostics diagnostics.terms_found = totalMeshTerms; const meshSuccessRate = articleMatches.length > 0 ? (articlesWithMesh / articleMatches.length) * 100 : 0; if (meshSuccessRate > 75) { diagnostics.mesh_availability = "full"; diagnostics.indexing_status = "complete"; } else if (meshSuccessRate > 25) { diagnostics.mesh_availability = "partial"; diagnostics.indexing_status = "in_progress"; } else { diagnostics.mesh_availability = "none"; diagnostics.indexing_status = "not_indexed"; diagnostics.warnings.push( "Low MeSH term availability - articles may be too recent or not yet indexed", ); } return { entities: allEntities, diagnostics }; } private extractMeshTermsWithFallback( articleXml: string, articleUID: string, diagnostics: ParsingDiagnostics, ): any[] { const strategies = [ () => this.extractMeshDescriptors(articleXml, articleUID), () => this.extractMeshQualifiers(articleXml, articleUID), () => this.extractKeywordsAsMesh(articleXml, articleUID), ]; for (const [index, strategy] of strategies.entries()) { try { const terms = strategy(); if (terms.length > 0) { diagnostics.method_used = [ "mesh_descriptors", "mesh_qualifiers", "keywords_as_mesh", ][index]; return terms; } } catch (error) { diagnostics.failed_extractions.push( `Strategy ${index + 1}: ${error instanceof Error ? error.message : "Unknown error"}`, ); } } return []; } private extractMeshDescriptors( articleXml: string, articleUID: string, ): any[] { const meshRegex = /<MeshHeading>([\s\S]*?)<\/MeshHeading>/g; const descriptorRegex = /<DescriptorName[^>]*>([^<]+)<\/DescriptorName>/; const meshBlocks = Array.from(articleXml.matchAll(meshRegex)); return meshBlocks .map((meshBlock, index) => { const meshContent = meshBlock[1]; const descriptorMatch = meshContent.match(descriptorRegex); return { uid: `${articleUID}_mesh_${index}`, descriptorname: descriptorMatch ? stripXmlTags(descriptorMatch[1]) : null, }; }) .filter((term) => term.descriptorname); } private extractMeshQualifiers(articleXml: string, articleUID: string): any[] { const qualifierRegex = /<QualifierName[^>]*>([^<]+)<\/QualifierName>/g; const qualifiers = Array.from(articleXml.matchAll(qualifierRegex)); return qualifiers.map((match, index) => ({ uid: `${articleUID}_mesh_qual_${index}`, descriptorname: stripXmlTags(match[1]), })); } private extractKeywordsAsMesh(articleXml: string, articleUID: string): any[] { const keywordRegex = /<Keyword[^>]*>([^<]+)<\/Keyword>/g; const keywords = Array.from(articleXml.matchAll(keywordRegex)); return keywords.map((match, index) => ({ uid: `${articleUID}_mesh_kw_${index}`, descriptorname: stripXmlTags(match[1]), })); } private extractFallbackTerms( title: string, abstract: string, articleUID: string, diagnostics: ParsingDiagnostics, ): any[] { const fallbackTerms: any[] = []; const text = `${title} ${abstract}`.toLowerCase(); // Extract likely medical/scientific terms const medicalTerms = [ "cancer", "tumor", "mutation", "protein", "gene", "therapy", "treatment", "drug", "pharmacology", "clinical", "patient", "disease", "syndrome", "receptor", "inhibitor", "biomarker", "expression", "pathway", ]; medicalTerms.forEach((term, index) => { if (text.includes(term)) { fallbackTerms.push({ uid: `${articleUID}_mesh_fallback_${index}`, descriptorname: term.charAt(0).toUpperCase() + term.slice(1), }); } }); if (fallbackTerms.length > 0) { diagnostics.method_used = "fallback_extraction"; diagnostics.warnings.push( "Used fallback term extraction from title/abstract", ); } return fallbackTerms; } } // Enhanced parser for EInfo responses with structured field and link data export class EInfoXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "einfo_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; // Extract database info const dbNameMatch = content.match(/<DbName>([^<]+)<\/DbName>/); const dbDescMatch = content.match(/<Description>([^<]+)<\/Description>/); const countMatch = content.match(/<Count>([^<]+)<\/Count>/); const lastUpdateMatch = content.match(/<LastUpdate>([^<]+)<\/LastUpdate>/); if (dbNameMatch) { allEntities.push({ type: "database_info", data: { uid: `db_${dbNameMatch[1]}`, name: stripXmlTags(dbNameMatch[1]), description: dbDescMatch ? stripXmlTags(dbDescMatch[1]) : null, record_count: countMatch ? parseInt(countMatch[1], 10) : null, last_update: lastUpdateMatch ? stripXmlTags(lastUpdateMatch[1]) : null, }, }); } // Extract searchable fields const fieldMatches = content.match(/<Field>[\s\S]*?<\/Field>/g) || []; fieldMatches.forEach((fieldXml, index) => { const nameMatch = fieldXml.match(/<Name>([^<]+)<\/Name>/); const fullNameMatch = fieldXml.match(/<FullName>([^<]+)<\/FullName>/); const isDateMatch = fieldXml.match(/<IsDate>([^<]+)<\/IsDate>/); const isNumericalMatch = fieldXml.match( /<IsNumerical>([^<]+)<\/IsNumerical>/, ); if (nameMatch) { allEntities.push({ type: "searchable_field", data: { uid: `field_${nameMatch[1]}_${index}`, name: stripXmlTags(nameMatch[1]), full_name: fullNameMatch ? stripXmlTags(fullNameMatch[1]) : null, is_date: isDateMatch ? isDateMatch[1] === "Y" : false, is_numerical: isNumericalMatch ? isNumericalMatch[1] === "Y" : false, }, }); } }); // Extract available links const linkMatches = content.match(/<Link>[\s\S]*?<\/Link>/g) || []; linkMatches.forEach((linkXml, index) => { const nameMatch = linkXml.match(/<Name>([^<]+)<\/Name>/); const menuMatch = linkXml.match(/<Menu>([^<]+)<\/Menu>/); const dbToMatch = linkXml.match(/<DbTo>([^<]+)<\/DbTo>/); if (nameMatch) { allEntities.push({ type: "link_info", data: { uid: `link_${nameMatch[1]}_${index}`, name: stripXmlTags(nameMatch[1]), menu_name: menuMatch ? stripXmlTags(menuMatch[1]) : null, target_db: dbToMatch ? stripXmlTags(dbToMatch[1]) : null, }, }); } }); diagnostics.terms_found = allEntities.length; return { entities: allEntities, diagnostics }; } } // Enhanced parser for ESummary responses with structured document summaries export class ESummaryXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "esummary_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; const docSumMatches = content.match(/<DocSum>[\s\S]*?<\/DocSum>/g) || []; for (const docSumXml of docSumMatches) { const idMatch = docSumXml.match(/<Id>([^<]+)<\/Id>/); if (!idMatch) continue; const uid = idMatch[1]; const summary: any = { uid, pmid: uid }; // Extract common fields const titleMatch = docSumXml.match( /<Item Name="Title"[^>]*>([^<]*)<\/Item>/, ); const authorsMatch = docSumXml.match( /<Item Name="AuthorList"[^>]*>([\s\S]*?)<\/Item>/, ); const journalMatch = docSumXml.match( /<Item Name="FullJournalName"[^>]*>([^<]*)<\/Item>/, ); const pubDateMatch = docSumXml.match( /<Item Name="PubDate"[^>]*>([^<]*)<\/Item>/, ); const doiMatch = docSumXml.match(/<Item Name="DOI"[^>]*>([^<]*)<\/Item>/); summary.title = titleMatch ? stripXmlTags(titleMatch[1]) : null; summary.journal = journalMatch ? stripXmlTags(journalMatch[1]) : null; summary.pub_date = pubDateMatch ? stripXmlTags(pubDateMatch[1]) : null; summary.doi = doiMatch ? stripXmlTags(doiMatch[1]) : null; // Extract authors from AuthorList if (authorsMatch) { const authorItems = authorsMatch[1].match(/<Item Name="Author"[^>]*>([^<]*)<\/Item>/g) || []; summary.authors = authorItems .map((item) => { const authorMatch = item.match(/>([^<]*)</); return authorMatch ? stripXmlTags(authorMatch[1]) : ""; }) .filter((author) => author); } allEntities.push({ type: "document_summary", data: summary, }); } diagnostics.terms_found = allEntities.length; return { entities: allEntities, diagnostics }; } } // Enhanced parser for BLAST submit responses to extract RID export class BlastSubmitParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "blast_submit_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; // Extract RID from HTML response const ridMatch = content.match( /value="([A-Z0-9]+)"\s+id="rid"|RID\s*=\s*([A-Z0-9]+)/i, ); const estimateMatch = content.match( /We estimate that results will be ready in (\d+)/i, ); if (ridMatch) { const rid = ridMatch[1] || ridMatch[2]; allEntities.push({ type: "blast_job", data: { uid: rid, rid: rid, status: "submitted", estimated_time: estimateMatch ? parseInt(estimateMatch[1], 10) : null, }, }); diagnostics.terms_found = 1; } else { diagnostics.failed_extractions.push( "Could not extract RID from BLAST response", ); } return { entities: allEntities, diagnostics }; } } // Enhanced parser for Gene database XML format export class GeneXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "gene_xml_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; const geneMatches = content.match(/<Entrezgene>[\s\S]+?<\/Entrezgene>/g) || []; for (const geneXml of geneMatches) { const geneIdMatch = geneXml.match( /<Gene-track_geneid>(\d+)<\/Gene-track_geneid>/, ); const symbolMatch = geneXml.match( /<Gene-ref_locus>([^<]+)<\/Gene-ref_locus>/, ); const descMatch = geneXml.match( /<Gene-ref_desc>([^<]+)<\/Gene-ref_desc>/, ); const chromosomeMatch = geneXml.match( /<Maps_display-str>([^<]+)<\/Maps_display-str>/, ); const organismMatch = geneXml.match( /<BioSource_org>[\s\S]*?<Org-ref_taxname>([^<]+)<\/Org-ref_taxname>/, ); const geneUID = geneIdMatch ? geneIdMatch[1] : `gene_${Math.random()}`; allEntities.push({ type: "gene", data: { uid: geneUID, gene_id: geneUID, symbol: symbolMatch ? stripXmlTags(symbolMatch[1]) : null, name: symbolMatch ? stripXmlTags(symbolMatch[1]) : null, description: descMatch ? stripXmlTags(descMatch[1]) : null, chromosome: chromosomeMatch ? stripXmlTags(chromosomeMatch[1]) : null, organism: organismMatch ? stripXmlTags(organismMatch[1]) : null, gene_type: "protein-coding", // Default, could be enhanced }, }); } diagnostics.terms_found = allEntities.length; return { entities: allEntities, diagnostics }; } } // Enhanced parser for Protein database XML format export class ProteinXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "protein_xml_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; const seqMatches = content.match(/<Bioseq>[\s\S]+?<\/Bioseq>/g) || []; for (const seqXml of seqMatches) { const idMatch = seqXml.match( /<Textseq-id_accession>([^<]+)<\/Textseq-id_accession>/, ); const titleMatch = seqXml.match( /<Seqdesc_title>([^<]+)<\/Seqdesc_title>/, ); const lengthMatch = seqXml.match(/<Bioseq_length>(\d+)<\/Bioseq_length>/); const organismMatch = seqXml.match( /<BioSource_org>[\s\S]*?<Org-ref_taxname>([^<]+)<\/Org-ref_taxname>/, ); const proteinUID = idMatch ? idMatch[1] : `prot_${Math.random()}`; allEntities.push({ type: "protein", data: { uid: proteinUID, accession: proteinUID, title: titleMatch ? stripXmlTags(titleMatch[1]) : null, length: lengthMatch ? parseInt(lengthMatch[1], 10) : null, organism: organismMatch ? stripXmlTags(organismMatch[1]) : null, }, }); } diagnostics.terms_found = allEntities.length; return { entities: allEntities, diagnostics }; } } // Enhanced parser for Nucleotide database XML format export class NucleotideXMLParser implements IContentParser { parse(content: string): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const allEntities: { type: string; data: any }[] = []; const diagnostics: ParsingDiagnostics = { method_used: "nucleotide_xml_extraction", terms_found: 0, failed_extractions: [], warnings: [], indexing_status: "complete", mesh_availability: "none", }; const seqMatches = content.match(/<Bioseq>[\s\S]+?<\/Bioseq>/g) || []; for (const seqXml of seqMatches) { const idMatch = seqXml.match( /<Textseq-id_accession>([^<]+)<\/Textseq-id_accession>/, ); const titleMatch = seqXml.match( /<Seqdesc_title>([^<]+)<\/Seqdesc_title>/, ); const lengthMatch = seqXml.match(/<Bioseq_length>(\d+)<\/Bioseq_length>/); const organismMatch = seqXml.match( /<BioSource_org>[\s\S]*?<Org-ref_taxname>([^<]+)<\/Org-ref_taxname>/, ); const moltypeMatch = seqXml.match( /<MolInfo_biomol>(\d+)<\/MolInfo_biomol>/, ); const seqUID = idMatch ? idMatch[1] : `nucl_${Math.random()}`; // Map biomol codes to readable names const biomolMap: { [key: string]: string } = { "1": "genomic DNA", "2": "pre-mRNA", "3": "mRNA", "4": "rRNA", "5": "tRNA", "6": "peptide", "7": "other-genetic", "8": "genomic-mRNA", }; allEntities.push({ type: "nucleotide", data: { uid: seqUID, accession: seqUID, title: titleMatch ? stripXmlTags(titleMatch[1]) : null, length: lengthMatch ? parseInt(lengthMatch[1], 10) : null, organism: organismMatch ? stripXmlTags(organismMatch[1]) : null, molecule_type: moltypeMatch ? biomolMap[moltypeMatch[1]] || "unknown" : "unknown", }, }); } diagnostics.terms_found = allEntities.length; return { entities: allEntities, diagnostics }; } } // Enhanced fallback parser for unstructured data. export class FallbackParser implements IContentParser { parse(content: any): { entities: { type: string; data: any }[]; diagnostics: ParsingDiagnostics; } { const diagnostics: ParsingDiagnostics = { method_used: "fallback_raw", terms_found: 0, failed_extractions: [], warnings: ["Content could not be parsed with specialized parser"], indexing_status: "unknown", mesh_availability: "none", }; return { entities: [ { type: "raw_data", data: { uid: "raw_1", content: JSON.stringify(content) }, }, ], diagnostics, }; } } // Factory to select the appropriate parser. export function getParserFor(db: string, rettype?: string): IContentParser { if (db === "pubmed" && rettype === "xml") { return new PubMedXMLParser(); } if (db === "gene" && rettype === "xml") { return new GeneXMLParser(); } if (db === "protein" && rettype === "xml") { return new ProteinXMLParser(); } if ((db === "nucleotide" || db === "nuccore") && rettype === "xml") { return new NucleotideXMLParser(); } return new FallbackParser(); } // Factory to get parser for specific tool responses export function getParserForTool( toolName: string, content: string, ): IContentParser { switch (toolName) { case "EInfo": return new EInfoXMLParser(); case "ESummary": return new ESummaryXMLParser(); case "EFetch": // Determine database from content or use PubMed as default if (content.includes("<PubmedArticle>")) { return new PubMedXMLParser(); } return new FallbackParser(); case "BLAST Submit": return new BlastSubmitParser(); default: return new FallbackParser(); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/QuentinCody/entrez-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parsers.ts•20.7 KiB