Skip to main content
Glama
eSummaryResultParser.ts•13.9 kB
/** * @fileoverview Helper functions for parsing ESummary results from NCBI. * Handles different ESummary XML structures and formats the data into * consistent ParsedBriefSummary objects. * @module src/services/NCBI/parsing/eSummaryResultParser */ import { ESummaryArticleId, ESummaryDocSumOldXml, ESummaryDocumentSummary, ESummaryItem, ESummaryResult, ParsedBriefSummary, ESummaryAuthor as XmlESummaryAuthor, // This is the normalized output type XmlESummaryAuthorRaw, // This is the raw input type from XML parsing } from "../../../types-global/pubmedXml.js"; import { dateParser, logger, RequestContext, requestContextService, } from "../../../utils/index.js"; // Note: utils/index.js is the barrel file import { ensureArray, getAttribute, getText } from "./xmlGenericHelpers.js"; /** * Formats an array of ESummary authors into a string. * Limits to the first 3 authors and adds "et al." if more exist. * @param authors - Array of ESummary author objects (normalized). * @returns A string like "Doe J, Smith A, Brown B, et al." or empty if no authors. */ export function formatESummaryAuthors(authors?: XmlESummaryAuthor[]): string { if (!authors || authors.length === 0) return ""; return ( authors .slice(0, 3) .map((author) => author.name) // Assumes author.name is the string representation .join(", ") + (authors.length > 3 ? ", et al." : "") ); } /** * Standardizes date strings from ESummary to "YYYY-MM-DD" format. * Uses the dateParser utility. * @param dateStr - Date string from ESummary (e.g., "2023/01/15", "2023 Jan 15", "2023"). * @param parentContext - Optional parent request context for logging. * @returns A promise resolving to a standardized date string ("YYYY-MM-DD") or undefined if parsing fails. */ export async function standardizeESummaryDate( dateStr?: string, parentContext?: RequestContext, ): Promise<string | undefined> { if (dateStr === undefined || dateStr === null) return undefined; // Check for null as well const dateInputString = String(dateStr); // Ensure it's a string const currentContext = parentContext || requestContextService.createRequestContext({ operation: "standardizeESummaryDateInternal", inputDate: dateInputString, // Log the stringified version }); try { // Pass the stringified version to the date parser const parsedDate = await dateParser.parseDate( dateInputString, currentContext, ); if (parsedDate) { return parsedDate.toISOString().split("T")[0]; // Format as YYYY-MM-DD } logger.debug( `standardizeESummaryDate: dateParser could not parse "${dateInputString}", returning undefined.`, currentContext, ); } catch (e) { logger.warning( `standardizeESummaryDate: Error during dateParser.parseDate for "${dateInputString}", returning undefined.`, { ...currentContext, error: e instanceof Error ? e.message : String(e), }, ); } return undefined; // Return undefined if parsing fails } /** * Parses authors from an ESummary DocumentSummary structure. * Handles various ways authors might be represented. * Returns an array of normalized XmlESummaryAuthor objects. * Internal helper function. */ function parseESummaryAuthorsFromDocumentSummary( docSummary: ESummaryDocumentSummary, ): XmlESummaryAuthor[] { const authorsProp = docSummary.Authors; if (!authorsProp) return []; const parsedAuthors: XmlESummaryAuthor[] = []; const processRawAuthor = (rawAuthInput: XmlESummaryAuthorRaw | string) => { let name = ""; let authtype: string | undefined; let clusterid: string | undefined; if (typeof rawAuthInput === "string") { name = rawAuthInput; } else if (rawAuthInput && typeof rawAuthInput === "object") { const authorObj = rawAuthInput as XmlESummaryAuthorRaw; // Now typed // Try extracting text from the object itself (e.g., if it's { '#text': 'Author Name' }) name = getText(authorObj, ""); // If name is still empty, try common property names for author names if (!name) { name = getText(authorObj.Name || authorObj.name, ""); } authtype = getText(authorObj.AuthType || authorObj.authtype, undefined); clusterid = getText( authorObj.ClusterId || authorObj.clusterid, undefined, ); // Fallback for unhandled structures: log and try to stringify if (!name) { const authInputString = JSON.stringify(authorObj); logger.warning( `Unhandled author structure in parseESummaryAuthorsFromDocumentSummary. authInput: ${authInputString.substring(0, 100)}`, requestContextService.createRequestContext({ operation: "parseESummaryAuthorsFromDocumentSummary", detail: "Unhandled author structure", }), ); // As a last resort, if it's a simple object with a single value, that might be the name const keys = Object.keys(authorObj); if ( keys.length === 1 && keys[0] && typeof (authorObj as Record<string, unknown>)[keys[0]] === "string" ) { name = (authorObj as Record<string, unknown>)[keys[0]] as string; } else if (authInputString.length < 100) { // Avoid overly long stringified objects name = authInputString; // Not ideal, but better than empty for debugging } } } if (name.trim()) { parsedAuthors.push({ name: name.trim(), authtype, clusterid, }); } }; if (Array.isArray(authorsProp)) { // authorsProp could be Array<string> or Array<XmlESummaryAuthorRaw> (authorsProp as (XmlESummaryAuthorRaw | string)[]).forEach( processRawAuthor, ); } else if ( typeof authorsProp === "object" && "Author" in authorsProp && // authorsProp is { Author: ... } authorsProp.Author ) { const rawAuthors = ensureArray( authorsProp.Author as | XmlESummaryAuthorRaw | XmlESummaryAuthorRaw[] | string, ); rawAuthors.forEach(processRawAuthor); } else if (typeof authorsProp === "string") { try { // Attempt to parse if it looks like a JSON array string if (authorsProp.startsWith("[") && authorsProp.endsWith("]")) { const parsedJsonAuthors = JSON.parse(authorsProp) as unknown[]; if (Array.isArray(parsedJsonAuthors)) { parsedJsonAuthors.forEach((authItem: unknown) => { if (typeof authItem === "string") { parsedAuthors.push({ name: authItem.trim() }); } else if ( typeof authItem === "object" && authItem !== null && ((authItem as XmlESummaryAuthorRaw).name || (authItem as XmlESummaryAuthorRaw).Name) ) { // If it's an object with a name property, treat as XmlESummaryAuthorRaw processRawAuthor(authItem as XmlESummaryAuthorRaw); } }); if (parsedAuthors.length > 0) return parsedAuthors; // Return if JSON parsing yielded results } } } catch (e) { logger.debug( `Failed to parse Authors string as JSON: ${authorsProp.substring(0, 100)}`, requestContextService.createRequestContext({ operation: "parseESummaryAuthorsFromString", input: authorsProp.substring(0, 100), error: e instanceof Error ? e.message : String(e), }), ); } // Fallback: split string by common delimiters authorsProp .split(/[,;]/) .map((namePart: string) => namePart.trim()) .filter((namePart) => namePart) .forEach((namePart) => parsedAuthors.push({ name: namePart })); } return parsedAuthors.filter((author) => author.name); } /** * Parses a single ESummary DocumentSummary (newer XML format) into a raw summary object. * Internal helper function. */ function parseSingleDocumentSummary(docSummary: ESummaryDocumentSummary): Omit< ParsedBriefSummary, "pubDate" | "epubDate" > & { rawPubDate?: string; rawEPubDate?: string; } { const pmid = docSummary["@_uid"]; const authorsArray = parseESummaryAuthorsFromDocumentSummary(docSummary); let doiValue: string | undefined = getText(docSummary.DOI, undefined); if (!doiValue) { const articleIdsProp = docSummary.ArticleIds; if (articleIdsProp) { const idsArray = Array.isArray(articleIdsProp) ? articleIdsProp : ensureArray( ( articleIdsProp as { ArticleId: ESummaryArticleId[] | ESummaryArticleId; } ).ArticleId, ); const doiEntry = idsArray.find( (id) => (id as ESummaryArticleId).idtype === "doi", ); if (doiEntry) { doiValue = getText((doiEntry as ESummaryArticleId).value, undefined); } } } return { pmid: String(pmid), title: getText(docSummary.Title, undefined), authors: formatESummaryAuthors(authorsArray), source: getText(docSummary.Source, undefined) || getText(docSummary.FullJournalName, undefined) || getText(docSummary.SO, undefined) || undefined, doi: doiValue, rawPubDate: getText(docSummary.PubDate, undefined), rawEPubDate: getText(docSummary.EPubDate, undefined), }; } /** * Parses a single ESummary DocSum (older XML item-based format) into a raw summary object. * Internal helper function. */ function parseSingleDocSumOldXml(docSum: ESummaryDocSumOldXml): Omit< ParsedBriefSummary, "pubDate" | "epubDate" > & { rawPubDate?: string; rawEPubDate?: string; } { const pmid = docSum.Id; const items = ensureArray(docSum.Item); const getItemValue = ( name: string | string[], type?: ESummaryItem["_Type"], ): string | undefined => { const namesToTry = ensureArray(name); for (const n of namesToTry) { const item = items.find( (i) => i._Name === n && (type ? i._Type === type : true) && i._Type !== "ERROR", ); if (item) { const textVal = getText(item); if (textVal !== undefined) return String(textVal); } } return undefined; }; const getAuthorList = (): XmlESummaryAuthor[] => { const authorListItem = items.find( (i) => i._Name === "AuthorList" && i._Type === "List", ); if (authorListItem && authorListItem.Item) { return ensureArray(authorListItem.Item) .filter((a) => a._Name === "Author" && a._Type === "String") .map((a) => ({ name: getText(a, "") })); } // Fallback for authors directly under DocSum items return items .filter((i) => i._Name === "Author" && i._Type === "String") .map((a) => ({ name: getText(a, "") })); }; const authorsArray = getAuthorList(); let doiFromItems: string | undefined = getItemValue("DOI", "String"); if (!doiFromItems) { const articleIdsItem = items.find( (i) => i._Name === "ArticleIds" && i._Type === "List", ); if (articleIdsItem && articleIdsItem.Item) { const ids = ensureArray(articleIdsItem.Item); const doiIdItem = ids.find( (id) => getAttribute(id as ESummaryItem, "idtype") === "doi" || (id as ESummaryItem)._Name === "doi", // Some older formats might use Name="doi" ); if (doiIdItem) { doiFromItems = getText(doiIdItem); } } } return { pmid: String(pmid), title: getItemValue("Title", "String"), authors: formatESummaryAuthors(authorsArray), source: getItemValue(["Source", "FullJournalName", "SO"], "String"), doi: doiFromItems, rawPubDate: getItemValue(["PubDate", "ArticleDate"], "Date"), rawEPubDate: getItemValue("EPubDate", "Date"), }; } /** * Extracts and formats brief summaries from ESummary XML result. * Handles both DocumentSummarySet (newer) and older DocSum structures. * Asynchronously standardizes dates. * @param eSummaryResult - The parsed XML object from ESummary (eSummaryResult part). * @param context - Request context for logging and passing to date standardization. * @returns A promise resolving to an array of parsed brief summary objects. */ export async function extractBriefSummaries( eSummaryResult?: ESummaryResult, context?: RequestContext, ): Promise<ParsedBriefSummary[]> { if (!eSummaryResult) return []; const opContext = context || requestContextService.createRequestContext({ operation: "extractBriefSummariesInternal", }); if (eSummaryResult.ERROR) { logger.warning("ESummary result contains an error", { ...opContext, errorDetails: eSummaryResult.ERROR, }); return []; } let rawSummaries: (Omit<ParsedBriefSummary, "pubDate" | "epubDate"> & { rawPubDate?: string; rawEPubDate?: string; })[] = []; if (eSummaryResult.DocumentSummarySet?.DocumentSummary) { const docSummaries = ensureArray( eSummaryResult.DocumentSummarySet.DocumentSummary, ); rawSummaries = docSummaries .map(parseSingleDocumentSummary) .filter((s) => s.pmid); } else if (eSummaryResult.DocSum) { const docSums = ensureArray(eSummaryResult.DocSum); rawSummaries = docSums.map(parseSingleDocSumOldXml).filter((s) => s.pmid); } const processedSummaries: ParsedBriefSummary[] = []; for (const rawSummary of rawSummaries) { const pubDate = await standardizeESummaryDate( rawSummary.rawPubDate, opContext, ); const epubDate = await standardizeESummaryDate( rawSummary.rawEPubDate, opContext, ); processedSummaries.push({ pmid: rawSummary.pmid, title: rawSummary.title, authors: rawSummary.authors, source: rawSummary.source, doi: rawSummary.doi, pubDate, epubDate, }); } return processedSummaries; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cyanheads/pubmed-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server