StatPearls MCP Server

statpearls-parser.ts•14.7 KiB

/** * StatPearls HTML Parser * * This module provides functions for parsing StatPearls HTML content * and extracting structured data according to the defined types. */ import * as cheerio from 'cheerio'; import { ParsingError } from './error.js'; import type { StatPearlsArticle, Author, PublicationDetails, Section, Subsection, Figure, Table, Reference, Objectives, StatPearlsSearchResult } from '../types/statpearls.js'; /** * Extract the article ID (NBK number) from a StatPearls URL or HTML content * @param input URL or HTML content * @returns The NBK ID or null if not found */ export function extractArticleId(input: string): string | null { // Try to extract from URL const urlMatch = input.match(/\/books\/NBK(\d+)/); if (urlMatch) { return `NBK${urlMatch[1]}`; } // Try to extract from HTML meta tags const $ = cheerio.load(input); const metaContent = $('meta[name="ncbi_acc"]').attr('content'); if (metaContent) { return metaContent; } return null; } /** * Extract the title from StatPearls HTML * @param $ Cheerio instance * @returns The article title */ function extractTitle($: cheerio.CheerioAPI): string { // Try the specific selector first let title = $('h1[id^="_NBK"] span.title').text().trim(); // Fallback to other potential selectors if (!title) { title = $('h1 span.title').text().trim(); } if (!title) { title = $('h1').first().text().trim(); } if (!title) { title = $('title').text().trim().replace(' - StatPearls - NCBI Bookshelf', ''); } return title || 'Untitled Article'; } /** * Extract authors from StatPearls HTML * @param $ Cheerio instance * @returns Array of Author objects */ function extractAuthors($: cheerio.CheerioAPI): Author[] { const authors: Author[] = []; // Extract author names $('p.contrib-group span[itemprop="author"]').each((i, el) => { authors.push({ name: $(el).text().trim() }); }); // Try to extract affiliations if available const affiliationsDiv = $('div[id$="_ai__"]'); if (affiliationsDiv.length > 0) { affiliationsDiv.find('div.affiliation').each((i, el) => { if (i < authors.length) { authors[i].affiliation = $(el).text().replace(/^\d+\s+/, '').trim(); } }); } return authors; } /** * Extract publication details from StatPearls HTML * @param $ Cheerio instance * @returns PublicationDetails object */ function extractPublicationDetails($: cheerio.CheerioAPI): PublicationDetails { const details: PublicationDetails = { lastUpdate: '', publisher: 'StatPearls Publishing', publicationYear: new Date().getFullYear().toString() }; // Extract last update date const lastUpdate = $('p.small span[itemprop="dateModified"]').text().trim(); if (lastUpdate) { details.lastUpdate = lastUpdate; } // Extract publication info from the header $('div.bk_prnt p').each((i, el) => { const text = $(el).text().trim(); // Look for publisher and publication year const publisherMatch = text.match(/StatPearls \[Internet\]\. Treasure Island $FL$: StatPearls Publishing; (\d{4})/); if (publisherMatch) { details.publicationYear = publisherMatch[1]; details.publicationLocation = 'Treasure Island (FL)'; } }); return details; } /** * Extract objectives from StatPearls HTML * @param $ Cheerio instance * @returns Objectives object or undefined if not found */ function extractObjectives($: cheerio.CheerioAPI): Objectives | undefined { const objectivesItems: string[] = []; // Find the objectives section const objectivesHeader = $('p:contains("Objectives:")'); if (objectivesHeader.length > 0) { // Get the list items following the objectives header const objectivesList = objectivesHeader.next('ul'); objectivesList.find('li').each((i, el) => { objectivesItems.push($(el).text().trim()); }); return { items: objectivesItems }; } return undefined; } /** * Extract abstract from StatPearls HTML * @param $ Cheerio instance * @returns Abstract text or undefined if not found */ function extractAbstract($: cheerio.CheerioAPI): string | undefined { // Find the continuing education activity section (which serves as the abstract) const abstractSection = $('div[id$=".s1"]'); if (abstractSection.length > 0) { // Skip the heading and get all paragraph text const paragraphs: string[] = []; abstractSection.find('> p').each((i, el) => { paragraphs.push($(el).text().trim()); }); return paragraphs.join('\n\n'); } return undefined; } /** * Extract introduction from StatPearls HTML * @param $ Cheerio instance * @returns Introduction text or undefined if not found */ function extractIntroduction($: cheerio.CheerioAPI): string | undefined { // Find the introduction section const introSection = $('div[id$=".s2"]'); if (introSection.length > 0) { // Skip the heading and get all paragraph text const paragraphs: string[] = []; introSection.find('> p').each((i, el) => { paragraphs.push($(el).text().trim()); }); return paragraphs.join('\n\n'); } return undefined; } /** * Extract figures from a section * @param $ Cheerio instance * @param sectionEl The section element * @returns Array of Figure objects */ function extractFigures($: cheerio.CheerioAPI, sectionEl: any): Figure[] { const figures: Figure[] = []; // Find all figure containers within the section $(sectionEl).find('div.floats-group div.iconblock.fig').each((i, el) => { const id = $(el).attr('id') || ''; const imgEl = $(el).find('img.small-thumb'); const captionEl = $(el).find('p.float-caption'); if (imgEl.length > 0 && captionEl.length > 0) { figures.push({ id, caption: captionEl.text().trim(), imageUrl: imgEl.attr('src') || '', altText: imgEl.attr('alt') || '', largeImageUrl: imgEl.attr('src-large') || '' }); } }); return figures; } /** * Extract tables from a section * @param $ Cheerio instance * @param sectionEl The section element * @returns Array of Table objects */ function extractTables($: cheerio.CheerioAPI, sectionEl: any): Table[] { const tables: Table[] = []; // Find all table containers within the section $(sectionEl).find('div.iconblock.table-wrap').each((i, el) => { const id = $(el).attr('id') || ''; const captionEl = $(el).find('p.float-caption'); const tableUrl = $(el).find('a.img_link').attr('href') || ''; if (captionEl.length > 0) { tables.push({ id, caption: captionEl.text().trim(), tableHtml: '', // Actual table HTML would need to be fetched from tableUrl tableUrl: tableUrl.startsWith('/') ? `https://www.ncbi.nlm.nih.gov${tableUrl}` : tableUrl }); } }); return tables; } /** * Extract subsections from a section's content * @param $ Cheerio instance * @param sectionContent The HTML content of the section * @returns Array of Subsection objects */ function extractSubsections($: cheerio.CheerioAPI, sectionContent: string): Subsection[] { const subsections: Subsection[] = []; const $content = cheerio.load(sectionContent); // Find all bold text that might be subsection headings $content('p > b').each((i, el) => { const title = $content(el).text().trim(); if (title) { // Get the parent paragraph and any following siblings until the next subsection const parentP = $content(el).parent(); let content = parentP.html() || ''; // Remove the title from the content content = content.replace(`<b>${title}</b>`, '').trim(); // Get content from following paragraphs until the next subsection let nextEl = parentP.next(); while (nextEl.length > 0 && !nextEl.find('b').length) { content += '\n\n' + (nextEl.html() || ''); nextEl = nextEl.next(); } subsections.push({ title, content }); } }); return subsections; } /** * Extract all sections from StatPearls HTML * @param $ Cheerio instance * @returns Array of Section objects */ function extractSections($: cheerio.CheerioAPI): Section[] { const sections: Section[] = []; // Find all section divs $('div[id^="article-"][id$=".s"]').each((i, el) => { const id = $(el).attr('id') || ''; const heading = $(el).find('> h2').first().text().trim(); // Skip the abstract and introduction sections as they're handled separately if (id.endsWith('.s1') || id.endsWith('.s2')) { return; } // Skip the references section as it's handled separately if (heading.toLowerCase() === 'references') { return; } // Extract the content HTML let contentHtml = ''; $(el).children().each((j, child) => { if (j > 0) { // Skip the heading contentHtml += $.html(child); } }); // Extract figures and tables const figures = extractFigures($, el); const tables = extractTables($, el); // Extract subsections const subsections = extractSubsections($, contentHtml); // Create the section object sections.push({ id, title: heading, content: contentHtml, subsections, figures, tables, level: 2 // Main sections are h2 }); }); return sections; } /** * Extract references from StatPearls HTML * @param $ Cheerio instance * @returns Array of Reference objects */ function extractReferences($: cheerio.CheerioAPI): Reference[] { const references: Reference[] = []; // Find the references section const referencesSection = $('div[id$=".s26"]'); if (referencesSection.length > 0) { // Process each reference in the definition list referencesSection.find('dl.temp-labeled-list > dt').each((i, el) => { const number = parseInt($(el).text().trim().replace('.', ''), 10); const refDiv = $(el).next('dd').find('div.bk_ref'); const id = refDiv.attr('id') || ''; const text = refDiv.text().trim(); // Try to extract PMID and PMCID let pmid = ''; let pmcid = ''; const pmidLink = refDiv.find('a[href*="pubmed.ncbi.nlm.nih.gov"]'); if (pmidLink.length > 0) { const pmidMatch = pmidLink.attr('href')?.match(/\/(\d+)$/); if (pmidMatch) { pmid = pmidMatch[1]; } } const pmcLink = refDiv.find('a[href*="pmc/articles/PMC"]'); if (pmcLink.length > 0) { const pmcMatch = pmcLink.attr('href')?.match(/PMC(\d+)/); if (pmcMatch) { pmcid = pmcMatch[1]; } } references.push({ id, number, text, pmid, pmcid }); }); } return references; } /** * Extract copyright information from StatPearls HTML * @param $ Cheerio instance * @returns Copyright text */ function extractCopyright($: cheerio.CheerioAPI): string { const copyrightDiv = $('div.post-content'); if (copyrightDiv.length > 0) { return copyrightDiv.text().trim(); } return 'Copyright © StatPearls Publishing LLC.'; } /** * Extract disclosure information from StatPearls HTML * @param $ Cheerio instance * @returns Array of disclosure statements */ function extractDisclosures($: cheerio.CheerioAPI): string[] { const disclosures: string[] = []; $('dl.temp-labeled-list.small p:contains("Disclosure:")').each((i, el) => { disclosures.push($(el).text().trim()); }); return disclosures; } /** * Parse StatPearls HTML content into a structured article * @param html The HTML content to parse * @param url The URL of the article * @returns Structured StatPearlsArticle object */ export function parseStatPearlsArticle(html: string, url: string): StatPearlsArticle { try { const $ = cheerio.load(html); // Extract article ID const articleId = extractArticleId(url) || extractArticleId(html) || ''; // Extract basic metadata const title = extractTitle($); const authors = extractAuthors($); const publicationDetails = extractPublicationDetails($); // Extract content sections const abstract = extractAbstract($); const objectives = extractObjectives($); const introduction = extractIntroduction($); const sections = extractSections($); const references = extractReferences($); // Extract additional information const copyright = extractCopyright($); const disclosures = extractDisclosures($); return { title, authors, publicationDetails, abstract, objectives, introduction, sections, references, copyright, disclosures, url, articleId }; } catch (error) { throw new ParsingError(`Failed to parse StatPearls article: ${error instanceof Error ? error.message : String(error)}`); } } /** * Parse StatPearls search results HTML * @param html The HTML content of the search results page * @returns Array of StatPearlsSearchResult objects */ export function parseStatPearlsSearchResults(html: string): StatPearlsSearchResult[] { try { const $ = cheerio.load(html); const results: StatPearlsSearchResult[] = []; // NCBI search results are typically in elements with class 'rslt' $('.rslt').each((i, element) => { // Extract title from the first link in the result const titleElement = $(element).find('a').first(); const title = titleElement.text().trim(); // Extract URL from the link const relativeUrl = titleElement.attr('href'); const url = relativeUrl ? new URL(relativeUrl, 'https://www.ncbi.nlm.nih.gov').toString() : ''; // Extract description from the result text const descriptionElement = $(element).find('p').first(); const description = descriptionElement.text().trim(); // Try to extract article ID from URL let articleId = ''; const idMatch = url.match(/\/books\/NBK(\d+)/); if (idMatch) { articleId = `NBK${idMatch[1]}`; } // Try to extract authors if available const authors: string[] = []; const authorText = $(element).find('.supp').text(); if (authorText) { const authorMatch = authorText.match(/by (.+?)\.?$/); if (authorMatch) { authorMatch[1].split(',').forEach(author => { authors.push(author.trim()); }); } } if (title && url) { results.push({ title, url, description: description || 'No description available', authors, articleId }); } }); return results; } catch (error) { throw new ParsingError(`Failed to parse StatPearls search results: ${error instanceof Error ? error.message : String(error)}`); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jpoles1/statpearls-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

statpearls-parser.ts•14.7 KiB