Rijksmuseum MCP Server

MIT License
Overview InspectNew Schema Related Servers Reviews Score
mcp_simple_pubmed
"""
Full text fetching functionality for PubMed articles.

This module focuses solely on retrieving full text content from PMC
using Bio.Entrez.
"""
import logging
from typing import Optional
import xml.etree.ElementTree as ET
from Bio import Entrez, Medline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("pubmed-fetch")

class PubMedFetch:
    """Client for fetching full text from PubMed Central."""
    
    def _clean_text(self, text: Optional[str]) -> Optional[str]:
        """Clean text content.
        
        Args:
            text: Text to clean
            
        Returns:
            Cleaned text with normalized whitespace
        """
        if text is None:
            return None
            
        return ' '.join(text.split())

    def _extract_text_from_pmc_xml(self, xml_content: bytes) -> str:
        """Extract readable text content from PMC XML.
        
        Args:
            xml_content: PMC article XML
            
        Returns:
            Extracted text content
        """
        try:
            root = ET.fromstring(xml_content)
            
            # Dictionary for text parts
            parts = {}
            
            # Get article title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                parts['title'] = self._clean_text(title_elem.text)
            
            # Get abstract
            abstract_parts = []
            for abstract in root.findall(".//abstract//p"):
                if abstract.text:
                    abstract_parts.append(self._clean_text(abstract.text))
            if abstract_parts:
                parts['abstract'] = " ".join(abstract_parts)
            
            # Get main body text
            body_parts = []
            for section in root.findall(".//body//sec"):
                # Get section title if available
                title = section.find("title")
                if title is not None and title.text:
                    body_parts.append(f"\n\n{title.text}\n")
                
                # Get paragraphs in section
                for p in section.findall(".//p"):
                    if p.text:
                        body_parts.append(self._clean_text(p.text))
            
            if body_parts:
                parts['body'] = "\n\n".join(body_parts)
            
            # Combine all parts
            text_parts = []
            if 'title' in parts:
                text_parts.append(parts['title'])
            if 'abstract' in parts:
                text_parts.append("\nABSTRACT\n" + parts['abstract'])
            if 'body' in parts:
                text_parts.append("\nMAIN TEXT\n" + parts['body'])
                
            if not text_parts:
                raise ValueError("No text content found in PMC XML")
                
            return "\n\n".join(text_parts)
            
        except ET.ParseError as e:
            logger.error(f"Error parsing PMC XML: {str(e)}")
            raise ValueError(f"Could not parse PMC XML content: {str(e)}")
        except Exception as e:
            logger.error(f"Error extracting text from PMC XML: {str(e)}")
            raise ValueError(f"Error processing PMC content: {str(e)}")

    async def get_full_text(self, pmid: str) -> str:
        """Get full text of an article if available.
        
        Args:
            pmid: PubMed ID of the article
            
        Returns:
            Full text content if available, otherwise an error message
            explaining why the text is not available.
            
        Raises:
            ValueError: If there are issues accessing or parsing the content
        """
        try:
            # First get PMC ID if available
            logger.info(f"Fetching article {pmid}")
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
            record = Medline.read(handle)
            handle.close()
            
            if 'PMC' in record:
                pmc_id = record['PMC']
                logger.info(f"Found PMC ID {pmc_id}, fetching full text")
                
                # Get full text from PMC
                pmc_handle = Entrez.efetch(db='pmc', id=pmc_id, rettype='full', retmode='xml')
                xml_content = pmc_handle.read()
                pmc_handle.close()
                
                # Parse XML and extract text
                return self._extract_text_from_pmc_xml(xml_content)
                    
            elif 'DOI' in record:
                return f"Full text not available in PMC. Article has DOI {record['DOI']} - full text may be available through publisher"
            else:
                return "Full text not available - article is not in PMC and has no DOI"
                
        except Exception as e:
            logger.exception(f"Error getting full text for article {pmid}")
            return f"Error retrieving full text: {str(e)}"