BioPython MCP Server

pubmed.py•18.8 KiB

""" PubMed Central Full-Text Access Module. This module provides functions to: 1. Fetch full-text articles from PubMed Central (PMC) 2. Generate formatted literature reviews from PubMed searches 3. Get URLs for PMC articles and DOIs Rate Limiting ------------- PMC access should respect NCBI rate limits (same as Entrez): - 3 requests/second without API key - 10 requests/second with API key """ import time from datetime import datetime from typing import Any, TypedDict import httpx class ReviewStats(TypedDict): """Statistics tracking for pubmed_review.""" pmc_count: int doi_count: int years: list[int] journals: dict[str, int] def pubmed_fetch(pmc_id: str, format: str = "xml", timeout: int = 30) -> dict[str, Any]: """ Fetch full-text article from PubMed Central (PMC). This function retrieves open access full-text articles from PMC using the PMC OAI service. Only works for open access articles that have a PMC ID. Args: pmc_id: PMC identifier (with or without 'PMC' prefix, e.g., "PMC123456" or "123456") format: Output format - "xml" for structured XML or "text" for plain text (default: "xml") timeout: Request timeout in seconds (default: 30) Returns: Dictionary containing the full-text article and metadata: - success (bool): Whether fetch was successful - pmc_id (str): The PMC identifier - format (str): Format of returned content - content (str): Full-text article content - content_length (int): Length of content in characters - error (str): Error message if unsuccessful Examples: >>> result = pubmed_fetch("PMC3539452") >>> if result["success"]: ... print(result["content"][:100]) >>> result = pubmed_fetch("3539452", format="text") >>> print(result["content"]) Note: - Only works for open access articles - Articles without PMC IDs cannot be fetched - Rate limiting applies (use with entrez_rate_limit context manager) - XML format preserves structure (sections, figures, tables, references) - Text format provides simplified plain text extraction """ try: # Normalize PMC ID (ensure it starts with PMC) if not pmc_id.startswith("PMC"): pmc_id = f"PMC{pmc_id}" # PMC OAI service URL # This service provides full-text XML for open access articles base_url = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi" # Request parameters for OAI GetRecord params = { "verb": "GetRecord", "identifier": f"oai:pubmedcentral.nih.gov:{pmc_id[3:]}", # Remove PMC prefix "metadataPrefix": "pmc", # PMC XML format } # Make HTTP request with httpx.Client(timeout=timeout) as client: response = client.get(base_url, params=params) response.raise_for_status() content = response.text # Check for errors in OAI response if "error" in content.lower() and "idDoesNotExist" in content: return { "success": False, "error": f"PMC ID {pmc_id} not found or not available in open access", "pmc_id": pmc_id, } # For text format, extract plain text from XML if format == "text": # Simple text extraction (remove XML tags) import re text_content = re.sub(r"<[^>]+>", " ", content) text_content = re.sub(r"\s+", " ", text_content).strip() content = text_content return { "success": True, "pmc_id": pmc_id, "format": format, "content": content, "content_length": len(content), } except httpx.TimeoutException as e: return { "success": False, "error": f"Request timeout after {timeout} seconds: {str(e)}", "pmc_id": pmc_id, } except httpx.HTTPStatusError as e: return { "success": False, "error": f"HTTP error: {e.response.status_code} - {e.response.reason_phrase}", "pmc_id": pmc_id, } except Exception as e: return { "success": False, "error": f"Failed to fetch PMC article: {str(e)}", "pmc_id": pmc_id, } def get_pmc_url(pmc_id: str) -> str: """ Get the URL for a PubMed Central article. Args: pmc_id: PMC identifier (with or without 'PMC' prefix) Returns: Full URL to PMC article page Examples: >>> get_pmc_url("PMC3539452") 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3539452/' >>> get_pmc_url("3539452") 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3539452/' """ if not pmc_id.startswith("PMC"): pmc_id = f"PMC{pmc_id}" return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" def get_doi_url(doi: str) -> str: """ Get the URL for a DOI. Args: doi: Digital Object Identifier Returns: Full URL to DOI resolver Examples: >>> get_doi_url("10.1371/journal.pone.0012345") 'https://doi.org/10.1371/journal.pone.0012345' """ return f"https://doi.org/{doi}" def pubmed_review( query: str, output_path: str, format: str = "summary", max_results: int = 25, sort: str = "pub_date", ) -> dict[str, Any]: """ Create a formatted literature review from PubMed search results. This function searches PubMed, fetches article metadata, formats it as markdown, and returns the content in JSON for Claude to write to the desired location. Args: query: PubMed search query (supports full Entrez syntax including year filters) Example: "BRCA1 AND breast cancer AND 2020:2024[PDAT]" output_path: Target filepath (for reference; Claude handles actual writing) format: Output format - "full", "summary", or "minimal" (default: "summary") - "summary": Title + key findings + metadata (~60 tokens/article) - "full": Complete abstracts (~250 tokens/article) - "minimal": Title + links only (~20 tokens/article) max_results: Maximum number of articles to include (default: 25, max: 1000) sort: Sort order - "pub_date", "relevance", etc. (default: "pub_date") Returns: Dictionary with review content and metadata: - status: "success" or "error" - content: Full markdown content (ready to write) - filepath: Target path from output_path parameter - articles_found: Total number of articles found - articles_written: Number of articles successfully processed - articles_with_pmc: Count of articles with PMC IDs - articles_with_doi: Count of articles with DOIs - query: Original search query - format: Format used - file_size_kb: File size in kilobytes - year_range: {"min": int, "max": int} - top_journals: List of top 5 journals by article count - execution_time_seconds: Time taken to generate review Examples: >>> # Returns content in JSON for Claude to write >>> result = pubmed_review( ... query="COL4A3[Gene] AND Alport syndrome", ... output_path="KB/pubmed/alport_review.md" ... ) >>> # Claude automatically writes result["content"] to Obsidian or filesystem >>> # Full format with more results >>> result = pubmed_review( ... query="BRCA1 AND breast cancer AND 2020:2024[PDAT]", ... output_path="reviews/brca1_review.md", ... format="full", ... max_results=50 ... ) Notes: - Returns content in JSON (MCP best practice: server generates, client writes) - Uses memory-efficient content generation - Fetches articles in batches of 20 (NCBI limit) - Respects NCBI rate limits (3/sec or 10/sec with API key) - For very large reviews (>500 articles), consider splitting into multiple calls - Includes Obsidian-compatible YAML frontmatter """ start_time = time.time() articles_written = 0 try: # Import here to avoid circular dependency from biopython_mcp import database # Validate output path if not output_path.endswith(".md"): return { "status": "error", "error_type": "validation_error", "message": "Output path must end with .md", } # Validate format if format not in ["full", "summary", "minimal"]: return { "status": "error", "error_type": "validation_error", "message": f"Invalid format '{format}'. Must be 'full', 'summary', or 'minimal'", } # Search PubMed for PMIDs search_result = database.entrez_search( "pubmed", query, max_results=min(max_results, 1000), sort=sort ) if not search_result["success"]: return { "status": "error", "error_type": "query_error", "message": search_result.get("error", "Search failed"), } pmids = search_result["ids"] total_found = search_result["total_found"] if not pmids: return { "status": "error", "error_type": "query_error", "message": "No articles found for query", } # Statistics tracking stats: ReviewStats = { "pmc_count": 0, "doi_count": 0, "years": [], "journals": {}, } # Build content in memory content_parts = [] # Step 1: Generate frontmatter (Obsidian YAML) query_truncated = query[:50] + "..." if len(query) > 50 else query content_parts.append("---") content_parts.append(f"title: Literature Review - {query_truncated}") content_parts.append("tags: [literature-review, pubmed, biopython-mcp]") content_parts.append(f"date: {datetime.now().isoformat()}") content_parts.append(f'query: "{query}"') content_parts.append(f"total_articles: {len(pmids)}") content_parts.append(f"format: {format}") content_parts.append("status: complete") content_parts.append("---\n") # Step 2: Write header content_parts.append(f"# Literature Review: {query}\n") content_parts.append("## Query Details\n") content_parts.append(f"- **Query:** `{query}`") content_parts.append(f"- **Total Found:** {total_found:,}") content_parts.append(f"- **Retrieved:** {len(pmids)}") content_parts.append(f"- **Format:** {format}") content_parts.append(f"- **Sort:** {sort}") content_parts.append(f"- **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") content_parts.append("---\n") # Step 3: Fetch and format articles in batches batch_size = 20 for i in range(0, len(pmids), batch_size): batch = pmids[i : i + batch_size] # Fetch summaries for this batch summary_result = database.entrez_summary("pubmed", batch) if not summary_result["success"]: # Write error note but continue content_parts.append(f"\n**Error fetching batch {i//batch_size + 1}:** ") content_parts.append(f"{summary_result.get('error', 'Unknown error')}\n") continue # Process each article in the batch immediately for idx, summary in enumerate(summary_result["summaries"]): try: article_num = i + idx + 1 # Extract metadata pmid = summary.get("Id", "") title = summary.get("Title", "Untitled") authors = summary.get("AuthorList", []) journal = summary.get("FullJournalName", summary.get("Source", "Unknown")) year = summary.get("PubDate", "")[:4] if summary.get("PubDate") else "N/A" # Extract IDs article_ids = summary.get("ArticleIds", {}) pmc_id = article_ids.get("pmc", "") doi = article_ids.get("doi", "") # Track statistics if pmc_id: stats["pmc_count"] += 1 if doi: stats["doi_count"] += 1 if year.isdigit(): stats["years"].append(int(year)) if journal: stats["journals"][journal] = stats["journals"].get(journal, 0) + 1 # Format based on requested format type if format == "minimal": # Minimal format: single line content_parts.append(f"[{article_num}] {title} | PMID: {pmid}") if pmc_id: content_parts.append(f" | PMC: {pmc_id}") if doi: content_parts.append(f" | [{doi}]({get_doi_url(doi)})") content_parts.append("\n") elif format == "summary": # Summary format: title + key info + first sentence content_parts.append(f"### [{article_num}] {title}\n") content_parts.append(f"**PMID:** {pmid} | **Year:** {year}") if pmc_id: content_parts.append(f" | **PMC:** [{pmc_id}]({get_pmc_url(pmc_id)})") else: content_parts.append(" | **PMC:** null") content_parts.append("\n") # Get first sentence from abstract if available fetch_result = database.entrez_fetch( "pubmed", pmid, rettype="abstract", retmode="text" ) if fetch_result["success"]: abstract = fetch_result["data"] # Extract first sentence (up to first period + space) first_sentence = abstract.split(". ")[0] + "." content_parts.append(f"**Key:** {first_sentence}\n") content_parts.append("---\n") elif format == "full": # Full format: complete abstract content_parts.append(f"## [{article_num}] {title}\n") content_parts.append( f"**PMID:** [{pmid}](https://pubmed.ncbi.nlm.nih.gov/{pmid}/)" ) content_parts.append(f" | **Year:** {year} | **Journal:** {journal}\n") if doi: content_parts.append(f"**DOI:** [{doi}]({get_doi_url(doi)})") if pmc_id: content_parts.append(f" | **PMC:** [{pmc_id}]({get_pmc_url(pmc_id)})") content_parts.append("\n") # Authors if authors: author_names = [ f"{a.get('LastName', '')} {a.get('Initials', '')}".strip() for a in authors[:10] ] content_parts.append(f"**Authors:** {', '.join(author_names)}") if len(authors) > 10: content_parts.append(f", et al. ({len(authors)} total)") content_parts.append("\n") # Fetch full abstract fetch_result = database.entrez_fetch( "pubmed", pmid, rettype="abstract", retmode="text" ) if fetch_result["success"]: abstract = fetch_result["data"] content_parts.append("**Full Abstract:**\n") content_parts.append(f"{abstract}\n") else: content_parts.append("**Abstract:** Not available\n") content_parts.append("---\n") articles_written += 1 except Exception as e: # Write error note for this article but continue content_parts.append( f"\n**Error processing article {article_num}:** {str(e)}\n" ) continue # Step 4: Write summary statistics at end content_parts.append("\n## Summary Statistics\n") content_parts.append(f"- **Total Articles:** {articles_written}") content_parts.append(f"- **With PMC IDs:** {stats['pmc_count']}") content_parts.append(f"- **With DOIs:** {stats['doi_count']}") if stats["years"]: content_parts.append(f"- **Year Range:** {min(stats['years'])} - {max(stats['years'])}") if stats["journals"]: top_journals = sorted(stats["journals"].items(), key=lambda x: x[1], reverse=True)[:5] content_parts.append("\n**Top Journals:**") for journal, count in top_journals: content_parts.append(f"- {journal}: {count} articles") # Step 5: Combine all parts full_content = "\n".join(content_parts) file_size_kb = round(len(full_content.encode("utf-8")) / 1024, 2) # Calculate execution time execution_time = round(time.time() - start_time, 2) # Build top journals list top_journals_list = [ {"name": name, "count": count} for name, count in sorted(stats["journals"].items(), key=lambda x: x[1], reverse=True)[ :5 ] ] # Step 6: Return content in JSON return { "status": "success", "content": full_content, "filepath": output_path, "articles_found": total_found, "articles_written": articles_written, "articles_with_pmc": stats["pmc_count"], "articles_with_doi": stats["doi_count"], "query": query, "format": format, "file_size_kb": file_size_kb, "year_range": ( {"min": min(stats["years"]), "max": max(stats["years"])} if stats["years"] else {"min": 0, "max": 0} ), "top_journals": top_journals_list, "execution_time_seconds": execution_time, } except Exception as e: return { "status": "error", "error_type": "unknown", "message": f"Error creating literature review: {str(e)}", }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kmaneesh/biopython-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pubmed.py•18.8 KiB