BioPython MCP Server

biopython-mcp
biopython_mcp

database.py•40.7 KiB

"""Database access tools for NCBI, UniProt, and other biological databases. NCBI Entrez System ------------------ The Entrez system provides programmatic access to NCBI databases including: - **pubmed**: PubMed citations and abstracts - **nucleotide**: GenBank nucleotide sequences - **protein**: Protein sequences - **gene**: Gene records with genomic information - **clinvar**: ClinVar variant-phenotype relationships - **snp**: dbSNP genetic variations - **structure**: 3D molecular structures (PDB) - **taxonomy**: Taxonomic information Query Syntax Examples --------------------- Basic searches: "BRCA1[Gene]" - Search by gene symbol "breast cancer" - Free text search "2024/01/01:2024/12/31[PDAT]" - Publication date range Boolean operators: "BRCA1 AND breast cancer" - Both terms required "diabetes OR obesity" - Either term matches "cancer NOT lung" - Exclude lung cancer Field searches: "Smith J[Author]" - Search by author "Nature[Journal]" - Search by journal name "review[Publication Type]" - Filter by publication type Rate Limiting ------------- NCBI enforces rate limits on API requests: - **Default**: 3 requests/second (no API key) - **With API key**: 10 requests/second - Set `NCBI_API_KEY` environment variable for higher limits - API keys available at: https://www.ncbi.nlm.nih.gov/account/settings/ Environment Variables --------------------- - **NCBI_EMAIL**: Your email address (required by NCBI) - **NCBI_API_KEY**: Your API key (optional, for higher rate limits) Example: export NCBI_EMAIL="your.email@example.com" export NCBI_API_KEY="your_api_key_here" """ from typing import Any from Bio import Entrez, SeqIO from biopython_mcp.utils import entrez_rate_limit, format_entrez_error, parse_ids def fetch_genbank( accession: str, email: str = "user@example.com", rettype: str = "gb" ) -> dict[str, Any]: """ Fetch a sequence from GenBank by accession number. Args: accession: GenBank accession number email: Email address for Entrez (required by NCBI) rettype: Return type - 'gb' for GenBank, 'fasta' for FASTA (default: 'gb') Returns: Dictionary containing the sequence record and metadata """ try: Entrez.email = email # type: ignore[assignment] handle = Entrez.efetch(db="nucleotide", id=accession, rettype=rettype, retmode="text") record_text = handle.read() handle.close() normalized = "".join(record_text.split()).lower() if normalized.startswith("error:") or "failedtounderstandid" in normalized: return { "success": False, "error": record_text.strip() or "NCBI returned an error response", "accession": accession, "format": rettype, } return { "success": True, "accession": accession, "format": rettype, "data": record_text, "length": len(record_text), } except Exception as e: return {"success": False, "error": str(e), "accession": accession} def fetch_uniprot(uniprot_id: str, format: str = "fasta") -> dict[str, Any]: """ Fetch a protein sequence from UniProt. Args: uniprot_id: UniProt accession or ID format: Output format - 'fasta', 'txt', 'xml' (default: 'fasta') Returns: Dictionary containing the UniProt record """ try: import httpx url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.{format}" response = httpx.get(url, timeout=30.0) response.raise_for_status() return { "success": True, "uniprot_id": uniprot_id, "format": format, "data": response.text, "length": len(response.text), } except Exception as e: return {"success": False, "error": str(e), "uniprot_id": uniprot_id} def search_pubmed( query: str, max_results: int = 10, email: str = "user@example.com" ) -> dict[str, Any]: """ Search PubMed for scientific articles. Args: query: Search query string max_results: Maximum number of results to return (default: 10) email: Email address for Entrez (required by NCBI) Returns: Dictionary containing search results with PMIDs and article information """ try: Entrez.email = email # type: ignore[assignment] search_handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results) search_results = Entrez.read(search_handle) search_handle.close() pmids = search_results["IdList"] if not pmids: return { "success": True, "query": query, "count": 0, "results": [], } fetch_handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract", retmode="xml") articles = Entrez.read(fetch_handle) fetch_handle.close() results = [] for article in articles["PubmedArticle"]: medline = article["MedlineCitation"] pmid = str(medline["PMID"]) article_data = medline["Article"] title = article_data.get("ArticleTitle", "No title") abstract = article_data.get("Abstract", {}).get("AbstractText", ["No abstract"])[0] results.append({"pmid": pmid, "title": str(title), "abstract": str(abstract)[:500]}) return { "success": True, "query": query, "count": len(results), "total_found": int(search_results["Count"]), "results": results, } except Exception as e: return {"success": False, "error": str(e), "query": query} def fetch_sequence_by_id(db: str, seq_id: str, email: str = "user@example.com") -> dict[str, Any]: """ Fetch a sequence from NCBI database by ID. Args: db: Database name ('nucleotide', 'protein', etc.) seq_id: Sequence identifier email: Email address for Entrez (required by NCBI) Returns: Dictionary containing sequence information """ try: Entrez.email = email # type: ignore[assignment] handle = Entrez.efetch(db=db, id=seq_id, rettype="fasta", retmode="text") record = SeqIO.read(handle, "fasta") handle.close() return { "success": True, "database": db, "id": seq_id, "description": record.description, "sequence": str(record.seq), "length": len(record.seq), } except Exception as e: return {"success": False, "error": str(e), "database": db, "id": seq_id} # Core Entrez tools def entrez_info(database: str = "") -> dict[str, Any]: """ Get information about NCBI Entrez databases. Args: database: Specific database name (empty string for list of all databases) Returns: Dictionary containing database information: - If database="": List of all available databases with count - If database specified: Detailed info including description, record count, searchable fields, and available links Examples: >>> entrez_info() # List all databases >>> entrez_info("pubmed") # Get PubMed database details >>> entrez_info("gene") # Get Gene database details """ try: with entrez_rate_limit(): handle = Entrez.einfo(db=database if database else None) result = Entrez.read(handle) handle.close() if not database: # Return list of all databases return { "success": True, "databases": result["DbList"], "count": len(result["DbList"]), } else: # Return detailed info about specific database db_info = result["DbInfo"] return { "success": True, "database": database, "description": db_info.get("Description", ""), "record_count": int(db_info.get("Count", 0)), "last_update": db_info.get("LastUpdate", ""), "fields": [f["Name"] for f in db_info.get("FieldList", [])], "links": [link["Name"] for link in db_info.get("LinkList", [])], } except Exception as e: return format_entrez_error(e, {"database": database}) def entrez_search( database: str, query: str, max_results: int = 20, sort: str = "relevance", use_cache: bool = True, ) -> dict[str, Any]: """ Search any NCBI Entrez database using query syntax. Args: database: Database to search (e.g., 'pubmed', 'nucleotide', 'gene', 'clinvar') query: Search query using Entrez syntax (see module docstring for examples) max_results: Maximum number of results to return (default: 20, max: 10000) sort: Sort order - 'relevance', 'pub_date', 'Author', etc. (default: 'relevance') use_cache: Whether to use cached results (default: True, TTL: 1 hour) Returns: Dictionary containing: - ids: List of matching record IDs - count: Number of IDs returned - total_found: Total number of matches in database - query: Original query string - database: Database searched - cached: Whether result was from cache (if use_cache=True) Examples: >>> entrez_search("pubmed", "BRCA1 AND breast cancer", max_results=10) >>> entrez_search("gene", "BRCA1[Gene Name] AND Homo sapiens[Organism]") >>> entrez_search("nucleotide", "Homo sapiens[Organism]", max_results=5) >>> entrez_search("clinvar", "BRCA1[Gene] AND Pathogenic[Clinical Significance]") Notes: - Uses NCBI Entrez query syntax with field tags and Boolean operators - Rate limited to 3 req/sec (or 10 req/sec with API key) - See module docstring for comprehensive query syntax examples - Cached results have 1 hour TTL to balance freshness and API usage """ try: from biopython_mcp.utils import get_cached_result, set_cached_result # Check cache if enabled cache_params = { "query": query, "max_results": max_results, "sort": sort, } if use_cache: cached = get_cached_result(database, "search", cache_params, ttl=3600) if cached: cached["cached"] = True return cached # Perform search with entrez_rate_limit(): handle = Entrez.esearch( db=database, term=query, retmax=min(max_results, 10000), sort=sort ) result = Entrez.read(handle) handle.close() response = { "success": True, "database": database, "query": query, "ids": result["IdList"], "count": len(result["IdList"]), "total_found": int(result["Count"]), "sort": sort, "cached": False, } # Cache successful result if use_cache: set_cached_result(database, "search", cache_params, response) return response except Exception as e: return format_entrez_error(e, {"database": database, "query": query}) def entrez_fetch( database: str, ids: str | list[str], rettype: str = "xml", retmode: str = "xml", use_cache: bool = True, ) -> dict[str, Any]: """ Fetch full records from NCBI Entrez by UID. Args: database: Database name (e.g., 'pubmed', 'nucleotide', 'gene', 'protein') ids: Single ID, comma-separated string, or list of IDs rettype: Return type - 'xml', 'gb', 'fasta', 'abstract', etc. (default: 'xml') retmode: Return mode - 'xml', 'text', 'json' (default: 'xml') use_cache: Whether to use cached results (default: True, TTL: 7 days) Returns: Dictionary containing: - data: Raw data in requested format (parsed if XML, raw text otherwise) - ids: List of IDs fetched - count: Number of records retrieved - format: Return type/mode used - database: Database queried - cached: Whether result was from cache (if use_cache=True) Examples: >>> entrez_fetch("pubmed", "12345678", rettype="abstract", retmode="xml") >>> entrez_fetch("nucleotide", ["NM_000207", "NM_001127"], rettype="fasta", retmode="text") >>> entrez_fetch("gene", "672", rettype="xml") >>> entrez_fetch("protein", "NP_000198.1", rettype="fasta", retmode="text") Notes: - For >100 IDs, consider batching to avoid timeouts - Valid rettype/retmode combinations depend on database - XML mode returns parsed Python dict/list structure - Text mode returns raw string data - Rate limited to 3 req/sec (or 10 req/sec with API key) - Cached results have 7 day TTL since record data is relatively static """ try: from biopython_mcp.utils import get_cached_result, set_cached_result id_list = parse_ids(ids) if not id_list: return format_entrez_error( ValueError("No valid IDs provided"), {"database": database, "ids": ids} ) # Check cache if enabled cache_params = { "ids": sorted(id_list), # Sort for consistent cache keys "rettype": rettype, "retmode": retmode, } if use_cache: cached = get_cached_result(database, "fetch", cache_params, ttl=604800) # 7 days if cached: cached["cached"] = True return cached # Perform fetch with entrez_rate_limit(): handle = Entrez.efetch(db=database, id=id_list, rettype=rettype, retmode=retmode) # Read based on mode data = Entrez.read(handle) if retmode == "xml" else handle.read() handle.close() response = { "success": True, "database": database, "ids": id_list, "count": len(id_list), "format": f"{rettype}/{retmode}", "data": data, "cached": False, } # Cache successful result if use_cache: set_cached_result(database, "fetch", cache_params, response) return response except Exception as e: return format_entrez_error(e, {"database": database, "ids": str(ids)[:100]}) def entrez_summary(database: str, ids: str | list[str], use_cache: bool = True) -> dict[str, Any]: """ Get document summaries (DocSums) from NCBI Entrez. Document summaries are lightweight alternatives to full records, containing key metadata without the full content. Much faster for metadata-only queries. Args: database: Database name (e.g., 'pubmed', 'gene', 'clinvar', 'nucleotide') ids: Single ID, comma-separated string, or list of IDs use_cache: Whether to use cached results (default: True, TTL: 7 days) Returns: Dictionary containing: - summaries: List of document summary dictionaries - ids: List of IDs requested - count: Number of summaries returned - database: Database queried - cached: Whether result was from cache (if use_cache=True) Examples: >>> entrez_summary("pubmed", "12345678") >>> entrez_summary("gene", ["672", "7157"]) # BRCA1, TP53 >>> entrez_summary("clinvar", "12345") >>> entrez_summary("nucleotide", "NM_000207,NM_001127") Notes: - Much faster than entrez_fetch for metadata-only queries - Fields returned vary by database type - Rate limited to 3 req/sec (or 10 req/sec with API key) - Use this instead of fetch when you don't need full sequence/text - Cached results have 7 day TTL since summary data is relatively static """ try: from biopython_mcp.utils import get_cached_result, set_cached_result id_list = parse_ids(ids) if not id_list: return format_entrez_error( ValueError("No valid IDs provided"), {"database": database, "ids": ids} ) # Check cache if enabled cache_params = { "ids": sorted(id_list), # Sort for consistent cache keys } if use_cache: cached = get_cached_result(database, "summary", cache_params, ttl=604800) # 7 days if cached: cached["cached"] = True return cached # Perform summary fetch with entrez_rate_limit(): handle = Entrez.esummary(db=database, id=id_list) result = Entrez.read(handle) handle.close() # esummary returns different formats depending on single vs multiple IDs # Normalize to always return a list summaries = result if isinstance(result, list) else [result] response = { "success": True, "database": database, "ids": id_list, "count": len(summaries), "summaries": summaries, "cached": False, } # Cache successful result if use_cache: set_cached_result(database, "summary", cache_params, response) return response except Exception as e: return format_entrez_error(e, {"database": database, "ids": str(ids)[:100]}) # Clinical Genomics Specialized Tools def clinvar_variant_lookup( variant: str = "", gene: str = "", condition: str = "", significance: str = "", max_results: int = 20, use_cache: bool = True, ) -> dict[str, Any]: """ Search ClinVar for genetic variants and their clinical interpretations. This specialized wrapper combines entrez_search and entrez_summary for convenient ClinVar queries. Args: variant: Variant notation (e.g., "rs80357906", "NM_000059.3:c.1521_1523del") gene: Gene symbol (e.g., "BRCA1", "TP53") condition: Condition/phenotype (e.g., "breast cancer", "Lynch syndrome") significance: Clinical significance filter: - "pathogenic" - "likely_pathogenic" - "benign" - "likely_benign" - "uncertain" max_results: Maximum results to return (default: 20) use_cache: Whether to use cached results (default: True) Returns: Dictionary containing: - variants: List of variant dictionaries with clinical information - count: Number of variants returned - total_found: Total matches in ClinVar - query_terms: Dictionary of search terms used - cached: Whether result was from cache (if use_cache=True) Examples: >>> clinvar_variant_lookup(gene="BRCA1", significance="pathogenic", max_results=5) >>> clinvar_variant_lookup(variant="rs80357906") >>> clinvar_variant_lookup(gene="TP53", condition="cancer", max_results=10) Notes: - At least one search parameter must be provided - Multiple parameters are combined with AND logic - Rate limited (3 req/sec or 10 req/sec with API key) - Cached results inherit TTL from underlying entrez_search and entrez_summary calls """ try: # Build query from parameters query_parts = [] if variant: query_parts.append(f'"{variant}"[Variant Name]') if gene: query_parts.append(f"{gene}[Gene Name]") if condition: query_parts.append(f"{condition}[Disease/Phenotype]") if significance: # Map to ClinVar terminology sig_map = { "pathogenic": "Pathogenic", "likely_pathogenic": "Likely pathogenic", "benign": "Benign", "likely_benign": "Likely benign", "uncertain": "Uncertain significance", } sig_term = sig_map.get(significance.lower(), significance) query_parts.append(f'"{sig_term}"[Clinical Significance]') if not query_parts: return { "success": False, "error": "At least one search parameter required (variant, gene, condition, or significance)", "query_terms": {}, } query = " AND ".join(query_parts) # Search ClinVar using generic tool search_result = entrez_search( "clinvar", query, max_results=max_results, use_cache=use_cache ) if not search_result["success"]: return search_result if not search_result["ids"]: return { "success": True, "variants": [], "count": 0, "total_found": 0, "query_terms": { "variant": variant, "gene": gene, "condition": condition, "significance": significance, }, "query": query, "cached": search_result.get("cached", False), } # Get summaries using generic tool summary_result = entrez_summary("clinvar", search_result["ids"], use_cache=use_cache) if not summary_result["success"]: return summary_result # Format variants for clinical use variants = [] for summary in summary_result["summaries"]: # Extract key clinical information variant_info = { "clinvar_id": str(summary.get("uid", "")), "title": str(summary.get("title", "")), "accession": str(summary.get("accession", "")), "gene_symbol": ( str(summary.get("genes", [{}])[0].get("symbol", "")) if summary.get("genes") else "" ), "variation_type": str(summary.get("obj_type", "")), "clinical_significance": ( str(summary.get("clinical_significance", {}).get("description", "")) if isinstance(summary.get("clinical_significance"), dict) else str(summary.get("clinical_significance", "")) ), } variants.append(variant_info) return { "success": True, "variants": variants, "count": len(variants), "total_found": search_result["total_found"], "query_terms": { "variant": variant, "gene": gene, "condition": condition, "significance": significance, }, "query": query, "cached": search_result.get("cached", False) or summary_result.get("cached", False), } except Exception as e: return format_entrez_error(e, {"gene": gene, "variant": variant, "condition": condition}) def gene_info_fetch( gene_symbol: str = "", gene_id: str = "", organism: str = "Homo sapiens", use_cache: bool = True, ) -> dict[str, Any]: """ Fetch comprehensive gene information from NCBI Gene database. This specialized wrapper provides easy access to gene records with structured output. Args: gene_symbol: Gene symbol (e.g., "BRCA1", "TP53") gene_id: NCBI Gene ID (e.g., "672" for BRCA1) organism: Organism name (default: "Homo sapiens") use_cache: Whether to use cached results (default: True) Returns: Dictionary containing: - gene_id: NCBI Gene ID - symbol: Official gene symbol - name: Full gene name - summary: Gene summary/description - organism: Organism name - chromosome: Chromosomal location - aliases: List of gene aliases - type: Gene type (protein-coding, ncRNA, etc.) - cached: Whether result was from cache (if use_cache=True) Examples: >>> gene_info_fetch(gene_symbol="BRCA1") >>> gene_info_fetch(gene_id="672") >>> gene_info_fetch(gene_symbol="Brca1", organism="Mus musculus") Notes: - Provide either gene_symbol or gene_id (gene_id takes precedence) - Organism filter helps disambiguate gene symbols - Rate limited (3 req/sec or 10 req/sec with API key) - Cached results inherit TTL from underlying entrez_search and entrez_summary calls """ try: if not gene_symbol and not gene_id: return { "success": False, "error": "Either gene_symbol or gene_id required", } # If gene_id provided, fetch directly if gene_id: summary_result = entrez_summary("gene", gene_id, use_cache=use_cache) if not summary_result["success"]: return summary_result if not summary_result["summaries"]: return { "success": False, "error": f"Gene ID '{gene_id}' not found", "gene_id": gene_id, } gene_summary = summary_result["summaries"][0] cached = summary_result.get("cached", False) else: # Search by symbol + organism query = f"{gene_symbol}[Gene Name] AND {organism}[Organism]" search_result = entrez_search("gene", query, max_results=1, use_cache=use_cache) if not search_result["success"] or not search_result["ids"]: return { "success": False, "error": f"Gene '{gene_symbol}' not found for {organism}", "gene_symbol": gene_symbol, "organism": organism, } # Get summary of first result summary_result = entrez_summary("gene", search_result["ids"][0], use_cache=use_cache) if not summary_result["success"]: return summary_result gene_summary = summary_result["summaries"][0] cached = search_result.get("cached", False) or summary_result.get("cached", False) # Extract and structure gene information gene_info = { "success": True, "gene_id": str(gene_summary.get("uid", "")), "symbol": str(gene_summary.get("name", "")), "name": str(gene_summary.get("description", "")), "summary": str(gene_summary.get("summary", "")), "organism": ( str(gene_summary.get("organism", {}).get("scientificname", "")) if isinstance(gene_summary.get("organism"), dict) else str(gene_summary.get("organism", "")) ), "chromosome": str(gene_summary.get("chromosome", "")), "map_location": str(gene_summary.get("maplocation", "")), "gene_type": str(gene_summary.get("genetype", "")), "aliases": ( gene_summary.get("otheraliases", "").split(", ") if gene_summary.get("otheraliases") else [] ), "cached": cached, } return gene_info except Exception as e: return format_entrez_error( e, {"gene_symbol": gene_symbol, "gene_id": gene_id, "organism": organism} ) def pubmed_search( query: str, max_results: int = 10, sort: str = "relevance", year_start: int = 0, year_end: int = 0, use_cache: bool = True, ) -> dict[str, Any]: """ Search PubMed with enhanced metadata extraction. This specialized wrapper provides enriched PubMed search results with structured article metadata. Args: query: PubMed search query (supports all Entrez query syntax) max_results: Maximum results to return (default: 10) sort: Sort order - "relevance", "pub_date", "first_author" (default: "relevance") year_start: Filter by publication year start (e.g., 2020) year_end: Filter by publication year end (e.g., 2024) use_cache: Whether to use cached results (default: True, TTL: 1 hour) Returns: Dictionary containing: - articles: List of article dictionaries with: - pmid: PubMed ID - title: Article title - abstract: Full abstract text - authors: List of author names - journal: Journal name - year: Publication year - date: Publication date - doi: DOI (if available) - pmc_id: PMC ID (if available) - count: Number of articles returned - total_found: Total matches in PubMed - cached: Whether result was from cache (if use_cache=True) Examples: >>> pubmed_search("BRCA1 AND breast cancer", max_results=5) >>> pubmed_search("Smith J[Author]", sort="pub_date") >>> pubmed_search("diabetes", year_start=2020, year_end=2024, max_results=20) Notes: - Uses comprehensive Entrez query syntax - Returns full abstracts when available - Rate limited (3 req/sec or 10 req/sec with API key) - Cached results have 1 hour TTL to balance freshness and API usage """ try: # Add year filters to query if provided if year_start or year_end: start_year = year_start if year_start else "1900" end_year = year_end if year_end else "3000" year_query = f"{start_year}:{end_year}[PDAT]" query = f"({query}) AND {year_query}" # Search PubMed using generic tool search_result = entrez_search( "pubmed", query, max_results=max_results, sort=sort, use_cache=use_cache ) if not search_result["success"]: return search_result if not search_result["ids"]: return { "success": True, "articles": [], "count": 0, "total_found": 0, "query": query, "cached": search_result.get("cached", False), } # Fetch article details using generic tool fetch_result = entrez_fetch( "pubmed", search_result["ids"], rettype="abstract", retmode="xml", use_cache=use_cache ) if not fetch_result["success"]: return fetch_result # Extract rich metadata from articles articles = [] for article_data in fetch_result["data"]["PubmedArticle"]: medline = article_data["MedlineCitation"] article_info = medline["Article"] # Extract authors author_list = article_info.get("AuthorList", []) authors = [] for author in author_list: if "LastName" in author and "Initials" in author: authors.append(f"{author['LastName']} {author['Initials']}") elif "CollectiveName" in author: authors.append(str(author["CollectiveName"])) # Extract abstract abstract_sections = article_info.get("Abstract", {}).get("AbstractText", []) if abstract_sections: if isinstance(abstract_sections, list): abstract = " ".join(str(section) for section in abstract_sections) else: abstract = str(abstract_sections) else: abstract = "No abstract available" # Extract publication info pub_date = article_info.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}) year = int(pub_date.get("Year", 0)) if pub_date.get("Year") else 0 # Extract DOI and PMC ID from ArticleIdList doi = None pmc_id = None article_ids = article_data.get("PubmedData", {}).get("ArticleIdList", []) for article_id in article_ids: if article_id.attributes.get("IdType") == "doi": doi = str(article_id) elif article_id.attributes.get("IdType") == "pmc": pmc_id = str(article_id) article = { "pmid": str(medline["PMID"]), "title": str(article_info.get("ArticleTitle", "No title")), "abstract": abstract, "authors": authors, "journal": str(article_info.get("Journal", {}).get("Title", "")), "year": year, "date": f"{pub_date.get('Year', '')}-{pub_date.get('Month', '')}-{pub_date.get('Day', '')}".strip( "-" ), "doi": doi, "pmc_id": pmc_id, } articles.append(article) return { "success": True, "articles": articles, "count": len(articles), "total_found": search_result["total_found"], "query": query, "sort": sort, "cached": search_result.get("cached", False) or fetch_result.get("cached", False), } except Exception as e: return format_entrez_error(e, {"query": query, "max_results": max_results}) def variant_literature_link( variant_id: str, source_db: str = "clinvar", max_results: int = 10 ) -> dict[str, Any]: """ Find literature (PubMed) articles linked to a specific variant. Uses Entrez ELink to find cross-database relationships between variant databases and PubMed. Args: variant_id: Variant ID (ClinVar ID or dbSNP rs number) source_db: Source database - "clinvar" or "snp" (default: "clinvar") max_results: Maximum articles to return (default: 10) Returns: Dictionary containing: - variant_id: Input variant ID - source_db: Source database used - linked_pmids: List of linked PubMed IDs - articles: List of article summaries - count: Number of articles found Examples: >>> variant_literature_link("12345", source_db="clinvar") >>> variant_literature_link("80357906", source_db="snp", max_results=5) Notes: - Not all variants have linked literature - Uses Entrez ELink for database cross-referencing - Rate limited (3 req/sec or 10 req/sec with API key) """ try: # Validate source database if source_db not in ["clinvar", "snp"]: return { "success": False, "error": f"Invalid source_db '{source_db}'. Must be 'clinvar' or 'snp'", "variant_id": variant_id, } # Use Entrez.elink to find linked PubMed IDs with entrez_rate_limit(): handle = Entrez.elink(dbfrom=source_db, db="pubmed", id=variant_id) result = Entrez.read(handle) handle.close() # Extract linked PMIDs linked_pmids = [] if result and result[0].get("LinkSetDb"): for link_set in result[0]["LinkSetDb"]: if link_set.get("Link"): linked_pmids = [link["Id"] for link in link_set["Link"]] break if not linked_pmids: return { "success": True, "variant_id": variant_id, "source_db": source_db, "linked_pmids": [], "articles": [], "count": 0, } # Limit to max_results linked_pmids = linked_pmids[:max_results] # Get article summaries using generic tool summary_result = entrez_summary("pubmed", linked_pmids) if not summary_result["success"]: return summary_result # Format article info articles = [] for summary in summary_result["summaries"]: article = { "pmid": str(summary.get("uid", "")), "title": str(summary.get("title", "")), "authors": ( summary.get("authors", [{}])[0].get("name", "") if summary.get("authors") else "" ), "journal": str(summary.get("fulljournalname", "")), "year": str(summary.get("pubdate", ""))[:4], } articles.append(article) return { "success": True, "variant_id": variant_id, "source_db": source_db, "linked_pmids": linked_pmids, "articles": articles, "count": len(articles), } except Exception as e: return format_entrez_error(e, {"variant_id": variant_id, "source_db": source_db}) # Phase 3: Advanced Tools def entrez_link( source_db: str, target_db: str, ids: str | list[str], link_name: str = "", ) -> dict[str, Any]: """ Find related records across NCBI databases using ELink. This tool discovers relationships between records in different databases, such as finding PubMed articles related to genes, or nucleotide sequences related to proteins. Args: source_db: Source database (e.g., 'gene', 'protein', 'clinvar') target_db: Target database to link to (e.g., 'pubmed', 'nucleotide') ids: Single ID, comma-separated string, or list of IDs from source_db link_name: Specific link type (optional, empty = all available links) Returns: Dictionary containing: - source_db: Source database name - target_db: Target database name - source_ids: List of source IDs queried - linked_ids: Dict mapping source IDs to lists of linked target IDs - total_links: Total number of links found - link_name: Link type used (if specified) Examples: >>> entrez_link("gene", "pubmed", "672") # BRCA1 gene to PubMed >>> entrez_link("protein", "nucleotide", ["NP_000198.1", "NP_001121"]) >>> entrez_link("clinvar", "pubmed", "12345", link_name="clinvar_pubmed") Notes: - Discovers cross-database relationships automatically - Use entrez_info() to see available link names for databases - Rate limited (3 req/sec or 10 req/sec with API key) - Different databases support different link types """ try: from biopython_mcp.utils import parse_ids id_list = parse_ids(ids) if not id_list: return format_entrez_error( ValueError("No valid IDs provided"), {"source_db": source_db, "target_db": target_db}, ) linked_ids: dict[str, list[str]] = {} total_links = 0 # Link each ID individually for better tracking for source_id in id_list: with entrez_rate_limit(): if link_name: handle = Entrez.elink( dbfrom=source_db, db=target_db, id=source_id, linkname=link_name ) else: handle = Entrez.elink(dbfrom=source_db, db=target_db, id=source_id) result = Entrez.read(handle) handle.close() # Extract linked IDs for this source ID source_links = [] if result and result[0].get("LinkSetDb"): for link_set in result[0]["LinkSetDb"]: if link_set.get("Link"): source_links.extend([link["Id"] for link in link_set["Link"]]) linked_ids[source_id] = source_links total_links += len(source_links) return { "success": True, "source_db": source_db, "target_db": target_db, "source_ids": id_list, "linked_ids": linked_ids, "total_links": total_links, "link_name": link_name if link_name else "all", } except Exception as e: return format_entrez_error( e, {"source_db": source_db, "target_db": target_db, "ids": str(ids)[:100]} ) def clear_entrez_cache(database: str = "") -> dict[str, Any]: """ Clear cached Entrez results. The caching system stores Entrez query results to reduce API calls and improve response times. Use this tool to clear stale cache data. Args: database: Database name to clear (empty string clears all databases) Returns: Dictionary containing: - success: Whether operation succeeded - cleared: Number of cache files removed - database: Database cleared (or "all" if empty string) - cache_location: Path to cache directory Examples: >>> clear_entrez_cache() # Clear all caches >>> clear_entrez_cache("pubmed") # Clear only PubMed cache >>> clear_entrez_cache("gene") # Clear only Gene cache Notes: - Caching is optional and controlled via use_cache parameter - Default TTL: 1 hour for searches, 7 days for fetches - Cache stored in ~/.biopython-mcp/cache/ - Cached data includes search results and summaries """ try: from biopython_mcp.utils import _get_cache_dir, clear_cache cache_dir = _get_cache_dir() count = clear_cache(database) return { "success": True, "cleared": count, "database": database if database else "all", "cache_location": str(cache_dir), } except Exception as e: return {"success": False, "error": str(e), "database": database}

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kmaneesh/biopython-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

database.py•40.7 KiB