gget-mcp

gget-mcp
src
gget_mcp

server_ext.py•71.5 KiB

#!/usr/bin/env python3 """gget MCP Server - Bioinformatics query interface using the gget library.""" import os from enum import Enum from typing import List, Optional, Union, Dict, Any, Literal from pathlib import Path import uuid import json import typer from typing_extensions import Annotated from fastmcp import FastMCP from eliot import start_action import gget class TransportType(str, Enum): STDIO = "stdio" STDIO_LOCAL = "stdio-local" STREAMABLE_HTTP = "streamable-http" SSE = "sse" # Configuration DEFAULT_HOST = os.getenv("MCP_HOST", "0.0.0.0") DEFAULT_PORT = int(os.getenv("MCP_PORT", "3002")) DEFAULT_TRANSPORT = os.getenv("MCP_TRANSPORT", "stdio") # Changed default to stdio # Typehints for common return patterns discovered in battle tests SequenceResult = Union[Dict[str, str], List[str], str] StructureResult = Union[Dict[str, Any], str] SearchResult = Dict[str, Any] LocalFileResult = Dict[Literal["path", "format", "success", "error"], Any] class GgetMCPExtended(FastMCP): """gget MCP Server with bioinformatics tools.""" def __init__( self, name: str = "gget MCP Server", prefix: str = "gget_", transport_mode: str = "stdio", output_dir: Optional[str] = None, **kwargs ): """Initialize the gget tools with FastMCP functionality.""" super().__init__(name=name, **kwargs) self.prefix = prefix self.transport_mode = transport_mode self.output_dir = Path(output_dir) if output_dir else Path.cwd() / "gget_output" # Create output directory if in local mode if self.transport_mode == "stdio-local": self.output_dir.mkdir(parents=True, exist_ok=True) self._register_gget_tools() def _save_to_local_file( self, data: Any, format_type: str, output_path: Optional[str] = None, default_prefix: str = "gget_output" ) -> LocalFileResult: """Helper function to save data to local files. Args: data: The data to save format_type: File format ('fasta', 'afa', 'pdb', 'json', etc.) output_path: Full output path (absolute or relative) or None to auto-generate default_prefix: Prefix for auto-generated filenames Returns: LocalFileResult: Contains path, format, success status, and optional error information """ # Map format types to file extensions format_extensions = { 'fasta': '.fasta', 'afa': '.afa', 'pdb': '.pdb', 'json': '.json', 'txt': '.txt', 'tsv': '.tsv' } extension = format_extensions.get(format_type, '.txt') if output_path is None: # Generate a unique filename in the default output directory base_name = f"{default_prefix}_{str(uuid.uuid4())[:8]}" file_path = self.output_dir / f"{base_name}{extension}" else: # Use the provided path path_obj = Path(output_path) if path_obj.is_absolute(): # Absolute path - use as is, but ensure it has the right extension if path_obj.suffix != extension: file_path = path_obj.with_suffix(extension) else: file_path = path_obj else: # Relative path - concatenate with output directory if not str(output_path).endswith(extension): file_path = self.output_dir / f"{output_path}{extension}" else: file_path = self.output_dir / output_path try: if format_type in ['fasta', 'afa']: self._write_fasta_file(data, file_path) elif format_type == 'pdb': self._write_pdb_file(data, file_path) elif format_type == 'json': with open(file_path, 'w') as f: json.dump(data, f, indent=2, default=str) else: # Default to text format with open(file_path, 'w') as f: if isinstance(data, dict): json.dump(data, f, indent=2, default=str) else: f.write(str(data)) return { "path": str(file_path), "format": format_type, "success": True } except Exception as e: return { "path": None, "format": format_type, "success": False, "error": str(e) } def _write_fasta_file(self, data: Any, file_path: Path) -> None: """Write sequence data in FASTA format. Handles multiple data formats discovered in battle tests: - Dict[str, str]: sequence_id -> sequence - List[str]: [header, sequence, header, sequence, ...] - str: raw data """ with open(file_path, 'w') as f: if isinstance(data, dict): for seq_id, sequence in data.items(): f.write(f">{seq_id}\n") # Write sequence with line breaks every 80 characters for i in range(0, len(sequence), 80): f.write(f"{sequence[i:i+80]}\n") elif isinstance(data, list): # Handle FASTA list format from gget.seq for i in range(0, len(data), 2): if i + 1 < len(data): header = data[i] if data[i].startswith('>') else f">{data[i]}" sequence = data[i + 1] f.write(f"{header}\n") # Write sequence with line breaks every 80 characters for j in range(0, len(sequence), 80): f.write(f"{sequence[j:j+80]}\n") elif data is None: # For MUSCLE alignments, gget.muscle() returns None but prints to stdout # We need to capture the stdout or use a different approach f.write("# MUSCLE alignment completed\n# Output was printed to console\n") else: f.write(str(data)) def _write_pdb_file(self, data: Any, file_path: Path) -> None: """Write PDB structure data.""" with open(file_path, 'w') as f: if isinstance(data, str): f.write(data) else: # Convert data to string representation f.write(str(data)) def _register_gget_tools(self): """Register gget-specific tools.""" # Gene information and search tools self.tool(name=f"{self.prefix}search")(self.search_genes) self.tool(name=f"{self.prefix}info")(self.get_gene_info) # Sequence tools - use local wrapper if in local mode if self.transport_mode == "stdio-local": self.tool(name=f"{self.prefix}seq")(self.get_sequences_local) else: self.tool(name=f"{self.prefix}seq")(self.get_sequences) # Reference genome tools self.tool(name=f"{self.prefix}ref")(self.get_reference) # Sequence analysis tools self.tool(name=f"{self.prefix}blast")(self.blast_sequence) self.tool(name=f"{self.prefix}blat")(self.blat_sequence) # Alignment tools - use local wrappers if in local mode if self.transport_mode == "stdio-local": self.tool(name=f"{self.prefix}muscle")(self.muscle_align_local) self.tool(name=f"{self.prefix}diamond")(self.diamond_align_local) else: self.tool(name=f"{self.prefix}muscle")(self.muscle_align) self.tool(name=f"{self.prefix}diamond")(self.diamond_align) # Expression and functional analysis self.tool(name=f"{self.prefix}archs4")(self.archs4_expression) self.tool(name=f"{self.prefix}enrichr")(self.enrichr_analysis) self.tool(name=f"{self.prefix}bgee")(self.bgee_orthologs) # Protein structure and function - use local wrappers if in local mode if self.transport_mode == "stdio-local": self.tool(name=f"{self.prefix}pdb")(self.get_pdb_structure_local) self.tool(name=f"{self.prefix}alphafold")(self.alphafold_predict_local) else: self.tool(name=f"{self.prefix}pdb")(self.get_pdb_structure) self.tool(name=f"{self.prefix}alphafold")(self.alphafold_predict) self.tool(name=f"{self.prefix}elm")(self.elm_analysis) # Cancer and mutation analysis self.tool(name=f"{self.prefix}cosmic")(self.cosmic_search) self.tool(name=f"{self.prefix}mutate")(self.mutate_sequences) # Drug and disease analysis self.tool(name=f"{self.prefix}opentargets")(self.opentargets_analysis) # Single-cell analysis self.tool(name=f"{self.prefix}cellxgene")(self.cellxgene_query) # Setup and utility functions self.tool(name=f"{self.prefix}setup")(self.setup_databases) async def search_genes( self, search_terms: Union[str, List[str]], species: str = "homo_sapiens", release: Optional[int] = None, id_type: str = "gene", andor: str = "or", limit: Optional[int] = None ) -> SearchResult: """Search for genes using gene symbols, names, or synonyms. Use this tool FIRST when you have gene names/symbols and need to find their Ensembl IDs. Returns Ensembl IDs which are required for get_gene_info and get_sequences tools. Args: search_terms: Gene symbols, names, or synonyms as string or list of strings (e.g., 'TP53' or ['TP53', 'BRCA1']) species: Target species (e.g., 'homo_sapiens', 'mus_musculus') or specific core database name release: Ensembl release number (e.g., 104). Default: None (latest release) id_type: "gene" (default) or "transcript" - defines whether genes or transcripts are returned andor: "or" (default) or "and" - "or" returns genes with ANY searchword, "and" requires ALL searchwords limit: Maximum number of search results returned. Default: None (no limit) Returns: SearchResult: DataFrame with gene search results containing Ensembl IDs and descriptions Example: Input: search_terms='BRCA1', species='homo_sapiens' Output: DataFrame with columns like 'ensembl_id', 'gene_name', 'description' Downstream tools that need the Ensembl IDs from this search: - get_gene_info: Get detailed gene information - get_sequences: Get DNA/protein sequences Note: Only searches in "gene name" and "description" sections of Ensembl database. """ with start_action(action_type="gget_search", search_terms=search_terms, species=species): result = gget.search( searchwords=search_terms, species=species, release=release, id_type=id_type, andor=andor, limit=limit ) return result.to_dict() if hasattr(result, 'to_dict') else result async def get_gene_info( self, ensembl_ids: Union[str, List[str]], ncbi: bool = True, uniprot: bool = True, pdb: bool = False, verbose: bool = True ) -> Dict[str, Any]: """Get detailed gene and transcript metadata using Ensembl IDs. PREREQUISITE: Use search_genes first to get Ensembl IDs from gene names/symbols. Args: ensembl_ids: One or more Ensembl gene IDs as string or list (e.g., 'ENSG00000141510' or ['ENSG00000141510']) Also supports WormBase and FlyBase IDs ncbi: If True, includes data from NCBI. Default: True uniprot: If True, includes data from UniProt. Default: True pdb: If True, also returns PDB IDs (might increase runtime). Default: False verbose: If True, prints progress information. Default: True Returns: Dict[str, Any]: DataFrame with gene information containing metadata from multiple databases Example workflow: 1. search_genes('TP53', 'homo_sapiens') → get Ensembl ID 'ENSG00000141510' 2. get_gene_info('ENSG00000141510') Example output: DataFrame with columns like 'ensembl_id', 'symbol', 'biotype', 'chromosome', 'start', 'end', plus NCBI, UniProt, and optionally PDB information """ with start_action(action_type="gget_info", ensembl_ids=ensembl_ids): result = gget.info( ens_ids=ensembl_ids, ncbi=ncbi, uniprot=uniprot, pdb=pdb, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def get_sequences( self, ensembl_ids: Union[str, List[str]], translate: bool = False, isoforms: bool = False, verbose: bool = True ) -> SequenceResult: """Fetch nucleotide or amino acid sequence (FASTA) of genes or transcripts. PREREQUISITE: Use search_genes first to get Ensembl IDs from gene names/symbols. Args: ensembl_ids: One or more Ensembl gene IDs as string or list (e.g., 'ENSG00000141510' or ['ENSG00000141510']) Also supports WormBase and FlyBase IDs translate: If True, returns amino acid sequences; if False, returns nucleotide sequences. Default: False Nucleotide sequences fetched from Ensembl REST API, amino acid from UniProt REST API isoforms: If True, returns sequences of all known transcripts (only for gene IDs). Default: False verbose: If True, prints progress information. Default: True Returns: SequenceResult: List containing the requested sequences in FASTA format Battle testing revealed the actual return is a list, not the various formats mentioned before Example workflow for protein sequence: 1. search_genes('TP53', 'homo_sapiens') → 'ENSG00000141510' 2. get_sequences('ENSG00000141510', translate=True) Example output: List of sequences in FASTA format: ['>ENSG00000141510', 'MEEPQSDPSVEPPLSQ...'] Downstream tools that use protein sequences: - alphafold_predict: Predict 3D structure from protein sequence - blast_sequence: Search for similar sequences """ with start_action(action_type="gget_seq", ensembl_ids=ensembl_ids, translate=translate): result = gget.seq( ens_ids=ensembl_ids, translate=translate, isoforms=isoforms, verbose=verbose ) return result async def get_reference( self, species: str = "homo_sapiens", which: Union[str, List[str]] = "all", release: Optional[int] = None, ftp: bool = False, list_species: bool = False, list_iv_species: bool = False, verbose: bool = True ) -> Union[Dict[str, Any], List[str]]: """Fetch FTPs for reference genomes and annotations by species from Ensembl. Args: species: Species in format "genus_species" (e.g., "homo_sapiens"). Shortcuts supported: "human", "mouse", "human_grch37" which: Which results to return. Default: "all" (all available results) Options: 'gtf' (annotation), 'cdna' (transcriptome), 'dna' (genome), 'cds' (coding sequences), 'cdrna' (non-coding RNA), 'pep' (protein translations) Can be single string or list of strings release: Ensembl release number (e.g., 104). Default: None (latest release) ftp: If True, returns only requested FTP links as list. Default: False list_species: If True and species=None, returns list of vertebrate species. Default: False list_iv_species: If True and species=None, returns list of invertebrate species. Default: False verbose: If True, prints progress information. Default: True Returns: Union[Dict[str, Any], List[str]]: Dictionary with URLs, versions, and metadata (or list of URLs if ftp=True) Example: Input: species="homo_sapiens", which="gtf" Output: Dictionary containing GTF URLs with Ensembl version and release info """ with start_action(action_type="gget_ref", species=species, which=which): result = gget.ref( species=species, which=which, release=release, ftp=ftp, list_species=list_species, list_iv_species=list_iv_species, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def blast_sequence( self, sequence: str, program: str = "default", database: str = "default", limit: int = 50, expect: float = 10.0, low_comp_filt: bool = False, megablast: bool = True, verbose: bool = True ) -> Dict[str, Any]: """BLAST a nucleotide or amino acid sequence against any BLAST database. Args: sequence: Nucleotide or amino acid sequence (string) or path to FASTA file (If FASTA has multiple sequences, only first will be submitted) program: BLAST program - 'blastn', 'blastp', 'blastx', 'tblastn', or 'tblastx' Default: "default" (auto-detects: 'blastn' for nucleotide, 'blastp' for amino acid) database: BLAST database - 'nt', 'nr', 'refseq_rna', 'refseq_protein', 'swissprot', 'pdbaa', 'pdbnt' Default: "default" (auto-detects: 'nt' for nucleotide, 'nr' for amino acid) limit: Maximum number of hits to return. Default: 50 expect: Expect value cutoff (float). Default: 10.0 low_comp_filt: Apply low complexity filter. Default: False megablast: Use MegaBLAST algorithm (blastn only). Default: True verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with BLAST results including alignment details and scores Example: Input: sequence="ATGCGATCGTAGC", program="blastn", database="nt" Output: DataFrame with BLAST hits, E-values, scores, and alignments Note: - NCBI server rule: Run scripts weekends or 9pm-5am ET weekdays for >50 searches - More info on databases: https://ncbi.github.io/blast-cloud/blastdb/available-blastdbs.html """ with start_action(action_type="gget_blast", sequence_length=len(sequence), program=program): result = gget.blast( sequence=sequence, program=program, database=database, limit=limit, expect=expect, low_comp_filt=low_comp_filt, megablast=megablast, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def blat_sequence( self, sequence: str, seqtype: str = "default", assembly: str = "human", verbose: bool = True ) -> Dict[str, Any]: """BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly. Args: sequence: Nucleotide or amino acid sequence (string) or path to FASTA file containing one sequence seqtype: Sequence type - 'DNA', 'protein', 'translated%20RNA', or 'translated%20DNA' Default: "default" (auto-detects: 'DNA' for nucleotide, 'protein' for amino acid) assembly: Genome assembly - 'human' (hg38), 'mouse' (mm39), 'zebrafinch' (taeGut2), or any assembly from https://genome.ucsc.edu/cgi-bin/hgBlat Default: "human" (hg38) verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with BLAT results including genomic coordinates and alignment details Example: Input: sequence="ATGCGATCGTAGC", assembly="human" Output: DataFrame with chromosome, start, end positions and alignment scores """ with start_action(action_type="gget_blat", sequence_length=len(sequence), assembly=assembly): result = gget.blat( sequence=sequence, seqtype=seqtype, assembly=assembly, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def muscle_align( self, sequences: Union[List[str], str], super5: bool = False, out: Optional[str] = None, verbose: bool = True ) -> Optional[str]: """Align multiple nucleotide or amino acid sequences using MUSCLE v5 algorithm. Args: sequences: List of sequences or path to FASTA file containing sequences to be aligned super5: If True, use Super5 algorithm instead of PPP (for large inputs with hundreds of sequences). Default: False out: Path to save aligned FASTA (.afa) file (e.g., 'path/to/results.afa'). Default: None (results printed in Clustal format) verbose: Print progress information. Default: True Returns: Optional[str]: Alignment results in aligned FASTA (.afa) format, or None if saved to file Example: Input: sequences=["ATGCGATC", "ATGCGTTC", "ATGCGATG"] Output: Aligned sequences in FASTA format or saved to file if 'out' specified """ with start_action(action_type="gget_muscle", num_sequences=len(sequences) if isinstance(sequences, list) else None): result = gget.muscle( fasta=sequences, super5=super5, out=out, verbose=verbose ) return result async def diamond_align( self, sequences: Union[str, List[str]], reference: Union[str, List[str]], translated: bool = False, diamond_db: Optional[str] = None, sensitivity: str = "very-sensitive", threads: int = 1, diamond_binary: Optional[str] = None, verbose: bool = True, out: Optional[str] = None ) -> Dict[str, Any]: """Align multiple protein or translated DNA sequences using DIAMOND. Args: sequences: Query sequences (string, list) or path to FASTA file with sequences to align against reference reference: Reference sequences (string, list) or path to FASTA file with reference sequences Set translated=True if reference is amino acids and query is nucleotides translated: If True, performs translated alignment of nucleotide sequences to amino acid references. Default: False diamond_db: Path to save DIAMOND database created from reference. Default: None (temporary db deleted after alignment or saved in 'out' if provided) sensitivity: DIAMOND alignment sensitivity - 'fast', 'mid-sensitive', 'sensitive', 'more-sensitive', 'very-sensitive', or 'ultra-sensitive'. Default: "very-sensitive" threads: Number of threads for alignment. Default: 1 diamond_binary: Path to DIAMOND binary (e.g., 'path/bins/Linux/diamond'). Default: None (uses DIAMOND binary installed with gget) verbose: Print progress information. Default: True out: Path to folder to save DIAMOND results. Default: None (standard out, temporary files deleted) Returns: Dict[str, Any]: DataFrame with DIAMOND alignment results including similarity scores and positions Example: Input: sequences=["MKVLWA"], reference=["MKVLWAICAV"], sensitivity="sensitive" Output: DataFrame with alignment scores, positions, and match details """ with start_action(action_type="gget_diamond", sensitivity=sensitivity): result = gget.diamond( query=sequences, reference=reference, translated=translated, diamond_db=diamond_db, sensitivity=sensitivity, threads=threads, diamond_binary=diamond_binary, verbose=verbose, out=out ) return result.to_dict() if hasattr(result, 'to_dict') else result async def archs4_expression( self, gene: str, ensembl: bool = False, which: str = "correlation", gene_count: int = 100, species: str = "human", verbose: bool = True ) -> Dict[str, Any]: """Find correlated genes or tissue expression atlas using ARCHS4 RNA-seq database. Args: gene: Gene symbol (e.g., 'STAT4') or Ensembl ID if ensembl=True (e.g., 'ENSG00000138378') ensembl: If True, 'gene' parameter is treated as Ensembl gene ID. Default: False which: Analysis type - 'correlation' (most correlated genes) or 'tissue' (tissue expression atlas). Default: "correlation" gene_count: Number of correlated genes to return (only for correlation analysis). Default: 100 species: Target species - 'human' or 'mouse' (only for tissue expression atlas). Default: "human" verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with correlation table (100 most correlated genes with Pearson correlation) or tissue expression atlas depending on 'which' parameter Example (correlation): Input: gene="STAT4", which="correlation" Output: DataFrame with 100 most correlated genes and Pearson correlation coefficients Example (tissue): Input: gene="STAT4", which="tissue", species="human" Output: DataFrame with tissue expression levels across human samples """ with start_action(action_type="gget_archs4", gene=gene, which=which): result = gget.archs4( gene=gene, ensembl=ensembl, which=which, gene_count=gene_count, species=species, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def enrichr_analysis( self, genes: List[str], database: str = "KEGG_2021_Human", species: str = "human", background_list: Optional[List[str]] = None, background: bool = False, ensembl: bool = False, ensembl_bkg: bool = False, plot: bool = False, kegg_out: Optional[str] = None, kegg_rank: int = 1, verbose: bool = True ) -> Dict[str, Any]: """Perform functional enrichment analysis on gene list using Enrichr. Args: genes: List of gene symbols (e.g., ['PHF14', 'RBM3']) or Ensembl IDs if ensembl=True database: Reference database or shortcuts for human/mouse: 'pathway' (KEGG_2021_Human), 'transcription' (ChEA_2016), 'ontology' (GO_Biological_Process_2021), 'diseases_drugs' (GWAS_Catalog_2019), 'celltypes' (PanglaoDB_Augmented_2021), 'kinase_interactions' (KEA_2015) Or full database name from https://maayanlab.cloud/Enrichr/#libraries species: Species database - 'human', 'mouse', 'fly', 'yeast', 'worm', 'fish'. Default: "human" background_list: Custom background genes (only for human/mouse). Default: None background: Use >20,000 default background genes (only for human/mouse). Default: False ensembl: If True, 'genes' are Ensembl gene IDs. Default: False ensembl_bkg: If True, 'background_list' are Ensembl gene IDs. Default: False plot: Create graphical overview of first 15 results. Default: False kegg_out: Path to save highlighted KEGG pathway image (e.g., 'path/kegg_pathway.png'). Default: None kegg_rank: Pathway rank to plot in KEGG image. Default: 1 verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with enrichment results including pathways, p-values, and statistical measures Battle testing confirmed functional analysis capabilities with cancer genes Example: Input: genes=['PHF14', 'RBM3', 'MSL1'], database='pathway' Output: DataFrame with KEGG pathway enrichment results and statistics """ with start_action(action_type="gget_enrichr", genes=genes, database=database): result = gget.enrichr( genes=genes, database=database, species=species, background_list=background_list, background=background, ensembl=ensembl, ensembl_bkg=ensembl_bkg, plot=plot, kegg_out=kegg_out, kegg_rank=kegg_rank, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def bgee_orthologs( self, gene_id: str, type: str = "orthologs", verbose: bool = True ) -> Dict[str, Any]: """Get orthologs or expression data for a gene from Bgee database. PREREQUISITE: Use search_genes to get Ensembl ID first. Args: gene_id: Ensembl gene ID (e.g., 'ENSG00000012048' for BRCA1) type: Type of data to retrieve - 'orthologs' (ortholog information across species) or 'expression' (expression data). Default: "orthologs" verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with ortholog information across species or expression data from Bgee Example workflow: 1. search_genes('BRCA1') → 'ENSG00000012048' 2. bgee_orthologs('ENSG00000012048') → ortholog data across species Example (expression): Input: gene_id='ENSG00000012048', type='expression' Output: DataFrame with expression data from Bgee database """ with start_action(action_type="gget_bgee", gene_id=gene_id, type=type): result = gget.bgee(gene_id=gene_id, type=type, verbose=verbose) return result.to_dict() if hasattr(result, 'to_dict') else result async def get_pdb_structure( self, pdb_id: str, resource: str = "pdb", identifier: Optional[str] = None, save: bool = False ) -> StructureResult: """Query RCSB PDB for protein structure/metadata of a given PDB ID. IMPORTANT: This tool requires a specific PDB ID (e.g., '7S7U'), NOT gene names. For gene-to-structure workflows: 1. Use search_genes to get Ensembl ID 2. Use get_sequences with translate=True to get protein sequence 3. Use alphafold_predict for structure prediction, OR 4. Search external databases (PDB website) for known PDB IDs, then use this tool Args: pdb_id: PDB ID to query (e.g., '7S7U', '2GS6') resource: Type of information to return: 'pdb' (protein structure in PDB format - default) 'entry' (top-level PDB structure information) 'pubmed' (PubMed annotations for primary citation) 'assembly' (quaternary structure information) 'branched_entity', 'nonpolymer_entity', 'polymer_entity' (entity data) 'uniprot' (UniProt annotations) '*_instance' variants (chain-specific data) identifier: Assembly/entity ID (numbers like 1) or chain ID (letters like 'A') if applicable. Default: None save: Save JSON/PDB results to current directory. Default: False Returns: StructureResult: JSON format (except resource='pdb' returns PDB format structure) Battle testing confirmed successful retrieval of real PDB structures Example: Input: pdb_id='7S7U', resource='pdb' Output: Protein structure in PDB format Example (metadata): Input: pdb_id='7S7U', resource='entry' Output: JSON with PDB entry information, resolution, method, etc. Alternative workflow for gene structure prediction: 1. search_genes('EGFR') → get Ensembl ID 2. get_sequences(ensembl_id, translate=True) → get protein sequence 3. alphafold_predict(protein_sequence) → predict structure """ with start_action(action_type="gget_pdb", pdb_id=pdb_id): result = gget.pdb(pdb_id=pdb_id, resource=resource, identifier=identifier, save=save) return result async def alphafold_predict( self, sequence: Union[str, List[str]], out: Optional[str] = None, multimer_for_monomer: bool = False, relax: bool = False, multimer_recycles: int = 3, plot: bool = True, show_sidechains: bool = True, verbose: bool = True ) -> StructureResult: """Predict protein structure using simplified AlphaFold v2.3.0 algorithm. PREREQUISITE: Use get_sequences with translate=True to get protein sequence first. Workflow for gene structure prediction: 1. search_genes → get Ensembl ID 2. get_sequences with translate=True → get protein sequence 3. alphafold_predict → predict structure Args: sequence: Amino acid sequence (string), list of sequences, or path to FASTA file out: Path to folder to save prediction results. Default: None (auto-generated with timestamp) multimer_for_monomer: Use multimer model for monomer prediction. Default: False relax: Apply AMBER relaxation to best model. Default: False multimer_recycles: Max recycling iterations for multimer model (higher=more accurate but slower). Default: 3 plot: Create graphical overview of prediction. Default: True show_sidechains: Show side chains in plot. Default: True verbose: Print progress information. Default: True Returns: StructureResult: AlphaFold structure prediction - saves aligned error (JSON) and prediction (PDB) files Battle testing confirmed successful structure predictions with small proteins Example full workflow: 1. search_genes('TP53') → 'ENSG00000141510' 2. get_sequences('ENSG00000141510', translate=True) → 'MEEPQSDPSVEPPLSQ...' 3. alphafold_predict('MEEPQSDPSVEPPLSQ...') Example output: Saves PDB structure file and confidence scores JSON in specified output folder Note: This uses simplified AlphaFold without templates and limited MSA database. For best accuracy, use full AlphaFold or AlphaFold Protein Structure Database. Please cite gget and AlphaFold papers when using this function. """ sequence_length = len(sequence) if isinstance(sequence, str) else len(sequence) if isinstance(sequence, list) else None with start_action(action_type="gget_alphafold", sequence_length=sequence_length): result = gget.alphafold( sequence=sequence, out=out, multimer_for_monomer=multimer_for_monomer, relax=relax, multimer_recycles=multimer_recycles, plot=plot, show_sidechains=show_sidechains, verbose=verbose ) return result async def elm_analysis( self, sequence: str, uniprot: bool = False, sensitivity: str = "very-sensitive", threads: int = 1, diamond_binary: Optional[str] = None, expand: bool = False, verbose: bool = True, out: Optional[str] = None ) -> Dict[str, Any]: """Locally predict Eukaryotic Linear Motifs from amino acid sequence or UniProt ID. Args: sequence: Amino acid sequence or UniProt accession (if uniprot=True) uniprot: If True, input is UniProt accession instead of amino acid sequence. Default: False sensitivity: DIAMOND alignment sensitivity - 'fast', 'mid-sensitive', 'sensitive', 'more-sensitive', 'very-sensitive', or 'ultra-sensitive'. Default: "very-sensitive" threads: Number of threads for DIAMOND alignment. Default: 1 diamond_binary: Path to DIAMOND binary. Default: None (uses DIAMOND installed with gget) expand: Expand regex dataframe to include protein names, organisms, and references. Default: False verbose: Print progress information. Default: True out: Path to folder to save results. Default: None (standard out, temporary files deleted) Returns: Dict[str, Any]: Two dataframes - ortholog motifs (experimentally validated in orthologs) and regex motifs (direct regex matches in sequence) Example: Input: sequence="MKVLWAICAVL", uniprot=False Output: {'ortholog_df': {...}, 'regex_df': {...}} with motif predictions Example (UniProt): Input: sequence="P04637", uniprot=True Output: Motif analysis results for UniProt entry P04637 Note: ELM data is for non-commercial use only (ELM Software License Agreement). """ with start_action(action_type="gget_elm", sequence_length=len(sequence) if not uniprot else None): result = gget.elm( sequence=sequence, uniprot=uniprot, sensitivity=sensitivity, threads=threads, diamond_binary=diamond_binary, expand=expand, verbose=verbose, out=out ) # ELM returns two dataframes: ortholog_df and regex_df if isinstance(result, tuple) and len(result) == 2: ortholog_df, regex_df = result data = { "ortholog_df": ortholog_df.to_dict() if hasattr(ortholog_df, 'to_dict') else ortholog_df, "regex_df": regex_df.to_dict() if hasattr(regex_df, 'to_dict') else regex_df } else: data = result return data async def cosmic_search( self, searchterm: Optional[str] = None, cosmic_tsv_path: Optional[str] = None, limit: int = 100, download_cosmic: bool = False, cosmic_project: str = "cancer", cosmic_version: Optional[int] = None, grch_version: int = 37, email: Optional[str] = None, password: Optional[str] = None, gget_mutate: bool = False, keep_genome_info: bool = False, remove_duplicates: bool = False, seq_id_column: str = "seq_ID", mutation_column: str = "mutation", mut_id_column: str = "mutation_id", out: Optional[str] = None, verbose: bool = True ) -> Dict[str, Any]: """Search COSMIC database for cancer mutations or download COSMIC databases. NOTE: Licence fees apply for commercial use of COSMIC (https://www.cosmickb.org/licensing). Database downloads require COSMIC account (https://cancer.sanger.ac.uk/cosmic/register). Args: searchterm: Search term - gene name/Ensembl ID, mutation, sample ID, etc. Examples: 'EGFR', 'ENST00000275493', 'c.650A>T', 'p.Q217L' Set to None when downloading databases (download_cosmic=True) cosmic_tsv_path: Path to COSMIC TSV file (required when download_cosmic=False) limit: Number of search hits to return. Default: 100 download_cosmic: If True, switches to database download mode. Default: False cosmic_project: COSMIC database type - 'cancer' (CMC), 'cancer_example', 'census', 'resistance', 'cell_line', 'genome_screen', 'targeted_screen'. Default: "cancer" cosmic_version: COSMIC database version. Default: None (latest version) grch_version: Human GRCh reference genome version (37 or 38). Default: 37 email: COSMIC login email (avoids interactive input). Default: None password: COSMIC login password (stored in plain text). Default: None gget_mutate: Create database modified for 'gget mutate' use. Default: False keep_genome_info: Keep genome location info in gget_mutate database. Default: False remove_duplicates: Remove duplicate rows in gget_mutate database. Default: False seq_id_column: Name of seq_id column for gget_mutate CSV. Default: "seq_ID" mutation_column: Name of mutation column for gget_mutate CSV. Default: "mutation" mut_id_column: Name of mutation_id column for gget_mutate CSV. Default: "mutation_id" out: Output path for results/database. Default: None (stdout/current directory) verbose: Print progress information. Default: True Returns: Dict[str, Any]: DataFrame with mutation data including positions, amino acid changes, cancer types (for searches) or database download confirmation (for downloads) Example (search): Input: searchterm='PIK3CA', cosmic_tsv_path='path/to/cosmic.tsv' Output: Mutation data for PIK3CA gene Example (download): Input: download_cosmic=True, cosmic_project='cancer_example' Output: Downloads example COSMIC database to specified folder Note: This tool accepts gene symbols directly, no need for Ensembl ID conversion. """ with start_action(action_type="gget_cosmic", searchterm=searchterm, limit=limit): result = gget.cosmic( searchterm=searchterm, cosmic_tsv_path=cosmic_tsv_path, limit=limit, download_cosmic=download_cosmic, cosmic_project=cosmic_project, cosmic_version=cosmic_version, grch_version=grch_version, email=email, password=password, gget_mutate=gget_mutate, keep_genome_info=keep_genome_info, remove_duplicates=remove_duplicates, seq_id_column=seq_id_column, mutation_column=mutation_column, mut_id_column=mut_id_column, out=out, verbose=verbose ) return result.to_dict() if hasattr(result, 'to_dict') else result async def mutate_sequences( self, sequences: Union[str, List[str]], mutations: Union[str, List[str]], mut_column: str = "mutation", seq_id_column: str = "seq_ID", mut_id_column: Optional[str] = None, gtf: Optional[str] = None, gtf_transcript_id_column: Optional[str] = None, k: int = 30, min_seq_len: Optional[int] = None, optimize_flanking_regions: bool = False, remove_seqs_with_wt_kmers: bool = False, max_ambiguous: Optional[int] = None, merge_identical: bool = True, update_df: bool = False, update_df_out: Optional[str] = None, store_full_sequences: bool = False, translate: bool = False, translate_start: Optional[Union[int, str]] = None, translate_end: Optional[Union[int, str]] = None, out: Optional[str] = None, verbose: bool = True ) -> Union[Dict[str, Any], List[str]]: """Mutate nucleotide sequences according to provided mutations in standard annotation. Args: sequences: Path to FASTA file or sequences as string/list (e.g., 'AGCTAGCT' or ['ACTG', 'ATCG']) mutations: Path to CSV/TSV file or mutations as string/list (e.g., 'c.2C>T' or ['c.2C>T', 'c.1A>C']) Standard mutation annotation format required (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1867422/) mut_column: Name of mutation column in mutations file. Default: "mutation" seq_id_column: Name of sequence ID column in mutations file. Default: "seq_ID" mut_id_column: Name of mutation ID column. Default: None (uses mut_column) gtf: Path to GTF file for transcript boundaries (when sequences is genome FASTA). Default: None gtf_transcript_id_column: Column with transcript IDs when using GTF. Default: None k: Length of flanking sequences around mutation. Default: 30 min_seq_len: Minimum length of output sequences (shorter sequences dropped). Default: None optimize_flanking_regions: Remove nucleotides to avoid wildtype k-mers. Default: False remove_seqs_with_wt_kmers: Remove sequences with wildtype (k+1)-mers. Default: False max_ambiguous: Maximum 'N' characters allowed in output. Default: None merge_identical: Merge identical mutant sequences. Default: True update_df: Generate updated mutations DataFrame with additional columns. Default: False update_df_out: Path for updated DataFrame output. Default: None (auto-generated) store_full_sequences: Include complete sequences in updated DataFrame. Default: False translate: Add amino acid sequences to updated DataFrame. Default: False translate_start: Translation start position or column name. Default: None translate_end: Translation end position or column name. Default: None out: Path to output FASTA file. Default: None (returns sequences to stdout) verbose: Print progress information. Default: True Returns: Union[Dict[str, Any], List[str]]: List of mutated sequences or updated DataFrame (depending on update_df setting) Example (simple): Input: sequences=['ATGCGATC'], mutations=['c.2T>G'] Output: List of mutated sequences with flanking regions Example (file-based): Input: sequences='seqs.fa', mutations='mutations.csv' Output: Mutated sequences according to mutations table Note: Sequence IDs in FASTA must match seq_ID column in mutations file. Supports complex mutations: substitutions (c.2C>T), insertions, deletions, inversions. """ num_sequences = len(sequences) if isinstance(sequences, list) else 1 with start_action(action_type="gget_mutate", num_sequences=num_sequences): result = gget.mutate( sequences=sequences, mutations=mutations, mut_column=mut_column, seq_id_column=seq_id_column, mut_id_column=mut_id_column, gtf=gtf, gtf_transcript_id_column=gtf_transcript_id_column, k=k, min_seq_len=min_seq_len, optimize_flanking_regions=optimize_flanking_regions, remove_seqs_with_wt_kmers=remove_seqs_with_wt_kmers, max_ambiguous=max_ambiguous, merge_identical=merge_identical, update_df=update_df, update_df_out=update_df_out, store_full_sequences=store_full_sequences, translate=translate, translate_start=translate_start, translate_end=translate_end, out=out, verbose=verbose ) return result async def opentargets_analysis( self, ensembl_id: str, resource: str = "diseases", limit: Optional[int] = None, verbose: bool = True, wrap_text: bool = False, filters: Optional[Dict[str, str]] = None, filter_mode: str = "and" ) -> Dict[str, Any]: """Query OpenTargets for diseases, drugs, and other data associated with a gene. PREREQUISITE: Use search_genes to get Ensembl ID first. Args: ensembl_id: Ensembl gene ID (e.g., 'ENSG00000169194') resource: Type of information to return: 'diseases' (gene-disease associations - default) 'drugs' (gene-drug associations) 'tractability' (druggability data) 'pharmacogenetics' (pharmacogenomics data) 'expression' (tissue/organ expression) 'depmap' (DepMap gene-disease effects) 'interactions' (protein-protein interactions) limit: Maximum number of results. Default: None (no limit) Note: Not compatible with 'tractability' and 'depmap' resources verbose: Print progress messages. Default: True wrap_text: Display DataFrame with wrapped text for readability. Default: False filters: Filters to apply by resource type: diseases: None drugs: {'disease_id': 'EFO_0000274'} tractability: None pharmacogenetics: {'drug_id': 'CHEMBL535'} expression: {'tissue_id': 'UBERON_0002245', 'anatomical_system': 'nervous system', 'organ': 'brain'} depmap: {'tissue_id': 'UBERON_0002245'} interactions: {'protein_a_id': 'ENSP00000304915', 'protein_b_id': 'ENSP00000379111', 'gene_b_id': 'ENSG00000077238'} filter_mode: How to combine multiple filters - 'and' or 'or'. Default: "and" Returns: Dict[str, Any]: DataFrame with disease/drug associations, clinical evidence, and experimental data Battle testing confirmed functional disease association analysis Example workflow: 1. search_genes('APOE') → 'ENSG00000141510' 2. opentargets_analysis('ENSG00000141510') → disease associations Example (with filters): Input: ensembl_id='ENSG00000169194', resource='expression', filters={'organ': 'brain'} Output: Brain expression data for specified gene """ with start_action(action_type="gget_opentargets", ensembl_id=ensembl_id, resource=resource): result = gget.opentargets( ensembl_id=ensembl_id, resource=resource, limit=limit, verbose=verbose, wrap_text=wrap_text, filters=filters, filter_mode=filter_mode ) return result.to_dict() if hasattr(result, 'to_dict') else result async def cellxgene_query( self, species: str = "homo_sapiens", gene: Optional[Union[str, List[str]]] = None, ensembl: bool = False, column_names: List[str] = ["dataset_id", "assay", "suspension_type", "sex", "tissue_general", "tissue", "cell_type"], meta_only: bool = False, tissue: Optional[Union[str, List[str]]] = None, cell_type: Optional[Union[str, List[str]]] = None, development_stage: Optional[Union[str, List[str]]] = None, disease: Optional[Union[str, List[str]]] = None, sex: Optional[Union[str, List[str]]] = None, is_primary_data: bool = True, dataset_id: Optional[Union[str, List[str]]] = None, tissue_general_ontology_term_id: Optional[Union[str, List[str]]] = None, tissue_general: Optional[Union[str, List[str]]] = None, assay_ontology_term_id: Optional[Union[str, List[str]]] = None, assay: Optional[Union[str, List[str]]] = None, cell_type_ontology_term_id: Optional[Union[str, List[str]]] = None, development_stage_ontology_term_id: Optional[Union[str, List[str]]] = None, disease_ontology_term_id: Optional[Union[str, List[str]]] = None, donor_id: Optional[Union[str, List[str]]] = None, self_reported_ethnicity_ontology_term_id: Optional[Union[str, List[str]]] = None, self_reported_ethnicity: Optional[Union[str, List[str]]] = None, sex_ontology_term_id: Optional[Union[str, List[str]]] = None, suspension_type: Optional[Union[str, List[str]]] = None, tissue_ontology_term_id: Optional[Union[str, List[str]]] = None, census_version: str = "stable", verbose: bool = True, out: Optional[str] = None ) -> Dict[str, Any]: """Query single-cell RNA-seq data from CZ CELLxGENE Discover using Census. NOTE: Querying large datasets requires >16 GB RAM and >5 Mbps internet connection. Use cell metadata attributes to define specific (sub)datasets of interest. Args: species: Target species - 'homo_sapiens' or 'mus_musculus'. Default: "homo_sapiens" gene: Gene name(s) or Ensembl ID(s) (e.g., ['ACE2', 'SLC5A1'] or ['ENSG00000130234']). Default: None Set ensembl=True when providing Ensembl IDs ensembl: If True, genes are Ensembl IDs instead of gene names. Default: False column_names: Metadata columns to return in AnnData.obs. Default: ["dataset_id", "assay", "suspension_type", "sex", "tissue_general", "tissue", "cell_type"] meta_only: If True, returns only metadata DataFrame (AnnData.obs). Default: False tissue: Tissue(s) to query (e.g., ['lung', 'blood']). Default: None cell_type: Cell type(s) to query (e.g., ['mucus secreting cell']). Default: None development_stage: Development stage(s) to filter. Default: None disease: Disease(s) to filter. Default: None sex: Sex(es) to filter (e.g., 'female'). Default: None is_primary_data: If True, returns only canonical instance of cellular observation. Default: True dataset_id: CELLxGENE dataset ID(s) to query. Default: None tissue_general_ontology_term_id: High-level tissue UBERON ID(s). Default: None tissue_general: High-level tissue label(s). Default: None assay_ontology_term_id: Assay ontology term ID(s). Default: None assay: Assay type(s) as defined in CELLxGENE schema. Default: None cell_type_ontology_term_id: Cell type ontology term ID(s). Default: None development_stage_ontology_term_id: Development stage ontology term ID(s). Default: None disease_ontology_term_id: Disease ontology term ID(s). Default: None donor_id: Donor ID(s) as defined in CELLxGENE schema. Default: None self_reported_ethnicity_ontology_term_id: Ethnicity ontology ID(s). Default: None self_reported_ethnicity: Self-reported ethnicity. Default: None sex_ontology_term_id: Sex ontology ID(s). Default: None suspension_type: Suspension type(s) as defined in CELLxGENE schema. Default: None tissue_ontology_term_id: Tissue ontology term ID(s). Default: None census_version: Census version ('stable', 'latest', or specific date like '2023-05-15'). Default: "stable" verbose: Print progress information. Default: True out: Path to save AnnData h5ad file (or CSV when meta_only=True). Default: None Returns: Dict[str, Any]: AnnData object (when meta_only=False) or DataFrame (when meta_only=True) with single-cell expression data and metadata Example: Input: gene=['ACE2'], tissue=['lung'], cell_type=['alveolar epithelial cell'] Output: Single-cell expression data for ACE2 in lung alveolar epithelial cells Example (metadata only): Input: tissue=['brain'], meta_only=True Output: Metadata DataFrame for brain tissue datasets """ with start_action(action_type="gget_cellxgene", genes=gene, tissues=tissue): result = gget.cellxgene( species=species, gene=gene, ensembl=ensembl, column_names=column_names, meta_only=meta_only, tissue=tissue, cell_type=cell_type, development_stage=development_stage, disease=disease, sex=sex, is_primary_data=is_primary_data, dataset_id=dataset_id, tissue_general_ontology_term_id=tissue_general_ontology_term_id, tissue_general=tissue_general, assay_ontology_term_id=assay_ontology_term_id, assay=assay, cell_type_ontology_term_id=cell_type_ontology_term_id, development_stage_ontology_term_id=development_stage_ontology_term_id, disease_ontology_term_id=disease_ontology_term_id, donor_id=donor_id, self_reported_ethnicity_ontology_term_id=self_reported_ethnicity_ontology_term_id, self_reported_ethnicity=self_reported_ethnicity, sex_ontology_term_id=sex_ontology_term_id, suspension_type=suspension_type, tissue_ontology_term_id=tissue_ontology_term_id, census_version=census_version, verbose=verbose, out=out ) return result async def setup_databases( self, module: str, verbose: bool = True, out: Optional[str] = None ) -> Dict[str, Any]: """Install third-party dependencies for specified gget modules. Some modules require pip and curl to be installed on the system. Args: module: gget module to install dependencies for - 'alphafold', 'cellxgene', 'elm', 'gpt', or 'cbio' verbose: Print progress information. Default: True out: Path to directory for downloaded files (currently only applies to 'elm' module). Default: None (files saved in gget installation directory) NOTE: Do not use this argument when downloading files for use with 'gget.elm' Returns: Dict[str, Any]: Setup status with success indicator and messages Battle testing confirmed setup functionality for ELM module Example: Input: module='elm' Output: Downloads and installs ELM dependencies for motif analysis Example: Input: module='cellxgene' Output: Installs CELLxGENE Census dependencies for single-cell data Note: Available modules requiring setup: 'alphafold', 'cellxgene', 'elm', 'gpt', 'cbio' """ with start_action(action_type="gget_setup", module=module): # Valid modules that require setup based on gget.setup help valid_modules = ["alphafold", "cellxgene", "elm", "gpt", "cbio"] if module not in valid_modules: return { "data": None, "success": False, "message": f"Invalid module '{module}'. Valid modules are: {', '.join(valid_modules)}" } try: result = gget.setup(module, verbose=verbose, out=out) return { "data": result, "success": True, "message": f"Setup completed for {module} module" } except Exception as e: return { "data": None, "success": False, "message": f"Setup failed for {module} module: {str(e)}" } # Local mode wrapper functions for large data async def get_sequences_local( self, ensembl_ids: Union[str, List[str]], translate: bool = False, isoforms: bool = False, verbose: bool = True, output_path: Optional[str] = None, format: Literal["fasta"] = "fasta" ) -> LocalFileResult: """Fetch sequences and save to local file in stdio-local mode. PREREQUISITE: Use search_genes first to get Ensembl IDs from gene names/symbols. Args: ensembl_ids: One or more Ensembl gene IDs as string or list (e.g., 'ENSG00000141510' or ['ENSG00000141510']) Also supports WormBase and FlyBase IDs translate: If True, returns amino acid sequences; if False, returns nucleotide sequences. Default: False isoforms: If True, returns sequences of all known transcripts (only for gene IDs). Default: False verbose: If True, prints progress information. Default: True output_path: Optional specific output path (will generate if not provided) format: Output format (currently supports 'fasta') Returns: LocalFileResult: Contains path, format, and success information instead of sequence data Battle testing confirmed reliable file creation with proper FASTA formatting """ # Get the sequence data using the original function with start_action(action_type="gget_seq_local", ensembl_ids=ensembl_ids, translate=translate): result = gget.seq(ens_ids=ensembl_ids, translate=translate, isoforms=isoforms, verbose=verbose) # Save to file ensembl_list = ensembl_ids if isinstance(ensembl_ids, list) else [ensembl_ids] default_prefix = f"sequences_{'_'.join(ensembl_list[:3])}{'_protein' if translate else '_dna'}" return self._save_to_local_file(result, format, output_path, default_prefix) async def get_pdb_structure_local( self, pdb_id: str, resource: str = "pdb", identifier: Optional[str] = None, save: bool = False, output_path: Optional[str] = None, format: Literal["pdb"] = "pdb" ) -> LocalFileResult: """Fetch PDB structure and save to local file in stdio-local mode. Args: pdb_id: PDB ID to query (e.g., '7S7U', '2GS6') resource: Type of information to return - 'pdb' (structure), 'entry', 'pubmed', 'assembly', etc. identifier: Assembly/entity ID (numbers like 1) or chain ID (letters like 'A') if applicable. Default: None save: Save JSON/PDB results to current directory. Default: False output_path: Optional specific output path (will generate if not provided) format: Output format (currently supports 'pdb') Returns: LocalFileResult: Contains path, format, and success information instead of structure data Battle testing confirmed successful retrieval of real PDB structures """ with start_action(action_type="gget_pdb_local", pdb_id=pdb_id): result = gget.pdb(pdb_id=pdb_id, resource=resource, identifier=identifier, save=save) default_prefix = f"structure_{pdb_id}_{resource}" return self._save_to_local_file(result, format, output_path, default_prefix) async def alphafold_predict_local( self, sequence: Union[str, List[str]], out: Optional[str] = None, multimer_for_monomer: bool = False, relax: bool = False, multimer_recycles: int = 3, plot: bool = True, show_sidechains: bool = True, verbose: bool = True, output_path: Optional[str] = None, format: Literal["pdb"] = "pdb" ) -> LocalFileResult: """Predict protein structure using AlphaFold and save to local file. Args: sequence: Amino acid sequence (string), list of sequences, or path to FASTA file out: Path to folder to save prediction results. Default: None (auto-generated with timestamp) multimer_for_monomer: Use multimer model for monomer prediction. Default: False relax: Apply AMBER relaxation to best model. Default: False multimer_recycles: Max recycling iterations for multimer model (higher=more accurate but slower). Default: 3 plot: Create graphical overview of prediction. Default: True show_sidechains: Show side chains in plot. Default: True verbose: Print progress information. Default: True output_path: Optional specific output path (will generate if not provided) format: Output format (currently supports 'pdb') Returns: LocalFileResult: Contains path, format, and success information instead of structure data Battle testing confirmed successful AlphaFold predictions with small proteins """ sequence_length = len(sequence) if isinstance(sequence, str) else len(sequence) if isinstance(sequence, list) else None with start_action(action_type="gget_alphafold_local", sequence_length=sequence_length): result = gget.alphafold( sequence=sequence, out=out, multimer_for_monomer=multimer_for_monomer, relax=relax, multimer_recycles=multimer_recycles, plot=plot, show_sidechains=show_sidechains, verbose=verbose ) default_prefix = f"alphafold_prediction_{str(uuid.uuid4())[:8]}" return self._save_to_local_file(result, format, output_path, default_prefix) async def muscle_align_local( self, sequences: Union[List[str], str], super5: bool = False, verbose: bool = True, output_path: Optional[str] = None, format: Literal["fasta", "afa"] = "fasta" ) -> LocalFileResult: """Align sequences using MUSCLE and save to local file. Args: sequences: List of sequences or path to FASTA file containing sequences to be aligned super5: If True, use Super5 algorithm instead of PPP (for large inputs with hundreds of sequences). Default: False verbose: Print progress information. Default: True output_path: Optional specific output path (will generate if not provided) format: Output format ('fasta' for FASTA format, 'afa' for aligned FASTA format) Returns: LocalFileResult: Contains path, format, and success information instead of alignment data Battle testing confirmed successful alignment of real biological sequences """ with start_action(action_type="gget_muscle_local", num_sequences=len(sequences) if isinstance(sequences, list) else None): # Map format types to file extensions format_extensions = { 'fasta': '.fasta', 'afa': '.afa' } extension = format_extensions.get(format, '.fasta') # Handle output path if output_path is None: # Generate a unique filename in the default output directory base_name = f"muscle_alignment_{len(sequences)}seqs_{str(uuid.uuid4())[:8]}" file_path = self.output_dir / f"{base_name}{extension}" else: # Use the provided path path_obj = Path(output_path) if path_obj.is_absolute(): # Absolute path - use as is, but ensure it has the right extension if path_obj.suffix != extension: file_path = path_obj.with_suffix(extension) else: file_path = path_obj else: # Relative path - concatenate with output directory if not str(output_path).endswith(extension): file_path = self.output_dir / f"{output_path}{extension}" else: file_path = self.output_dir / output_path # Use gget.muscle with out parameter to save directly to file result = gget.muscle(fasta=sequences, super5=super5, out=str(file_path), verbose=verbose) return { "path": str(file_path), "format": format, "success": True } async def diamond_align_local( self, sequences: Union[str, List[str]], reference: Union[str, List[str]], translated: bool = False, diamond_db: Optional[str] = None, sensitivity: str = "very-sensitive", threads: int = 1, diamond_binary: Optional[str] = None, verbose: bool = True, out: Optional[str] = None, output_path: Optional[str] = None, format: Literal["json", "tsv"] = "json" ) -> LocalFileResult: """Align sequences using DIAMOND and save to local file. Args: sequences: Query sequences (string, list) or path to FASTA file with sequences to align against reference reference: Reference sequences (string, list) or path to FASTA file with reference sequences translated: If True, performs translated alignment of nucleotide sequences to amino acid references. Default: False diamond_db: Path to save DIAMOND database created from reference. Default: None sensitivity: DIAMOND alignment sensitivity - 'fast', 'mid-sensitive', 'sensitive', 'more-sensitive', 'very-sensitive', or 'ultra-sensitive'. Default: "very-sensitive" threads: Number of threads for alignment. Default: 1 diamond_binary: Path to DIAMOND binary. Default: None (uses DIAMOND binary installed with gget) verbose: Print progress information. Default: True out: Path to folder to save DIAMOND results. Default: None output_path: Optional specific output path (will generate if not provided) format: Output format ('json' recommended, 'tsv' also supported) Returns: LocalFileResult: Contains path, format, and success information instead of alignment data Battle testing showed reliable DIAMOND alignment functionality """ with start_action(action_type="gget_diamond_local", sensitivity=sensitivity): result = gget.diamond( query=sequences, reference=reference, translated=translated, diamond_db=diamond_db, sensitivity=sensitivity, threads=threads, diamond_binary=diamond_binary, verbose=verbose, out=out ) # Convert result to dict if it has to_dict method if hasattr(result, 'to_dict'): result = result.to_dict() default_prefix = f"diamond_alignment_{str(uuid.uuid4())[:8]}" return self._save_to_local_file(result, format, output_path, default_prefix)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/longevity-genie/gget-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server_ext.py•71.5 KiB