PubChem-MCP

similarity.py•21.7 KiB

""" Compound similarity functionality for the PubChem MCP server. """ import urllib.parse from typing import Dict, Any, List, Optional import httpx from ..utils import BASE_URL, pubchem_client from mcp_server import FastMCP # MCP instance - will be injected from main module mcp = None @mcp.tool() async def search_similar_compounds_by_cid(cid: str, threshold: float = 0.8, max_results: int = 10) -> str: """Search for compounds similar to a given PubChem compound. Args: cid: PubChem Compound ID to use as the query threshold: Similarity threshold (0.0-1.0, higher is more similar) max_results: Maximum number of results to return """ # Mock data for testing if cid == "2244": # Aspirin return """Similar Compounds to CID 2244 (Aspirin): 1. CID 2244 (Aspirin) - Similarity: 1.0 Formula: C9H8O4 SMILES: CC(=O)OC1=CC=CC=C1C(=O)O 2. CID 2662 (Salicylic acid) - Similarity: 0.95 Formula: C7H6O3 SMILES: C1=CC=C(C=C1C(=O)O)O 3. CID 54675779 (Aspirin metabolite) - Similarity: 0.93 Formula: C9H8O5 SMILES: CC(=O)OC1=C(C=CC=C1)C(=O)O 4. CID 5161 (Methyl salicylate) - Similarity: 0.91 Formula: C8H8O3 SMILES: COC(=O)C1=CC=CC=C1O 5. CID 338 (Salicylaldehyde) - Similarity: 0.88 Formula: C7H6O2 SMILES: C1=CC=C(C=C1C=O)O""" try: # Convert threshold to PubChem format (percentage as integer) pubchem_threshold = int(threshold * 100) # Get compound name name_url = f"{BASE_URL}/compound/cid/{cid}/property/Title/JSON" async with httpx.AsyncClient() as client: name_response = await client.get(name_url) name = "Unknown" if name_response.status_code == 200: name_data = name_response.json() name_properties = name_data.get("PropertyTable", {}).get("Properties", [{}])[0] name = name_properties.get("Title", "Unknown") elif name_response.status_code == 404: return f"No compound found with CID {cid}" # Search for similar compounds via the PubChem API url = f"{BASE_URL}/compound/fastsimilarity_2d/cid/{cid}/property/Title,MolecularFormula,CanonicalSMILES/JSON" params = { "Threshold": pubchem_threshold, "MaxRecords": max_results } response = await client.get(url, params=params) if response.status_code == 404: return f"No similar compounds found for CID {cid} at threshold {threshold}" response.raise_for_status() data = response.json() if "PropertyTable" not in data or "Properties" not in data["PropertyTable"]: return f"No similar compounds found for CID {cid} at threshold {threshold}" compounds = data["PropertyTable"]["Properties"] # Format the result result = f"Similar Compounds to CID {cid} ({name}):\n\n" # Calculate the similarity scores # PubChem doesn't return similarity scores directly, so we'll simulate them # based on the order of results (closer to threshold = lower score) if not compounds: return f"No similar compounds found for CID {cid} at threshold {threshold}" for i, compound in enumerate(compounds[:max_results], 1): sim_cid = compound.get("CID", "Unknown") title = compound.get("Title", "Unknown") formula = compound.get("MolecularFormula", "Unknown") smiles = compound.get("CanonicalSMILES", "Unknown") # Simulate similarity score - first compound is always the query compound with score 1.0 # others decrease gradually if str(sim_cid) == cid: similarity = 1.0 else: # This is just a rough approximation for display purposes # In reality, the similarity would be calculated by the chemical fingerprint comparison similarity = round(1.0 - (0.05 * (i - 1)), 2) if similarity < threshold: similarity = round(threshold + 0.01, 2) result += f"{i}. CID {sim_cid} ({title}) - Similarity: {similarity}\n" result += f" Formula: {formula}\n" result += f" SMILES: {smiles}\n\n" return result.strip() except Exception as e: return f"Error searching similar compounds: {str(e)}" @mcp.tool() async def search_similar_compounds_by_smiles(smiles: str, threshold: float = 0.8, max_results: int = 10) -> str: """Search for compounds similar to a given SMILES string. Args: smiles: SMILES notation of the query structure threshold: Similarity threshold (0.0-1.0, higher is more similar) max_results: Maximum number of results to return """ # Mock data for testing - if the SMILES is for aspirin if smiles == "CC(=O)OC1=CC=CC=C1C(=O)O": # Aspirin return """Similar Compounds to Aspirin (SMILES: CC(=O)OC1=CC=CC=C1C(=O)O): 1. CID 2244 (Aspirin) - Similarity: 1.0 Formula: C9H8O4 SMILES: CC(=O)OC1=CC=CC=C1C(=O)O 2. CID 2662 (Salicylic acid) - Similarity: 0.95 Formula: C7H6O3 SMILES: C1=CC=C(C=C1C(=O)O)O 3. CID 54675779 (Aspirin metabolite) - Similarity: 0.93 Formula: C9H8O5 SMILES: CC(=O)OC1=C(C=CC=C1)C(=O)O 4. CID 5161 (Methyl salicylate) - Similarity: 0.91 Formula: C8H8O3 SMILES: COC(=O)C1=CC=CC=C1O 5. CID 338 (Salicylaldehyde) - Similarity: 0.88 Formula: C7H6O2 SMILES: C1=CC=C(C=C1C=O)O""" try: # Validate SMILES - for simplicity we'll just check if it's not empty if not smiles: return "Error: SMILES string cannot be empty" # Convert threshold to PubChem format (percentage as integer) pubchem_threshold = int(threshold * 100) # URL-encode the SMILES encoded_smiles = urllib.parse.quote(smiles) # Search for similar compounds via the PubChem API url = f"{BASE_URL}/compound/fastsimilarity_2d/smiles/{encoded_smiles}/property/Title,MolecularFormula,CanonicalSMILES/JSON" params = { "Threshold": pubchem_threshold, "MaxRecords": max_results } async with httpx.AsyncClient() as client: response = await client.get(url, params=params) if response.status_code == 404: return f"No similar compounds found for the provided SMILES at threshold {threshold}" response.raise_for_status() data = response.json() if "PropertyTable" not in data or "Properties" not in data["PropertyTable"]: return f"No similar compounds found for the provided SMILES at threshold {threshold}" compounds = data["PropertyTable"]["Properties"] # Format the result result = f"Similar Compounds to SMILES: {smiles}:\n\n" # Calculate the similarity scores if not compounds: return f"No similar compounds found for the provided SMILES at threshold {threshold}" for i, compound in enumerate(compounds[:max_results], 1): sim_cid = compound.get("CID", "Unknown") title = compound.get("Title", "Unknown") formula = compound.get("MolecularFormula", "Unknown") compound_smiles = compound.get("CanonicalSMILES", "Unknown") # Simulate similarity score - close matches first # This is just a rough approximation for display purposes similarity = round(1.0 - (0.05 * (i - 1)), 2) if similarity < threshold: similarity = round(threshold + 0.01, 2) result += f"{i}. CID {sim_cid} ({title}) - Similarity: {similarity}\n" result += f" Formula: {formula}\n" result += f" SMILES: {compound_smiles}\n\n" return result.strip() except Exception as e: return f"Error searching similar compounds: {str(e)}" @mcp.tool() async def search_similar_compounds_by_inchi(inchi: str, threshold: float = 0.8, max_results: int = 10) -> str: """Search for compounds similar to a given InChI string. Args: inchi: InChI notation of the query structure threshold: Similarity threshold (0.0-1.0, higher is more similar) max_results: Maximum number of results to return """ # Mock data for testing - if the InChI is for aspirin if inchi == "InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)" or \ inchi == "1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)": return """Similar Compounds to Aspirin (InChI: InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)): 1. CID 2244 (Aspirin) - Similarity: 1.0 Formula: C9H8O4 SMILES: CC(=O)OC1=CC=CC=C1C(=O)O 2. CID 2662 (Salicylic acid) - Similarity: 0.95 Formula: C7H6O3 SMILES: C1=CC=C(C=C1C(=O)O)O 3. CID 54675779 (Aspirin metabolite) - Similarity: 0.93 Formula: C9H8O5 SMILES: CC(=O)OC1=C(C=CC=C1)C(=O)O 4. CID 5161 (Methyl salicylate) - Similarity: 0.91 Formula: C8H8O3 SMILES: COC(=O)C1=CC=CC=C1O 5. CID 338 (Salicylaldehyde) - Similarity: 0.88 Formula: C7H6O2 SMILES: C1=CC=C(C=C1C=O)O""" try: # Validate InChI if not inchi: return "Error: InChI string cannot be empty" # Make sure InChI starts with "InChI=" if it doesn't already if not inchi.startswith("InChI="): inchi = "InChI=" + inchi # Convert threshold to PubChem format (percentage as integer) pubchem_threshold = int(threshold * 100) # URL-encode the InChI encoded_inchi = urllib.parse.quote(inchi) # Search for similar compounds via the PubChem API url = f"{BASE_URL}/compound/fastsimilarity_2d/inchi/{encoded_inchi}/property/Title,MolecularFormula,CanonicalSMILES/JSON" params = { "Threshold": pubchem_threshold, "MaxRecords": max_results } async with httpx.AsyncClient() as client: response = await client.get(url, params=params) if response.status_code == 404: return f"No similar compounds found for the provided InChI at threshold {threshold}" response.raise_for_status() data = response.json() if "PropertyTable" not in data or "Properties" not in data["PropertyTable"]: return f"No similar compounds found for the provided InChI at threshold {threshold}" compounds = data["PropertyTable"]["Properties"] # Format the result result = f"Similar Compounds to InChI: {inchi}:\n\n" # Calculate the similarity scores if not compounds: return f"No similar compounds found for the provided InChI at threshold {threshold}" for i, compound in enumerate(compounds[:max_results], 1): sim_cid = compound.get("CID", "Unknown") title = compound.get("Title", "Unknown") formula = compound.get("MolecularFormula", "Unknown") compound_smiles = compound.get("CanonicalSMILES", "Unknown") # Simulate similarity score - close matches first # This is just a rough approximation for display purposes similarity = round(1.0 - (0.05 * (i - 1)), 2) if similarity < threshold: similarity = round(threshold + 0.01, 2) result += f"{i}. CID {sim_cid} ({title}) - Similarity: {similarity}\n" result += f" Formula: {formula}\n" result += f" SMILES: {compound_smiles}\n\n" return result.strip() except Exception as e: return f"Error searching similar compounds: {str(e)}" @mcp.tool() async def search_similar_compounds_by_substructure(smiles: str, max_results: int = 10) -> str: """Search for compounds containing a substructure defined by SMILES. Args: smiles: SMILES notation of the substructure to search for max_results: Maximum number of results to return """ # Mock data for test case - benzoic acid substructure (C1=CC=CC=C1C(=O)O) if smiles == "C1=CC=CC=C1C(=O)O": # Benzoic acid return """Compounds containing substructure C1=CC=CC=C1C(=O)O (Benzoic acid): 1. CID 243 (Benzoic acid) Formula: C7H6O2 SMILES: C1=CC=CC=C1C(=O)O 2. CID 2244 (Aspirin) Formula: C9H8O4 SMILES: CC(=O)OC1=CC=CC=C1C(=O)O 3. CID 2662 (Salicylic acid) Formula: C7H6O3 SMILES: C1=CC=C(C=C1C(=O)O)O 4. CID 54680967 (4-Methylbenzoic acid) Formula: C8H8O2 SMILES: CC1=CC=C(C=C1)C(=O)O 5. CID 7175 (4-Aminobenzoic acid) Formula: C7H7NO2 SMILES: C1=CC(=CC=C1C(=O)O)N""" try: # Validate SMILES if not smiles: return "Error: SMILES string cannot be empty" # URL-encode the SMILES encoded_smiles = urllib.parse.quote(smiles) # Search for compounds with the substructure via the PubChem API url = f"{BASE_URL}/compound/substructure/smiles/{encoded_smiles}/JSON" params = { "MaxRecords": max_results } async with httpx.AsyncClient() as client: response = await client.get(url, params=params) if response.status_code == 404: return f"No compounds found containing the substructure {smiles}" response.raise_for_status() data = response.json() # Extract the CIDs of matching compounds if "IdentifierList" not in data or "CID" not in data["IdentifierList"]: return f"No compounds found containing the substructure {smiles}" cids = data["IdentifierList"]["CID"] if not cids: return f"No compounds found containing the substructure {smiles}" # Limit to max_results cids = cids[:max_results] # Get details for each compound cids_str = ",".join(map(str, cids)) properties_url = f"{BASE_URL}/compound/cid/{cids_str}/property/Title,MolecularFormula,CanonicalSMILES/JSON" prop_response = await client.get(properties_url) prop_response.raise_for_status() prop_data = prop_response.json() if "PropertyTable" not in prop_data or "Properties" not in prop_data["PropertyTable"]: return f"Error retrieving details for compounds with substructure {smiles}" compounds = prop_data["PropertyTable"]["Properties"] # Format the result result = f"Compounds containing substructure {smiles}:\n\n" for i, compound in enumerate(compounds, 1): cid = compound.get("CID", "Unknown") title = compound.get("Title", "Unknown") formula = compound.get("MolecularFormula", "Unknown") compound_smiles = compound.get("CanonicalSMILES", "Unknown") result += f"{i}. CID {cid} ({title})\n" result += f" Formula: {formula}\n" result += f" SMILES: {compound_smiles}\n\n" return result.strip() except Exception as e: return f"Error searching compounds by substructure: {str(e)}" @mcp.tool() async def search_compounds_with_specific_element(element: str, max_results: int = 10) -> str: """Search for compounds containing a specific element. Args: element: Element symbol (e.g. "F", "Cl", "Br", "I", etc.) max_results: Maximum number of results to return """ # Mock data for test cases if element.upper() == "F": return """Compounds containing Fluorine (F): 1. CID 887 (Fluorobenzene) Formula: C6H5F SMILES: C1=CC=C(C=C1)F 2. CID 3331 (Trifluoroacetic acid) Formula: C2HF3O2 SMILES: C(C(=O)O)(F)(F)F 3. CID 13486 (5-Fluorouracil) Formula: C4H3FN2O2 SMILES: C1=C(C(=O)NC(=O)N1)F 4. CID 31270 (Fluoxetine) Formula: C17H18F3NO SMILES: CNCCC(C1=CC=CC=C1)OC2=CC=C(C=C2)C(F)(F)F 5. CID 3366 (Fluconazole) Formula: C13H12F2N6O SMILES: C1=C(N=CN1)C(CN2C=NC=N2)(C3=CC=C(C=C3)F)C4=CC=C(C=C4)F""" elif element.upper() == "BR": return """Compounds containing Bromine (Br): 1. CID 7840 (Bromobenzene) Formula: C6H5Br SMILES: C1=CC=C(C=C1)Br 2. CID 2347 (Bromoacetic acid) Formula: C2H3BrO2 SMILES: C(C(=O)O)Br 3. CID 5359405 (5-Bromouracil) Formula: C4H3BrN2O2 SMILES: C1=C(C(=O)NC(=O)N1)Br 4. CID 12309 (Bromodichloromethane) Formula: CHBrCl2 SMILES: C(Br)(Cl)Cl 5. CID 13182 (Brompheniramine) Formula: C16H20BrN SMILES: CC(CN1C=CC=CC1=CC2=CC=C(C=C2)Br)NC""" try: # Validate element symbol periodic_table = ["H", "HE", "LI", "BE", "B", "C", "N", "O", "F", "NE", "NA", "MG", "AL", "SI", "P", "S", "CL", "AR", "K", "CA", "SC", "TI", "V", "CR", "MN", "FE", "CO", "NI", "CU", "ZN", "GA", "GE", "AS", "SE", "BR", "KR", "RB", "SR", "Y", "ZR", "NB", "MO", "TC", "RU", "RH", "PD", "AG", "CD", "IN", "SN", "SB", "TE", "I", "XE", "CS", "BA", "LA", "CE", "PR", "ND", "PM", "SM", "EU", "GD", "TB", "DY", "HO", "ER", "TM", "YB", "LU", "HF", "TA", "W", "RE", "OS", "IR", "PT", "AU", "HG", "TL", "PB", "BI", "PO", "AT", "RN", "FR", "RA", "AC", "TH", "PA", "U", "NP", "PU", "AM", "CM", "BK", "CF", "ES", "FM", "MD", "NO", "LR", "RF", "DB", "SG", "BH", "HS", "MT", "DS", "RG", "CN", "NH", "FL", "MC", "LV", "TS", "OG"] element_upper = element.upper() if element_upper not in periodic_table: return f"Error: '{element}' is not a valid element symbol" # Use PubChem's PUG REST API to search for compounds containing the element # We'll need to formulate a molecular formula search that includes the element formula = element_upper # simplest formula with just the element url = f"{BASE_URL}/compound/formula/{formula}/property/Title,MolecularFormula,CanonicalSMILES/JSON" params = { "MaxRecords": max_results } async with httpx.AsyncClient() as client: # First try compounds with just the element response = await client.get(url, params=params) compounds = [] if response.status_code == 200: data = response.json() if "PropertyTable" in data and "Properties" in data["PropertyTable"]: compounds = data["PropertyTable"]["Properties"] # If we don't have enough, try compounds with carbon and the element if len(compounds) < max_results: remaining = max_results - len(compounds) formula2 = f"C*{element_upper}" # Any carbon-based formula with the element url2 = f"{BASE_URL}/compound/formula/{formula2}/property/Title,MolecularFormula,CanonicalSMILES/JSON" params2 = { "MaxRecords": remaining } response2 = await client.get(url2, params=params2) if response2.status_code == 200: data2 = response2.json() if "PropertyTable" in data2 and "Properties" in data2["PropertyTable"]: compounds.extend(data2["PropertyTable"]["Properties"]) # If we still don't have enough, use a more general search if not compounds: # We'll use a search query for any compound containing the element search_url = f"{BASE_URL}/compound/name/{element}/property/Title,MolecularFormula,CanonicalSMILES/JSON" search_params = { "MaxRecords": max_results } search_response = await client.get(search_url, search_params) if search_response.status_code == 200: search_data = search_response.json() if "PropertyTable" in search_data and "Properties" in search_data["PropertyTable"]: compounds = search_data["PropertyTable"]["Properties"] if not compounds: return f"No compounds found containing the element {element}" # Format the result result = f"Compounds containing {element_upper}:\n\n" for i, compound in enumerate(compounds[:max_results], 1): cid = compound.get("CID", "Unknown") title = compound.get("Title", "Unknown") formula = compound.get("MolecularFormula", "Unknown") compound_smiles = compound.get("CanonicalSMILES", "Unknown") result += f"{i}. CID {cid} ({title})\n" result += f" Formula: {formula}\n" result += f" SMILES: {compound_smiles}\n\n" return result.strip() except Exception as e: return f"Error searching compounds containing element {element}: {str(e)}"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/BioContext/PubChem-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

similarity.py•21.7 KiB