try:
from transformers import T5Tokenizer, T5ForConditionalGeneration
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
from ..utils.base_tool import BaseTool
from ..utils.errors import ChemMCPInputError, ChemMCPToolInitError
from ..utils.mcp_app import ChemMCPManager, run_mcp_server
@ChemMCPManager.register_tool
class MoleculeCaptioner(BaseTool):
__version__ = "0.1.0"
name = "MoleculeCaptioner"
func_name = "caption_molecule"
description = "Generate a natural language caption for a molecule with MolT5 based on its SMILES representation."
implementation_description = "Uses [the MolT5-large model](laituan245/molt5-large-smiles2caption), a transformer-based neural network trained on molecule-text pairs, to generate natural language descriptions from SMILES representations."
categories = ["Molecule"]
tags = ["Molecular Information", "Text", "Neural Networks", "SMILES"]
required_envs = []
text_input_sig = [('smiles', 'str', 'N/A', 'SMILES representation of the molecule.')]
code_input_sig = [('smiles', 'str', 'N/A', 'SMILES representation of the molecule.')]
output_sig = [('caption', 'str', 'Natural language description of the molecule.')]
examples = [
{'code_input': {'smiles': 'CCOC(=O)c1ccc(cc1)N'}, 'text_input': {'smiles': 'CCOC(=O)c1ccc(cc1)N'}, 'output': {'caption': 'The molecule is an ethyl ester resulting from the formal condensation of the carboxy group of 4-aminobenzoic acid with ethanol. It has a role as a topical anaesthetic, an allergen, an antimalarial, an antirheumatic drug, a cardiovascular drug, a drug allergen, a drug metabolite, a hepatotoxic agent, a local anaesthetic, a non-steroidal anti-inflammatory drug, an ophthalmology drug, a plant metabolite and a topical ophthalmic anaesthetic.'}},
]
oss_dependencies = [
("MolT5", "https://github.com/blender-nlp/MolT5", "BSD 3-Clause")
]
services_and_software = []
def __init__(self, init: bool = True, interface: str = 'code'):
# 由于需要transformers库,延迟到实际使用时再初始化模块
# 这样可以避免在工具注册时就需要transformers库
super().__init__(init=False, interface=interface)
def _init_modules(self):
if not TRANSFORMERS_AVAILABLE:
raise ChemMCPToolInitError("The 'transformers' library is required for MoleculeCaptioner. Please install it with: pip install transformers")
# Load pre-trained model and tokenizer
model_name = "laituan245/molt5-large-smiles2caption"
self.tokenizer = T5Tokenizer.from_pretrained(model_name, model_max_length=512)
self.model = T5ForConditionalGeneration.from_pretrained(model_name)
def _run_base(self, smiles: str) -> str:
# 只在实际使用时才初始化模块
if not hasattr(self, 'tokenizer') or not hasattr(self, 'model'):
if not TRANSFORMERS_AVAILABLE:
raise ChemMCPToolInitError("The 'transformers' library is required for MoleculeCaptioner. Please install it with: pip install transformers")
self._init_modules()
# Validate SMILES input first
from ..tool_utils.smiles import is_smiles
if not is_smiles(smiles):
raise ChemMCPInputError("The input is not a valid SMILES string.")
input_text = smiles
input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids
outputs = self.model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
caption = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return caption
if __name__ == "__main__":
run_mcp_server()