server.py•3.04 kB
import subprocess
import json
from fastmcp import FastMCP
# Create an MCP server instance
server = FastMCP(
name="HPC/AI MCP Server",
instructions="""
The Azure HPC/AI Model Context Protocol (MCP) Server is a centralized
platform designed to streamline the deployment and management of agent
skills within Azure’s HPC and AI environments.
"""
)
@server.tool()
def list_nodes() -> list[dict]:
"""
Lists all nodes in the Kubernetes GPU node pool.
Uses `kubectl get nodes -l <label> -o json` to return structured data.
Returns a JSON array of node objects with:
- name: node name
- labels: node labels
- status: "Ready" or "NotReady"
"""
try:
completed = subprocess.run(
["kubectl", "get", "nodes", "-l", "accelerator=nvidia", "-o", "json"],
capture_output=True,
text=True
)
if completed.returncode != 0:
return [{"error": completed.stderr.strip()}]
data = json.loads(completed.stdout)
nodes = []
for item in data.get("items", []):
# Default status
status = "NotReady"
for cond in item["status"].get("conditions", []):
if cond.get("type") == "Ready" and cond.get("status") == "True":
status = "Ready"
break
nodes.append({
"name": item["metadata"]["name"],
"labels": item["metadata"].get("labels", {}),
"status": status,
})
return nodes if nodes else [{"message": "No GPU nodes found."}]
except Exception as e:
return [{"error": str(e)}]
@server.tool()
def get_node_topologies() -> list[dict]:
"""
Returns agentpool, pkey and torset for each GPU node.
Reads from node labels:
- kubernetes.azure.com/agentpool
- ib/pkey
- ib/torset
Output list entries have the shape:
{ "name": str, "agentpool": str|None, "pkey": str|None, "torset": str|None }
"""
try:
completed = subprocess.run(
["kubectl", "get", "nodes", "-o", "json"],
capture_output=True,
text=True
)
if completed.returncode != 0:
return [{"error": completed.stderr.strip()}]
data = json.loads(completed.stdout)
results = []
for item in data.get("items", []):
meta = item.get("metadata", {})
labels = meta.get("labels", {})
results.append({
"name": meta.get("name", ""),
"agentpool": labels.get("kubernetes.azure.com/agentpool"),
"pkey": labels.get("ib/pkey"),
"torset": labels.get("ib/torset"),
})
return results if results else [{"message": "No GPU nodes found."}]
except Exception as e:
return [{"error": str(e)}]
if __name__ == "__main__":
# Run with the default transport (stdio) so MCP tool calls use stdio.
server.run()