#!/usr/bin/env python3
import asyncio
import logging
import json
from typing import Any, Sequence
from mcp.server import Server
try:
import httpx
except ImportError:
httpx = None
from mcp.types import (
CallToolRequest,
CallToolResult,
ListToolsRequest,
ListToolsResult,
Tool,
TextContent,
)
from mcp.server.stdio import stdio_server
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Create server instance
server = Server("llm-integration-server")
@server.list_tools()
async def list_tools() -> list[Tool]:
"""List available tools for the LLM."""
return [
Tool(
name="llm_predict",
description="Process text input through a local LLM",
inputSchema={
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The text prompt to send to the LLM"
},
"max_tokens": {
"type": "integer",
"description": "Maximum number of tokens to generate",
"default": 100
}
},
"required": ["prompt"]
}
),
Tool(
name="echo",
description="Echo back the input text for testing",
inputSchema={
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to echo back"
}
},
"required": ["text"]
}
)
]
@server.call_tool()
async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
"""Handle tool calls from the LLM client."""
logger.info(f"Tool called: {name} with arguments: {arguments}")
if name == "llm_predict":
prompt = arguments.get("prompt", "")
max_tokens = arguments.get("max_tokens", 100)
# Placeholder for actual LLM integration
# Replace this with your actual LLM inference code
response = await perform_llm_inference(prompt, max_tokens)
return [
TextContent(
type="text",
text=f"LLM Response: {response}"
)
]
elif name == "echo":
text = arguments.get("text", "")
return [
TextContent(
type="text",
text=f"Echo: {text}"
)
]
else:
raise ValueError(f"Unknown tool: {name}")
async def perform_llm_inference(prompt: str, max_tokens: int = 100) -> str:
"""Perform LLM inference using Ollama.
This function connects to a local Ollama instance and generates responses.
Make sure Ollama is running: ollama serve
"""
try:
if httpx is None:
return "Error: httpx not installed. Run: pip install httpx"
# Ollama API endpoint
ollama_url = "http://localhost:11434/api/generate"
# Prepare the request payload
payload = {
"model": "llama3.2", # Change this to your preferred model
"prompt": prompt,
"stream": False,
"options": {
"num_predict": max_tokens
}
}
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(ollama_url, json=payload)
if response.status_code == 200:
result = response.json()
return result.get("response", "No response generated")
else:
return f"Error: Ollama request failed with status {response.status_code}"
except httpx.ConnectError:
return "Error: Could not connect to Ollama. Make sure Ollama is running (ollama serve)"
except Exception as e:
logger.error(f"LLM inference error: {e}")
return f"Error during LLM inference: {str(e)}"
async def main():
"""Run the MCP server."""
logger.info("Starting MCP LLM Integration Server...")
# Run the server using stdio transport
async with stdio_server() as (read_stream, write_stream):
await server.run(
read_stream,
write_stream,
server.create_initialization_options()
)
if __name__ == "__main__":
asyncio.run(main())