Skip to main content
Glama
voice_client.py20.3 kB
#!/usr/bin/env python3 """ Voice client for managing Mac applications through MCP Uses Ollama for understanding commands and MCP tools for management """ import requests import json import subprocess import sys import os import time import select MCP_SERVER_PATH = os.path.join(os.path.dirname(__file__), "src", "server.py") OLLAMA_API_URL = "http://localhost:11434" OLLAMA_MODEL = "llama3.2" try: import speech_recognition as sr SPEECH_RECOGNITION_AVAILABLE = True except ImportError: SPEECH_RECOGNITION_AVAILABLE = False print("⚠️ speech_recognition is not installed. Install: pip install SpeechRecognition") print(" For voice input, pyaudio is also needed: pip install pyaudio") try: import pyttsx3 TTS_AVAILABLE = True except ImportError: TTS_AVAILABLE = False print("⚠️ pyttsx3 is not installed. Install: pip install pyttsx3") print(" Or use macOS built-in say (already available)") def speak(text, use_system=True): """Converts text to speech""" if use_system: # Use built-in macOS say command subprocess.run(["say", text], check=False) elif TTS_AVAILABLE: try: engine = pyttsx3.init() engine.say(text) engine.runAndWait() except Exception as e: print(f"TTS error: {e}") subprocess.run(["say", text], check=False) else: subprocess.run(["say", text], check=False) def listen(use_microphone=True, activation_key='space'): """ Listens to voice input and converts it to text Activated by pressing a key (default is space) """ if not use_microphone or not SPEECH_RECOGNITION_AVAILABLE: # Alternative: use text input return input("You: ") # Wait for activation key press if activation_key == 'space': print("⌨️ Press SPACE to start voice recording (or Enter for text input)") elif activation_key == 'enter': print("⌨️ Press ENTER to start voice recording") else: print(f"⌨️ Press {activation_key.upper()} to start voice recording") # Use threading for non-blocking key reading import select import termios import tty # Configure terminal for single character reading old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) while True: # Check if there's input if select.select([sys.stdin], [], [], 0)[0]: key = sys.stdin.read(1) # Space or Enter activates recording if key in [' ', '\n', '\r']: print("\n🎤 Recording... (speak, press Enter when finished)") break # 'q' to exit elif key == 'q': termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) return None # Any other key - text mode elif key == '\x1b': # ESC termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) return input("\nYou: ") except (ImportError, AttributeError): # Fallback for systems without termios (e.g., Windows) key = input("Press Enter to record voice: ") if key.lower() == 'q': return None finally: try: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) except: pass # Start recording r = sr.Recognizer() with sr.Microphone() as source: # Adapt to ambient noise (faster for button activation) r.adjust_for_ambient_noise(source, duration=0.3) try: # Listen with increased time limit since user already pressed button audio = r.listen(source, timeout=30, phrase_time_limit=30) print("🔄 Recognizing speech...") # Use Google Speech Recognition (requires internet) text = r.recognize_google(audio, language="en-US") print(f"📝 Recognized: {text}") return text except sr.WaitTimeoutError: print("⏱️ Timeout. Didn't hear command.") return None except sr.UnknownValueError: print("❌ Could not recognize speech") return None except sr.RequestError as e: print(f"❌ Speech recognition service error: {e}") print("💡 Use text input or install offline recognition") return None def call_mcp_tool(tool_name, arguments): """Calls MCP tool via JSON-RPC""" request = { "jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": { "name": tool_name, "arguments": arguments } } try: process = subprocess.Popen( ["python3", MCP_SERVER_PATH], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) request_json = json.dumps(request) + "\n" stdout, stderr = process.communicate(input=request_json, timeout=10) for line in stdout.split('\n'): if line.strip(): try: response = json.loads(line) if "result" in response: content = response["result"].get("content", []) if content: return content[0].get("text", "") if "error" in response: return f"Error: {response['error'].get('message', 'Unknown error')}" except json.JSONDecodeError: continue return "No response from MCP server" except subprocess.TimeoutExpired: process.kill() return "Timeout when calling MCP tool" except Exception as e: return f"Error calling MCP tool: {str(e)}" def extract_search_query(user_query): print(f"🔍 Extracting query from: '{user_query}'") """Extracts search query from user text""" import re if not user_query: return None # Patterns for finding query (more precise, using greedy quantifier) patterns = [ r'find\s+(.+?)\s+in\s+google', r'search\s+(.+?)\s+in\s+google', r'search\s+for\s+(.+?)\s+in\s+google', r'look\s+up\s+(.+?)\s+in\s+google', r'google\s+(.+?)$', ] query_lower = user_query.lower() for pattern in patterns: match = re.search(pattern, query_lower, re.IGNORECASE) if match: query = match.group(1).strip() # Remove extra words at the end (in case pattern captured extra) query = re.sub(r'\s+in\s+google.*$', '', query, flags=re.IGNORECASE) if query: return query.strip() # If pattern with "in google" not found, try just "find X" or "search X" simple_patterns = [ r'^find\s+(.+)$', r'^search\s+(.+)$', r'^search\s+for\s+(.+)$', r'^look\s+up\s+(.+)$', ] for pattern in simple_patterns: match = re.search(pattern, query_lower, re.IGNORECASE) if match: query = match.group(1).strip() # Remove "in google" if present query = re.sub(r'\s+in\s+google.*$', '', query, flags=re.IGNORECASE) if query: return query.strip() # If no pattern found, return entire query, removing words "find", "search", "in google" query = user_query query = re.sub(r'^find\s+', '', query, flags=re.IGNORECASE) query = re.sub(r'^search\s+', '', query, flags=re.IGNORECASE) query = re.sub(r'^search\s+for\s+', '', query, flags=re.IGNORECASE) query = re.sub(r'^look\s+up\s+', '', query, flags=re.IGNORECASE) query = re.sub(r'\s+in\s+google.*$', '', query, flags=re.IGNORECASE) result = query.strip() if query.strip() else None return result def list_mcp_tools(): """Gets list of available MCP tools""" request = { "jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {} } try: process = subprocess.Popen( ["python3", MCP_SERVER_PATH], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) request_json = json.dumps(request) + "\n" stdout, stderr = process.communicate(input=request_json, timeout=10) for line in stdout.split('\n'): if line.strip(): try: response = json.loads(line) if "result" in response and "tools" in response["result"]: return response["result"]["tools"] except json.JSONDecodeError: continue return [] except Exception as e: print(f"Error getting list of tools: {e}", file=sys.stderr) return [] def ask_ollama_with_tools(user_query, verbose=False): """Uses Ollama to understand the request and call appropriate MCP tools""" tools = list_mcp_tools() tools_description = "\n".join([ f"- {tool['name']}: {tool['description']}" for tool in tools ]) system_prompt = f"""You are an assistant that can manage Mac applications through MCP tools. Available tools: {tools_description} When the user asks to open an application, perform an action, or get information, determine which tool to use and return JSON in the format: {{ "tool": "tool_name", "arguments": {{"parameter": "value"}} }} If the request doesn't require using tools, just respond with regular text. Examples: - "Open Calculator" -> {{"tool": "open_application", "arguments": {{"appName": "Calculator"}}}} - "Close Safari" -> {{"tool": "quit_application", "arguments": {{"appName": "Safari"}}}} - "What applications are running?" -> {{"tool": "get_running_applications", "arguments": {{}}}} - "Open MongoDB Compass" -> {{"tool": "open_application", "arguments": {{"appName": "MongoDB Compass"}}}} - "Create database test" -> {{"tool": "mongodb_create_database", "arguments": {{"databaseName": "test"}}}} - "Create collection users in database test" -> {{"tool": "mongodb_create_collection", "arguments": {{"databaseName": "test", "collectionName": "users"}}}} - "Add document {{\"name\": \"John\"}} to collection users in database test" -> {{"tool": "mongodb_insert_document", "arguments": {{"databaseName": "test", "collectionName": "users", "document": "{{\\\"name\\\": \\\"John\\\"}}"}}}} - "Find apple image in Google" -> {{"tool": "search_google_in_safari", "arguments": {{"query": "apple image"}}}} - "Search Google for Python" -> {{"tool": "search_google_in_safari", "arguments": {{"query": "Python"}}}} - "Find information about MCP in Google" -> {{"tool": "search_google_in_safari", "arguments": {{"query": "MCP"}}}} IMPORTANT: - For search_google_in_safari always extract the search query from the user's text and pass it in the "query" parameter. If the user says "find X in Google" or "search Y", then query should be "X" or "Y". - ALWAYS return ONLY a valid JSON object in the format {{"tool": "...", "arguments": {{...}}}}. DO NOT return just text or tool name without JSON. DO NOT return empty arguments. Respond ONLY with JSON, without additional explanations or text.""" try: response = requests.post( f"{OLLAMA_API_URL}/api/generate", json={ "model": OLLAMA_MODEL, "prompt": f"{system_prompt}\n\nUser: {user_query}\nAssistant:", "stream": False, "options": { "temperature": 0.3, "max_tokens": 500 } }, timeout=30 ) if response.status_code != 200: return f"Ollama error: {response.status_code}", False result = response.json() answer = result.get("response", "").strip() if verbose: print(f"🤖 Ollama response: {answer}") # Try to parse JSON try: import re json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}' json_matches = re.findall(json_pattern, answer, re.DOTALL) print(f"🔍 JSON matches: {json_matches}") for json_str in json_matches: try: tool_call = json.loads(json_str) if "tool" in tool_call: tool_name = tool_call["tool"] tool_args = tool_call.get("arguments", {}) print(f"🔧 tool_call: {tool_call}") print(f"🔧 tool_name: {tool_name}") # Fallback: if arguments are empty for search_google_in_safari, extract query from user_query if tool_name == "search_google_in_safari": print(f"🔍 ==========tool_name: {tool_name}") # Check if query is in arguments current_query = None if tool_args and isinstance(tool_args, dict): current_query = tool_args.get("query") print(f"🔍 Check: tool_args={tool_args}, current_query={current_query}") if not current_query: # Try to extract search query from original request print(f"🔍 Extracting query from: '{user_query}'") query = extract_search_query(user_query) print(f"🔍 Extraction result: '{query}'") if query: # Make sure tool_args is a dictionary if not tool_args or not isinstance(tool_args, dict): tool_args = {} tool_args["query"] = query print(f"✅ Set query: '{query}'") else: print(f"⚠️ Failed to extract query from: '{user_query}'") # As last resort, use entire query, removing service words fallback_query = user_query.replace("find", "").replace("search", "").replace("in google", "").replace("for", "").strip() if fallback_query: if not tool_args or not isinstance(tool_args, dict): tool_args = {} tool_args["query"] = fallback_query print(f"✅ Used fallback query: '{fallback_query}'") if verbose: print(f"🔧 Calling tool: {tool_name}") print(f"📝 Arguments: {tool_args}") result = call_mcp_tool(tool_name, tool_args) return result, True except json.JSONDecodeError: continue # Try to parse entire response as JSON tool_call = json.loads(answer) print(f"🔧 tool_call: {tool_call}") if "tool" in tool_call: tool_name = tool_call["tool"] tool_args = tool_call.get("arguments", {}) # Fallback: if arguments are empty for search_google_in_safari, extract query from user_query if tool_name == "search_google_in_safari" and (not tool_args or not tool_args.get("query")): query = extract_search_query(user_query) if query: if not tool_args: tool_args = {} tool_args["query"] = query print(f"🔍 Extracted search query from text: '{query}'") else: print(f"⚠️ Failed to extract search query from: '{user_query}'") if verbose: print(f"🔧 Calling tool: {tool_name}") print(f"📝 Arguments: {tool_args}") result = call_mcp_tool(tool_name, tool_args) return result, True except (json.JSONDecodeError, KeyError): print(f"🔧 Failed to parse JSON: {answer}") # If not JSON, check if it's just a tool name answer_lower = answer.lower().strip() if "search_google" in answer_lower or answer_lower == "search_google_in_safari": # Try to extract search query from original request query = extract_search_query(user_query) if query: if verbose: print(f"🔧 Calling tool: search_google_in_safari") print(f"📝 Arguments: {{'query': '{query}'}}") result = call_mcp_tool("search_google_in_safari", {"query": query}) return result, True pass return answer, False except requests.exceptions.ConnectionError: return "❌ Failed to connect to Ollama. Start: ollama serve", False except Exception as e: return f"Error: {str(e)}", False def main(): print("🎤 Voice assistant for managing Mac applications") print("=" * 60) print(f"📦 Model: {OLLAMA_MODEL}") print(f"🌐 Ollama: {OLLAMA_API_URL}") print("=" * 60) print() # Check availability if not SPEECH_RECOGNITION_AVAILABLE: print("💡 For voice input, install:") print(" pip install SpeechRecognition pyaudio") print() print("📝 Text input will be used now") print() use_voice_input = False else: use_voice_input = True print("✅ Voice input available") print("✅ Voice output available (via macOS say)") print() while True: try: # Voice or text input if use_voice_input: query = listen() if query is None: continue else: query = input("You (or 'exit' to quit): ").strip() if not query: continue if query.lower() in ['exit', 'quit', 'stop']: speak("Goodbye!") print("👋 Goodbye!") break print(f"\n💬 Request: {query}") print("-" * 60) # Process request result, is_action = ask_ollama_with_tools(query, verbose=True) print(f"\n📋 Result: {result}") # Voice output of result if is_action: # For actions, speak brief answer speak(result.split('\n')[0] if '\n' in result else result) else: # For regular answers, speak entire text (if short) if len(result) < 200: speak(result) else: speak("Result shown on screen") print() time.sleep(0.5) # Small pause between commands except KeyboardInterrupt: print("\n\n👋 Interrupted by user") speak("Goodbye!") break except Exception as e: error_msg = f"Error: {str(e)}" print(f"\n❌ {error_msg}") speak("An error occurred") time.sleep(1) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/TrueOleg/MCP-expirements'

If you have feedback or need assistance with the MCP directory API, please join our Discord server