analyze_audio
Analyze audio files using Google Gemini AI to extract descriptions, insights, or answers to specific prompts with optional JSON context and system instructions.
Instructions
Analyze an audio file using Google Gemini.
Args: audio_path: Path to the audio file (wav, mp3, etc.) prompt: The prompt to send to Gemini. json_path: Optional path to a JSON file to provide as context. json_context: Optional JSON string to provide as context (overrides json_path if provided). instruction_file: Optional path to a text file containing system instructions. model: The Gemini model to use.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| audio_path | Yes | ||
| prompt | No | Describe this audio. | |
| json_path | No | ||
| json_context | No | ||
| instruction_file | No | ||
| model | No | gemini-3-pro-preview |
Implementation Reference
- gemini_audio/mcp_server.py:20-67 (handler)The MCP handler function for the 'analyze_audio' tool, which handles parameters, loads system instructions if provided, and delegates to analyze_audio_content.@mcp.tool() def analyze_audio( # pylint: disable=too-many-arguments, too-many-positional-arguments audio_path: str, prompt: str = "Describe this audio.", json_path: Optional[str] = None, json_context: Optional[str] = None, instruction_file: Optional[str] = None, model: str = "gemini-3-pro-preview" ) -> str: """ Analyze an audio file using Google Gemini. Args: audio_path: Path to the audio file (wav, mp3, etc.) prompt: The prompt to send to Gemini. json_path: Optional path to a JSON file to provide as context. json_context: Optional JSON string to provide as context (overrides json_path if provided). instruction_file: Optional path to a text file containing system instructions. model: The Gemini model to use. """ api_key = os.getenv("GOOGLE_API_KEY") if not api_key: return "Error: GOOGLE_API_KEY not set." system_instruction = None if instruction_file: if os.path.exists(instruction_file): try: with open(instruction_file, 'r', encoding='utf-8') as f: system_instruction = f.read() except Exception as e: # pylint: disable=broad-exception-caught return f"Error reading instruction file: {str(e)}" else: return f"Error: Instruction file not found at {instruction_file}" try: return analyze_audio_content( audio_path, prompt, json_path, json_context, model, api_key, system_instruction ) except Exception as e: # pylint: disable=broad-exception-caught return f"Error analyzing audio: {str(e)}"
- gemini_audio/analyze_audio.py:43-129 (helper)Core helper function implementing the audio analysis logic: uploads audio to Gemini, handles JSON context, waits for processing, configures model, and generates response.def analyze_audio_content( # pylint: disable=too-many-arguments, too-many-locals, too-many-positional-arguments audio_path, prompt, json_path=None, json_context=None, model_name="gemini-3-pro-preview", api_key=None, system_instruction=None ): """ Analyzes audio content using Google Gemini. Args: audio_path (str): Path to the audio file. prompt (str): Prompt for the model. json_path (str, optional): Path to a JSON context file. json_context (str, optional): JSON context string. model_name (str, optional): Gemini model name. api_key (str, optional): Google API key. system_instruction (str, optional): System instruction for the model. Returns: str: The model's response. """ if not api_key: raise ValueError("API key is required") genai.configure(api_key=api_key) # type: ignore files_to_upload = [] # Upload Audio if not os.path.exists(audio_path): return f"Error: Audio file not found at {audio_path}" print(f"Uploading audio: {audio_path}") # Simple mime type detection or default to wav/mp3 mime_type = "audio/wav" if audio_path.lower().endswith(".mp3"): mime_type = "audio/mp3" audio_file = upload_to_gemini(audio_path, mime_type=mime_type) files_to_upload.append(audio_file) # Handle JSON json_content = "" if json_context: json_content = json_context elif json_path: if os.path.exists(json_path): print(f"Reading JSON: {json_path}") with open(json_path, 'r', encoding='utf-8') as f: json_content = f.read() else: print(f"Warning: JSON file not found at {json_path}") # Wait for processing wait_for_files_active(files_to_upload) # Create the model generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( # type: ignore model_name=model_name, generation_config=generation_config, # type: ignore system_instruction=system_instruction ) # Construct the prompt parts prompt_parts = [] if json_content: prompt_parts.append(f"Context JSON:\n{json_content}\n") prompt_parts.append(prompt) prompt_parts.append(audio_file) # Generate content print("Generating content...") response = model.generate_content(prompt_parts) return response.text
- gemini_audio/analyze_audio.py:13-20 (helper)Helper function to upload audio file to Gemini API.def upload_to_gemini(path, mime_type=None): """Uploads the given file to Gemini. See https://ai.google.dev/gemini-api/docs/prompting_with_media """ file = genai.upload_file(path, mime_type=mime_type) # type: ignore print(f"Uploaded file '{file.display_name}' as: {file.uri}") return file
- gemini_audio/analyze_audio.py:22-41 (helper)Helper function to wait for uploaded files to be processed and active in Gemini API.def wait_for_files_active(files): """Waits for the given files to be active. Some files uploaded to the Gemini API need to be processed before they can be used as prompt inputs. The status can be seen by querying the file's "state" field. This implementation relies on the file's "name" field to perform the query, and if the state is not ACTIVE, it waits 10 seconds and checks again. """ print("Waiting for file processing...") for name in (file.name for file in files): file = genai.get_file(name) # type: ignore while file.state.name == "PROCESSING": print(".", end="", flush=True) time.sleep(10) file = genai.get_file(name) # type: ignore if file.state.name != "ACTIVE": raise RuntimeError(f"File {file.name} failed to process") print("...all files ready")
- gemini_audio/mcp_server.py:68-75 (registration)MCP server run block that registers and runs the tools, including the help mentioning analyze_audio.if __name__ == "__main__": if "--help" in sys.argv: print("GeminiAudio MCP Server") print("Run this server using an MCP client (e.g. Claude Desktop, VS Code MCP extension).") print("\nTools:") print(" - analyze_audio: Analyze an audio file using Google Gemini.") sys.exit(0) mcp.run()