Skip to main content
Glama
analyze_audio.py5.13 kB
""" Module for analyzing audio content using Google Gemini. """ import os import time import argparse from dotenv import load_dotenv import google.generativeai as genai # Load environment variables load_dotenv() def upload_to_gemini(path, mime_type=None): """Uploads the given file to Gemini. See https://ai.google.dev/gemini-api/docs/prompting_with_media """ file = genai.upload_file(path, mime_type=mime_type) # type: ignore print(f"Uploaded file '{file.display_name}' as: {file.uri}") return file def wait_for_files_active(files): """Waits for the given files to be active. Some files uploaded to the Gemini API need to be processed before they can be used as prompt inputs. The status can be seen by querying the file's "state" field. This implementation relies on the file's "name" field to perform the query, and if the state is not ACTIVE, it waits 10 seconds and checks again. """ print("Waiting for file processing...") for name in (file.name for file in files): file = genai.get_file(name) # type: ignore while file.state.name == "PROCESSING": print(".", end="", flush=True) time.sleep(10) file = genai.get_file(name) # type: ignore if file.state.name != "ACTIVE": raise RuntimeError(f"File {file.name} failed to process") print("...all files ready") def analyze_audio_content( # pylint: disable=too-many-arguments, too-many-locals, too-many-positional-arguments audio_path, prompt, json_path=None, json_context=None, model_name="gemini-3-pro-preview", api_key=None, system_instruction=None ): """ Analyzes audio content using Google Gemini. Args: audio_path (str): Path to the audio file. prompt (str): Prompt for the model. json_path (str, optional): Path to a JSON context file. json_context (str, optional): JSON context string. model_name (str, optional): Gemini model name. api_key (str, optional): Google API key. system_instruction (str, optional): System instruction for the model. Returns: str: The model's response. """ if not api_key: raise ValueError("API key is required") genai.configure(api_key=api_key) # type: ignore files_to_upload = [] # Upload Audio if not os.path.exists(audio_path): return f"Error: Audio file not found at {audio_path}" print(f"Uploading audio: {audio_path}") # Simple mime type detection or default to wav/mp3 mime_type = "audio/wav" if audio_path.lower().endswith(".mp3"): mime_type = "audio/mp3" audio_file = upload_to_gemini(audio_path, mime_type=mime_type) files_to_upload.append(audio_file) # Handle JSON json_content = "" if json_context: json_content = json_context elif json_path: if os.path.exists(json_path): print(f"Reading JSON: {json_path}") with open(json_path, 'r', encoding='utf-8') as f: json_content = f.read() else: print(f"Warning: JSON file not found at {json_path}") # Wait for processing wait_for_files_active(files_to_upload) # Create the model generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( # type: ignore model_name=model_name, generation_config=generation_config, # type: ignore system_instruction=system_instruction ) # Construct the prompt parts prompt_parts = [] if json_content: prompt_parts.append(f"Context JSON:\n{json_content}\n") prompt_parts.append(prompt) prompt_parts.append(audio_file) # Generate content print("Generating content...") response = model.generate_content(prompt_parts) return response.text def main(): """ Main function for command line usage. """ parser = argparse.ArgumentParser(description="Upload audio/JSON and prompt Gemini.") parser.add_argument("--audio", required=True, help="Path to the audio file") parser.add_argument("--json", help="Path to the JSON file (optional)") parser.add_argument("--prompt", default="Describe this audio.", help="The prompt to send") parser.add_argument("--model", default="gemini-3-pro-preview", help="The model to use") args = parser.parse_args() api_key = os.getenv("GOOGLE_API_KEY") if not api_key: print("Error: GOOGLE_API_KEY environment variable not set.") return try: result = analyze_audio_content( audio_path=args.audio, prompt=args.prompt, json_path=args.json, json_context=None, model_name=args.model, api_key=api_key ) print("\nResponse:") print(result) except Exception as e: # pylint: disable=broad-exception-caught print(f"Error: {e}") if __name__ == "__main__": main()

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/unscene/gemini-audio-upload'

If you have feedback or need assistance with the MCP directory API, please join our Discord server