Just Prompt

654

Overview InspectNew Endpoints Schema Related Servers Reviews Score

just-prompt
ai_docs

llm_providers_details.xml•68 kB

This file is a merged representation of a subset of the codebase, containing files not matching ignore patterns, combined into a single document by Repomix. <file_summary> This section contains a summary of this file. <purpose> This file contains a packed representation of the entire repository's contents. It is designed to be easily consumable by AI systems for analysis, code review, or other automated processes. </purpose> <file_format> The content is organized as follows: 1. This summary section 2. Repository information 3. Directory structure 4. Repository files, each consisting of: - File path as an attribute - Full contents of the file </file_format> <usage_guidelines> - This file should be treated as read-only. Any changes should be made to the original repository files, not this packed version. - When processing this file, use the file path to distinguish between different files in the repository. - Be aware that this file may contain sensitive information. Handle it with the same level of security as you would the original repository. </usage_guidelines> <notes> - Some files may have been excluded based on .gitignore rules and Repomix's configuration - Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files - Files matching these patterns are excluded: server/modules/exbench_module.py - Files matching patterns in .gitignore are excluded - Files matching default ignore patterns are excluded - Files are sorted by Git change count (files with more changes are at the bottom) </notes> <additional_info> </additional_info> </file_summary> <directory_structure> __init__.py anthropic_llm.py data_types.py deepseek_llm.py exbench_module.py execution_evaluators.py fireworks_llm.py gemini_llm.py llm_models.py ollama_llm.py openai_llm.py tools.py </directory_structure> <files> This section contains the contents of the repository's files. <file path="__init__.py"> # Empty file to make tests a package </file> <file path="anthropic_llm.py"> import anthropic import os import json from modules.data_types import ModelAlias, PromptResponse, ToolsAndPrompts from utils import MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS, parse_markdown_backticks from modules.data_types import ( SimpleToolCall, ToolCallResponse, BenchPromptResponse, ) from utils import timeit from modules.tools import ( anthropic_tools_list, run_coder_agent, run_git_agent, run_docs_agent, all_tools_list, ) from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Initialize Anthropic client anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) def get_anthropic_cost(model: str, input_tokens: int, output_tokens: int) -> float: """ Calculate the cost for Anthropic API usage. Args: model: The model name/alias used input_tokens: Number of input tokens output_tokens: Number of output tokens Returns: float: Total cost in dollars """ cost_map = MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS.get(model) if not cost_map: return 0.0 input_cost = (input_tokens / 1_000_000) * cost_map["input"] output_cost = (output_tokens / 1_000_000) * cost_map["output"] return round(input_cost + output_cost, 6) def text_prompt(prompt: str, model: str) -> PromptResponse: """ Send a prompt to Anthropic and get a response. """ try: with timeit() as t: message = anthropic_client.messages.create( model=model, max_tokens=2048, messages=[{"role": "user", "content": prompt}], ) elapsed_ms = t() input_tokens = message.usage.input_tokens output_tokens = message.usage.output_tokens cost = get_anthropic_cost(model, input_tokens, output_tokens) return PromptResponse( response=message.content[0].text, runTimeMs=elapsed_ms, inputAndOutputCost=cost, ) except Exception as e: print(f"Anthropic error: {str(e)}") return PromptResponse( response=f"Error: {str(e)}", runTimeMs=0.0, inputAndOutputCost=0.0 ) def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: """ Send a prompt to Anthropic and get detailed benchmarking response. """ try: with timeit() as t: message = anthropic_client.messages.create( model=model, max_tokens=2048, messages=[{"role": "user", "content": prompt}], ) elapsed_ms = t() input_tokens = message.usage.input_tokens output_tokens = message.usage.output_tokens cost = get_anthropic_cost(model, input_tokens, output_tokens) return BenchPromptResponse( response=message.content[0].text, tokens_per_second=0.0, # Anthropic doesn't provide this info provider="anthropic", total_duration_ms=elapsed_ms, load_duration_ms=0.0, inputAndOutputCost=cost, ) except Exception as e: print(f"Anthropic error: {str(e)}") return BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider="anthropic", total_duration_ms=0.0, load_duration_ms=0.0, inputAndOutputCost=0.0, errored=True, ) def tool_prompt(prompt: str, model: str) -> ToolCallResponse: """ Run a chat model with tool calls using Anthropic's Claude. Now supports JSON structured output variants by parsing the response. """ with timeit() as t: if "-json" in model: # Standard message request but expecting JSON response message = anthropic_client.messages.create( model=model.replace("-json", ""), max_tokens=2048, messages=[{"role": "user", "content": prompt}], ) try: # Parse raw response text into ToolsAndPrompts model parsed_response = ToolsAndPrompts.model_validate_json( parse_markdown_backticks(message.content[0].text) ) tool_calls = [ SimpleToolCall( tool_name=tap.tool_name, params={"prompt": tap.prompt} ) for tap in parsed_response.tools_and_prompts ] except Exception as e: print(f"Failed to parse JSON response: {e}") tool_calls = [] else: # Original implementation for function calling message = anthropic_client.messages.create( model=model, max_tokens=2048, messages=[{"role": "user", "content": prompt}], tools=anthropic_tools_list, tool_choice={"type": "any"}, ) # Extract tool calls with parameters tool_calls = [] for content in message.content: if content.type == "tool_use": tool_name = content.name if tool_name in all_tools_list: tool_calls.append( SimpleToolCall(tool_name=tool_name, params=content.input) ) # Calculate cost based on token usage input_tokens = message.usage.input_tokens output_tokens = message.usage.output_tokens cost = get_anthropic_cost(model, input_tokens, output_tokens) return ToolCallResponse( tool_calls=tool_calls, runTimeMs=t(), inputAndOutputCost=cost ) </file> <file path="data_types.py"> from typing import Optional, Union from pydantic import BaseModel from enum import Enum class ModelAlias(str, Enum): haiku = "claude-3-5-haiku-latest" haiku_3_legacy = "claude-3-haiku-20240307" sonnet = "claude-3-5-sonnet-20241022" gemini_pro_2 = "gemini-1.5-pro-002" gemini_flash_2 = "gemini-1.5-flash-002" gemini_flash_8b = "gemini-1.5-flash-8b-latest" gpt_4o_mini = "gpt-4o-mini" gpt_4o = "gpt-4o" gpt_4o_predictive = "gpt-4o-predictive" gpt_4o_mini_predictive = "gpt-4o-mini-predictive" # JSON variants o1_mini_json = "o1-mini-json" gpt_4o_json = "gpt-4o-json" gpt_4o_mini_json = "gpt-4o-mini-json" gemini_pro_2_json = "gemini-1.5-pro-002-json" gemini_flash_2_json = "gemini-1.5-flash-002-json" sonnet_json = "claude-3-5-sonnet-20241022-json" haiku_json = "claude-3-5-haiku-latest-json" gemini_exp_1114_json = "gemini-exp-1114-json" # ollama models llama3_2_1b = "llama3.2:1b" llama_3_2_3b = "llama3.2:latest" qwen_2_5_coder_14b = "qwen2.5-coder:14b" qwq_3db = "qwq:32b" phi_4 = "vanilj/Phi-4:latest" class Prompt(BaseModel): prompt: str model: Union[ModelAlias, str] class ToolEnum(str, Enum): run_coder_agent = "run_coder_agent" run_git_agent = "run_git_agent" run_docs_agent = "run_docs_agent" class ToolAndPrompt(BaseModel): tool_name: ToolEnum prompt: str class ToolsAndPrompts(BaseModel): tools_and_prompts: list[ToolAndPrompt] class PromptWithToolCalls(BaseModel): prompt: str model: ModelAlias | str class PromptResponse(BaseModel): response: str runTimeMs: int inputAndOutputCost: float class SimpleToolCall(BaseModel): tool_name: str params: dict class ToolCallResponse(BaseModel): tool_calls: list[SimpleToolCall] runTimeMs: int inputAndOutputCost: float class ThoughtResponse(BaseModel): thoughts: str response: str error: Optional[str] = None # ------------ Execution Evaluator Benchmarks ------------ class BenchPromptResponse(BaseModel): response: str tokens_per_second: float provider: str total_duration_ms: float load_duration_ms: float inputAndOutputCost: float errored: Optional[bool] = None class ModelProvider(str, Enum): ollama = "ollama" mlx = "mlx" class ExeEvalType(str, Enum): execute_python_code_with_num_output = "execute_python_code_with_num_output" execute_python_code_with_string_output = "execute_python_code_with_string_output" raw_string_evaluator = "raw_string_evaluator" # New evaluator type python_print_execution_with_num_output = "python_print_execution_with_num_output" json_validator_eval = "json_validator_eval" class ExeEvalBenchmarkInputRow(BaseModel): dynamic_variables: Optional[dict] expectation: str | dict class ExecEvalBenchmarkFile(BaseModel): base_prompt: str evaluator: ExeEvalType prompts: list[ExeEvalBenchmarkInputRow] benchmark_name: str purpose: str models: list[str] # List of model names/aliases class ExeEvalBenchmarkOutputResult(BaseModel): prompt_response: BenchPromptResponse execution_result: str expected_result: str input_prompt: str model: str correct: bool index: int class ExecEvalBenchmarkCompleteResult(BaseModel): benchmark_file: ExecEvalBenchmarkFile results: list[ExeEvalBenchmarkOutputResult] @property def correct_count(self) -> int: return sum(1 for result in self.results if result.correct) @property def incorrect_count(self) -> int: return len(self.results) - self.correct_count @property def accuracy(self) -> float: return self.correct_count / len(self.results) class ExecEvalBenchmarkModelReport(BaseModel): model: str # Changed from ModelAlias to str results: list[ExeEvalBenchmarkOutputResult] correct_count: int incorrect_count: int accuracy: float average_tokens_per_second: float average_total_duration_ms: float average_load_duration_ms: float total_cost: float class ExecEvalPromptIteration(BaseModel): dynamic_variables: dict expectation: str | dict class ExecEvalBenchmarkReport(BaseModel): benchmark_name: str purpose: str base_prompt: str prompt_iterations: list[ExecEvalPromptIteration] models: list[ExecEvalBenchmarkModelReport] overall_correct_count: int overall_incorrect_count: int overall_accuracy: float average_tokens_per_second: float average_total_duration_ms: float average_load_duration_ms: float </file> <file path="deepseek_llm.py"> from openai import OpenAI from utils import MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS, timeit from modules.data_types import BenchPromptResponse, PromptResponse, ThoughtResponse import os from dotenv import load_dotenv # Load environment variables load_dotenv() # Initialize DeepSeek client client = OpenAI( api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com" ) def get_deepseek_cost(model: str, input_tokens: int, output_tokens: int) -> float: """ Calculate the cost for Gemini API usage. Args: model: The model name/alias used input_tokens: Number of input tokens output_tokens: Number of output tokens Returns: float: Total cost in dollars """ cost_map = MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS.get(model) if not cost_map: return 0.0 input_cost = (input_tokens / 1_000_000) * cost_map["input"] output_cost = (output_tokens / 1_000_000) * cost_map["output"] return round(input_cost + output_cost, 6) def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: """ Send a prompt to DeepSeek and get detailed benchmarking response. """ try: with timeit() as t: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], stream=False, ) elapsed_ms = t() input_tokens = response.usage.prompt_tokens output_tokens = response.usage.completion_tokens cost = get_deepseek_cost(model, input_tokens, output_tokens) return BenchPromptResponse( response=response.choices[0].message.content, tokens_per_second=0.0, # DeepSeek doesn't provide this info provider="deepseek", total_duration_ms=elapsed_ms, load_duration_ms=0.0, inputAndOutputCost=cost, ) except Exception as e: print(f"DeepSeek error: {str(e)}") return BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider="deepseek", total_duration_ms=0.0, load_duration_ms=0.0, errored=True, ) def text_prompt(prompt: str, model: str) -> PromptResponse: """ Send a prompt to DeepSeek and get the response. """ try: with timeit() as t: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], stream=False, ) elapsed_ms = t() input_tokens = response.usage.prompt_tokens output_tokens = response.usage.completion_tokens cost = get_deepseek_cost(model, input_tokens, output_tokens) return PromptResponse( response=response.choices[0].message.content, runTimeMs=elapsed_ms, inputAndOutputCost=cost, ) except Exception as e: print(f"DeepSeek error: {str(e)}") return PromptResponse( response=f"Error: {str(e)}", runTimeMs=0.0, inputAndOutputCost=0.0, ) def thought_prompt(prompt: str, model: str) -> ThoughtResponse: """ Send a thought prompt to DeepSeek and parse structured response. """ try: # Validate model if model != "deepseek-reasoner": raise ValueError(f"Invalid model for thought prompts: {model}. Must use 'deepseek-reasoner'") # Make API call with reasoning_content=True with timeit() as t: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], extra_body={"reasoning_content": True}, # Enable structured reasoning stream=False, ) elapsed_ms = t() # Extract content and reasoning message = response.choices[0].message thoughts = getattr(message, "reasoning_content", "") response_content = message.content # Validate required fields if not thoughts or not response_content: raise ValueError("Missing thoughts or response in API response") # Calculate costs input_tokens = response.usage.prompt_tokens output_tokens = response.usage.completion_tokens cost = get_deepseek_cost("deepseek-reasoner", input_tokens, output_tokens) return ThoughtResponse( thoughts=thoughts, response=response_content, error=None, ) except Exception as e: print(f"DeepSeek thought error: {str(e)}") return ThoughtResponse( thoughts=f"Error processing request: {str(e)}", response="", error=str(e) ) </file> <file path="exbench_module.py"> # ------------------------- Imports ------------------------- from typing import List, Optional from datetime import datetime from pathlib import Path import time from concurrent.futures import ThreadPoolExecutor from modules.data_types import ( ExecEvalBenchmarkFile, ExecEvalBenchmarkCompleteResult, ExeEvalBenchmarkOutputResult, ExecEvalBenchmarkModelReport, ExecEvalBenchmarkReport, ExecEvalPromptIteration, ModelAlias, ExeEvalType, ModelProvider, BenchPromptResponse, ) from modules.ollama_llm import bench_prompt from modules.execution_evaluators import ( execute_python_code, eval_result_compare, ) from utils import parse_markdown_backticks from modules import ( ollama_llm, anthropic_llm, deepseek_llm, gemini_llm, openai_llm, fireworks_llm, ) provider_delimiter = "~" def parse_model_string(model: str) -> tuple[str, str]: """ Parse model string into provider and model name. Format: "provider:model_name" or "model_name" (defaults to ollama) Raises: ValueError: If provider is not supported """ if provider_delimiter not in model: # Default to ollama if no provider specified return "ollama", model provider, *model_parts = model.split(provider_delimiter) model_name = provider_delimiter.join(model_parts) # Validate provider supported_providers = [ "ollama", "anthropic", "deepseek", "openai", "gemini", "fireworks", # "mlx", # "groq", ] if provider not in supported_providers: raise ValueError( f"Unsupported provider: {provider}. " f"Supported providers are: {', '.join(supported_providers)}" ) return provider, model_name # ------------------------- File Operations ------------------------- def save_report_to_file( report: ExecEvalBenchmarkReport, output_dir: str = "reports" ) -> str: """Save benchmark report to file with standardized naming. Args: report: The benchmark report to save output_dir: Directory to save the report in Returns: Path to the saved report file """ # Create output directory if it doesn't exist Path(output_dir).mkdir(exist_ok=True) # Generate filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_benchmark_name = report.benchmark_name.replace(" ", "_") report_filename = f"{output_dir}/{safe_benchmark_name}_{timestamp}.json" # Save report with open(report_filename, "w") as f: f.write(report.model_dump_json(indent=4)) return report_filename # ------------------------- Benchmark Execution ------------------------- provider_bench_functions = { "ollama": ollama_llm.bench_prompt, "anthropic": anthropic_llm.bench_prompt, "deepseek": deepseek_llm.bench_prompt, "openai": openai_llm.bench_prompt, "gemini": gemini_llm.bench_prompt, "fireworks": fireworks_llm.bench_prompt, } def process_single_prompt( prompt_row, benchmark_file, provider, model_name, index, total_tests ): print(f" Running test {index}/{total_tests}...") prompt = benchmark_file.base_prompt if prompt_row.dynamic_variables: for key, value in prompt_row.dynamic_variables.items(): prompt = prompt.replace(f"{{{{{key}}}}}", str(value)) bench_response = None max_retries = 3 delay = 1 for attempt in range(max_retries + 1): try: bench_response = provider_bench_functions[provider](prompt, model_name) break except Exception as e: if attempt < max_retries: print(f"Retry {attempt+1} for test {index} due to error: {str(e)}") time.sleep(delay * (attempt + 1)) else: print(f"All retries failed for test {index}") bench_response = BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider=provider, total_duration_ms=0.0, load_duration_ms=0.0, errored=True, ) backtick_parsed_response = parse_markdown_backticks(bench_response.response) execution_result = "" expected_result = str(prompt_row.expectation).strip() correct = False try: if benchmark_file.evaluator == ExeEvalType.execute_python_code_with_num_output: execution_result = execute_python_code(backtick_parsed_response) parsed_execution_result = str(execution_result).strip() correct = eval_result_compare( benchmark_file.evaluator, expected_result, parsed_execution_result ) elif ( benchmark_file.evaluator == ExeEvalType.execute_python_code_with_string_output ): execution_result = execute_python_code(backtick_parsed_response) correct = eval_result_compare( benchmark_file.evaluator, expected_result, execution_result ) elif benchmark_file.evaluator == ExeEvalType.raw_string_evaluator: execution_result = backtick_parsed_response correct = eval_result_compare( benchmark_file.evaluator, expected_result, execution_result ) elif benchmark_file.evaluator == "json_validator_eval": # For JSON validator, no code execution is needed; # use the response directly and compare the JSON objects. execution_result = backtick_parsed_response # expectation is assumed to be a dict (or JSON string convertible to dict) expected_result = prompt_row.expectation correct = eval_result_compare( "json_validator_eval", expected_result, execution_result ) elif ( benchmark_file.evaluator == ExeEvalType.python_print_execution_with_num_output ): wrapped_code = f"print({backtick_parsed_response})" execution_result = execute_python_code(wrapped_code) correct = eval_result_compare( ExeEvalType.execute_python_code_with_num_output, expected_result, execution_result.strip(), ) else: raise ValueError(f"Unsupported evaluator: {benchmark_file.evaluator}") except Exception as e: print(f"Error executing code in test {index}: {e}") execution_result = str(e) correct = False return ExeEvalBenchmarkOutputResult( input_prompt=prompt, prompt_response=bench_response, execution_result=str(execution_result), expected_result=str(expected_result), model=f"{provider}{provider_delimiter}{model_name}", correct=correct, index=index, ) def run_benchmark_for_model( model: str, benchmark_file: ExecEvalBenchmarkFile ) -> List[ExeEvalBenchmarkOutputResult]: results = [] total_tests = len(benchmark_file.prompts) try: provider, model_name = parse_model_string(model) except ValueError as e: print(f"Invalid model string {model}: {str(e)}") return [] print(f"Running benchmark with provider: {provider}, model: {model_name}") if provider == "ollama": # Sequential processing for Ollama for i, prompt_row in enumerate(benchmark_file.prompts, 1): result = process_single_prompt( prompt_row, benchmark_file, provider, model_name, i, total_tests ) results.append(result) else: # Parallel processing for other providers with ThreadPoolExecutor(max_workers=50) as executor: futures = [] for i, prompt_row in enumerate(benchmark_file.prompts, 1): futures.append( executor.submit( process_single_prompt, prompt_row, benchmark_file, provider, model_name, i, total_tests, ) ) for future in futures: results.append(future.result()) return results # ------------------------- Report Generation ------------------------- def generate_report( complete_result: ExecEvalBenchmarkCompleteResult, ) -> ExecEvalBenchmarkReport: model_reports = [] # Group results by model model_results = {} for result in complete_result.results: if result.model not in model_results: model_results[result.model] = [] model_results[result.model].append(result) # Create model reports for model, results in model_results.items(): correct_count = sum(1 for r in results if r.correct) incorrect_count = len(results) - correct_count accuracy = correct_count / len(results) avg_tokens_per_second = sum( r.prompt_response.tokens_per_second for r in results ) / len(results) avg_total_duration = sum( r.prompt_response.total_duration_ms for r in results ) / len(results) avg_load_duration = sum( r.prompt_response.load_duration_ms for r in results ) / len(results) model_total_cost = 0 try: model_total_cost = sum( ( r.prompt_response.inputAndOutputCost if hasattr(r.prompt_response, "inputAndOutputCost") else 0.0 ) for r in results ) except: print(f"Error calculating model_total_cost for model: {model}") model_total_cost = 0 model_reports.append( ExecEvalBenchmarkModelReport( model=model, results=results, correct_count=correct_count, incorrect_count=incorrect_count, accuracy=accuracy, average_tokens_per_second=avg_tokens_per_second, average_total_duration_ms=avg_total_duration, average_load_duration_ms=avg_load_duration, total_cost=model_total_cost, ) ) # Calculate overall statistics overall_correct = sum(r.correct_count for r in model_reports) overall_incorrect = sum(r.incorrect_count for r in model_reports) overall_accuracy = overall_correct / (overall_correct + overall_incorrect) avg_tokens_per_second = sum( r.average_tokens_per_second for r in model_reports ) / len(model_reports) avg_total_duration = sum(r.average_total_duration_ms for r in model_reports) / len( model_reports ) avg_load_duration = sum(r.average_load_duration_ms for r in model_reports) / len( model_reports ) return ExecEvalBenchmarkReport( benchmark_name=complete_result.benchmark_file.benchmark_name, purpose=complete_result.benchmark_file.purpose, base_prompt=complete_result.benchmark_file.base_prompt, prompt_iterations=[ ExecEvalPromptIteration( dynamic_variables=( prompt.dynamic_variables if prompt.dynamic_variables is not None else {} ), expectation=prompt.expectation, ) for prompt in complete_result.benchmark_file.prompts ], models=model_reports, overall_correct_count=overall_correct, overall_incorrect_count=overall_incorrect, overall_accuracy=overall_accuracy, average_tokens_per_second=avg_tokens_per_second, average_total_duration_ms=avg_total_duration, average_load_duration_ms=avg_load_duration, ) </file> <file path="execution_evaluators.py"> import subprocess from modules.data_types import ExeEvalType import json from deepdiff import DeepDiff def eval_result_compare(evalType: ExeEvalType, expected: str, actual: str) -> bool: """ Compare expected and actual results based on evaluation type. For numeric outputs, compare with a small epsilon tolerance. """ try: if ( evalType == ExeEvalType.execute_python_code_with_num_output or evalType == ExeEvalType.python_print_execution_with_num_output ): # Convert both values to float for numeric comparison expected_num = float(expected) actual_num = float(actual) epsilon = 1e-6 return abs(expected_num - actual_num) < epsilon elif evalType == ExeEvalType.execute_python_code_with_string_output: return str(expected).strip() == str(actual).strip() elif evalType == ExeEvalType.raw_string_evaluator: return str(expected).strip() == str(actual).strip() elif evalType == ExeEvalType.json_validator_eval: if not isinstance(expected, dict): expected = json.loads(expected) actual_parsed = json.loads(actual) if isinstance(actual, str) else actual print(f"Expected: {expected}") print(f"Actual: {actual_parsed}") deepdiffed = DeepDiff(expected, actual_parsed, ignore_order=False) print(f"DeepDiff: {deepdiffed}") return not deepdiffed else: return str(expected).strip() == str(actual).strip() except (ValueError, TypeError): return str(expected).strip() == str(actual).strip() def execute_python_code(code: str) -> str: """ Execute Python code and return the numeric output as a string. """ # Remove any surrounding quotes and whitespace code = code.strip().strip("'").strip('"') # Create a temporary file with the code import tempfile with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as tmp: tmp.write(code) tmp.flush() # Execute the temporary file using uv result = execute(f"uv run {tmp.name} --ignore-warnings") # Try to parse the result as a number try: # Remove any extra whitespace or newlines cleaned_result = result.strip() # Convert to float and back to string to normalize format return str(float(cleaned_result)) except (ValueError, TypeError): # If conversion fails, return the raw result return result def execute(code: str) -> str: """Execute the tests and return the output as a string.""" try: result = subprocess.run( code.split(), capture_output=True, text=True, ) if result.returncode != 0: return f"Error: {result.stderr}" return result.stdout except Exception as e: return f"Execution error: {str(e)}" </file> <file path="fireworks_llm.py"> import os import requests import json from modules.data_types import ( BenchPromptResponse, PromptResponse, ThoughtResponse, ) from utils import deepseek_r1_distil_separate_thoughts_and_response import time from dotenv import load_dotenv load_dotenv() FIREWORKS_API_KEY = os.getenv("FIREWORKS_AI_API_KEY", "") API_URL = "https://api.fireworks.ai/inference/v1/completions" def get_fireworks_cost(model: str, input_tokens: int, output_tokens: int) -> float: # For now, just return 0.0 or substitute a real cost calculation if available return 0.0 def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: start_time = time.time() headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {FIREWORKS_API_KEY}", } payload = { "model": model, "max_tokens": 20480, "prompt": prompt, "temperature": 0.2, } response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) end_time = time.time() resp_json = response.json() content = "" if "choices" in resp_json and len(resp_json["choices"]) > 0: content = resp_json["choices"][0].get("text", "") return BenchPromptResponse( response=content, tokens_per_second=0.0, # or compute if available provider="fireworks", total_duration_ms=(end_time - start_time) * 1000, load_duration_ms=0.0, errored=not response.ok, ) def text_prompt(prompt: str, model: str) -> PromptResponse: headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {FIREWORKS_API_KEY}", } payload = { "model": model, "max_tokens": 20480, "prompt": prompt, "temperature": 0.0, } response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) resp_json = response.json() print("resp_json", resp_json) # Extract just the text from the first choice content = "" if "choices" in resp_json and len(resp_json["choices"]) > 0: content = resp_json["choices"][0].get("text", "") return PromptResponse( response=content, runTimeMs=0, # or compute if desired inputAndOutputCost=0.0, # or compute if you have cost details ) def thought_prompt(prompt: str, model: str) -> ThoughtResponse: headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {FIREWORKS_API_KEY}", } payload = { "model": model, "max_tokens": 20480, "prompt": prompt, "temperature": 0.2, } response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) resp_json = response.json() content = "" if "choices" in resp_json and len(resp_json["choices"]) > 0: content = resp_json["choices"][0].get("text", "") if "r1" in model: thoughts, response_content = deepseek_r1_distil_separate_thoughts_and_response( content ) else: thoughts = "" response_content = content return ThoughtResponse( thoughts=thoughts, response=response_content, error=None if response.ok else str(resp_json.get("error", "Unknown error")), ) </file> <file path="gemini_llm.py"> import google.generativeai as genai from google import genai as genai2 import os import json from modules.tools import gemini_tools_list from modules.data_types import ( PromptResponse, SimpleToolCall, ModelAlias, ToolsAndPrompts, ThoughtResponse, ) from utils import ( parse_markdown_backticks, timeit, MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS, ) from modules.data_types import ToolCallResponse, BenchPromptResponse from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Initialize Gemini client genai.configure(api_key=os.getenv("GEMINI_API_KEY")) def get_gemini_cost(model: str, input_tokens: int, output_tokens: int) -> float: """ Calculate the cost for Gemini API usage. Args: model: The model name/alias used input_tokens: Number of input tokens output_tokens: Number of output tokens Returns: float: Total cost in dollars """ cost_map = MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS.get(model) if not cost_map: return 0.0 input_cost = (input_tokens / 1_000_000) * cost_map["input"] output_cost = (output_tokens / 1_000_000) * cost_map["output"] return round(input_cost + output_cost, 6) def thought_prompt(prompt: str, model: str) -> ThoughtResponse: """ Handle thought prompts for Gemini thinking models. """ try: # Validate model if model != "gemini-2.0-flash-thinking-exp-01-21": raise ValueError( f"Invalid model for thought prompts: {model}. Must use 'gemini-2.0-flash-thinking-exp-01-21'" ) # Configure thinking model config = {"thinking_config": {"include_thoughts": True}} client = genai2.Client( api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"} ) with timeit() as t: response = client.models.generate_content( model=model, contents=prompt, config=config ) elapsed_ms = t() # Parse thoughts and response thoughts = [] response_content = [] for part in response.candidates[0].content.parts: if hasattr(part, "thought") and part.thought: thoughts.append(part.text) else: response_content.append(part.text) return ThoughtResponse( thoughts="\n".join(thoughts), response="\n".join(response_content), error=None, ) except Exception as e: print(f"Gemini thought error: {str(e)}") return ThoughtResponse( thoughts=f"Error processing request: {str(e)}", response="", error=str(e) ) def text_prompt(prompt: str, model: str) -> PromptResponse: """ Send a prompt to Gemini and get a response. """ try: with timeit() as t: gemini_model = genai.GenerativeModel(model_name=model) response = gemini_model.generate_content(prompt) elapsed_ms = t() input_tokens = response._result.usage_metadata.prompt_token_count output_tokens = response._result.usage_metadata.candidates_token_count cost = get_gemini_cost(model, input_tokens, output_tokens) return PromptResponse( response=response.text, runTimeMs=elapsed_ms, inputAndOutputCost=cost, ) except Exception as e: print(f"Gemini error: {str(e)}") return PromptResponse( response=f"Error: {str(e)}", runTimeMs=0.0, inputAndOutputCost=0.0 ) def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: """ Send a prompt to Gemini and get detailed benchmarking response. """ try: with timeit() as t: gemini_model = genai.GenerativeModel(model_name=model) response = gemini_model.generate_content(prompt) elapsed_ms = t() input_tokens = response._result.usage_metadata.prompt_token_count output_tokens = response._result.usage_metadata.candidates_token_count cost = get_gemini_cost(model, input_tokens, output_tokens) return BenchPromptResponse( response=response.text, tokens_per_second=0.0, # Gemini doesn't provide timing info provider="gemini", total_duration_ms=elapsed_ms, load_duration_ms=0.0, inputAndOutputCost=cost, ) except Exception as e: print(f"Gemini error: {str(e)}") return BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider="gemini", total_duration_ms=0.0, load_duration_ms=0.0, inputAndOutputCost=0.0, errored=True, ) def tool_prompt(prompt: str, model: str, force_tools: list[str]) -> ToolCallResponse: """ Run a chat model with tool calls using Gemini's API. Now supports JSON structured output variants by parsing the response. """ with timeit() as t: if "-json" in model: # Initialize model for JSON output base_model = model.replace("-json", "") if model == "gemini-exp-1114-json": base_model = "gemini-exp-1114" # Map to actual model name gemini_model = genai.GenerativeModel( model_name=base_model, ) # Send message and get JSON response chat = gemini_model.start_chat() response = chat.send_message(prompt) try: # Parse raw response text into ToolsAndPrompts model parsed_response = ToolsAndPrompts.model_validate_json( parse_markdown_backticks(response.text) ) tool_calls = [ SimpleToolCall( tool_name=tap.tool_name, params={"prompt": tap.prompt} ) for tap in parsed_response.tools_and_prompts ] except Exception as e: print(f"Failed to parse JSON response: {e}") tool_calls = [] else: # Original implementation using function calling gemini_model = genai.GenerativeModel( model_name=model, tools=gemini_tools_list ) chat = gemini_model.start_chat(enable_automatic_function_calling=True) response = chat.send_message(prompt) tool_calls = [] for part in response.parts: if hasattr(part, "function_call"): fc = part.function_call tool_calls.append(SimpleToolCall(tool_name=fc.name, params=fc.args)) # Extract token counts and calculate cost usage_metadata = response._result.usage_metadata input_tokens = usage_metadata.prompt_token_count output_tokens = usage_metadata.candidates_token_count cost = get_gemini_cost(model, input_tokens, output_tokens) return ToolCallResponse( tool_calls=tool_calls, runTimeMs=t(), inputAndOutputCost=cost ) </file> <file path="llm_models.py"> import llm from dotenv import load_dotenv import os from modules import ollama_llm from modules.data_types import ( ModelAlias, PromptResponse, PromptWithToolCalls, ToolCallResponse, ThoughtResponse, ) from modules import openai_llm, gemini_llm, deepseek_llm, fireworks_llm from utils import MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS from modules.tools import all_tools_list from modules import anthropic_llm # Load environment variables from .env file load_dotenv() def simple_prompt(prompt_str: str, model_alias_str: str) -> PromptResponse: parts = model_alias_str.split(":", 1) if len(parts) < 2: raise ValueError("No provider prefix found in model string") provider = parts[0] model_name = parts[1] # For special predictive cases: if provider == "openai" and model_name in [ "gpt-4o-predictive", "gpt-4o-mini-predictive", ]: # Remove -predictive suffix when passing to API clean_model_name = model_name.replace("-predictive", "") return openai_llm.predictive_prompt(prompt_str, prompt_str, clean_model_name) if provider == "openai": return openai_llm.text_prompt(prompt_str, model_name) elif provider == "ollama": return ollama_llm.text_prompt(prompt_str, model_name) elif provider == "anthropic": return anthropic_llm.text_prompt(prompt_str, model_name) elif provider == "gemini": return gemini_llm.text_prompt(prompt_str, model_name) elif provider == "deepseek": return deepseek_llm.text_prompt(prompt_str, model_name) elif provider == "fireworks": return fireworks_llm.text_prompt(prompt_str, model_name) else: raise ValueError(f"Unsupported provider: {provider}") def tool_prompt(prompt: PromptWithToolCalls) -> ToolCallResponse: model_str = str(prompt.model) parts = model_str.split(":", 1) if len(parts) < 2: raise ValueError("No provider prefix found in model string") provider = parts[0] model_name = parts[1] if provider == "openai": return openai_llm.tool_prompt(prompt.prompt, model_name, all_tools_list) elif provider == "anthropic": return anthropic_llm.tool_prompt(prompt.prompt, model_name) elif provider == "gemini": return gemini_llm.tool_prompt(prompt.prompt, model_name, all_tools_list) elif provider == "deepseek": raise ValueError("DeepSeek does not support tool calls") elif provider == "ollama": raise ValueError("Ollama does not support tool calls") else: raise ValueError(f"Unsupported provider for tool calls: {provider}") def thought_prompt(prompt: str, model: str) -> ThoughtResponse: """ Handle thought prompt requests with specialized parsing for supported models. Fall back to standard text prompts for other models. """ parts = model.split(":", 1) if len(parts) < 2: raise ValueError("No provider prefix found in model string") provider = parts[0] model_name = parts[1] try: if provider == "deepseek": if model_name != "deepseek-reasoner": # Fallback to standard text prompt for non-reasoner models text_response = simple_prompt(prompt, model) return ThoughtResponse( thoughts="", response=text_response.response, error=None ) # Proceed with reasoner-specific processing response = deepseek_llm.thought_prompt(prompt, model_name) return response elif provider == "gemini": if model_name != "gemini-2.0-flash-thinking-exp-01-21": # Fallback to standard text prompt for non-thinking models text_response = simple_prompt(prompt, model) return ThoughtResponse( thoughts="", response=text_response.response, error=None ) # Proceed with thinking-specific processing response = gemini_llm.thought_prompt(prompt, model_name) return response elif provider == "ollama": if "deepseek-r1" not in model_name: # Fallback to standard text prompt for non-R1 models text_response = simple_prompt(prompt, model) return ThoughtResponse( thoughts="", response=text_response.response, error=None ) # Proceed with R1-specific processing response = ollama_llm.thought_prompt(prompt, model_name) return response elif provider == "fireworks": text_response = simple_prompt(prompt, model) return ThoughtResponse( thoughts="", response=text_response.response, error=None ) else: # For all other providers, use standard text prompt and wrap in ThoughtResponse text_response = simple_prompt(prompt, model) return ThoughtResponse( thoughts="", response=text_response.response, error=None ) except Exception as e: return ThoughtResponse( thoughts=f"Error processing request: {str(e)}", response="", error=str(e) ) </file> <file path="ollama_llm.py"> from ollama import chat from modules.data_types import PromptResponse, BenchPromptResponse, ThoughtResponse from utils import timeit, deepseek_r1_distil_separate_thoughts_and_response import json def text_prompt(prompt: str, model: str) -> PromptResponse: """ Send a prompt to Ollama and get a response. """ try: with timeit() as t: response = chat( model=model, messages=[ { "role": "user", "content": prompt, }, ], ) elapsed_ms = t() return PromptResponse( response=response.message.content, runTimeMs=elapsed_ms, # Now using actual timing inputAndOutputCost=0.0, # Ollama is free ) except Exception as e: print(f"Ollama error: {str(e)}") return PromptResponse( response=f"Error: {str(e)}", runTimeMs=0, inputAndOutputCost=0.0 ) def get_ollama_costs() -> tuple[int, int]: """ Return token costs for Ollama (always 0 since it's free) """ return 0, 0 def thought_prompt(prompt: str, model: str) -> ThoughtResponse: """ Handle thought prompts for DeepSeek R1 models running on Ollama. """ try: # Validate model name contains deepseek-r1 if "deepseek-r1" not in model: raise ValueError( f"Model {model} not supported for thought prompts. Must contain 'deepseek-r1'" ) with timeit() as t: # Get raw response from Ollama response = chat( model=model, messages=[ { "role": "user", "content": prompt, }, ], ) # Extract content and parse thoughts/response content = response.message.content thoughts, response_content = ( deepseek_r1_distil_separate_thoughts_and_response(content) ) return ThoughtResponse( thoughts=thoughts, response=response_content, error=None, ) except Exception as e: print(f"Ollama thought error ({model}): {str(e)}") return ThoughtResponse( thoughts=f"Error processing request: {str(e)}", response="", error=str(e) ) def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: """ Send a prompt to Ollama and get detailed benchmarking response. """ try: response = chat( model=model, messages=[ { "role": "user", "content": prompt, }, ], ) # Calculate tokens per second using eval_count and eval_duration eval_count = response.get("eval_count", 0) eval_duration_ns = response.get("eval_duration", 0) # Convert nanoseconds to seconds and calculate tokens per second eval_duration_s = eval_duration_ns / 1_000_000_000 tokens_per_second = eval_count / eval_duration_s if eval_duration_s > 0 else 0 # Create BenchPromptResponse bench_response = BenchPromptResponse( response=response.message.content, tokens_per_second=tokens_per_second, provider="ollama", total_duration_ms=response.get("total_duration", 0) / 1_000_000, # Convert ns to ms load_duration_ms=response.get("load_duration", 0) / 1_000_000, # Convert ns to ms inputAndOutputCost=0.0, # Ollama is free ) # print(json.dumps(bench_response.dict(), indent=2)) return bench_response except Exception as e: print(f"Ollama error: {str(e)}") return BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider="ollama", total_duration_ms=0.0, load_duration_ms=0.0, errored=True, ) </file> <file path="openai_llm.py"> import openai import os import json from modules.tools import openai_tools_list from modules.data_types import SimpleToolCall, ToolsAndPrompts from utils import parse_markdown_backticks, timeit, parse_reasoning_effort from modules.data_types import ( PromptResponse, ModelAlias, ToolCallResponse, BenchPromptResponse, ) from utils import MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS from modules.tools import all_tools_list from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() openai_client: openai.OpenAI = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # reasoning_effort_enabled_models = [ # "o3-mini", # "o1", # ] def get_openai_cost(model: str, input_tokens: int, output_tokens: int) -> float: """ Calculate the cost for OpenAI API usage. Args: model: The model name/alias used input_tokens: Number of input tokens output_tokens: Number of output tokens Returns: float: Total cost in dollars """ # Direct model name lookup first model_alias = model # Only do special mapping for gpt-4 variants if "gpt-4" in model: if model == "gpt-4o-mini": model_alias = ModelAlias.gpt_4o_mini elif model == "gpt-4o": model_alias = ModelAlias.gpt_4o else: model_alias = ModelAlias.gpt_4o cost_map = MAP_MODEL_ALIAS_TO_COST_PER_MILLION_TOKENS.get(model_alias) if not cost_map: print(f"No cost map found for model: {model}") return 0.0 input_cost = (input_tokens / 1_000_000) * float(cost_map["input"]) output_cost = (output_tokens / 1_000_000) * float(cost_map["output"]) # print( # f"model: {model}, input_cost: {input_cost}, output_cost: {output_cost}, total_cost: {input_cost + output_cost}, total_cost_rounded: {round(input_cost + output_cost, 6)}" # ) return round(input_cost + output_cost, 6) def tool_prompt(prompt: str, model: str, force_tools: list[str]) -> ToolCallResponse: """ Run a chat model forcing specific tool calls. Now supports JSON structured output variants. """ base_model, reasoning_effort = parse_reasoning_effort(model) with timeit() as t: if base_model == "o1-mini-json": # Manual JSON parsing for o1-mini completion = openai_client.chat.completions.create( model="o1-mini", messages=[{"role": "user", "content": prompt}], ) try: # Parse raw response text into ToolsAndPrompts model parsed_response = ToolsAndPrompts.model_validate_json( parse_markdown_backticks(completion.choices[0].message.content) ) tool_calls = [ SimpleToolCall( tool_name=tap.tool_name.value, params={"prompt": tap.prompt} ) for tap in parsed_response.tools_and_prompts ] except Exception as e: print(f"Failed to parse JSON response: {e}") tool_calls = [] elif "-json" in base_model: # Use structured output for JSON variants completion = openai_client.beta.chat.completions.parse( model=base_model.replace("-json", ""), messages=[{"role": "user", "content": prompt}], response_format=ToolsAndPrompts, ) try: tool_calls = [ SimpleToolCall( tool_name=tap.tool_name.value, params={"prompt": tap.prompt} ) for tap in completion.choices[0].message.parsed.tools_and_prompts ] except Exception as e: print(f"Failed to parse JSON response: {e}") tool_calls = [] else: # Original implementation for function calling completion = openai_client.chat.completions.create( model=base_model, messages=[{"role": "user", "content": prompt}], tools=openai_tools_list, tool_choice="required", ) tool_calls = [ SimpleToolCall( tool_name=tool_call.function.name, params=json.loads(tool_call.function.arguments), ) for tool_call in completion.choices[0].message.tool_calls or [] ] # Calculate costs input_tokens = completion.usage.prompt_tokens output_tokens = completion.usage.completion_tokens cost = get_openai_cost(model, input_tokens, output_tokens) return ToolCallResponse( tool_calls=tool_calls, runTimeMs=t(), inputAndOutputCost=cost ) def bench_prompt(prompt: str, model: str) -> BenchPromptResponse: """ Send a prompt to OpenAI and get detailed benchmarking response. """ base_model, reasoning_effort = parse_reasoning_effort(model) try: with timeit() as t: if reasoning_effort: completion = openai_client.chat.completions.create( model=base_model, reasoning_effort=reasoning_effort, messages=[{"role": "user", "content": prompt}], stream=False, ) else: completion = openai_client.chat.completions.create( model=base_model, messages=[{"role": "user", "content": prompt}], stream=False, ) elapsed_ms = t() input_tokens = completion.usage.prompt_tokens output_tokens = completion.usage.completion_tokens cost = get_openai_cost(base_model, input_tokens, output_tokens) return BenchPromptResponse( response=completion.choices[0].message.content, tokens_per_second=0.0, # OpenAI doesn't provide timing info provider="openai", total_duration_ms=elapsed_ms, load_duration_ms=0.0, inputAndOutputCost=cost, ) except Exception as e: print(f"OpenAI error: {str(e)}") return BenchPromptResponse( response=f"Error: {str(e)}", tokens_per_second=0.0, provider="openai", total_duration_ms=0.0, load_duration_ms=0.0, inputAndOutputCost=0.0, errored=True, ) def predictive_prompt(prompt: str, prediction: str, model: str) -> PromptResponse: """ Run a chat model with a predicted output to reduce latency. Args: prompt (str): The prompt to send to the OpenAI API. prediction (str): The predicted output text. model (str): The model ID to use for the API call. Returns: PromptResponse: The response including text, runtime, and cost. """ base_model, reasoning_effort = parse_reasoning_effort(model) # Prepare the API call parameters outside the timing block messages = [{"role": "user", "content": prompt}] prediction_param = {"type": "content", "content": prediction} # Only time the actual API call with timeit() as t: completion = openai_client.chat.completions.create( model=base_model, reasoning_effort=reasoning_effort, messages=messages, prediction=prediction_param, ) # Process results after timing block input_tokens = completion.usage.prompt_tokens output_tokens = completion.usage.completion_tokens cost = get_openai_cost(base_model, input_tokens, output_tokens) return PromptResponse( response=completion.choices[0].message.content, runTimeMs=t(), # Get the elapsed time of just the API call inputAndOutputCost=cost, ) def text_prompt(prompt: str, model: str) -> PromptResponse: """ Send a prompt to OpenAI and get a response. """ base_model, reasoning_effort = parse_reasoning_effort(model) try: with timeit() as t: if reasoning_effort: completion = openai_client.chat.completions.create( model=base_model, reasoning_effort=reasoning_effort, messages=[{"role": "user", "content": prompt}], ) else: completion = openai_client.chat.completions.create( model=base_model, messages=[{"role": "user", "content": prompt}], ) print("completion.usage", completion.usage.model_dump()) input_tokens = completion.usage.prompt_tokens output_tokens = completion.usage.completion_tokens cost = get_openai_cost(base_model, input_tokens, output_tokens) return PromptResponse( response=completion.choices[0].message.content, runTimeMs=t(), inputAndOutputCost=cost, ) except Exception as e: print(f"OpenAI error: {str(e)}") return PromptResponse( response=f"Error: {str(e)}", runTimeMs=0.0, inputAndOutputCost=0.0 ) </file> <file path="tools.py"> def run_coder_agent(prompt: str) -> str: """ Run the coder agent with the given prompt. Args: prompt (str): The input prompt for the coder agent Returns: str: The response from the coder agent """ return "run_coder_agent" def run_git_agent(prompt: str) -> str: """ Run the git agent with the given prompt. Args: prompt (str): The input prompt for the git agent Returns: str: The response from the git agent """ return "run_git_agent" def run_docs_agent(prompt: str) -> str: """ Run the docs agent with the given prompt. Args: prompt (str): The input prompt for the docs agent Returns: str: The response from the docs agent """ return "run_docs_agent" # Gemini tools list gemini_tools_list = [ { "function_declarations": [ { "name": "run_coder_agent", "description": "Run the coding agent with the given prompt. Use this when the user needs help writing, reviewing, or modifying code.", "parameters": { "type_": "OBJECT", "properties": { "prompt": { "type_": "STRING", "description": "The input prompt that describes what to code for the coder agent" } }, "required": ["prompt"] } }, { "name": "run_git_agent", "description": "Run the git agent with the given prompt. Use this when the user needs help with git operations, commits, or repository management.", "parameters": { "type_": "OBJECT", "properties": { "prompt": { "type_": "STRING", "description": "The input prompt that describes what to commit for the git agent" } }, "required": ["prompt"] } }, { "name": "run_docs_agent", "description": "Run the documentation agent with the given prompt. Use this when the user needs help creating, updating, or reviewing documentation.", "parameters": { "type_": "OBJECT", "properties": { "prompt": { "type_": "STRING", "description": "The input prompt that describes what to document for the documentation agent" } }, "required": ["prompt"] } } ] } ] # OpenAI tools list openai_tools_list = [ { "type": "function", "function": { "name": "run_coder_agent", "description": "Run the coding agent with the given prompt", "parameters": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to code for the coder agent", } }, "required": ["prompt"], }, }, }, { "type": "function", "function": { "name": "run_git_agent", "description": "Run the git agent with the given prompt", "parameters": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to commit for the git agent", } }, "required": ["prompt"], }, }, }, { "type": "function", "function": { "name": "run_docs_agent", "description": "Run the documentation agent with the given prompt", "parameters": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to document for the documentation agent", } }, "required": ["prompt"], }, }, }, ] anthropic_tools_list = [ { "name": "run_coder_agent", "description": "Run the coding agent with the given prompt", "input_schema": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to code for the coder agent", } }, "required": ["prompt"] } }, { "name": "run_git_agent", "description": "Run the git agent with the given prompt", "input_schema": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to commit for the git agent", } }, "required": ["prompt"] } }, { "name": "run_docs_agent", "description": "Run the documentation agent with the given prompt", "input_schema": { "type": "object", "properties": { "prompt": { "type": "string", "description": "The input prompt that describes what to document for the documentation agent", } }, "required": ["prompt"] } } ] all_tools_list = [d["function"]["name"] for d in openai_tools_list] </file> </files>

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/disler/just-prompt'

If you have feedback or need assistance with the MCP directory API, please join our Discord server