ML Lab MCP

benchmarks.py•6.4 KiB

"""Evaluation benchmarks for model testing.""" from __future__ import annotations import asyncio import json from abc import ABC, abstractmethod from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any @dataclass class EvalResult: """Result from running an evaluation.""" benchmark_name: str model_name: str score: float max_score: float metrics: dict[str, float] = field(default_factory=dict) samples_evaluated: int = 0 errors: int = 0 timestamp: datetime = field(default_factory=datetime.utcnow) details: list[dict[str, Any]] = field(default_factory=list) class Benchmark(ABC): """Abstract base class for evaluation benchmarks.""" @property @abstractmethod def name(self) -> str: """Benchmark name.""" ... @property @abstractmethod def description(self) -> str: """Benchmark description.""" ... @abstractmethod async def evaluate( self, model_path: str, num_samples: int | None = None, ) -> EvalResult: """ Run the benchmark on a model. Args: model_path: Path to the model or model name. num_samples: Optional limit on samples to evaluate. Returns: EvalResult with scores and metrics. """ ... class CustomEvalBenchmark(Benchmark): """ Custom evaluation benchmark from a JSONL file. Each line should have: - "input": The input prompt - "expected": The expected output or acceptable outputs - "category": Optional category for breakdown """ def __init__(self, eval_file: str, name: str | None = None): self.eval_file = Path(eval_file) self._name = name or self.eval_file.stem @property def name(self) -> str: return self._name @property def description(self) -> str: return f"Custom evaluation from {self.eval_file.name}" async def evaluate( self, model_path: str, num_samples: int | None = None, ) -> EvalResult: """Run custom evaluation.""" samples = [] with open(self.eval_file) as f: for line in f: if line.strip(): samples.append(json.loads(line)) if num_samples and len(samples) >= num_samples: break # This would integrate with the model for inference # For now, return a placeholder structure return EvalResult( benchmark_name=self.name, model_name=model_path, score=0.0, max_score=len(samples), samples_evaluated=len(samples), metrics={ "accuracy": 0.0, "samples": len(samples), }, ) class AccuracyBenchmark(Benchmark): """ Simple accuracy benchmark for classification/QA tasks. Compares model outputs to expected answers. """ def __init__( self, test_data: list[dict[str, Any]], name: str = "accuracy", ): self.test_data = test_data self._name = name @property def name(self) -> str: return self._name @property def description(self) -> str: return "Accuracy evaluation on test data" async def evaluate( self, model_path: str, num_samples: int | None = None, ) -> EvalResult: samples = self.test_data if num_samples: samples = samples[:num_samples] # Placeholder - would run inference return EvalResult( benchmark_name=self.name, model_name=model_path, score=0.0, max_score=len(samples), samples_evaluated=len(samples), ) class PerplexityBenchmark(Benchmark): """Measure perplexity on a test set.""" def __init__(self, test_file: str): self.test_file = Path(test_file) @property def name(self) -> str: return "perplexity" @property def description(self) -> str: return "Perplexity evaluation on held-out data" async def evaluate( self, model_path: str, num_samples: int | None = None, ) -> EvalResult: # Would compute actual perplexity return EvalResult( benchmark_name=self.name, model_name=model_path, score=0.0, # Lower is better for perplexity max_score=0.0, metrics={"perplexity": 0.0}, ) class EvalSuite: """ Collection of benchmarks for comprehensive evaluation. """ def __init__(self): self.benchmarks: list[Benchmark] = [] def add(self, benchmark: Benchmark) -> None: """Add a benchmark to the suite.""" self.benchmarks.append(benchmark) def add_custom(self, eval_file: str, name: str | None = None) -> None: """Add a custom evaluation from a file.""" self.add(CustomEvalBenchmark(eval_file, name)) async def run_all( self, model_path: str, num_samples: int | None = None, ) -> list[EvalResult]: """Run all benchmarks on a model.""" results = [] for benchmark in self.benchmarks: result = await benchmark.evaluate(model_path, num_samples) results.append(result) return results async def compare_models( self, model_paths: list[str], num_samples: int | None = None, ) -> dict[str, list[EvalResult]]: """Compare multiple models across all benchmarks.""" comparisons = {} for model_path in model_paths: results = await self.run_all(model_path, num_samples) comparisons[model_path] = results return comparisons def format_eval_results(results: list[EvalResult]) -> str: """Format evaluation results as a table.""" lines = ["Evaluation Results", "=" * 60] for result in results: lines.append(f"\n{result.benchmark_name}") lines.append("-" * 40) lines.append(f"Model: {result.model_name}") lines.append(f"Score: {result.score:.2f} / {result.max_score:.2f}") lines.append(f"Samples: {result.samples_evaluated} (errors: {result.errors})") if result.metrics: lines.append("Metrics:") for key, value in result.metrics.items(): lines.append(f" {key}: {value:.4f}") return "\n".join(lines)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PushPullCommitPush/ml-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

benchmarks.py•6.4 KiB