Domain Search MCP

eval_model.py•10 KiB

#!/usr/bin/env python3 """ Model Evaluation Script Runs the fine-tuned model on test prompts and evaluates the outputs using the eval framework. Compares against baseline. Usage: # Local model python eval_model.py --model_path training/output --samples 100 # Together.ai API python eval_model.py --together --model Qwen/Qwen2.5-72B-Instruct-Turbo --samples 100 """ import argparse import json import os import sys from datetime import datetime from pathlib import Path # Add parent to path sys.path.insert(0, str(Path(__file__).parent)) from eval.constraint_satisfaction import evaluate_batch as eval_constraints from eval.diversity_metrics import evaluate_batch as eval_diversity, evaluate_cross_batch_diversity, parse_domain_names from eval.pronounceability import evaluate_batch as eval_pronounceability from eval.premium_score import evaluate_batch as eval_premium def load_test_prompts(test_file: str, num_samples: int = 100) -> list[dict]: """Load test prompts from JSONL file.""" prompts = [] with open(test_file, 'r', encoding='utf-8') as f: for i, line in enumerate(f): if i >= num_samples: break data = json.loads(line) prompts.append({ "prompt": data["prompt"], "expected": data["response"], # Original response for reference "meta": data.get("meta", {}), }) return prompts def generate_with_local_model(model_path: str, prompts: list[dict], max_length: int = 512) -> list[dict]: """Generate responses using local fine-tuned model.""" import torch from transformers import AutoModelForCausalLM, AutoTokenizer print(f"Loading model from {model_path}...") tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", torch_dtype=torch.bfloat16, ) results = [] for i, item in enumerate(prompts): if (i + 1) % 10 == 0: print(f" Generating {i + 1}/{len(prompts)}...") messages = [{"role": "user", "content": item["prompt"]}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_length, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.pad_token_id, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant response if "<|im_start|>assistant" in response: response = response.split("<|im_start|>assistant")[-1].strip() if "<|im_end|>" in response: response = response.split("<|im_end|>")[0].strip() else: response = response[len(text):].strip() results.append({ "prompt": item["prompt"], "response": response, "meta": item.get("meta", {}), }) return results def generate_with_together(model: str, prompts: list[dict], api_key: str = None) -> list[dict]: """Generate responses using Together.ai API.""" import requests api_key = api_key or os.environ.get("TOGETHER_API_KEY") if not api_key: raise ValueError("TOGETHER_API_KEY not set") results = [] for i, item in enumerate(prompts): if (i + 1) % 10 == 0: print(f" Generating {i + 1}/{len(prompts)}...") response = requests.post( "https://api.together.ai/v1/chat/completions", headers={ "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, json={ "model": model, "messages": [{"role": "user", "content": item["prompt"]}], "temperature": 0.7, "max_tokens": 512, }, timeout=60, ) response.raise_for_status() data = response.json() results.append({ "prompt": item["prompt"], "response": data["choices"][0]["message"]["content"], "meta": item.get("meta", {}), }) return results def evaluate_results(samples: list[dict], name: str) -> dict: """Run evaluation on generated samples.""" print(f"\n{'='*60}") print(f"Evaluating: {name} ({len(samples)} samples)") print('='*60) results = { "name": name, "num_samples": len(samples), "timestamp": datetime.now().isoformat(), } # Run all evaluations print("\n[1/4] Constraint satisfaction...") constraint_results = eval_constraints(samples) results["constraints"] = constraint_results print(f" Score: {constraint_results['avg_overall']:.3f}") print("\n[2/4] Diversity...") diversity_results = eval_diversity(samples) results["diversity"] = diversity_results print(f" Score: {diversity_results['avg_overall']:.3f}") # Cross-batch diversity all_names = [] for sample in samples: all_names.extend(parse_domain_names(sample['response'])) cross_div = evaluate_cross_batch_diversity(all_names) results["cross_batch_diversity"] = { "total_names": cross_div["total_names"], "unique_names": cross_div["unique_names"], "unique_ratio": cross_div["unique_ratio"], } print(f" Cross-batch unique: {cross_div['unique_ratio']:.3f}") print("\n[3/4] Pronounceability...") pronounce_results = eval_pronounceability(samples) results["pronounceability"] = pronounce_results print(f" Score: {pronounce_results['avg_overall']:.3f}") print("\n[4/4] Brandability...") premium_results = eval_premium(samples) results["premium"] = premium_results print(f" Score: {premium_results['avg_overall']:.3f}") # Combined score combined = ( constraint_results['avg_overall'] * 0.30 + diversity_results['avg_overall'] * 0.20 + pronounce_results['avg_overall'] * 0.25 + premium_results['avg_overall'] * 0.25 ) results["combined_score"] = round(combined, 3) print(f"\n{'='*60}") print("SUMMARY") print('='*60) print(f" Constraint Satisfaction: {constraint_results['avg_overall']:.3f}") print(f" Diversity: {diversity_results['avg_overall']:.3f}") print(f" Pronounceability: {pronounce_results['avg_overall']:.3f}") print(f" Brandability: {premium_results['avg_overall']:.3f}") print(f" ---") print(f" COMBINED SCORE: {combined:.3f} ({combined*10:.1f}/10)") print('='*60) return results def compare_with_baseline(results: dict, baseline_file: str): """Compare results with baseline.""" if not os.path.exists(baseline_file): print(f"\nNo baseline found at {baseline_file}") return with open(baseline_file, 'r') as f: baseline = json.load(f) print(f"\n{'='*60}") print(f"COMPARISON: Baseline vs {results['name']}") print('='*60) metrics = [ ("Constraint Satisfaction", "constraints", "avg_overall"), ("Diversity", "diversity", "avg_overall"), ("Pronounceability", "pronounceability", "avg_overall"), ("Brandability", "premium", "avg_overall"), ("Combined Score", None, "combined_score"), ] for name, category, key in metrics: if category: v1 = baseline.get(category, {}).get(key, 0) v2 = results.get(category, {}).get(key, 0) else: v1 = baseline.get(key, 0) v2 = results.get(key, 0) diff = v2 - v1 arrow = "↑" if diff > 0.001 else "↓" if diff < -0.001 else "=" color = "\033[92m" if diff > 0.001 else "\033[91m" if diff < -0.001 else "" reset = "\033[0m" if color else "" print(f" {name:25s}: {v1:.3f} → {color}{v2:.3f}{reset} ({arrow} {abs(diff):.3f})") print('='*60) def main(): parser = argparse.ArgumentParser(description="Evaluate model on test prompts") parser.add_argument("--model_path", help="Path to local fine-tuned model") parser.add_argument("--together", action="store_true", help="Use Together.ai API") parser.add_argument("--model", default="Qwen/Qwen2.5-72B-Instruct-Turbo", help="Together.ai model name") parser.add_argument("--test_file", default="data/test.jsonl", help="Test prompts file") parser.add_argument("--samples", type=int, default=100, help="Number of samples to evaluate") parser.add_argument("--output", help="Output file for results") parser.add_argument("--baseline", default="results/baseline_dataset_quality.json", help="Baseline results file") args = parser.parse_args() # Load test prompts print(f"Loading {args.samples} test prompts from {args.test_file}...") prompts = load_test_prompts(args.test_file, args.samples) print(f"Loaded {len(prompts)} prompts") # Generate responses if args.together: print(f"\nGenerating with Together.ai ({args.model})...") samples = generate_with_together(args.model, prompts) model_name = args.model.replace("/", "_") elif args.model_path: print(f"\nGenerating with local model ({args.model_path})...") samples = generate_with_local_model(args.model_path, prompts) model_name = Path(args.model_path).name else: print("Error: Specify --model_path or --together") sys.exit(1) # Evaluate results = evaluate_results(samples, model_name) # Compare with baseline compare_with_baseline(results, args.baseline) # Save results output_file = args.output or f"results/{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {output_file}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dorukardahan/domain-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

eval_model.py•10 KiB