Skip to main content
Glama

MemOS-MCP

by qinshu1109
Apache 2.0
3
  • Linux
  • Apple
test_locomo_evaluation.py11.3 kB
#!/usr/bin/env python3 """ LoCoMo评测脚本测试 测试LoCoMo评测功能的基本运行 """ import os import sys import tempfile import shutil from pathlib import Path # 添加项目根目录到Python路径 sys.path.insert(0, str(Path(__file__).parent)) def test_basic_evaluation(): """测试基础评测功能""" print("🧪 测试基础评测功能") print("-" * 30) try: from locomo_evaluation import LoCoMoEvaluator # 使用临时目录 with tempfile.TemporaryDirectory() as temp_dir: print(f"使用临时目录: {temp_dir}") # 创建评测器 evaluator = LoCoMoEvaluator( data_dir=temp_dir, use_enhanced=True, verbose=True ) try: # 测试数据集创建 print("\n1. 测试数据集创建...") dataset = evaluator.create_test_dataset(size=5) if len(dataset) == 5: print("✅ 数据集创建成功") # 显示样本 sample = dataset[0] print(f" 样本示例: {sample['question']}") print(f" 正确答案: {sample['correct_answer'][:50]}...") else: print("❌ 数据集创建失败") return False # 测试记忆填充 print("\n2. 测试记忆填充...") success = evaluator.populate_memory(dataset) if success: print("✅ 记忆填充成功") else: print("❌ 记忆填充失败") return False # 测试单个样本评测 print("\n3. 测试单个样本评测...") sample = dataset[0] result = evaluator.evaluate_sample(sample) if result and 'accuracy' in result: print("✅ 样本评测成功") print(f" 准确率: {result['accuracy']:.3f}") print(f" 响应时间: {result['response_time']:.3f}秒") print(f" 结果数量: {result['num_results']}") else: print("❌ 样本评测失败") return False # 测试完整评测(小规模) print("\n4. 测试完整评测...") report = evaluator.run_evaluation(dataset_size=3) if report and 'summary_metrics' in report: print("✅ 完整评测成功") # 显示摘要 summary = report['summary_metrics'] if 'accuracy_mean' in summary: print(f" 平均准确率: {summary['accuracy_mean']:.3f}") if 'response_time_mean' in summary: print(f" 平均响应时间: {summary['response_time_mean']:.3f}秒") else: print("❌ 完整评测失败") return False # 测试报告保存 print("\n5. 测试报告保存...") report_file = evaluator.save_report(report) if os.path.exists(report_file): print("✅ 报告保存成功") print(f" 报告文件: {report_file}") else: print("❌ 报告保存失败") return False print("\n✅ 所有基础功能测试通过") return True finally: evaluator.close() except Exception as e: print(f"❌ 基础评测测试失败: {e}") import traceback traceback.print_exc() return False def test_metrics_calculation(): """测试评测指标计算""" print("\n🧮 测试评测指标计算") print("-" * 30) try: from locomo_evaluation import LoCoMoEvaluator # 创建临时评测器 with tempfile.TemporaryDirectory() as temp_dir: evaluator = LoCoMoEvaluator(temp_dir, verbose=False) try: # 测试准确率计算 correct = "Python是一种高级编程语言" predicted = "Python是编程语言,具有高级特性" accuracy = evaluator._calculate_accuracy(correct, predicted) print(f"准确率计算: {accuracy:.3f}") if 0.0 <= accuracy <= 1.0: print("✅ 准确率计算正常") else: print("❌ 准确率计算异常") return False # 测试ROUGE分数计算 rouge_scores = evaluator._calculate_rouge_scores(correct, predicted) print(f"ROUGE分数: {rouge_scores}") if isinstance(rouge_scores, dict) and 'rouge_1' in rouge_scores: print("✅ ROUGE分数计算正常") else: print("⚠️ ROUGE分数计算可能有问题(可能是依赖缺失)") # 测试BERTScore计算 bert_score = evaluator._calculate_bert_score(correct, predicted) print(f"BERTScore: {bert_score:.3f}") if isinstance(bert_score, (int, float)): print("✅ BERTScore计算正常") else: print("⚠️ BERTScore计算可能有问题(可能是依赖缺失)") # 测试Recall@K计算 results = [ {"content": "Python是编程语言"}, {"content": "Java也是编程语言"}, {"content": "机器学习很重要"} ] recall = evaluator._calculate_recall_at_k(correct, results) print(f"Recall@K: {recall:.3f}") if 0.0 <= recall <= 1.0: print("✅ Recall@K计算正常") else: print("❌ Recall@K计算异常") return False print("✅ 评测指标计算测试通过") return True finally: evaluator.close() except Exception as e: print(f"❌ 指标计算测试失败: {e}") return False def test_dataset_creation(): """测试数据集创建""" print("\n📝 测试数据集创建") print("-" * 30) try: from locomo_evaluation import LoCoMoEvaluator with tempfile.TemporaryDirectory() as temp_dir: evaluator = LoCoMoEvaluator(temp_dir, verbose=False) try: # 测试不同大小的数据集 sizes = [1, 5, 10] for size in sizes: dataset = evaluator.create_test_dataset(size) if len(dataset) == size: print(f"✅ 大小为{size}的数据集创建成功") else: print(f"❌ 大小为{size}的数据集创建失败") return False # 验证数据集结构 if dataset: sample = dataset[0] required_fields = ['id', 'topic', 'question', 'correct_answer', 'context_facts'] for field in required_fields: if field not in sample: print(f"❌ 数据集缺少字段: {field}") return False print(f" 样本字段完整: {list(sample.keys())}") print("✅ 数据集创建测试通过") return True finally: evaluator.close() except Exception as e: print(f"❌ 数据集创建测试失败: {e}") return False def test_error_handling(): """测试错误处理""" print("\n🛡️ 测试错误处理") print("-" * 30) try: from locomo_evaluation import LoCoMoEvaluator # 测试无效目录 try: evaluator = LoCoMoEvaluator("/invalid/path", verbose=False) # 如果没有抛出异常,说明有降级处理 print("✅ 无效路径处理正常") evaluator.close() except Exception: print("✅ 无效路径正确抛出异常") # 测试空数据集 with tempfile.TemporaryDirectory() as temp_dir: evaluator = LoCoMoEvaluator(temp_dir, verbose=False) try: empty_dataset = [] success = evaluator.populate_memory(empty_dataset) if not success: print("✅ 空数据集处理正常") else: print("⚠️ 空数据集处理可能有问题") print("✅ 错误处理测试通过") return True finally: evaluator.close() except Exception as e: print(f"❌ 错误处理测试失败: {e}") return False def main(): """主测试函数""" print("🧠 LoCoMo评测脚本测试") print("=" * 50) tests = [ ("数据集创建", test_dataset_creation), ("评测指标计算", test_metrics_calculation), ("基础评测功能", test_basic_evaluation), ("错误处理", test_error_handling) ] passed = 0 total = len(tests) for test_name, test_func in tests: try: if test_func(): passed += 1 print(f"✅ {test_name} 测试通过") else: print(f"❌ {test_name} 测试失败") except Exception as e: print(f"💥 {test_name} 测试异常: {e}") # 汇总结果 print("\n📊 测试结果汇总") print("=" * 50) print(f"总测试数: {total}") print(f"通过数: {passed}") print(f"失败数: {total - passed}") print(f"通过率: {passed/total*100:.1f}%") if passed == total: print("\n🎉 所有测试通过!LoCoMo评测功能正常") return True else: print(f"\n⚠️ {total - passed} 个测试失败,请检查相关功能") return False if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: print("\n🛑 测试被中断") sys.exit(1)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/qinshu1109/memos-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server