test_locomo_evaluation.py•11.3 kB
#!/usr/bin/env python3
"""
LoCoMo评测脚本测试
测试LoCoMo评测功能的基本运行
"""
import os
import sys
import tempfile
import shutil
from pathlib import Path
# 添加项目根目录到Python路径
sys.path.insert(0, str(Path(__file__).parent))
def test_basic_evaluation():
"""测试基础评测功能"""
print("🧪 测试基础评测功能")
print("-" * 30)
try:
from locomo_evaluation import LoCoMoEvaluator
# 使用临时目录
with tempfile.TemporaryDirectory() as temp_dir:
print(f"使用临时目录: {temp_dir}")
# 创建评测器
evaluator = LoCoMoEvaluator(
data_dir=temp_dir,
use_enhanced=True,
verbose=True
)
try:
# 测试数据集创建
print("\n1. 测试数据集创建...")
dataset = evaluator.create_test_dataset(size=5)
if len(dataset) == 5:
print("✅ 数据集创建成功")
# 显示样本
sample = dataset[0]
print(f" 样本示例: {sample['question']}")
print(f" 正确答案: {sample['correct_answer'][:50]}...")
else:
print("❌ 数据集创建失败")
return False
# 测试记忆填充
print("\n2. 测试记忆填充...")
success = evaluator.populate_memory(dataset)
if success:
print("✅ 记忆填充成功")
else:
print("❌ 记忆填充失败")
return False
# 测试单个样本评测
print("\n3. 测试单个样本评测...")
sample = dataset[0]
result = evaluator.evaluate_sample(sample)
if result and 'accuracy' in result:
print("✅ 样本评测成功")
print(f" 准确率: {result['accuracy']:.3f}")
print(f" 响应时间: {result['response_time']:.3f}秒")
print(f" 结果数量: {result['num_results']}")
else:
print("❌ 样本评测失败")
return False
# 测试完整评测(小规模)
print("\n4. 测试完整评测...")
report = evaluator.run_evaluation(dataset_size=3)
if report and 'summary_metrics' in report:
print("✅ 完整评测成功")
# 显示摘要
summary = report['summary_metrics']
if 'accuracy_mean' in summary:
print(f" 平均准确率: {summary['accuracy_mean']:.3f}")
if 'response_time_mean' in summary:
print(f" 平均响应时间: {summary['response_time_mean']:.3f}秒")
else:
print("❌ 完整评测失败")
return False
# 测试报告保存
print("\n5. 测试报告保存...")
report_file = evaluator.save_report(report)
if os.path.exists(report_file):
print("✅ 报告保存成功")
print(f" 报告文件: {report_file}")
else:
print("❌ 报告保存失败")
return False
print("\n✅ 所有基础功能测试通过")
return True
finally:
evaluator.close()
except Exception as e:
print(f"❌ 基础评测测试失败: {e}")
import traceback
traceback.print_exc()
return False
def test_metrics_calculation():
"""测试评测指标计算"""
print("\n🧮 测试评测指标计算")
print("-" * 30)
try:
from locomo_evaluation import LoCoMoEvaluator
# 创建临时评测器
with tempfile.TemporaryDirectory() as temp_dir:
evaluator = LoCoMoEvaluator(temp_dir, verbose=False)
try:
# 测试准确率计算
correct = "Python是一种高级编程语言"
predicted = "Python是编程语言,具有高级特性"
accuracy = evaluator._calculate_accuracy(correct, predicted)
print(f"准确率计算: {accuracy:.3f}")
if 0.0 <= accuracy <= 1.0:
print("✅ 准确率计算正常")
else:
print("❌ 准确率计算异常")
return False
# 测试ROUGE分数计算
rouge_scores = evaluator._calculate_rouge_scores(correct, predicted)
print(f"ROUGE分数: {rouge_scores}")
if isinstance(rouge_scores, dict) and 'rouge_1' in rouge_scores:
print("✅ ROUGE分数计算正常")
else:
print("⚠️ ROUGE分数计算可能有问题(可能是依赖缺失)")
# 测试BERTScore计算
bert_score = evaluator._calculate_bert_score(correct, predicted)
print(f"BERTScore: {bert_score:.3f}")
if isinstance(bert_score, (int, float)):
print("✅ BERTScore计算正常")
else:
print("⚠️ BERTScore计算可能有问题(可能是依赖缺失)")
# 测试Recall@K计算
results = [
{"content": "Python是编程语言"},
{"content": "Java也是编程语言"},
{"content": "机器学习很重要"}
]
recall = evaluator._calculate_recall_at_k(correct, results)
print(f"Recall@K: {recall:.3f}")
if 0.0 <= recall <= 1.0:
print("✅ Recall@K计算正常")
else:
print("❌ Recall@K计算异常")
return False
print("✅ 评测指标计算测试通过")
return True
finally:
evaluator.close()
except Exception as e:
print(f"❌ 指标计算测试失败: {e}")
return False
def test_dataset_creation():
"""测试数据集创建"""
print("\n📝 测试数据集创建")
print("-" * 30)
try:
from locomo_evaluation import LoCoMoEvaluator
with tempfile.TemporaryDirectory() as temp_dir:
evaluator = LoCoMoEvaluator(temp_dir, verbose=False)
try:
# 测试不同大小的数据集
sizes = [1, 5, 10]
for size in sizes:
dataset = evaluator.create_test_dataset(size)
if len(dataset) == size:
print(f"✅ 大小为{size}的数据集创建成功")
else:
print(f"❌ 大小为{size}的数据集创建失败")
return False
# 验证数据集结构
if dataset:
sample = dataset[0]
required_fields = ['id', 'topic', 'question', 'correct_answer', 'context_facts']
for field in required_fields:
if field not in sample:
print(f"❌ 数据集缺少字段: {field}")
return False
print(f" 样本字段完整: {list(sample.keys())}")
print("✅ 数据集创建测试通过")
return True
finally:
evaluator.close()
except Exception as e:
print(f"❌ 数据集创建测试失败: {e}")
return False
def test_error_handling():
"""测试错误处理"""
print("\n🛡️ 测试错误处理")
print("-" * 30)
try:
from locomo_evaluation import LoCoMoEvaluator
# 测试无效目录
try:
evaluator = LoCoMoEvaluator("/invalid/path", verbose=False)
# 如果没有抛出异常,说明有降级处理
print("✅ 无效路径处理正常")
evaluator.close()
except Exception:
print("✅ 无效路径正确抛出异常")
# 测试空数据集
with tempfile.TemporaryDirectory() as temp_dir:
evaluator = LoCoMoEvaluator(temp_dir, verbose=False)
try:
empty_dataset = []
success = evaluator.populate_memory(empty_dataset)
if not success:
print("✅ 空数据集处理正常")
else:
print("⚠️ 空数据集处理可能有问题")
print("✅ 错误处理测试通过")
return True
finally:
evaluator.close()
except Exception as e:
print(f"❌ 错误处理测试失败: {e}")
return False
def main():
"""主测试函数"""
print("🧠 LoCoMo评测脚本测试")
print("=" * 50)
tests = [
("数据集创建", test_dataset_creation),
("评测指标计算", test_metrics_calculation),
("基础评测功能", test_basic_evaluation),
("错误处理", test_error_handling)
]
passed = 0
total = len(tests)
for test_name, test_func in tests:
try:
if test_func():
passed += 1
print(f"✅ {test_name} 测试通过")
else:
print(f"❌ {test_name} 测试失败")
except Exception as e:
print(f"💥 {test_name} 测试异常: {e}")
# 汇总结果
print("\n📊 测试结果汇总")
print("=" * 50)
print(f"总测试数: {total}")
print(f"通过数: {passed}")
print(f"失败数: {total - passed}")
print(f"通过率: {passed/total*100:.1f}%")
if passed == total:
print("\n🎉 所有测试通过!LoCoMo评测功能正常")
return True
else:
print(f"\n⚠️ {total - passed} 个测试失败,请检查相关功能")
return False
if __name__ == "__main__":
try:
success = main()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n🛑 测试被中断")
sys.exit(1)