simple_locomo_demo.py•10.5 kB
#!/usr/bin/env python3
"""
简化的LoCoMo评测演示
演示LoCoMo评测的基本概念和流程,不依赖完整的MemOS环境
"""
import json
import time
import random
from datetime import datetime
from typing import List, Dict, Any
class SimpleMemorySystem:
"""简化的记忆系统模拟"""
def __init__(self):
self.memories = []
self.memory_id_counter = 0
def add_memory(self, content: str, tags: List[str] = None, metadata: Dict = None) -> bool:
"""添加记忆"""
memory = {
"id": self.memory_id_counter,
"content": content,
"tags": tags or [],
"metadata": metadata or {},
"timestamp": datetime.now().isoformat()
}
self.memories.append(memory)
self.memory_id_counter += 1
return True
def query_memories(self, query: str, limit: int = 5) -> List[Dict]:
"""查询记忆(简单的关键词匹配)"""
query_words = set(query.lower().split())
results = []
for memory in self.memories:
content_words = set(memory["content"].lower().split())
# 计算简单的相似度分数
intersection = query_words.intersection(content_words)
if intersection:
score = len(intersection) / len(query_words.union(content_words))
results.append({
**memory,
"score": score
})
# 按分数排序
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
class SimpleLoCoMoEvaluator:
"""简化的LoCoMo评测器"""
def __init__(self):
self.memory_system = SimpleMemorySystem()
self.test_data = self._create_test_data()
def _create_test_data(self) -> List[Dict]:
"""创建测试数据"""
knowledge_base = [
{
"topic": "Python编程",
"facts": [
"Python是一种高级编程语言,由Guido van Rossum创建",
"Python支持面向对象、函数式和过程式编程范式",
"Python的语法简洁明了,易于学习和使用",
"Python有丰富的标准库和第三方包生态系统"
]
},
{
"topic": "机器学习",
"facts": [
"机器学习是人工智能的一个重要分支",
"监督学习需要标注数据来训练模型",
"无监督学习从未标注数据中发现模式",
"深度学习使用神经网络进行复杂模式识别"
]
},
{
"topic": "数据结构",
"facts": [
"数组是最基本的数据结构之一",
"链表允许动态内存分配",
"栈遵循后进先出(LIFO)原则",
"队列遵循先进先出(FIFO)原则"
]
}
]
test_samples = []
for i, topic_data in enumerate(knowledge_base):
topic = topic_data["topic"]
facts = topic_data["facts"]
# 为每个主题创建测试问题
questions = [
f"什么是{topic}?",
f"请介绍{topic}的特点",
f"关于{topic},你知道什么?"
]
for j, question in enumerate(questions):
correct_answer = random.choice(facts)
sample = {
"id": f"test_{i}_{j}",
"topic": topic,
"question": question,
"correct_answer": correct_answer,
"context_facts": facts
}
test_samples.append(sample)
return test_samples
def populate_memory(self):
"""向记忆系统填充知识"""
print("📚 填充记忆系统...")
# 收集所有事实
all_facts = set()
for sample in self.test_data:
for fact in sample["context_facts"]:
all_facts.add((fact, sample["topic"]))
# 添加到记忆系统
for fact, topic in all_facts:
self.memory_system.add_memory(
fact,
tags=[topic, "知识库"],
metadata={"topic": topic, "source": "test_data"}
)
print(f"✅ 已添加 {len(all_facts)} 条知识到记忆系统")
def evaluate_sample(self, sample: Dict) -> Dict:
"""评测单个样本"""
question = sample["question"]
correct_answer = sample["correct_answer"]
# 记录开始时间
start_time = time.time()
# 查询记忆系统
results = self.memory_system.query_memories(question, limit=5)
# 记录响应时间
response_time = time.time() - start_time
# 提取最佳答案
predicted_answer = ""
if results:
predicted_answer = results[0]["content"]
# 计算准确率(简单的关键词匹配)
accuracy = self._calculate_accuracy(correct_answer, predicted_answer)
return {
"sample_id": sample["id"],
"question": question,
"correct_answer": correct_answer,
"predicted_answer": predicted_answer,
"accuracy": accuracy,
"response_time": response_time,
"num_results": len(results)
}
def _calculate_accuracy(self, correct: str, predicted: str) -> float:
"""计算准确率"""
if not correct or not predicted:
return 0.0
correct_words = set(correct.lower().split())
predicted_words = set(predicted.lower().split())
if len(correct_words) == 0:
return 0.0
intersection = correct_words.intersection(predicted_words)
return len(intersection) / len(correct_words)
def run_evaluation(self) -> Dict:
"""运行完整评测"""
print("🚀 开始简化LoCoMo评测演示")
print("=" * 50)
# 填充记忆系统
self.populate_memory()
# 评测所有样本
print(f"\n🔍 评测 {len(self.test_data)} 个样本...")
results = []
accuracies = []
response_times = []
for i, sample in enumerate(self.test_data):
result = self.evaluate_sample(sample)
results.append(result)
accuracies.append(result["accuracy"])
response_times.append(result["response_time"])
print(f" 样本 {i+1}: 准确率={result['accuracy']:.3f}, "
f"响应时间={result['response_time']:.3f}s")
# 计算汇总统计
avg_accuracy = sum(accuracies) / len(accuracies) if accuracies else 0
avg_response_time = sum(response_times) / len(response_times) if response_times else 0
evaluation_report = {
"summary": {
"total_samples": len(self.test_data),
"avg_accuracy": avg_accuracy,
"avg_response_time": avg_response_time,
"timestamp": datetime.now().isoformat()
},
"detailed_results": results
}
return evaluation_report
def print_summary(self, report: Dict):
"""打印评测摘要"""
summary = report["summary"]
print("\n📊 评测结果摘要")
print("=" * 50)
print(f"总样本数: {summary['total_samples']}")
print(f"平均准确率: {summary['avg_accuracy']:.3f}")
print(f"平均响应时间: {summary['avg_response_time']:.3f}秒")
print(f"评测时间: {summary['timestamp']}")
# 显示一些详细结果
print("\n📋 样本结果示例:")
for i, result in enumerate(report["detailed_results"][:3]):
print(f"\n样本 {i+1}:")
print(f" 问题: {result['question']}")
print(f" 正确答案: {result['correct_answer'][:50]}...")
print(f" 预测答案: {result['predicted_answer'][:50]}...")
print(f" 准确率: {result['accuracy']:.3f}")
def save_report(self, report: Dict, filename: str = "simple_locomo_report.json"):
"""保存评测报告"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"\n📄 评测报告已保存: {filename}")
def demonstrate_locomo_concept():
"""演示LoCoMo评测概念"""
print("🧠 LoCoMo评测概念演示")
print("=" * 50)
print("\n💡 LoCoMo评测的核心概念:")
print("1. 长期对话记忆评测 - 测试AI系统在长时间对话中保持记忆的能力")
print("2. 问答任务 - 基于历史对话内容回答问题")
print("3. 事件摘要 - 总结对话中的重要事件")
print("4. 多模态对话生成 - 生成包含文本和图像的对话")
print("\n📊 评测指标:")
print("- 准确率 (Accuracy): 答案的正确性")
print("- ROUGE分数: 文本重叠度评估")
print("- BERTScore: 语义相似度评估")
print("- Recall@K: 前K个结果中的召回率")
print("- 响应时间: 系统响应速度")
print("\n🎯 MemOS适配:")
print("- 使用MemOS的记忆管理能力")
print("- 测试增强版vs基础版的性能差异")
print("- 评估反馈机制、时间感知、主题漂移检测等功能")
print("- 量化改进后的正确率提升")
def main():
"""主函数"""
# 演示概念
demonstrate_locomo_concept()
print("\n" + "="*50)
# 运行简化评测
evaluator = SimpleLoCoMoEvaluator()
report = evaluator.run_evaluation()
# 显示结果
evaluator.print_summary(report)
# 保存报告
evaluator.save_report(report)
print("\n🎉 简化LoCoMo评测演示完成!")
print("\n💡 要运行完整的LoCoMo评测,请使用:")
print(" ./venv/bin/python locomo_evaluation.py --dataset-size 20")
if __name__ == "__main__":
main()