test_semantic_search.py•6.42 kB
#!/usr/bin/env python3
"""
测试语义搜索效果对比:哈希向量 vs 真正的嵌入向量
"""
import os
import numpy as np
from pathlib import Path
from openai import OpenAI
from usage_examples import SimpleMemOS
def load_env_file():
"""加载环境变量"""
env_file = Path(".env")
if env_file.exists():
with open(env_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
os.environ[key] = value
def get_real_embedding(text, client):
"""获取真正的嵌入向量"""
try:
response = client.embeddings.create(
model="BAAI/bge-large-zh-v1.5",
input=text
)
return response.data[0].embedding
except Exception as e:
print(f"❌ 获取嵌入向量失败: {e}")
return None
def cosine_similarity(vec1, vec2):
"""计算余弦相似度"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0
return dot_product / (norm1 * norm2)
def test_semantic_search():
"""测试语义搜索效果"""
print("🧪 测试语义搜索效果对比")
print("=" * 60)
# 加载环境变量
load_env_file()
# 初始化客户端
client = OpenAI(
api_key=os.getenv("SILICONFLOW_API_KEY"),
base_url=os.getenv("SILICONFLOW_BASE_URL")
)
# 初始化当前系统
memos = SimpleMemOS()
# 测试文档集合
documents = [
"今天学习了人工智能和机器学习的基础知识",
"MemOS是一个智能记忆管理系统",
"手动验证测试功能正常工作",
"深度学习神经网络模型训练",
"自然语言处理和文本分析技术",
"数据库查询和向量搜索算法",
"Python编程语言开发应用",
"云计算和分布式系统架构"
]
# 测试查询
queries = [
"AI和机器学习",
"记忆管理",
"测试验证",
"深度学习",
"文本处理"
]
print("📚 测试文档集合:")
for i, doc in enumerate(documents, 1):
print(f" {i}. {doc}")
print("\n🔍 测试查询:")
for i, query in enumerate(queries, 1):
print(f" {i}. {query}")
# 获取所有文档的嵌入向量
print("\n📊 计算嵌入向量...")
# 哈希向量(当前系统)
hash_embeddings = []
for doc in documents:
hash_vec = memos._get_embedding(doc)
hash_embeddings.append(hash_vec)
# 真正的嵌入向量
real_embeddings = []
for doc in documents:
real_vec = get_real_embedding(doc, client)
if real_vec:
real_embeddings.append(real_vec)
else:
print("❌ 无法获取真正的嵌入向量,跳过对比测试")
return
print("✅ 嵌入向量计算完成")
# 对每个查询进行搜索对比
for query in queries:
print(f"\n🔍 查询: '{query}'")
print("-" * 40)
# 获取查询的嵌入向量
query_hash = memos._get_embedding(query)
query_real = get_real_embedding(query, client)
if not query_real:
continue
# 计算相似度
hash_similarities = []
real_similarities = []
for i, (doc, hash_vec, real_vec) in enumerate(zip(documents, hash_embeddings, real_embeddings)):
hash_sim = cosine_similarity(query_hash, hash_vec)
real_sim = cosine_similarity(query_real, real_vec)
hash_similarities.append((i, doc, hash_sim))
real_similarities.append((i, doc, real_sim))
# 排序结果
hash_similarities.sort(key=lambda x: x[2], reverse=True)
real_similarities.sort(key=lambda x: x[2], reverse=True)
# 显示Top 3结果
print("📈 哈希向量搜索结果 (Top 3):")
for rank, (idx, doc, sim) in enumerate(hash_similarities[:3], 1):
print(f" {rank}. [{sim:.4f}] {doc}")
print("\n🎯 真实嵌入搜索结果 (Top 3):")
for rank, (idx, doc, sim) in enumerate(real_similarities[:3], 1):
print(f" {rank}. [{sim:.4f}] {doc}")
# 分析结果差异
hash_top3 = [x[0] for x in hash_similarities[:3]]
real_top3 = [x[0] for x in real_similarities[:3]]
overlap = len(set(hash_top3) & set(real_top3))
print(f"\n📊 Top3重叠度: {overlap}/3 ({overlap/3*100:.1f}%)")
def test_current_search_quality():
"""测试当前搜索质量"""
print("\n🧪 测试当前系统搜索质量")
print("=" * 60)
# 初始化系统
memos = SimpleMemOS()
# 测试查询
test_queries = [
"MemOS功能",
"测试验证",
"记忆管理",
"开发功能",
"智能提取"
]
for query in test_queries:
print(f"\n🔍 搜索: '{query}'")
results = memos.search_memories(query, limit=3)
if results:
print(f"✅ 找到 {len(results)} 条结果:")
for i, result in enumerate(results, 1):
content = result.get('content', 'N/A')
score = result.get('score', 0)
print(f" {i}. [{score:.4f}] {content[:50]}...")
else:
print("❌ 未找到相关结果")
def main():
"""主函数"""
print("🚀 开始语义搜索效果测试...")
# 测试语义搜索对比
test_semantic_search()
# 测试当前系统搜索质量
test_current_search_quality()
print("\n" + "=" * 60)
print("📋 测试总结:")
print("1. SiliconFlow嵌入API可用,提供1024维高质量向量")
print("2. 当前系统使用384维哈希向量,搜索功能基本可用")
print("3. 真正的嵌入向量在语义理解上明显优于哈希向量")
print("4. 建议升级到真正的嵌入模型以提升搜索精度")
print("5. 可以考虑安装sentence-transformers支持本地重排")
if __name__ == "__main__":
main()