MCP Chat Analysis Server
- tests
import pytest
import numpy as np
from mcp_chat_analysis.embeddings import EmbeddingGenerator
@pytest.fixture
def embedding_generator():
return EmbeddingGenerator(
model_name="sentence-transformers/all-MiniLM-L6-v2",
device="cpu",
batch_size=2
)
@pytest.mark.asyncio
async def test_single_embedding_generation(embedding_generator):
text = "This is a test message"
embedding = await embedding_generator.generate_single(text)
assert isinstance(embedding, list)
assert len(embedding) == embedding_generator.embedding_dim
assert all(isinstance(x, float) for x in embedding)
@pytest.mark.asyncio
async def test_batch_embedding_generation(embedding_generator):
texts = [
"First test message",
"Second test message",
"Third test message"
]
embeddings = await embedding_generator.generate(texts)
assert isinstance(embeddings, list)
assert len(embeddings) == len(texts)
assert all(len(emb) == embedding_generator.embedding_dim for emb in embeddings)
@pytest.mark.asyncio
async def test_similarity_computation(embedding_generator):
text1 = "This is about artificial intelligence"
text2 = "AI and machine learning are related topics"
text3 = "The weather is nice today"
# Similar texts should have higher similarity
sim1 = await embedding_generator.compute_similarity(text1, text2)
sim2 = await embedding_generator.compute_similarity(text1, text3)
assert 0 <= sim1 <= 1
assert 0 <= sim2 <= 1
assert sim1 > sim2 # Related topics should be more similar
@pytest.mark.asyncio
async def test_similar_chunks_finding(embedding_generator):
query = "artificial intelligence"
texts = [
"AI and machine learning",
"Natural language processing",
"The weather forecast",
"Today's temperature",
"Deep learning algorithms"
]
results = await embedding_generator.find_similar_chunks(
query,
texts,
threshold=0.3,
top_k=3
)
assert isinstance(results, list)
assert len(results) <= 3
assert all("score" in r and "text" in r for r in results)
assert all(0 <= r["score"] <= 1 for r in results)
# Check ordering
scores = [r["score"] for r in results]
assert scores == sorted(scores, reverse=True)
@pytest.mark.asyncio
async def test_long_text_handling(embedding_generator):
long_text = "a" * 1000 # Text longer than max_length
embedding = await embedding_generator.generate_single(long_text)
assert isinstance(embedding, list)
assert len(embedding) == embedding_generator.embedding_dim
@pytest.mark.asyncio
async def test_empty_text_handling(embedding_generator):
empty_text = ""
embedding = await embedding_generator.generate_single(empty_text)
assert isinstance(embedding, list)
assert len(embedding) == embedding_generator.embedding_dim
@pytest.mark.asyncio
async def test_different_similarity_methods(embedding_generator):
text1 = "This is a test"
text2 = "Another test message"
cosine_sim = await embedding_generator.compute_similarity(
text1,
text2,
method="cosine"
)
euclidean_sim = await embedding_generator.compute_similarity(
text1,
text2,
method="euclidean"
)
assert 0 <= cosine_sim <= 1
assert 0 <= euclidean_sim <= 1
@pytest.mark.asyncio
async def test_invalid_similarity_method(embedding_generator):
with pytest.raises(ValueError):
await embedding_generator.compute_similarity(
"text1",
"text2",
method="invalid"
)