MCP Chat Analysis Server
- src
- mcp_chat_analysis
from typing import List, Dict, Any, Optional
import logging
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
logger = logging.getLogger(__name__)
class EmbeddingGenerator:
"""Handles generation of vector embeddings for text"""
def __init__(
self,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
device: Optional[str] = None,
batch_size: int = 32,
max_length: int = 512
):
"""
Initialize the embedding generator
Args:
model_name: Name of the sentence-transformers model to use
device: Device to run model on ('cpu', 'cuda', or None for auto)
batch_size: Batch size for embedding generation
max_length: Maximum sequence length
"""
self.model_name = model_name
self.batch_size = batch_size
self.max_length = max_length
# Determine device
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
self.device = device
# Load model
logger.info(f"Loading model {model_name} on {device}")
self.model = SentenceTransformer(model_name, device=device)
# Cache model properties
self.embedding_dim = self.model.get_sentence_embedding_dimension()
logger.info(f"Model loaded. Embedding dimension: {self.embedding_dim}")
async def generate(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of texts
Args:
texts: List of texts to embed
Returns:
List of embedding vectors
"""
# Preprocess texts
processed_texts = [
self._preprocess_text(text) for text in texts
]
# Generate embeddings in batches
all_embeddings = []
for i in range(0, len(processed_texts), self.batch_size):
batch = processed_texts[i:i + self.batch_size]
embeddings = self.model.encode(
batch,
batch_size=self.batch_size,
show_progress_bar=False,
convert_to_numpy=True
)
all_embeddings.extend(embeddings.tolist())
return all_embeddings
async def generate_single(self, text: str) -> List[float]:
"""
Generate embedding for a single text
Args:
text: Text to embed
Returns:
Embedding vector
"""
processed = self._preprocess_text(text)
embedding = self.model.encode(
processed,
show_progress_bar=False,
convert_to_numpy=True
)
return embedding.tolist()
def _preprocess_text(self, text: str) -> str:
"""
Preprocess text before embedding
Args:
text: Input text
Returns:
Preprocessed text
"""
# Basic preprocessing
text = text.strip()
# Truncate if needed
if len(text) > self.max_length:
text = text[:self.max_length]
return text
async def compute_similarity(
self,
text1: str,
text2: str,
method: str = "cosine"
) -> float:
"""
Compute similarity between two texts
Args:
text1: First text
text2: Second text
method: Similarity method ('cosine' or 'euclidean')
Returns:
Similarity score
"""
# Generate embeddings
emb1 = await self.generate_single(text1)
emb2 = await self.generate_single(text2)
# Convert to numpy arrays
vec1 = np.array(emb1)
vec2 = np.array(emb2)
# Compute similarity
if method == "cosine":
similarity = np.dot(vec1, vec2) / (
np.linalg.norm(vec1) * np.linalg.norm(vec2)
)
elif method == "euclidean":
similarity = 1 / (1 + np.linalg.norm(vec1 - vec2))
else:
raise ValueError(f"Unknown similarity method: {method}")
return float(similarity)
async def find_similar_chunks(
self,
query: str,
texts: List[str],
threshold: float = 0.5,
top_k: int = 5
) -> List[Dict[str, Any]]:
"""
Find most similar text chunks to a query
Args:
query: Query text
texts: List of texts to search
threshold: Minimum similarity threshold
top_k: Maximum number of results
Returns:
List of similar texts with scores
"""
# Generate query embedding
query_emb = await self.generate_single(query)
# Generate embeddings for all texts
chunk_embs = await self.generate(texts)
# Compute similarities
similarities = []
for i, emb in enumerate(chunk_embs):
score = np.dot(query_emb, emb) / (
np.linalg.norm(query_emb) * np.linalg.norm(emb)
)
if score >= threshold:
similarities.append({
"text": texts[i],
"score": float(score)
})
# Sort by score and return top_k
similarities.sort(key=lambda x: x["score"], reverse=True)
return similarities[:top_k]