MCP SOP Server

Overview Schema Related Servers Score Discussions

mcp-sop-server
src
mcp_sop_server

document_searcher.py•9.7 KiB

import logging import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer from typing import List, Dict, Any, Optional from pathlib import Path class DocumentSearcher: """ Handles semantic search of SOP documents using ChromaDB and Italian language embeddings. """ def __init__(self, db_path: str = "chroma_db", collection_name: str = "sop_documents", model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): """ Initialize the document searcher. Args: db_path: Path to ChromaDB storage collection_name: Name of the ChromaDB collection model_name: Sentence transformer model (multilingual for Italian support) """ self.db_path = Path(db_path) self.collection_name = collection_name self.logger = logging.getLogger(__name__) # Initialize ChromaDB client self.client = chromadb.PersistentClient( path=str(self.db_path), settings=Settings(anonymized_telemetry=False) ) # Load multilingual sentence transformer for Italian support self.logger.info(f"🤖 Loading embedding model: {model_name}") self.embedding_model = SentenceTransformer(model_name) # Get or create collection self.collection = self._get_or_create_collection() def _get_or_create_collection(self): """Get existing collection or create a new one.""" try: collection = self.client.get_collection(name=self.collection_name) self.logger.info(f"Using existing collection: {self.collection_name}") except: collection = self.client.create_collection( name=self.collection_name, metadata={"description": "SOP documents collection with Italian language support"} ) self.logger.info(f"📚 Created new collection: {self.collection_name}") return collection def _generate_embeddings(self, texts: List[str]) -> List[List[float]]: """Generate embeddings for a list of texts.""" try: embeddings = self.embedding_model.encode(texts, convert_to_tensor=False) return embeddings.tolist() except Exception as e: self.logger.error(f"Error generating embeddings: {e}") return [] def add_documents(self, documents: List[Dict[str, Any]]) -> bool: """ Add processed documents to the ChromaDB collection. Args: documents: List of processed document dictionaries Returns: bool: Success status """ try: all_texts = [] all_metadatas = [] all_ids = [] for doc in documents: for i, chunk in enumerate(doc['chunks']): # Create unique ID for each chunk chunk_id = f"{doc['sop_category']}_{doc['sop_name']}_chunk_{i}" # Prepare metadata metadata = { 'sop_category': doc['sop_category'], 'sop_name': doc['sop_name'], 'file_path': doc['file_path'], 'language': doc['language'], 'chunk_index': i, 'total_chunks': doc['chunk_count'], 'file_size': doc['file_size'], 'last_modified': doc['last_modified'] } all_texts.append(chunk) all_metadatas.append(metadata) all_ids.append(chunk_id) if not all_texts: self.logger.warning("No texts to add to collection") return False # Generate embeddings self.logger.info(f"Generating embeddings for {len(all_texts)} text chunks...") embeddings = self._generate_embeddings(all_texts) if not embeddings: self.logger.error("Failed to generate embeddings") return False # Add to collection self.collection.add( documents=all_texts, metadatas=all_metadatas, ids=all_ids, embeddings=embeddings ) self.logger.info(f"Successfully added {len(all_texts)} document chunks to collection") return True except Exception as e: self.logger.error(f"Error adding documents to collection: {e}") return False def search(self, query: str, n_results: int = 5, sop_category: Optional[str] = None) -> List[Dict[str, Any]]: """ Search for relevant SOP documents based on query. Args: query: Search query in Italian or English n_results: Number of results to return sop_category: Optional filter by SOP category Returns: List of relevant document chunks with metadata """ try: # Generate embedding for query query_embedding = self._generate_embeddings([query])[0] # Prepare where clause for filtering where_clause = None if sop_category: where_clause = {"sop_category": sop_category} # Search in collection results = self.collection.query( query_embeddings=[query_embedding], n_results=n_results, where=where_clause, include=['documents', 'metadatas', 'distances'] ) # Format results formatted_results = [] if results['documents'] and results['documents'][0]: for i in range(len(results['documents'][0])): result = { 'content': results['documents'][0][i], 'metadata': results['metadatas'][0][i], 'similarity_score': 1 - results['distances'][0][i], # Convert distance to similarity 'sop_category': results['metadatas'][0][i]['sop_category'], 'sop_name': results['metadatas'][0][i]['sop_name'], 'file_path': results['metadatas'][0][i]['file_path'] } formatted_results.append(result) self.logger.info(f"Found {len(formatted_results)} results for query: {query[:50]}...") return formatted_results except Exception as e: self.logger.error(f"Error searching documents: {e}") return [] def get_sop_by_category(self, category: str) -> List[Dict[str, Any]]: """Get all SOPs in a specific category.""" try: results = self.collection.get( where={"sop_category": category}, include=['documents', 'metadatas'] ) # Group by SOP name to avoid duplicates sops = {} if results['documents']: for i, doc in enumerate(results['documents']): metadata = results['metadatas'][i] sop_name = metadata['sop_name'] if sop_name not in sops: sops[sop_name] = { 'sop_name': sop_name, 'sop_category': metadata['sop_category'], 'file_path': metadata['file_path'], 'language': metadata['language'], 'total_chunks': metadata['total_chunks'] } return list(sops.values()) except Exception as e: self.logger.error(f"Error getting SOPs by category {category}: {e}") return [] def get_collection_stats(self) -> Dict[str, Any]: """Get statistics about the document collection.""" try: count = self.collection.count() # Get all metadata to calculate stats results = self.collection.get(include=['metadatas']) categories = set() languages = set() sop_names = set() if results['metadatas']: for metadata in results['metadatas']: categories.add(metadata['sop_category']) languages.add(metadata['language']) sop_names.add(metadata['sop_name']) return { 'total_chunks': count, 'total_documents': len(sop_names), 'categories': sorted(list(categories)), 'languages': sorted(list(languages)), 'category_count': len(categories) } except Exception as e: self.logger.error(f"Error getting collection stats: {e}") return {} def clear_collection(self) -> bool: """Clear all documents from the collection.""" try: # Delete the collection and recreate it self.client.delete_collection(name=self.collection_name) self.collection = self._get_or_create_collection() self.logger.info("Collection cleared successfully") return True except Exception as e: self.logger.error(f"Error clearing collection: {e}") return False

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dadapera/mcp-sop-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document_searcher.py•9.7 KiB