MCP Server for Qdrant

#!/bin/sh # Patch script for mcp-server-qdrant to run on Alpine Linux # Run this after cloning the repository but before running with uv # Make sure we're in the right repo directory if [ ! -d "/tmp/mcp-server-qdrant" ]; then echo "Error: /tmp/mcp-server-qdrant directory not found" exit 1 fi # Create the Alpine embedding provider file cat > /tmp/mcp-server-qdrant/src/mcp_server_qdrant/embeddings/alpine_compat.py << 'EOF' """ Alpine Linux compatible embedding provider. This module provides an embedding implementation that works well on Alpine Linux with minimal dependencies and no PyTorch requirement. """ import asyncio import logging from typing import List, Dict, Any, Optional from mcp_server_qdrant.embeddings.base import EmbeddingProvider # Configure logging logger = logging.getLogger(__name__) class AlpineEmbedProvider(EmbeddingProvider): """ Alpine-optimized embedding provider that works without PyTorch. This provider will attempt to use fastembed first, falling back to other lightweight options if necessary. :param model_name: The name of the embedding model to use. """ def __init__(self, model_name: str): self.model_name = model_name self.embedding_model = None self.vector_size = 384 # Default size for small models # Try to initialize fastembed try: from fastembed import TextEmbedding logger.info(f"Initializing FastEmbed with model {model_name}") self.embedding_model = TextEmbedding(model_name) self.embedding_type = "fastembed" logger.info("Successfully initialized FastEmbed") except Exception as e: logger.warning(f"Failed to initialize FastEmbed: {e}") logger.warning("Will attempt to use alternative lightweight embedding") self._init_fallback_embedding() def _init_fallback_embedding(self): """Initialize a fallback embedding model if fastembed fails.""" try: # Try to import onnxruntime directly import onnxruntime as ort import numpy as np from urllib.request import urlretrieve import os # Use a small, pre-quantized ONNX model model_url = "https://huggingface.co/optimum/all-MiniLM-L6-v2-onnx/resolve/main/model_quantized.onnx" model_path = "/tmp/model_quantized.onnx" if not os.path.exists(model_path): logger.info(f"Downloading ONNX model to {model_path}") urlretrieve(model_url, model_path) logger.info("Initializing ONNX runtime session") self.ort_session = ort.InferenceSession(model_path) self.embedding_type = "onnx" logger.info("Successfully initialized ONNX fallback model") # Set more specific configuration self.tokenizer = None try: # Try to load tokenizer if available from tokenizers import Tokenizer tokenizer_url = "https://huggingface.co/optimum/all-MiniLM-L6-v2-onnx/resolve/main/tokenizer.json" tokenizer_path = "/tmp/tokenizer.json" if not os.path.exists(tokenizer_path): urlretrieve(tokenizer_url, tokenizer_path) self.tokenizer = Tokenizer.from_file(tokenizer_path) logger.info("Successfully loaded tokenizer") except Exception as te: logger.warning(f"Could not load tokenizer: {te}") logger.warning("Will use a basic tokenization method") except Exception as e: logger.error(f"Failed to initialize any embedding model: {e}") raise RuntimeError(f"Could not initialize any embedding model: {e}") async def embed_documents(self, documents: List[str]) -> List[List[float]]: """Embed a list of documents into vectors.""" if not documents: return [] if self.embedding_type == "fastembed": # Run in a thread pool since FastEmbed is synchronous loop = asyncio.get_event_loop() embeddings = await loop.run_in_executor( None, lambda: list(self.embedding_model.embed(documents)) ) return [embedding.tolist() for embedding in embeddings] elif self.embedding_type == "onnx": # Use ONNX runtime for inference embeddings = [] for doc in documents: embedding = await self._embed_with_onnx(doc) embeddings.append(embedding) return embeddings else: raise ValueError(f"Unknown embedding type: {self.embedding_type}") async def embed_query(self, query: str) -> List[float]: """Embed a query into a vector.""" if not query or not query.strip(): # Return a zero vector of appropriate dimension to avoid errors return [0.0] * self.vector_size if self.embedding_type == "fastembed": # Run in a thread pool since FastEmbed is synchronous loop = asyncio.get_event_loop() embeddings = await loop.run_in_executor( None, lambda: list(self.embedding_model.embed([query])) ) return embeddings[0].tolist() elif self.embedding_type == "onnx": # Use ONNX runtime for inference return await self._embed_with_onnx(query) else: raise ValueError(f"Unknown embedding type: {self.embedding_type}") async def _embed_with_onnx(self, text: str) -> List[float]: """Use ONNX runtime to get embeddings.""" import numpy as np # Implement basic tokenization if no tokenizer available if self.tokenizer: # Use proper tokenizer encoding = self.tokenizer.encode(text) input_ids = encoding.ids attention_mask = encoding.attention_mask token_type_ids = [0] * len(input_ids) # Truncate if too long max_length = 512 if len(input_ids) > max_length: input_ids = input_ids[:max_length] attention_mask = attention_mask[:max_length] token_type_ids = token_type_ids[:max_length] # Convert to numpy arrays input_ids = np.array([input_ids], dtype=np.int64) attention_mask = np.array([attention_mask], dtype=np.int64) token_type_ids = np.array([token_type_ids], dtype=np.int64) else: # Very basic tokenization as fallback # This is a simplified version and won't work as well as a proper tokenizer words = text.lower().split()[:512] # Simple splitting and truncation # Create dummy input tensors (this is a simplified approach) input_ids = np.array([[i + 1 for i in range(len(words))]], dtype=np.int64) attention_mask = np.array([[1] * len(words)], dtype=np.int64) token_type_ids = np.array([[0] * len(words)], dtype=np.int64) # Run inference loop = asyncio.get_event_loop() outputs = await loop.run_in_executor( None, lambda: self.ort_session.run( None, { "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids } ) ) # Process the output embeddings = outputs[0] # Mean pooling mask = attention_mask.reshape(-1, attention_mask.shape[-1]) mask_expanded = np.expand_dims(mask, axis=-1) sum_embeddings = np.sum(embeddings * mask_expanded, axis=1) sum_mask = np.sum(mask, axis=1, keepdims=True) sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=None) mean_embeddings = sum_embeddings / sum_mask # Normalize the vector norm = np.linalg.norm(mean_embeddings, axis=1, keepdims=True) normalized_embeddings = mean_embeddings / norm return normalized_embeddings[0].tolist() def get_vector_name(self) -> str: """ Return the name of the vector for the Qdrant collection. """ model_name = self.model_name.split("/")[-1].lower() return f"alpine-{model_name}" EOF # Update the embedding types to include Alpine provider sed -i '/class EmbeddingProviderType(Enum):/,/SENTENCE_TRANSFORMERS/ s/SENTENCE_TRANSFORMERS = "sentence-transformers"/SENTENCE_TRANSFORMERS = "sentence-transformers"\n ALPINE = "alpine"/' /tmp/mcp-server-qdrant/src/mcp_server_qdrant/embeddings/types.py # Update the factory to support our new provider sed -i '/def create_embedding_provider/,/return/ { /if settings.provider_type == EmbeddingProviderType.FASTEMBED:/i\ if settings.provider_type == EmbeddingProviderType.ALPINE:\ try:\ from mcp_server_qdrant.embeddings.alpine_compat import AlpineEmbedProvider\ return AlpineEmbedProvider(settings.model_name)\ except ImportError as e:\ logger.error(f"Failed to import Alpine provider: {e}")\ logger.info("Falling back to FastEmbed provider")\ # Fall back to FastEmbed if Alpine provider fails\ settings.provider_type = EmbeddingProviderType.FASTEMBED\ }' /tmp/mcp-server-qdrant/src/mcp_server_qdrant/embeddings/factory.py # Add logger import if not already there sed -i '1,/import/ s/from mcp_server_qdrant.settings import EmbeddingProviderSettings/from mcp_server_qdrant.settings import EmbeddingProviderSettings\nimport logging\n\n# Set up logger\nlogger = logging.getLogger(__name__)/' /tmp/mcp-server-qdrant/src/mcp_server_qdrant/embeddings/factory.py # Create a .env file with Alpine provider settings cat > /tmp/mcp-server-qdrant/.env << EOF EMBEDDING_PROVIDER=alpine EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 COLLECTION_NAME=memories LOG_LEVEL=INFO EOF echo "Patch applied successfully! The code should now work on Alpine Linux." echo "Install the required dependencies with:" echo "pip install fastembed onnxruntime>=1.14.0 numpy tokenizers"