"""
Indexer module for local RAG MCP server.
"""
import os
from typing import Optional
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from indexer.abstract_tokenizer import AbstractTokenizer
from indexer.remote_tei_tokenizer import RemoteTeiTokenizer
from indexer.tiktoken_tokenizer import TikTokenTokenizer
DEFAULT_LOCAL_EMBEDDING_MODEL_NAME="BAAI/bge-large-en-v1.5"
DEFAULT_INDEXER_CHUNK_SIZE=384
DEFAULT_INDEXER_OVERLAP=50
def build_embedding_function(embedding_model_name: str = DEFAULT_LOCAL_EMBEDDING_MODEL_NAME,
embed_endpoint: Optional[str] = None) -> BaseEmbedding:
"""
Builds an embedding function.
Args:
embedding_model_name (str): The name of the embedding model to use.
Defaults to DEFAULT_LOCAL_EMBEDDING_MODEL_NAME.
embed_endpoint (Optional[str]): The API endpoint for a remote embedding service.
If provided, OpenAIEmbedding is used.
If None, a local HuggingFaceEmbedding is used.
Returns:
BaseEmbedding: The initialized embedding model.
"""
# Initialize embedding model
if embed_endpoint:
# When using a remote endpoint, model_name might be specific to that endpoint
embed_model = OpenAIEmbedding(api_key= os.getenv("OPENAI_API_KEY", "DUMMY_API_KEY"),
api_base=embed_endpoint,
model_name=embedding_model_name)
else:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# When using a local HuggingFace model
embed_model = HuggingFaceEmbedding(
model_name=embedding_model_name
)
return embed_model
def build_tokenizer(embedding_model_name: str = DEFAULT_LOCAL_EMBEDDING_MODEL_NAME,
token_endpoint: Optional[str] = None,
tokenizer_model_name: Optional[str] = None) -> AbstractTokenizer:
if token_endpoint:
result = RemoteTeiTokenizer(api_url=token_endpoint)
else:
if tokenizer_model_name:
if tokenizer_model_name in ['o200k_base', 'cl100k_base', 'p50k_base', 'r50k_base']:
result = TikTokenTokenizer(tokenizer_model_name)
return result
else:
tokenizer_model_name = embedding_model_name
from indexer.hf_tokenizer import HuggingFaceTokenizer
result = HuggingFaceTokenizer(tokenizer_model_name)
return result