"""
Tests for the indexer module.
"""
import os
import tempfile
import unittest
from pathlib import Path
import pytest
from dotenv import load_dotenv
from llama_index.core import (
Document
)
from indexer import DEFAULT_LOCAL_EMBEDDING_MODEL_NAME
from indexer.index import Indexer
from indexer.retriever import create_retriever_from_env
load_dotenv()
def create_indexer_from_env(
input_path: str,
output_path: str,
window_size = 3
):
return Indexer(input_path,output_path,
embed_endpoint = os.getenv('EMBED_ENDPOINT'),
embedding_model_name = os.getenv('EMBED_MODEL', DEFAULT_LOCAL_EMBEDDING_MODEL_NAME),
window_size = window_size)
@pytest.fixture
def temp_dir():
"""Create a temporary directory for testing."""
with tempfile.TemporaryDirectory() as tmpdir:
yield tmpdir
@pytest.fixture
def sample_documents():
"""Create sample documents for testing."""
return [
Document(text="This is a test document. It contains multiple sentences. Each sentence is important."),
Document(text="Another test document. With different content. For testing purposes."),
]
def test_indexer_initialization(temp_dir):
"""Test Indexer initialization."""
indexer = create_indexer_from_env(
input_path=temp_dir,
output_path=os.path.join(temp_dir, "index"),
window_size=2
)
assert indexer.input_path == Path(temp_dir)
assert indexer.output_path == Path(temp_dir) / "index"
assert indexer.window_size == 2
assert os.path.exists(indexer.output_path)
def test_indexer_with_embed_endpoint(temp_dir):
"""Test Indexer initialization with embedding endpoint."""
embed_endpoint = "http://test-endpoint"
indexer = Indexer(
input_path=temp_dir,
output_path=os.path.join(temp_dir, "index"),
embed_endpoint=embed_endpoint
)
assert indexer.embed_model.api_base == embed_endpoint
@unittest.skipIf(os.getenv('EMBED_ENDPOINT')==False, 'no EMBED_ENDPOINT defined')
def test_load_documents(temp_dir, sample_documents):
"""Test document loading."""
# Create test files
doc_dir = os.path.join(temp_dir, "docs")
os.makedirs(doc_dir)
for i, doc in enumerate(sample_documents):
with open(os.path.join(doc_dir, f"doc_{i}.txt"), "w") as f:
f.write(doc.text)
indexer = Indexer(
input_path=doc_dir,
output_path=os.path.join(temp_dir, "index"),
embed_endpoint=os.getenv('EMBED_ENDPOINT')
)
documents = indexer._load_documents()
assert len(documents) == len(sample_documents)
assert all(isinstance(doc, Document) for doc in documents)
def test_build_and_load_index(temp_dir, sample_documents):
"""Test index building and loading."""
# Create test files
doc_dir = os.path.join(temp_dir, "docs")
os.makedirs(doc_dir)
for i, doc in enumerate(sample_documents):
with open(os.path.join(doc_dir, f"doc_{i}.txt"), "w") as f:
f.write(doc.text)
indexer = create_indexer_from_env(
input_path=doc_dir,
output_path=os.path.join(temp_dir, "index")
)
# Build index
indexer.build_index()
# Load index
loaded_index = indexer.load_index()
assert loaded_index is not None
retriever = create_retriever_from_env(os.path.join(temp_dir, "index"))
res = retriever.query_as_text('what is the document about')
assert len(res) > 0