"""
Shared pytest fixtures for integration and e2e tests.
Provides both ephemeral (tmp_path) and persistent fixtures for different
testing scenarios:
- Ephemeral fixtures (tmp_path): Fast, isolated, used by default in unit tests
- Persistent fixtures: Realistic storage, shared across tests in a session/module
Use persistent fixtures when:
- Testing index persistence/loading behavior
- Testing manifest checking across test runs
- Simulating realistic production scenarios
- Testing index size/performance with larger datasets
Use ephemeral fixtures (tmp_path) when:
- Testing core logic in isolation
- Fast test iteration is priority
- Each test needs complete isolation
"""
# MUST be set before any HuggingFace/sentence-transformers imports to suppress
# progress bars that would pollute JSON output in E2E tests.
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TQDM_DISABLE"] = "1"
from pathlib import Path
from typing import Any, Generator
import pytest
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from src.config import ChunkingConfig, Config, IndexingConfig, LLMConfig, SearchConfig, ServerConfig
from src.indexing.manager import IndexManager
from src.indices.graph import GraphStore
from src.indices.keyword import KeywordIndex
from src.indices.vector import VectorIndex
@pytest.fixture(autouse=True)
def isolate_xdg_data_home(tmp_path_factory, monkeypatch):
"""Isolate application data while preserving HuggingFace model cache.
Creates temp directories for XDG_DATA_HOME and HOME to isolate test data,
but preserves HF_HOME to avoid re-downloading large embedding models
and hitting rate limits during parallel test execution.
"""
# Preserve original HuggingFace cache location BEFORE modifying HOME
# Default to ~/.cache/huggingface if not set
original_home = os.environ.get("HOME", "")
original_hf_home = os.environ.get("HF_HOME", os.path.join(original_home, ".cache", "huggingface"))
# Create isolated temp directories for application data
data_home = tmp_path_factory.mktemp("xdg-data-home")
home_dir = tmp_path_factory.mktemp("home")
monkeypatch.setenv("XDG_DATA_HOME", str(data_home))
monkeypatch.setenv("HOME", str(home_dir))
# Restore HuggingFace cache to original location (shared across workers)
monkeypatch.setenv("HF_HOME", original_hf_home)
# ============================================================================
# Test Fixture Factories
# ============================================================================
def make_test_config(tmp_path: Path, **overrides):
docs_path = tmp_path / "docs"
docs_path.mkdir(exist_ok=True)
index_path = tmp_path / "index"
index_path.mkdir(exist_ok=True)
defaults: dict[str, Any] = {
"server": ServerConfig(host="localhost", port=8080),
"indexing": IndexingConfig(
documents_path=str(docs_path),
index_path=str(index_path),
),
"search": SearchConfig(),
"chunking": ChunkingConfig(),
}
defaults.update(overrides)
return Config(**defaults)
def create_test_document(docs_dir: Path | str, doc_id: str, content: str):
doc_path = Path(docs_dir) / f"{doc_id}.md"
doc_path.write_text(content)
return str(doc_path)
# ============================================================================
# Shared Embedding Model Fixture
# ============================================================================
@pytest.fixture(scope="session")
def shared_embedding_model():
"""Session-scoped embedding model shared across all tests.
Uses filelock to ensure only one pytest worker downloads the model
at a time, preventing race conditions and rate limit issues.
Pre-warms the model with a dummy embedding call to avoid first-call
overhead (~1-2s) during actual tests.
"""
import filelock
# Lock file in the HF cache directory
hf_home = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
lock_path = os.path.join(hf_home, ".model_download.lock")
os.makedirs(os.path.dirname(lock_path), exist_ok=True)
with filelock.FileLock(lock_path, timeout=300): # 5 min timeout
model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Pre-warm: trigger model initialization and cache warmup
_ = model.get_text_embedding("warmup")
return model
@pytest.fixture(scope="module")
def module_vector_index(shared_embedding_model):
"""
Module-scoped VectorIndex with shared embedding model.
Use this instead of creating VectorIndex() in function-scoped fixtures
to avoid redundant model loading (2-4s overhead per load).
Note: Module scope means tests share index state. Only use when tests
don't mutate the index or when using tmp_path for document isolation.
"""
return VectorIndex(embedding_model=shared_embedding_model)
@pytest.fixture(scope="module")
def module_indices(shared_embedding_model):
"""
Module-scoped indices for integration tests.
Returns (vector, keyword, graph) tuple with shared embedding model.
Avoids redundant model loading across tests in the same module.
Note: Module scope means tests share index state. Ensure tests either:
1. Use separate tmp_path directories for document isolation, OR
2. Don't mutate index state, OR
3. Explicitly clear indices between tests
"""
vector = VectorIndex(embedding_model=shared_embedding_model)
keyword = KeywordIndex()
graph = GraphStore()
return vector, keyword, graph
# ============================================================================
# Persistent Storage Fixtures
# ============================================================================
@pytest.fixture(scope="session")
def persistent_storage_root(tmp_path_factory) -> Path:
"""
Create session-scoped persistent storage directory.
This directory persists for the entire test session, allowing
tests to share data and verify persistence behavior.
Returns path to persistent storage root directory.
"""
return tmp_path_factory.mktemp("persistent_test_storage")
@pytest.fixture(scope="session")
def persistent_docs_path(persistent_storage_root: Path) -> Path:
"""
Create session-scoped documents directory.
Documents stored here persist across tests in the session.
Returns path to persistent documents directory.
"""
docs_path = persistent_storage_root / "documents"
docs_path.mkdir(parents=True, exist_ok=True)
return docs_path
@pytest.fixture(scope="session")
def persistent_index_path(persistent_storage_root: Path) -> Path:
"""
Create session-scoped index directory.
Indices stored here persist across tests in the session.
Returns path to persistent index directory.
"""
index_path = persistent_storage_root / "indices"
index_path.mkdir(parents=True, exist_ok=True)
return index_path
# ============================================================================
# Persistent Configuration Fixtures
# ============================================================================
@pytest.fixture(scope="session")
def persistent_config(
persistent_docs_path: Path,
persistent_index_path: Path,
) -> Config:
"""
Create session-scoped configuration with persistent paths.
Uses real persistent storage locations that survive across
tests in the session.
Returns Config object configured for persistent storage.
"""
return Config(
server=ServerConfig(),
indexing=IndexingConfig(
documents_path=str(persistent_docs_path),
index_path=str(persistent_index_path),
recursive=True,
),
parsers={"**/*.md": "MarkdownParser"},
search=SearchConfig(
semantic_weight=1.0,
keyword_weight=1.0,
recency_bias=0.5,
rrf_k_constant=60,
),
llm=LLMConfig(embedding_model="BAAI/bge-small-en-v1.5"),
)
# ============================================================================
# Module-Scoped Persistent Fixtures
# ============================================================================
@pytest.fixture(scope="module")
def persistent_indices_module(shared_embedding_model) -> Generator[tuple[VectorIndex, KeywordIndex, GraphStore], None, None]:
"""
Create module-scoped indices that persist across tests in a module.
These indices are shared across all tests in a module for performance.
They start fresh but can accumulate data within a module's test suite.
Yields tuple of (vector, keyword, graph) indices.
"""
vector = VectorIndex(embedding_model=shared_embedding_model)
keyword = KeywordIndex()
graph = GraphStore()
yield vector, keyword, graph
@pytest.fixture(scope="module")
def persistent_manager_module(
persistent_config: Config,
persistent_indices_module: tuple[VectorIndex, KeywordIndex, GraphStore],
) -> IndexManager:
"""
Create module-scoped IndexManager with persistent storage.
This manager uses persistent paths and shared indices within a module.
Useful for testing persistence behavior and manifest checking.
Returns IndexManager configured with persistent storage.
"""
vector, keyword, graph = persistent_indices_module
return IndexManager(persistent_config, vector, keyword, graph)
# ============================================================================
# Function-Scoped Persistent Fixtures with Cleanup
# ============================================================================
@pytest.fixture
def persistent_indices_isolated(shared_embedding_model) -> Generator[tuple[VectorIndex, KeywordIndex, GraphStore], None, None]:
"""
Create function-scoped indices that can use persistent storage.
Fresh indices for each test but can persist to/load from disk.
Provides isolation while allowing persistence testing.
Yields tuple of (vector, keyword, graph) indices.
"""
vector = VectorIndex(embedding_model=shared_embedding_model)
keyword = KeywordIndex()
graph = GraphStore()
yield vector, keyword, graph
@pytest.fixture
def persistent_manager_isolated(
persistent_config: Config,
persistent_indices_isolated: tuple[VectorIndex, KeywordIndex, GraphStore],
) -> IndexManager:
"""
Create function-scoped IndexManager with persistent storage.
Fresh manager for each test that uses persistent paths.
Allows testing persistence across manager instances.
Returns IndexManager configured with persistent storage.
"""
vector, keyword, graph = persistent_indices_isolated
return IndexManager(persistent_config, vector, keyword, graph)
# ============================================================================
# Hybrid Fixtures (Module-Scoped Config + Function-Scoped Indices)
# ============================================================================
@pytest.fixture(scope="module")
def persistent_config_module(tmp_path_factory) -> Config:
"""
Create module-scoped configuration with dedicated module storage.
Each test module gets its own persistent storage directory that
survives across tests in that module.
Returns Config object with module-specific persistent paths.
"""
base_path = tmp_path_factory.mktemp("module_persistent")
docs_path = base_path / "documents"
index_path = base_path / "indices"
docs_path.mkdir(parents=True, exist_ok=True)
index_path.mkdir(parents=True, exist_ok=True)
return Config(
server=ServerConfig(),
indexing=IndexingConfig(
documents_path=str(docs_path),
index_path=str(index_path),
recursive=True,
),
parsers={"**/*.md": "MarkdownParser"},
search=SearchConfig(
semantic_weight=1.0,
keyword_weight=1.0,
recency_bias=0.5,
rrf_k_constant=60,
),
llm=LLMConfig(embedding_model="BAAI/bge-small-en-v1.5"),
)
@pytest.fixture
def persistent_manager_with_module_config(
persistent_config_module: Config,
persistent_indices_isolated: tuple[VectorIndex, KeywordIndex, GraphStore],
) -> IndexManager:
"""
Create function-scoped manager with module-persistent paths.
Fresh manager for each test but shares module-level storage paths.
Balances isolation with realistic persistence testing.
Returns IndexManager with module-scoped persistent storage.
"""
vector, keyword, graph = persistent_indices_isolated
return IndexManager(persistent_config_module, vector, keyword, graph)
# ============================================================================
# Cleanup Utilities
# ============================================================================
@pytest.fixture
def cleanup_persistent_indices(persistent_index_path: Path) -> Generator[None, None, None]:
"""
Clean up persistent indices after test execution.
Use this fixture when you need guaranteed cleanup of persistent
storage after a test, even if using session-scoped paths.
Example:
def test_with_cleanup(
persistent_manager_isolated,
cleanup_persistent_indices
):
# Test code here
# Indices will be cleaned up after test
pass
"""
yield
# Cleanup after test
if persistent_index_path.exists():
import shutil
for item in persistent_index_path.iterdir():
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
@pytest.fixture
def cleanup_persistent_docs(persistent_docs_path: Path) -> Generator[None, None, None]:
"""
Clean up persistent documents after test execution.
Use this fixture when you need guaranteed cleanup of persistent
documents after a test.
Example:
def test_with_doc_cleanup(
persistent_docs_path,
cleanup_persistent_docs
):
# Test code here
# Documents will be cleaned up after test
pass
"""
yield
# Cleanup after test
if persistent_docs_path.exists():
for item in persistent_docs_path.iterdir():
if item.is_dir():
import shutil
shutil.rmtree(item)
else:
item.unlink()
# ============================================================================
# pytest-xdist hook to handle serial tests
# ============================================================================
def pytest_xdist_auto_num_workers(config):
"""Hook to configure pytest-xdist behavior for serial tests."""
# Let pytest-xdist determine worker count automatically
return None
def pytest_collection_modifyitems(config, items):
"""Mark serial tests to run in the main process."""
for item in items:
if "serial" in item.keywords:
# Force serial tests to run in dist group 'serial'
# This ensures they don't run in parallel with other tests
item.add_marker(pytest.mark.xdist_group(name="serial"))