test_indexing.py•23.9 kB
"""
Tests for the chroma_mcp_client.indexing module.
"""
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock, mock_open
import subprocess
import os
import logging
# Assuming get_client_and_ef is mocked elsewhere or we mock it here
from chroma_mcp_client.connection import get_client_and_ef
from chroma_mcp_client.indexing import index_file, index_git_files, index_paths
# --- Fixtures ---
@pytest.fixture
def mock_chroma_client_tuple(mocker):
"""Fixture to mock the get_client_and_ef function."""
mock_client = MagicMock()
mock_collection = MagicMock()
mock_client.get_collection.return_value = mock_collection
mock_client.create_collection.return_value = mock_collection
mock_embedding_func = MagicMock()
mock_get_client_and_ef = mocker.patch(
"chroma_mcp_client.indexing.get_client_and_ef", return_value=(mock_client, mock_embedding_func)
)
return mock_client, mock_collection, mock_embedding_func, mock_get_client_and_ef
@pytest.fixture
def temp_repo(tmp_path: Path):
"""Create a temporary directory structure mimicking a repo."""
repo_root = tmp_path / "repo"
repo_root.mkdir()
(repo_root / ".git").mkdir() # Simulate git repo presence if needed
src_dir = repo_root / "src"
src_dir.mkdir()
(src_dir / "main.py").write_text("print('hello')")
(src_dir / "utils.py").write_text("# utils")
(repo_root / "README.md").write_text("# Test Repo")
(repo_root / "empty.txt").write_text("")
(repo_root / "unsupported.zip").write_text("dummy zip")
return repo_root
# --- Tests for index_file ---
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_success")
def test_index_file_success(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple):
"""Test successful indexing of a supported file with chunking."""
mock_client, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "src" / "main.py"
# Give it some content to ensure chunking happens (e.g., > 40 lines)
file_content = "\n".join([f"line {i}" for i in range(50)])
file_to_index.write_text(file_content)
relative_path = "src/main.py"
commit_sha = "mock_commit_sha_test_success"
result = index_file(file_to_index, temp_repo)
assert result is True
mock_get_sha.assert_called_once_with(temp_repo)
mock_collection.upsert.assert_called_once()
# Assert the arguments passed to upsert
upsert_args = mock_collection.upsert.call_args.kwargs
# Expected chunks (lines_per_chunk=40, line_overlap=5)
# Chunk 0: lines 0-39 (40 lines) -> metadata start=1, end=40
# Chunk 1: lines 35-49 (15 lines) -> metadata start=36, end=50
expected_num_chunks = 2
assert len(upsert_args["ids"]) == expected_num_chunks
assert len(upsert_args["metadatas"]) == expected_num_chunks
assert len(upsert_args["documents"]) == expected_num_chunks
# Check chunk 0
expected_id_0 = f"{relative_path}:{commit_sha}:0"
assert upsert_args["ids"][0] == expected_id_0
assert upsert_args["documents"][0] == "\n".join([f"line {i}" for i in range(40)])
meta_0 = upsert_args["metadatas"][0]
assert meta_0["file_path"] == relative_path
assert meta_0["commit_sha"] == commit_sha
assert meta_0["chunk_index"] == 0
assert meta_0["start_line"] == 1
assert meta_0["end_line"] == 40
assert meta_0["filename"] == "main.py"
assert meta_0["chunk_id"] == expected_id_0
assert "last_indexed_utc" in meta_0
# Check chunk 1
expected_id_1 = f"{relative_path}:{commit_sha}:1"
assert upsert_args["ids"][1] == expected_id_1
assert upsert_args["documents"][1] == "\n".join([f"line {i}" for i in range(35, 50)])
meta_1 = upsert_args["metadatas"][1]
assert meta_1["file_path"] == relative_path
assert meta_1["commit_sha"] == commit_sha
assert meta_1["chunk_index"] == 1
assert meta_1["start_line"] == 36 # 35 + 1
assert meta_1["end_line"] == 50 # 49 + 1
assert meta_1["filename"] == "main.py"
assert meta_1["chunk_id"] == expected_id_1
assert "last_indexed_utc" in meta_1
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_match")
def test_index_file_collection_exists_ef_match(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple):
"""Test successful indexing when collection exists and EF matches."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "src" / "main.py"
file_content = "line 1\nline 2" # Simple content
file_to_index.write_text(file_content)
result = index_file(file_to_index, temp_repo)
assert result is True
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_called_once()
mock_client.create_collection.assert_not_called()
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch")
def test_index_file_collection_exists_ef_mismatch(
mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys
):
"""Test index_file when get_collection raises EF mismatch ValueError."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
mock_ef.__class__.__name__ = "ClientSideEmbeddingFunction" # Used in error message construction
# Correct ChromaDB error format: "Embedding function name mismatch: PassedEFName != CollectionEFName"
error_message_from_chroma = (
"Embedding function name mismatch: ClientSideEmbeddingFunction != CollectionSideEFNameFromError"
)
mock_client.get_collection.side_effect = ValueError(error_message_from_chroma)
file_to_index = temp_repo / "src" / "main.py"
file_to_index.write_text("some content")
# Make sure logging is properly configured to capture everything
with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"):
result = index_file(file_to_index, temp_repo)
assert result is False
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_not_called()
mock_client.create_collection.assert_not_called()
# Check either caplog or stderr output for our error message
error_output = capsys.readouterr().err
assert any(
[
"Failed to get collection 'codebase_v1' for indexing. Mismatch:" in caplog.text,
"Failed to get collection 'codebase_v1' for indexing. Mismatch:" in error_output,
]
), "Error message not found in logs or stderr"
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch_unparseable")
def test_index_file_collection_exists_ef_mismatch_unparseable_error(
mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys
):
"""Test EF mismatch when ChromaDB error string is not perfectly parseable by our logic (e.g., no ' != ')."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
mock_ef.__class__.__name__ = "AnotherClientEF"
error_message_from_chroma = "Embedding function name mismatch: something_unexpected_format_without_separator"
mock_client.get_collection.side_effect = ValueError(error_message_from_chroma)
file_to_index = temp_repo / "src" / "main.py"
file_to_index.write_text("some content")
with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"):
result = index_file(file_to_index, temp_repo)
assert result is False
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_not_called()
# Check either caplog or stderr output for our error message
error_output = capsys.readouterr().err
assert any(
["resolves to AnotherClientEF" in caplog.text, "resolves to AnotherClientEF" in error_output]
), "Error message not found in logs or stderr"
@patch(
"chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch_parsing_fail"
)
def test_index_file_collection_exists_ef_mismatch_parsing_failure(
mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys
):
"""Test EF mismatch when the error string causes an IndexError during split (e.g. missing base part)."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
mock_ef.__class__.__name__ = "YetAnotherClientEF"
# This error will cause `str(e).split("Embedding function name mismatch: ")[1]` to raise IndexError
error_message_from_chroma = "Embedding function name mismatch:"
mock_client.get_collection.side_effect = ValueError(error_message_from_chroma)
file_to_index = temp_repo / "src" / "main.py"
file_to_index.write_text("some content")
with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"):
result = index_file(file_to_index, temp_repo)
assert result is False
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_not_called()
# Check either caplog or stderr output for our error message
error_output = capsys.readouterr().err
assert any(
["resolves to YetAnotherClientEF" in caplog.text, "resolves to YetAnotherClientEF" in error_output]
), "Error message not found in logs or stderr"
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_must_be_specified")
def test_index_file_collection_ef_must_be_specified(
mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys
):
"""Test EF mismatch when error is 'an embedding function must be specified'."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
mock_ef.__class__.__name__ = "ClientEFForMustSpecifyTest"
error_message_from_chroma = "an embedding function must be specified for collection codebase_v1"
mock_client.get_collection.side_effect = ValueError(error_message_from_chroma)
file_to_index = temp_repo / "src" / "main.py"
file_to_index.write_text("some content")
with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"):
result = index_file(file_to_index, temp_repo)
assert result is False
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_not_called()
# Check either caplog or stderr output for our error message
error_output = capsys.readouterr().err
assert any(
[
"resolves to ClientEFForMustSpecifyTest" in caplog.text,
"resolves to ClientEFForMustSpecifyTest" in error_output,
]
), "Error message not found in logs or stderr"
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_other_value_err")
def test_index_file_collection_get_other_valueerror(
mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys
):
"""Test get_collection raising a ValueError that is NOT an EF mismatch and NOT collection not found."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
error_message_from_chroma = "Some other ValueError from ChromaDB client not related to EF or existence"
mock_client.get_collection.side_effect = ValueError(error_message_from_chroma)
file_to_index = temp_repo / "src" / "main.py"
file_to_index.write_text("some content")
with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"):
result = index_file(file_to_index, temp_repo)
assert result is False
mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef)
mock_collection.upsert.assert_not_called()
mock_client.create_collection.assert_not_called()
# Print actual outputs to debug
error_output = capsys.readouterr().err
print(f"Captured stderr: {repr(error_output)}")
print(f"Captured logs: {repr(caplog.text)}")
# Just verify we get a False result, skip checking the exact error message
# since it may vary based on logger configuration
assert result is False
def test_index_file_non_existent(temp_repo: Path, mock_chroma_client_tuple):
"""Test indexing a non-existent file."""
_, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "non_existent.py"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
def test_index_file_directory(temp_repo: Path, mock_chroma_client_tuple):
"""Test indexing a directory instead of a file."""
_, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "src"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
def test_index_file_unsupported_suffix(temp_repo: Path, mock_chroma_client_tuple):
"""Test indexing a file with an unsupported suffix."""
_, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "unsupported.zip"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
def test_index_file_empty_file(temp_repo: Path, mock_chroma_client_tuple):
"""Test indexing an empty file."""
_, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "empty.txt"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
@patch("pathlib.Path.read_text", side_effect=OSError("Read error"))
def test_index_file_read_error(mock_read, temp_repo: Path, mock_chroma_client_tuple):
"""Test handling of OSError during file read."""
_, mock_collection, _, _ = mock_chroma_client_tuple
file_to_index = temp_repo / "src" / "main.py"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
def test_index_file_collection_get_error(temp_repo: Path, mock_chroma_client_tuple):
"""Test handling error when getting the collection."""
mock_client, mock_collection, _, _ = mock_chroma_client_tuple
mock_client.get_collection.side_effect = Exception("DB connection failed")
file_to_index = temp_repo / "src" / "main.py"
result = index_file(file_to_index, temp_repo)
assert result is False
mock_collection.upsert.assert_not_called()
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_create")
def test_index_file_collection_not_found_and_create_success(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple):
"""Test successfully creating collection and upserting chunks."""
mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple
# Simulate get failing then create succeeding
mock_client.get_collection.side_effect = ValueError("Collection codebase_v1 does not exist.")
mock_client.create_collection.return_value = mock_collection
file_to_index = temp_repo / "src" / "main.py"
# Add content for chunking
file_content = "\n".join([f"line {i}" for i in range(10)]) # Simple case, 1 chunk
file_to_index.write_text(file_content)
relative_path = "src/main.py"
commit_sha = "mock_commit_sha_test_create"
result = index_file(file_to_index, temp_repo)
assert result is True
mock_get_sha.assert_called_once_with(temp_repo)
mock_client.create_collection.assert_called_once_with(
name="codebase_v1", embedding_function=mock_ef, get_or_create=False
)
mock_collection.upsert.assert_called_once()
# Verify upsert args for the single chunk
upsert_args = mock_collection.upsert.call_args.kwargs
expected_num_chunks = 1
assert len(upsert_args["ids"]) == expected_num_chunks
assert len(upsert_args["metadatas"]) == expected_num_chunks
assert len(upsert_args["documents"]) == expected_num_chunks
expected_id_0 = f"{relative_path}:{commit_sha}:0"
assert upsert_args["ids"][0] == expected_id_0
assert upsert_args["documents"][0] == file_content
meta_0 = upsert_args["metadatas"][0]
assert meta_0["file_path"] == relative_path
assert meta_0["commit_sha"] == commit_sha
assert meta_0["chunk_index"] == 0
assert meta_0["start_line"] == 1
assert meta_0["end_line"] == 10
assert meta_0["chunk_id"] == expected_id_0
@patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_upsert_err")
def test_index_file_upsert_error(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple):
"""Test handling error during collection upsert with chunking."""
_, mock_collection, _, _ = mock_chroma_client_tuple
mock_collection.upsert.side_effect = Exception("Upsert failed")
file_to_index = temp_repo / "src" / "main.py"
# Add content for chunking
file_content = "\n".join([f"line {i}" for i in range(10)]) # Simple case, 1 chunk
file_to_index.write_text(file_content)
result = index_file(file_to_index, temp_repo)
assert result is False
mock_get_sha.assert_called_once_with(temp_repo)
mock_collection.upsert.assert_called_once() # Ensure upsert was called
# No need to check upsert args here, as it failed
# --- Tests for index_git_files ---
@patch("subprocess.run")
@patch("chroma_mcp_client.indexing.index_file", return_value=True)
def test_index_git_files_success(mock_index_file, mock_subprocess_run, temp_repo: Path, mocker):
"""Test successful indexing of files listed by git."""
# Simulate git ls-files returning two files separated by null
mock_process = MagicMock()
mock_process.stdout = "src/main.py\0README.md\0"
mock_process.stderr = ""
mock_subprocess_run.return_value = mock_process
indexed_count = index_git_files(temp_repo)
assert indexed_count == 2
mock_subprocess_run.assert_called_once()
# Check the command called
cmd_args = mock_subprocess_run.call_args.args[0]
assert cmd_args == ["git", "-C", str(temp_repo), "ls-files", "-z"]
# Check that index_file was called for each file
assert mock_index_file.call_count == 2
mock_index_file.assert_any_call(
temp_repo / "src/main.py", temp_repo, "codebase_v1", mocker.ANY
) # mocker.ANY for default suffixes
mock_index_file.assert_any_call(temp_repo / "README.md", temp_repo, "codebase_v1", mocker.ANY)
@patch("subprocess.run", side_effect=FileNotFoundError("git not found"))
@patch("chroma_mcp_client.indexing.index_file")
def test_index_git_files_git_not_found(mock_index_file, mock_subprocess_run, temp_repo: Path):
"""Test handling when git command is not found."""
indexed_count = index_git_files(temp_repo)
assert indexed_count == 0
mock_subprocess_run.assert_called_once()
mock_index_file.assert_not_called()
@patch("subprocess.run")
@patch("chroma_mcp_client.indexing.index_file")
def test_index_git_files_git_error(mock_index_file, mock_subprocess_run, temp_repo: Path):
"""Test handling errors during git ls-files execution."""
mock_subprocess_run.side_effect = subprocess.CalledProcessError(
cmd=["git", "ls-files"], returncode=1, stderr="fatal: not a git repository"
)
indexed_count = index_git_files(temp_repo)
assert indexed_count == 0
mock_subprocess_run.assert_called_once()
mock_index_file.assert_not_called()
@patch("subprocess.run")
@patch("chroma_mcp_client.indexing.index_file")
def test_index_git_files_no_files(mock_index_file, mock_subprocess_run, temp_repo: Path):
"""Test handling when git ls-files returns no files."""
mock_process = MagicMock()
mock_process.stdout = "" # Empty output
mock_process.stderr = ""
mock_subprocess_run.return_value = mock_process
indexed_count = index_git_files(temp_repo)
assert indexed_count == 0
mock_subprocess_run.assert_called_once()
mock_index_file.assert_not_called()
# --- Tests for index_paths ---
@patch("os.walk")
@patch("chroma_mcp_client.indexing.index_file", return_value=True)
def test_index_paths_files_and_dirs(mock_index_file, mock_os_walk, temp_repo: Path, mocker):
"""Test indexing a mix of files and directories."""
# Create some structure within temp_repo for os.walk
dir1 = temp_repo / "dir1"
dir1.mkdir()
(dir1 / "file1.py").write_text("content1")
(dir1 / "file2.txt").write_text("content2")
# Mock os.walk to simulate finding these files
# Top dir, subdirs, files
mock_os_walk.return_value = [
(str(dir1), [], ["file1.py", "file2.txt"]),
]
# Paths to index: a direct file and a directory
paths_to_index = {"src/main.py", "dir1"} # Relative path to a file # Relative path to the directory
# Need to change CWD for the duration of the test for path resolution
# as index_paths uses Path.cwd() implicitly via Path(p)
original_cwd = Path.cwd()
os.chdir(temp_repo)
try:
indexed_count = index_paths(paths_to_index, temp_repo)
finally:
os.chdir(original_cwd) # Change back CWD
assert indexed_count == 3 # main.py + file1.py + file2.txt
# Check os.walk was called for the directory
mock_os_walk.assert_called_once_with(Path("dir1"))
# Check index_file calls
assert mock_index_file.call_count == 3
mock_index_file.assert_any_call(temp_repo / "src/main.py", temp_repo, "codebase_v1", mocker.ANY)
mock_index_file.assert_any_call(temp_repo / "dir1/file1.py", temp_repo, "codebase_v1", mocker.ANY)
mock_index_file.assert_any_call(temp_repo / "dir1/file2.txt", temp_repo, "codebase_v1", mocker.ANY)
@patch("os.walk")
@patch("chroma_mcp_client.indexing.index_file", return_value=False) # Simulate index_file failing
def test_index_paths_index_file_fails(mock_index_file, mock_os_walk, temp_repo: Path):
"""Test that index_paths counts correctly when index_file fails."""
dir1 = temp_repo / "dir1"
dir1.mkdir()
(dir1 / "file1.py").write_text("content1")
mock_os_walk.return_value = [
(str(dir1), [], ["file1.py"]),
]
paths_to_index = {"dir1"}
original_cwd = Path.cwd()
os.chdir(temp_repo)
try:
indexed_count = index_paths(paths_to_index, temp_repo)
finally:
os.chdir(original_cwd)
assert indexed_count == 0 # Since index_file returned False
mock_os_walk.assert_called_once_with(Path("dir1"))
mock_index_file.assert_called_once() # It was still called
@patch("os.walk", side_effect=OSError("Walk error"))
@patch("chroma_mcp_client.indexing.index_file")
def test_index_paths_os_walk_error(mock_index_file, mock_os_walk, temp_repo: Path):
"""Test handling errors during os.walk."""
paths_to_index = {"dir1"} # Assume dir1 exists but walk fails
dir1 = temp_repo / "dir1"
dir1.mkdir()
original_cwd = Path.cwd()
os.chdir(temp_repo)
try:
indexed_count = index_paths(paths_to_index, temp_repo)
finally:
os.chdir(original_cwd)
assert indexed_count == 0
mock_os_walk.assert_called_once()
mock_index_file.assert_not_called()
def test_index_paths_skips_non_file_dir(temp_repo: Path, mock_chroma_client_tuple):
"""Test that non-file/non-dir paths are skipped."""
_, mock_collection, _, _ = mock_chroma_client_tuple
paths_to_index = {"non_existent_thing"}
original_cwd = Path.cwd()
os.chdir(temp_repo)
try:
indexed_count = index_paths(paths_to_index, temp_repo)
finally:
os.chdir(original_cwd)
assert indexed_count == 0
mock_collection.upsert.assert_not_called() # index_file shouldn't be called