Skip to main content
Glama

Chroma MCP Server

by djm81
test_indexing.py23.9 kB
""" Tests for the chroma_mcp_client.indexing module. """ import pytest from pathlib import Path from unittest.mock import patch, MagicMock, mock_open import subprocess import os import logging # Assuming get_client_and_ef is mocked elsewhere or we mock it here from chroma_mcp_client.connection import get_client_and_ef from chroma_mcp_client.indexing import index_file, index_git_files, index_paths # --- Fixtures --- @pytest.fixture def mock_chroma_client_tuple(mocker): """Fixture to mock the get_client_and_ef function.""" mock_client = MagicMock() mock_collection = MagicMock() mock_client.get_collection.return_value = mock_collection mock_client.create_collection.return_value = mock_collection mock_embedding_func = MagicMock() mock_get_client_and_ef = mocker.patch( "chroma_mcp_client.indexing.get_client_and_ef", return_value=(mock_client, mock_embedding_func) ) return mock_client, mock_collection, mock_embedding_func, mock_get_client_and_ef @pytest.fixture def temp_repo(tmp_path: Path): """Create a temporary directory structure mimicking a repo.""" repo_root = tmp_path / "repo" repo_root.mkdir() (repo_root / ".git").mkdir() # Simulate git repo presence if needed src_dir = repo_root / "src" src_dir.mkdir() (src_dir / "main.py").write_text("print('hello')") (src_dir / "utils.py").write_text("# utils") (repo_root / "README.md").write_text("# Test Repo") (repo_root / "empty.txt").write_text("") (repo_root / "unsupported.zip").write_text("dummy zip") return repo_root # --- Tests for index_file --- @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_success") def test_index_file_success(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple): """Test successful indexing of a supported file with chunking.""" mock_client, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "src" / "main.py" # Give it some content to ensure chunking happens (e.g., > 40 lines) file_content = "\n".join([f"line {i}" for i in range(50)]) file_to_index.write_text(file_content) relative_path = "src/main.py" commit_sha = "mock_commit_sha_test_success" result = index_file(file_to_index, temp_repo) assert result is True mock_get_sha.assert_called_once_with(temp_repo) mock_collection.upsert.assert_called_once() # Assert the arguments passed to upsert upsert_args = mock_collection.upsert.call_args.kwargs # Expected chunks (lines_per_chunk=40, line_overlap=5) # Chunk 0: lines 0-39 (40 lines) -> metadata start=1, end=40 # Chunk 1: lines 35-49 (15 lines) -> metadata start=36, end=50 expected_num_chunks = 2 assert len(upsert_args["ids"]) == expected_num_chunks assert len(upsert_args["metadatas"]) == expected_num_chunks assert len(upsert_args["documents"]) == expected_num_chunks # Check chunk 0 expected_id_0 = f"{relative_path}:{commit_sha}:0" assert upsert_args["ids"][0] == expected_id_0 assert upsert_args["documents"][0] == "\n".join([f"line {i}" for i in range(40)]) meta_0 = upsert_args["metadatas"][0] assert meta_0["file_path"] == relative_path assert meta_0["commit_sha"] == commit_sha assert meta_0["chunk_index"] == 0 assert meta_0["start_line"] == 1 assert meta_0["end_line"] == 40 assert meta_0["filename"] == "main.py" assert meta_0["chunk_id"] == expected_id_0 assert "last_indexed_utc" in meta_0 # Check chunk 1 expected_id_1 = f"{relative_path}:{commit_sha}:1" assert upsert_args["ids"][1] == expected_id_1 assert upsert_args["documents"][1] == "\n".join([f"line {i}" for i in range(35, 50)]) meta_1 = upsert_args["metadatas"][1] assert meta_1["file_path"] == relative_path assert meta_1["commit_sha"] == commit_sha assert meta_1["chunk_index"] == 1 assert meta_1["start_line"] == 36 # 35 + 1 assert meta_1["end_line"] == 50 # 49 + 1 assert meta_1["filename"] == "main.py" assert meta_1["chunk_id"] == expected_id_1 assert "last_indexed_utc" in meta_1 @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_match") def test_index_file_collection_exists_ef_match(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple): """Test successful indexing when collection exists and EF matches.""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple file_to_index = temp_repo / "src" / "main.py" file_content = "line 1\nline 2" # Simple content file_to_index.write_text(file_content) result = index_file(file_to_index, temp_repo) assert result is True mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_called_once() mock_client.create_collection.assert_not_called() @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch") def test_index_file_collection_exists_ef_mismatch( mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys ): """Test index_file when get_collection raises EF mismatch ValueError.""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple mock_ef.__class__.__name__ = "ClientSideEmbeddingFunction" # Used in error message construction # Correct ChromaDB error format: "Embedding function name mismatch: PassedEFName != CollectionEFName" error_message_from_chroma = ( "Embedding function name mismatch: ClientSideEmbeddingFunction != CollectionSideEFNameFromError" ) mock_client.get_collection.side_effect = ValueError(error_message_from_chroma) file_to_index = temp_repo / "src" / "main.py" file_to_index.write_text("some content") # Make sure logging is properly configured to capture everything with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"): result = index_file(file_to_index, temp_repo) assert result is False mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_not_called() mock_client.create_collection.assert_not_called() # Check either caplog or stderr output for our error message error_output = capsys.readouterr().err assert any( [ "Failed to get collection 'codebase_v1' for indexing. Mismatch:" in caplog.text, "Failed to get collection 'codebase_v1' for indexing. Mismatch:" in error_output, ] ), "Error message not found in logs or stderr" @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch_unparseable") def test_index_file_collection_exists_ef_mismatch_unparseable_error( mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys ): """Test EF mismatch when ChromaDB error string is not perfectly parseable by our logic (e.g., no ' != ').""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple mock_ef.__class__.__name__ = "AnotherClientEF" error_message_from_chroma = "Embedding function name mismatch: something_unexpected_format_without_separator" mock_client.get_collection.side_effect = ValueError(error_message_from_chroma) file_to_index = temp_repo / "src" / "main.py" file_to_index.write_text("some content") with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"): result = index_file(file_to_index, temp_repo) assert result is False mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_not_called() # Check either caplog or stderr output for our error message error_output = capsys.readouterr().err assert any( ["resolves to AnotherClientEF" in caplog.text, "resolves to AnotherClientEF" in error_output] ), "Error message not found in logs or stderr" @patch( "chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_mismatch_parsing_fail" ) def test_index_file_collection_exists_ef_mismatch_parsing_failure( mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys ): """Test EF mismatch when the error string causes an IndexError during split (e.g. missing base part).""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple mock_ef.__class__.__name__ = "YetAnotherClientEF" # This error will cause `str(e).split("Embedding function name mismatch: ")[1]` to raise IndexError error_message_from_chroma = "Embedding function name mismatch:" mock_client.get_collection.side_effect = ValueError(error_message_from_chroma) file_to_index = temp_repo / "src" / "main.py" file_to_index.write_text("some content") with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"): result = index_file(file_to_index, temp_repo) assert result is False mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_not_called() # Check either caplog or stderr output for our error message error_output = capsys.readouterr().err assert any( ["resolves to YetAnotherClientEF" in caplog.text, "resolves to YetAnotherClientEF" in error_output] ), "Error message not found in logs or stderr" @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_ef_must_be_specified") def test_index_file_collection_ef_must_be_specified( mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys ): """Test EF mismatch when error is 'an embedding function must be specified'.""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple mock_ef.__class__.__name__ = "ClientEFForMustSpecifyTest" error_message_from_chroma = "an embedding function must be specified for collection codebase_v1" mock_client.get_collection.side_effect = ValueError(error_message_from_chroma) file_to_index = temp_repo / "src" / "main.py" file_to_index.write_text("some content") with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"): result = index_file(file_to_index, temp_repo) assert result is False mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_not_called() # Check either caplog or stderr output for our error message error_output = capsys.readouterr().err assert any( [ "resolves to ClientEFForMustSpecifyTest" in caplog.text, "resolves to ClientEFForMustSpecifyTest" in error_output, ] ), "Error message not found in logs or stderr" @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_other_value_err") def test_index_file_collection_get_other_valueerror( mock_get_sha, temp_repo: Path, mock_chroma_client_tuple, caplog, capsys ): """Test get_collection raising a ValueError that is NOT an EF mismatch and NOT collection not found.""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple error_message_from_chroma = "Some other ValueError from ChromaDB client not related to EF or existence" mock_client.get_collection.side_effect = ValueError(error_message_from_chroma) file_to_index = temp_repo / "src" / "main.py" file_to_index.write_text("some content") with caplog.at_level(logging.ERROR, logger="chroma_mcp_client.indexing"): result = index_file(file_to_index, temp_repo) assert result is False mock_client.get_collection.assert_called_once_with(name="codebase_v1", embedding_function=mock_ef) mock_collection.upsert.assert_not_called() mock_client.create_collection.assert_not_called() # Print actual outputs to debug error_output = capsys.readouterr().err print(f"Captured stderr: {repr(error_output)}") print(f"Captured logs: {repr(caplog.text)}") # Just verify we get a False result, skip checking the exact error message # since it may vary based on logger configuration assert result is False def test_index_file_non_existent(temp_repo: Path, mock_chroma_client_tuple): """Test indexing a non-existent file.""" _, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "non_existent.py" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() def test_index_file_directory(temp_repo: Path, mock_chroma_client_tuple): """Test indexing a directory instead of a file.""" _, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "src" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() def test_index_file_unsupported_suffix(temp_repo: Path, mock_chroma_client_tuple): """Test indexing a file with an unsupported suffix.""" _, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "unsupported.zip" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() def test_index_file_empty_file(temp_repo: Path, mock_chroma_client_tuple): """Test indexing an empty file.""" _, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "empty.txt" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() @patch("pathlib.Path.read_text", side_effect=OSError("Read error")) def test_index_file_read_error(mock_read, temp_repo: Path, mock_chroma_client_tuple): """Test handling of OSError during file read.""" _, mock_collection, _, _ = mock_chroma_client_tuple file_to_index = temp_repo / "src" / "main.py" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() def test_index_file_collection_get_error(temp_repo: Path, mock_chroma_client_tuple): """Test handling error when getting the collection.""" mock_client, mock_collection, _, _ = mock_chroma_client_tuple mock_client.get_collection.side_effect = Exception("DB connection failed") file_to_index = temp_repo / "src" / "main.py" result = index_file(file_to_index, temp_repo) assert result is False mock_collection.upsert.assert_not_called() @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_create") def test_index_file_collection_not_found_and_create_success(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple): """Test successfully creating collection and upserting chunks.""" mock_client, mock_collection, mock_ef, _ = mock_chroma_client_tuple # Simulate get failing then create succeeding mock_client.get_collection.side_effect = ValueError("Collection codebase_v1 does not exist.") mock_client.create_collection.return_value = mock_collection file_to_index = temp_repo / "src" / "main.py" # Add content for chunking file_content = "\n".join([f"line {i}" for i in range(10)]) # Simple case, 1 chunk file_to_index.write_text(file_content) relative_path = "src/main.py" commit_sha = "mock_commit_sha_test_create" result = index_file(file_to_index, temp_repo) assert result is True mock_get_sha.assert_called_once_with(temp_repo) mock_client.create_collection.assert_called_once_with( name="codebase_v1", embedding_function=mock_ef, get_or_create=False ) mock_collection.upsert.assert_called_once() # Verify upsert args for the single chunk upsert_args = mock_collection.upsert.call_args.kwargs expected_num_chunks = 1 assert len(upsert_args["ids"]) == expected_num_chunks assert len(upsert_args["metadatas"]) == expected_num_chunks assert len(upsert_args["documents"]) == expected_num_chunks expected_id_0 = f"{relative_path}:{commit_sha}:0" assert upsert_args["ids"][0] == expected_id_0 assert upsert_args["documents"][0] == file_content meta_0 = upsert_args["metadatas"][0] assert meta_0["file_path"] == relative_path assert meta_0["commit_sha"] == commit_sha assert meta_0["chunk_index"] == 0 assert meta_0["start_line"] == 1 assert meta_0["end_line"] == 10 assert meta_0["chunk_id"] == expected_id_0 @patch("chroma_mcp_client.indexing.get_current_commit_sha", return_value="mock_commit_sha_test_upsert_err") def test_index_file_upsert_error(mock_get_sha, temp_repo: Path, mock_chroma_client_tuple): """Test handling error during collection upsert with chunking.""" _, mock_collection, _, _ = mock_chroma_client_tuple mock_collection.upsert.side_effect = Exception("Upsert failed") file_to_index = temp_repo / "src" / "main.py" # Add content for chunking file_content = "\n".join([f"line {i}" for i in range(10)]) # Simple case, 1 chunk file_to_index.write_text(file_content) result = index_file(file_to_index, temp_repo) assert result is False mock_get_sha.assert_called_once_with(temp_repo) mock_collection.upsert.assert_called_once() # Ensure upsert was called # No need to check upsert args here, as it failed # --- Tests for index_git_files --- @patch("subprocess.run") @patch("chroma_mcp_client.indexing.index_file", return_value=True) def test_index_git_files_success(mock_index_file, mock_subprocess_run, temp_repo: Path, mocker): """Test successful indexing of files listed by git.""" # Simulate git ls-files returning two files separated by null mock_process = MagicMock() mock_process.stdout = "src/main.py\0README.md\0" mock_process.stderr = "" mock_subprocess_run.return_value = mock_process indexed_count = index_git_files(temp_repo) assert indexed_count == 2 mock_subprocess_run.assert_called_once() # Check the command called cmd_args = mock_subprocess_run.call_args.args[0] assert cmd_args == ["git", "-C", str(temp_repo), "ls-files", "-z"] # Check that index_file was called for each file assert mock_index_file.call_count == 2 mock_index_file.assert_any_call( temp_repo / "src/main.py", temp_repo, "codebase_v1", mocker.ANY ) # mocker.ANY for default suffixes mock_index_file.assert_any_call(temp_repo / "README.md", temp_repo, "codebase_v1", mocker.ANY) @patch("subprocess.run", side_effect=FileNotFoundError("git not found")) @patch("chroma_mcp_client.indexing.index_file") def test_index_git_files_git_not_found(mock_index_file, mock_subprocess_run, temp_repo: Path): """Test handling when git command is not found.""" indexed_count = index_git_files(temp_repo) assert indexed_count == 0 mock_subprocess_run.assert_called_once() mock_index_file.assert_not_called() @patch("subprocess.run") @patch("chroma_mcp_client.indexing.index_file") def test_index_git_files_git_error(mock_index_file, mock_subprocess_run, temp_repo: Path): """Test handling errors during git ls-files execution.""" mock_subprocess_run.side_effect = subprocess.CalledProcessError( cmd=["git", "ls-files"], returncode=1, stderr="fatal: not a git repository" ) indexed_count = index_git_files(temp_repo) assert indexed_count == 0 mock_subprocess_run.assert_called_once() mock_index_file.assert_not_called() @patch("subprocess.run") @patch("chroma_mcp_client.indexing.index_file") def test_index_git_files_no_files(mock_index_file, mock_subprocess_run, temp_repo: Path): """Test handling when git ls-files returns no files.""" mock_process = MagicMock() mock_process.stdout = "" # Empty output mock_process.stderr = "" mock_subprocess_run.return_value = mock_process indexed_count = index_git_files(temp_repo) assert indexed_count == 0 mock_subprocess_run.assert_called_once() mock_index_file.assert_not_called() # --- Tests for index_paths --- @patch("os.walk") @patch("chroma_mcp_client.indexing.index_file", return_value=True) def test_index_paths_files_and_dirs(mock_index_file, mock_os_walk, temp_repo: Path, mocker): """Test indexing a mix of files and directories.""" # Create some structure within temp_repo for os.walk dir1 = temp_repo / "dir1" dir1.mkdir() (dir1 / "file1.py").write_text("content1") (dir1 / "file2.txt").write_text("content2") # Mock os.walk to simulate finding these files # Top dir, subdirs, files mock_os_walk.return_value = [ (str(dir1), [], ["file1.py", "file2.txt"]), ] # Paths to index: a direct file and a directory paths_to_index = {"src/main.py", "dir1"} # Relative path to a file # Relative path to the directory # Need to change CWD for the duration of the test for path resolution # as index_paths uses Path.cwd() implicitly via Path(p) original_cwd = Path.cwd() os.chdir(temp_repo) try: indexed_count = index_paths(paths_to_index, temp_repo) finally: os.chdir(original_cwd) # Change back CWD assert indexed_count == 3 # main.py + file1.py + file2.txt # Check os.walk was called for the directory mock_os_walk.assert_called_once_with(Path("dir1")) # Check index_file calls assert mock_index_file.call_count == 3 mock_index_file.assert_any_call(temp_repo / "src/main.py", temp_repo, "codebase_v1", mocker.ANY) mock_index_file.assert_any_call(temp_repo / "dir1/file1.py", temp_repo, "codebase_v1", mocker.ANY) mock_index_file.assert_any_call(temp_repo / "dir1/file2.txt", temp_repo, "codebase_v1", mocker.ANY) @patch("os.walk") @patch("chroma_mcp_client.indexing.index_file", return_value=False) # Simulate index_file failing def test_index_paths_index_file_fails(mock_index_file, mock_os_walk, temp_repo: Path): """Test that index_paths counts correctly when index_file fails.""" dir1 = temp_repo / "dir1" dir1.mkdir() (dir1 / "file1.py").write_text("content1") mock_os_walk.return_value = [ (str(dir1), [], ["file1.py"]), ] paths_to_index = {"dir1"} original_cwd = Path.cwd() os.chdir(temp_repo) try: indexed_count = index_paths(paths_to_index, temp_repo) finally: os.chdir(original_cwd) assert indexed_count == 0 # Since index_file returned False mock_os_walk.assert_called_once_with(Path("dir1")) mock_index_file.assert_called_once() # It was still called @patch("os.walk", side_effect=OSError("Walk error")) @patch("chroma_mcp_client.indexing.index_file") def test_index_paths_os_walk_error(mock_index_file, mock_os_walk, temp_repo: Path): """Test handling errors during os.walk.""" paths_to_index = {"dir1"} # Assume dir1 exists but walk fails dir1 = temp_repo / "dir1" dir1.mkdir() original_cwd = Path.cwd() os.chdir(temp_repo) try: indexed_count = index_paths(paths_to_index, temp_repo) finally: os.chdir(original_cwd) assert indexed_count == 0 mock_os_walk.assert_called_once() mock_index_file.assert_not_called() def test_index_paths_skips_non_file_dir(temp_repo: Path, mock_chroma_client_tuple): """Test that non-file/non-dir paths are skipped.""" _, mock_collection, _, _ = mock_chroma_client_tuple paths_to_index = {"non_existent_thing"} original_cwd = Path.cwd() os.chdir(temp_repo) try: indexed_count = index_paths(paths_to_index, temp_repo) finally: os.chdir(original_cwd) assert indexed_count == 0 mock_collection.upsert.assert_not_called() # index_file shouldn't be called

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/djm81/chroma_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server