Skip to main content
Glama

mcp-nixos

by utensils
test_evals.py31.9 kB
"""Basic evaluation tests for MCP-NixOS to validate AI usability.""" from dataclasses import dataclass from unittest.mock import Mock, patch import pytest from mcp_nixos import server def get_tool_function(tool_name: str): """Get the underlying function from a FastMCP tool.""" tool = getattr(server, tool_name) if hasattr(tool, "fn"): return tool.fn return tool # Get the underlying functions for direct use darwin_search = get_tool_function("darwin_search") darwin_info = get_tool_function("darwin_info") darwin_list_options = get_tool_function("darwin_list_options") darwin_options_by_prefix = get_tool_function("darwin_options_by_prefix") home_manager_search = get_tool_function("home_manager_search") home_manager_info = get_tool_function("home_manager_info") home_manager_list_options = get_tool_function("home_manager_list_options") home_manager_options_by_prefix = get_tool_function("home_manager_options_by_prefix") nixos_info = get_tool_function("nixos_info") nixos_search = get_tool_function("nixos_search") nixos_stats = get_tool_function("nixos_stats") # Removed duplicate classes - kept the more comprehensive versions below class TestErrorHandlingEvals: """Evaluations for error scenarios.""" @pytest.fixture(autouse=True) def mock_channel_validation(self): """Mock channel validation to always pass for 'unstable'.""" with patch("mcp_nixos.server.channel_cache") as mock_cache: mock_cache.get_available.return_value = {"unstable": "latest-45-nixos-unstable"} mock_cache.get_resolved.return_value = {"unstable": "latest-45-nixos-unstable"} with patch("mcp_nixos.server.validate_channel") as mock_validate: mock_validate.return_value = True yield mock_cache @pytest.mark.asyncio async def test_invalid_channel_error(self): """User specifies invalid channel - should get clear error.""" result = await nixos_search("firefox", channel="invalid-channel") # Should get a clear error message assert "Error (ERROR): Invalid channel 'invalid-channel'" in result @patch("mcp_nixos.server.requests.post") @pytest.mark.asyncio async def test_package_not_found(self, mock_post): """User searches for non-existent package.""" mock_response = Mock() mock_response.json.return_value = {"hits": {"hits": []}} mock_response.raise_for_status = Mock() mock_post.return_value = mock_response result = await nixos_info("nonexistentpackage", type="package") # Should get informative not found error assert "Error (NOT_FOUND): Package 'nonexistentpackage' not found" in result class TestCompleteScenarioEval: """End-to-end scenario evaluation.""" @pytest.fixture(autouse=True) def mock_channel_validation(self): """Mock channel validation to always pass for 'unstable'.""" with patch("mcp_nixos.server.channel_cache") as mock_cache: mock_cache.get_available.return_value = {"unstable": "latest-45-nixos-unstable"} mock_cache.get_resolved.return_value = {"unstable": "latest-45-nixos-unstable"} with patch("mcp_nixos.server.validate_channel") as mock_validate: mock_validate.return_value = True yield mock_cache @patch("mcp_nixos.server.requests.post") @patch("mcp_nixos.server.requests.get") @pytest.mark.asyncio async def test_complete_firefox_installation_flow(self, mock_get, mock_post): """Complete flow: user wants Firefox with specific Home Manager config.""" # Step 1: Search for Firefox package search_resp = Mock() search_resp.json.return_value = { "hits": { "hits": [ { "_source": { "package_pname": "firefox", "package_pversion": "121.0", "package_description": "A web browser built from Firefox source tree", } } ] } } search_resp.raise_for_status = Mock() # Step 2: Get package details info_resp = Mock() info_resp.json.return_value = { "hits": { "hits": [ { "_source": { "package_pname": "firefox", "package_pversion": "121.0", "package_description": "A web browser built from Firefox source tree", "package_homepage": ["https://www.mozilla.org/firefox/"], "package_license_set": ["MPL-2.0"], } } ] } } info_resp.raise_for_status = Mock() # Step 3: Search Home Manager options hm_resp = Mock() hm_resp.text = """ <html> <dt>programs.firefox.enable</dt> <dd> <p>Whether to enable Firefox.</p> <span class="term">Type: boolean</span> </dd> </html> """ hm_resp.raise_for_status = Mock() mock_post.side_effect = [search_resp, info_resp] mock_get.return_value = hm_resp # Execute the flow # 1. Search for Firefox result1 = await nixos_search("firefox") assert "Found 1 packages matching 'firefox':" in result1 assert "• firefox (121.0)" in result1 # 2. Get detailed info result2 = await nixos_info("firefox") assert "Package: firefox" in result2 assert "Homepage: https://www.mozilla.org/firefox/" in result2 # 3. Check Home Manager options result3 = await home_manager_search("firefox") assert "• programs.firefox.enable" in result3 # AI should now have all info needed to guide user through installation # ===== Content from test_evals_comprehensive.py ===== @dataclass class EvalScenario: """Represents an evaluation scenario.""" name: str user_query: str expected_tool_calls: list[str] success_criteria: list[str] description: str = "" @dataclass class EvalResult: """Result of running an evaluation.""" scenario: EvalScenario passed: bool score: float # 0.0 to 1.0 tool_calls_made: list[tuple[str, dict, str]] # (tool_name, args, result) criteria_met: dict[str, bool] reasoning: str class MockAIAssistant: """Simulates an AI assistant using the MCP tools.""" def __init__(self): self.tool_calls = [] async def process_query(self, query: str) -> list[tuple[str, dict, str]]: """Process a user query and return tool calls made.""" self.tool_calls = [] # Simulate AI decision making based on query if ("install" in query.lower() or "get" in query.lower()) and any( pkg in query.lower() for pkg in ["vscode", "firefox", "git"] ): await self._handle_package_installation(query) elif ("configure" in query.lower() or "set up" in query.lower()) and "nginx" in query.lower(): await self._handle_service_configuration(query) elif ( "home manager" in query.lower() or "should i configure" in query.lower() or ("manage" in query.lower() and "home manager" in query.lower()) ): await self._handle_home_manager_query(query) elif "dock" in query.lower() and ("darwin" in query.lower() or "macos" in query.lower()): await self._handle_darwin_query(query) elif "difference between" in query.lower(): await self._handle_comparison_query(query) return self.tool_calls async def _make_tool_call(self, tool_name: str, **kwargs) -> str: """Make a tool call and record it.""" # Map tool names to actual functions tools = { "nixos_search": nixos_search, "nixos_info": nixos_info, "nixos_stats": nixos_stats, "home_manager_search": home_manager_search, "home_manager_info": home_manager_info, "home_manager_list_options": home_manager_list_options, "home_manager_options_by_prefix": home_manager_options_by_prefix, "darwin_search": darwin_search, "darwin_info": darwin_info, "darwin_list_options": darwin_list_options, "darwin_options_by_prefix": darwin_options_by_prefix, } if tool_name in tools: result = await tools[tool_name](**kwargs) self.tool_calls.append((tool_name, kwargs, result)) return result return "" async def _handle_package_installation(self, query: str): """Handle package installation queries.""" # Extract package name package = None if "vscode" in query.lower(): package = "vscode" elif "firefox" in query.lower(): package = "firefox" elif "git" in query.lower(): package = "git" if package: # Search for the package await self._make_tool_call("nixos_search", query=package, search_type="packages") # If it's a command, also search programs if package == "git": await self._make_tool_call("nixos_search", query=package, search_type="programs") # Get detailed info await self._make_tool_call("nixos_info", name=package, type="package") async def _handle_service_configuration(self, query: str): """Handle service configuration queries.""" if "nginx" in query.lower(): # Search for nginx options await self._make_tool_call("nixos_search", query="services.nginx", search_type="options") # Get specific option info await self._make_tool_call("nixos_info", name="services.nginx.enable", type="option") await self._make_tool_call("nixos_info", name="services.nginx.virtualHosts", type="option") async def _handle_home_manager_query(self, query: str): """Handle Home Manager related queries.""" if "git" in query.lower(): # Search both system and user options await self._make_tool_call("nixos_search", query="git", search_type="packages") await self._make_tool_call("home_manager_search", query="programs.git") await self._make_tool_call("home_manager_info", name="programs.git.enable") elif "shell" in query.lower(): # Handle shell configuration queries await self._make_tool_call("home_manager_search", query="programs.zsh") await self._make_tool_call("home_manager_info", name="programs.zsh.enable") await self._make_tool_call("home_manager_options_by_prefix", option_prefix="programs.zsh") async def _handle_darwin_query(self, query: str): """Handle Darwin/macOS queries.""" if "dock" in query.lower(): await self._make_tool_call("darwin_search", query="system.defaults.dock") await self._make_tool_call("darwin_info", name="system.defaults.dock.autohide") await self._make_tool_call("darwin_options_by_prefix", option_prefix="system.defaults.dock") async def _handle_comparison_query(self, query: str): """Handle package comparison queries.""" if "firefox" in query.lower(): await self._make_tool_call("nixos_search", query="firefox", search_type="packages") await self._make_tool_call("nixos_info", name="firefox", type="package") await self._make_tool_call("nixos_info", name="firefox-esr", type="package") class EvalFramework: """Framework for running and scoring evaluations.""" def __init__(self): self.assistant = MockAIAssistant() async def run_eval(self, scenario: EvalScenario) -> EvalResult: """Run a single evaluation scenario.""" # Have the assistant process the query tool_calls = await self.assistant.process_query(scenario.user_query) # Check which criteria were met criteria_met = self._check_criteria(scenario, tool_calls) # Calculate score score = sum(1 for met in criteria_met.values() if met) / len(criteria_met) passed = score >= 0.7 # 70% threshold # Generate reasoning reasoning = self._generate_reasoning(scenario, tool_calls, criteria_met) return EvalResult( scenario=scenario, passed=passed, score=score, tool_calls_made=tool_calls, criteria_met=criteria_met, reasoning=reasoning, ) def _check_criteria(self, scenario: EvalScenario, tool_calls: list[tuple[str, dict, str]]) -> dict[str, bool]: """Check which success criteria were met.""" criteria_met = {} # Check expected tool calls expected_tools = set() for expected_call in scenario.expected_tool_calls: # Parse expected call (handle "await" prefix) if expected_call.startswith("await "): tool_name = expected_call[6:].split("(")[0] # Skip "await " else: tool_name = expected_call.split("(")[0] expected_tools.add(tool_name) actual_tools = {call[0] for call in tool_calls} criteria_met["made_expected_tool_calls"] = expected_tools.issubset(actual_tools) # Check specific criteria based on scenario all_results = "\n".join(call[2] for call in tool_calls) for criterion in scenario.success_criteria: if "finds" in criterion and "package" in criterion: # Check if package was found criteria_met[criterion] = any("Found" in call[2] and "packages" in call[2] for call in tool_calls) elif "mentions" in criterion: # Check if certain text is mentioned key_term = criterion.split("mentions")[1].strip() criteria_met[criterion] = key_term.lower() in all_results.lower() elif "provides" in criterion: # Check if examples/syntax provided criteria_met[criterion] = bool(tool_calls) and len(all_results) > 100 elif "explains" in criterion: # Check if explanation provided (has meaningful content) criteria_met[criterion] = len(all_results) > 200 else: # Default: assume met if we have results criteria_met[criterion] = bool(tool_calls) return criteria_met def _generate_reasoning( self, scenario: EvalScenario, tool_calls: list[tuple[str, dict, str]], criteria_met: dict[str, bool] ) -> str: """Generate reasoning about the evaluation result.""" parts = [] # Tool usage if tool_calls: parts.append(f"Made {len(tool_calls)} tool calls") else: parts.append("No tool calls made") # Criteria summary met_count = sum(1 for met in criteria_met.values() if met) parts.append(f"Met {met_count}/{len(criteria_met)} criteria") # Specific issues for criterion, met in criteria_met.items(): if not met: parts.append(f"Failed: {criterion}") return "; ".join(parts) class TestPackageDiscoveryEvals: """Evaluations for package discovery scenarios.""" def setup_method(self): self.framework = EvalFramework() @patch("mcp_nixos.server.es_query") @pytest.mark.asyncio async def test_eval_find_vscode_package(self, mock_query): """Eval: User wants to install VSCode.""" # Mock responses mock_query.return_value = [ { "_source": { "package_pname": "vscode", "package_pversion": "1.85.0", "package_description": "Open source code editor by Microsoft", } } ] scenario = EvalScenario( name="find_vscode", user_query="I want to install VSCode on NixOS", expected_tool_calls=[ "await nixos_search(query='vscode', search_type='packages')", "await nixos_info(name='vscode', type='package')", ], success_criteria=["finds vscode package", "mentions configuration.nix", "provides installation syntax"], ) result = await self.framework.run_eval(scenario) # Verify evaluation assert result.passed assert result.score >= 0.7 assert len(result.tool_calls_made) >= 2 assert any("vscode" in str(call) for call in result.tool_calls_made) @patch("mcp_nixos.server.es_query") @pytest.mark.asyncio async def test_eval_find_git_command(self, mock_query): """Eval: User wants git command.""" # Mock different responses for different queries def query_side_effect(*args, **kwargs): query = args[1] if "program" in str(query): return [{"_source": {"package_programs": ["git"], "package_pname": "git"}}] return [ { "_source": { "package_pname": "git", "package_pversion": "2.43.0", "package_description": "Distributed version control system", } } ] mock_query.side_effect = query_side_effect scenario = EvalScenario( name="find_git_command", user_query="How do I get the 'git' command on NixOS?", expected_tool_calls=[ "await nixos_search(query='git', search_type='programs')", "await nixos_info(name='git', type='package')", ], success_criteria=[ "identifies git package", "explains system vs user installation", "shows both environment.systemPackages and Home Manager options", ], ) result = await self.framework.run_eval(scenario) assert result.passed assert any("programs" in str(call[1]) for call in result.tool_calls_made) @patch("mcp_nixos.server.es_query") @pytest.mark.asyncio async def test_eval_package_comparison(self, mock_query): """Eval: User needs to compare packages.""" # Mock responses for firefox variants def query_side_effect(*args, **kwargs): return [ { "_source": { "package": { "pname": "firefox", "version": "120.0", "description": "Mozilla Firefox web browser", } } } ] mock_query.side_effect = query_side_effect scenario = EvalScenario( name="compare_firefox_variants", user_query="What's the difference between firefox and firefox-esr?", expected_tool_calls=[ "await nixos_search(query='firefox', search_type='packages')", "await nixos_info(name='firefox', type='package')", "await nixos_info(name='firefox-esr', type='package')", ], success_criteria=[ "explains ESR vs regular versions", "mentions stability vs features trade-off", "provides configuration examples for both", ], ) result = await self.framework.run_eval(scenario) # Check that comparison tools were called assert len(result.tool_calls_made) >= 2 assert any("firefox-esr" in str(call) for call in result.tool_calls_made) class TestServiceConfigurationEvals: """Evaluations for service configuration scenarios.""" def setup_method(self): self.framework = EvalFramework() @patch("mcp_nixos.server.es_query") @pytest.mark.asyncio async def test_eval_nginx_setup(self, mock_query): """Eval: User wants to set up nginx.""" mock_query.return_value = [ { "_source": { "option_name": "services.nginx.enable", "option_type": "boolean", "option_description": "Whether to enable nginx web server", } } ] scenario = EvalScenario( name="nginx_setup", user_query="How do I set up nginx on NixOS to serve static files?", expected_tool_calls=[ "await nixos_search(query='services.nginx', search_type='options')", "await nixos_info(name='services.nginx.enable', type='option')", "await nixos_info(name='services.nginx.virtualHosts', type='option')", ], success_criteria=[ "enables nginx service", "configures virtual host", "explains directory structure", "mentions firewall configuration", "provides complete configuration.nix example", ], ) result = await self.framework.run_eval(scenario) assert len(result.tool_calls_made) >= 2 assert any("nginx" in call[2] for call in result.tool_calls_made) @patch("mcp_nixos.server.es_query") @pytest.mark.asyncio async def test_eval_database_setup(self, mock_query): """Eval: User wants PostgreSQL setup.""" mock_query.return_value = [ { "_source": { "option": { "option_name": "services.postgresql.enable", "option_type": "boolean", "option_description": "Whether to enable PostgreSQL", } } } ] scenario = EvalScenario( name="postgresql_setup", user_query="Set up PostgreSQL with a database for my app", expected_tool_calls=[ "await nixos_search(query='services.postgresql', search_type='options')", "await nixos_info(name='services.postgresql.enable', type='option')", "await nixos_info(name='services.postgresql.ensureDatabases', type='option')", "await nixos_info(name='services.postgresql.ensureUsers', type='option')", ], success_criteria=[ "enables postgresql service", "creates database", "sets up user with permissions", "explains connection details", "mentions backup considerations", ], ) # This scenario would need more complex mocking in real implementation # For now, just verify the structure works result = await self.framework.run_eval(scenario) assert isinstance(result, EvalResult) class TestHomeManagerIntegrationEvals: """Evaluations for Home Manager vs system configuration.""" def setup_method(self): self.framework = EvalFramework() @patch("mcp_nixos.server.es_query") @patch("mcp_nixos.server.parse_html_options") @pytest.mark.asyncio async def test_eval_user_vs_system_config(self, mock_parse, mock_query): """Eval: User confused about where to configure git.""" # Mock system package mock_query.return_value = [ { "_source": { "package": { "pname": "git", "version": "2.43.0", "description": "Distributed version control system", } } } ] # Mock Home Manager options mock_parse.return_value = [ {"name": "programs.git.enable", "type": "boolean", "description": "Enable git"}, {"name": "programs.git.userName", "type": "string", "description": "Git user name"}, ] scenario = EvalScenario( name="git_config_location", user_query="Should I configure git in NixOS or Home Manager?", expected_tool_calls=[ "await nixos_search(query='git', search_type='packages')", "await home_manager_search(query='programs.git')", "await home_manager_info(name='programs.git.enable')", ], success_criteria=[ "explains system vs user configuration", "recommends Home Manager for user configs", "shows both approaches", "explains when to use each", ], ) result = await self.framework.run_eval(scenario) assert len(result.tool_calls_made) >= 3 assert any("home_manager" in call[0] for call in result.tool_calls_made) @patch("mcp_nixos.server.parse_html_options") @pytest.mark.asyncio async def test_eval_dotfiles_management(self, mock_parse): """Eval: User wants to manage shell config.""" mock_parse.return_value = [ {"name": "programs.zsh.enable", "type": "boolean", "description": "Enable zsh"}, {"name": "programs.zsh.oh-my-zsh.enable", "type": "boolean", "description": "Enable Oh My Zsh"}, ] scenario = EvalScenario( name="shell_config", user_query="How do I manage my shell configuration with Home Manager?", expected_tool_calls=[ "await home_manager_search(query='programs.zsh')", "await home_manager_info(name='programs.zsh.enable')", "await home_manager_options_by_prefix(option_prefix='programs.zsh')", ], success_criteria=[ "enables shell program", "explains configuration options", "mentions aliases and plugins", "provides working example", ], ) result = await self.framework.run_eval(scenario) assert any("zsh" in str(call) for call in result.tool_calls_made) class TestDarwinPlatformEvals: """Evaluations for macOS/nix-darwin scenarios.""" def setup_method(self): self.framework = EvalFramework() @patch("mcp_nixos.server.parse_html_options") @pytest.mark.asyncio async def test_eval_macos_dock_settings(self, mock_parse): """Eval: User wants to configure macOS dock.""" mock_parse.return_value = [ {"name": "system.defaults.dock.autohide", "type": "boolean", "description": "Auto-hide dock"}, {"name": "system.defaults.dock.tilesize", "type": "integer", "description": "Dock icon size"}, ] scenario = EvalScenario( name="macos_dock_config", user_query="How do I configure dock settings with nix-darwin?", expected_tool_calls=[ "await darwin_search(query='system.defaults.dock')", "await darwin_info(name='system.defaults.dock.autohide')", "await darwin_options_by_prefix(option_prefix='system.defaults.dock')", ], success_criteria=[ "finds dock configuration options", "explains autohide and other settings", "provides darwin-configuration.nix example", "mentions darwin-rebuild command", ], ) result = await self.framework.run_eval(scenario) assert len(result.tool_calls_made) >= 2 assert any("darwin" in call[0] for call in result.tool_calls_made) assert any("dock" in str(call) for call in result.tool_calls_made) class TestEvalReporting: """Test evaluation reporting functionality.""" @pytest.mark.asyncio async def test_eval_result_generation(self): """Test that eval results are properly generated.""" scenario = EvalScenario( name="test_scenario", user_query="Test query", expected_tool_calls=["await nixos_search(query='test')"], success_criteria=["finds test package"], ) result = EvalResult( scenario=scenario, passed=True, score=1.0, tool_calls_made=[ ("nixos_search", {"query": "test"}, "Found 1 packages matching 'test':\n\n• test (1.0.0)") ], criteria_met={"finds test package": True}, reasoning="Made 1 tool calls; Met 1/1 criteria", ) assert result.passed assert result.score == 1.0 assert len(result.tool_calls_made) == 1 def test_eval_scoring(self): """Test evaluation scoring logic.""" # Create a scenario with multiple criteria EvalScenario( name="multi_criteria", user_query="Test with multiple criteria", expected_tool_calls=[], success_criteria=["criterion1", "criterion2", "criterion3"], ) # Test partial success criteria_met = {"criterion1": True, "criterion2": True, "criterion3": False} score = sum(1 for met in criteria_met.values() if met) / len(criteria_met) assert score == pytest.approx(0.666, rel=0.01) assert score < 0.7 # Below passing threshold def generate_eval_report(self, results: list[EvalResult]) -> str: """Generate a report from evaluation results.""" total = len(results) passed = sum(1 for r in results if r.passed) avg_score = sum(r.score for r in results) / total if total > 0 else 0 report = f"""# MCP-NixOS Evaluation Report ## Summary - Total Evaluations: {total} - Passed: {passed} ({passed / total * 100:.1f}%) - Average Score: {avg_score:.2f} ## Detailed Results """ for result in results: status = "✅ PASS" if result.passed else "❌ FAIL" report += f"\n### {status} {result.scenario.name} (Score: {result.score:.2f})\n" report += f"Query: {result.scenario.user_query}\n" report += f"Reasoning: {result.reasoning}\n" return report class TestCompleteEvalSuite: """Run complete evaluation suite.""" @pytest.mark.integration @pytest.mark.asyncio async def test_run_all_evals(self): """Run all evaluation scenarios and generate report.""" # This would run all eval scenarios and generate a comprehensive report # For brevity, just verify the structure exists all_scenarios = [ EvalScenario( name="basic_package_install", user_query="How do I install Firefox?", expected_tool_calls=["await nixos_search(query='firefox')"], success_criteria=["finds firefox package"], ), EvalScenario( name="service_config", user_query="Configure nginx web server", expected_tool_calls=["await nixos_search(query='nginx', search_type='options')"], success_criteria=["finds nginx options"], ), EvalScenario( name="home_manager_usage", user_query="Should I use Home Manager for git config?", expected_tool_calls=["await home_manager_search(query='git')"], success_criteria=["recommends Home Manager"], ), ] assert len(all_scenarios) >= 3 assert all(isinstance(s, EvalScenario) for s in all_scenarios) if __name__ == "__main__": pytest.main([__file__, "-v"])

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/utensils/mcp-nixos'

If you have feedback or need assistance with the MCP directory API, please join our Discord server