Nextcloud MCP Server

Overview Schema Related Servers Score Discussions

test_generation_quality.py•4.9 kB

"""Tests for RAG generation quality (Answer Correctness metric). These tests evaluate whether the MCP client LLM generates factually correct answers from retrieved context using the nc_semantic_search_answer tool. Metric: Answer Correctness - Measures: Is the generated answer factually correct? - Method: LLM-as-judge - Compare RAG answer vs ground truth (binary true/false) - Evaluation: External LLM evaluates semantic equivalence """ import pytest @pytest.mark.integration async def test_answer_correctness( mcp_sampling_client, evaluation_llm, nfcorpus_test_data, ): """Test that RAG system generates factually correct answers. For each test query: 1. Execute full RAG pipeline via nc_semantic_search_answer MCP tool 2. Extract generated answer from RAG response 3. Use LLM-as-judge to compare against ground truth (binary true/false) 4. Assert answer is semantically equivalent to ground truth This tests the quality of the generation component (MCP client LLM). """ results_summary = [] for test_case in nfcorpus_test_data: query = test_case["query_text"] ground_truth = test_case["ground_truth_answer"] print(f"\n{'=' * 80}") print(f"Query: {query}") # Execute full RAG pipeline print("Executing RAG pipeline...") rag_result = await mcp_sampling_client.call_tool( "nc_semantic_search_answer", arguments={"query": query, "limit": 5}, ) rag_answer = rag_result["generated_answer"] print(f"RAG Answer preview: {rag_answer[:200]}...") print(f"Ground Truth preview: {ground_truth[:200]}...") # LLM-as-judge evaluation evaluation_prompt = f"""Compare these two answers and respond with only TRUE or FALSE. Question: {query} Generated Answer: {rag_answer} Ground Truth Answer: {ground_truth} Are these answers semantically equivalent (do they convey the same factual information)? Respond with only: TRUE or FALSE""" print("Evaluating answer correctness...") evaluation_result = await evaluation_llm.generate( evaluation_prompt, max_tokens=10, ) is_correct = evaluation_result.strip().upper() == "TRUE" result = { "query_id": test_case["query_id"], "query": query, "rag_answer_length": len(rag_answer), "ground_truth_length": len(ground_truth), "is_correct": is_correct, "evaluation_result": evaluation_result.strip(), } results_summary.append(result) print(f" Evaluation: {evaluation_result.strip()}") print(f" Status: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}") # Assert answer correctness assert is_correct, ( f"Answer mismatch for query: {query}\n\n" f"Generated Answer:\n{rag_answer}\n\n" f"Ground Truth:\n{ground_truth}\n\n" f"Evaluation: {evaluation_result.strip()}" ) # Print summary print(f"\n{'=' * 80}") print("Answer Correctness Summary:") print(f" Total queries: {len(results_summary)}") print(f" Correct: {sum(r['is_correct'] for r in results_summary)}") print(f" Incorrect: {sum(not r['is_correct'] for r in results_summary)}") accuracy = sum(r["is_correct"] for r in results_summary) / len(results_summary) print(f" Accuracy: {accuracy:.2%}") print(f"{'=' * 80}") @pytest.mark.integration async def test_answer_contains_sources(mcp_sampling_client, nfcorpus_test_data): """Test that RAG answers include source citations. This is a basic quality check - we verify that the nc_semantic_search_answer tool returns both a generated answer and source documents. """ for test_case in nfcorpus_test_data: query = test_case["query_text"] # Execute RAG pipeline rag_result = await mcp_sampling_client.call_tool( "nc_semantic_search_answer", arguments={"query": query, "limit": 5}, ) # Check response structure assert "generated_answer" in rag_result, "Response missing 'generated_answer'" assert "sources" in rag_result, "Response missing 'sources'" # Check sources are provided sources = rag_result["sources"] assert len(sources) > 0, f"No sources returned for query: {query}" # Check each source has required fields for i, source in enumerate(sources): assert "document_id" in source or "id" in source, ( f"Source {i} missing document ID" ) assert "excerpt" in source or "content" in source or "text" in source, ( f"Source {i} missing content" ) print(f"Query: {query}") print(f" Sources provided: {len(sources)}") print(" Status: ✓ PASS")

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbcoutinho/nextcloud-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server