"""
Performance and load testing for MCP Wikipedia Server.
This module contains performance benchmarks and load tests to ensure
the server can handle production workloads efficiently.
"""
import asyncio
import time
import statistics
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Any
import sys
import os
# Add src to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
try:
from mcp_server.mcp_server import WikipediaServer
except ImportError:
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src', 'mcp_server'))
from mcp_server import WikipediaServer
class PerformanceTester:
"""Performance testing utilities for Wikipedia server."""
def __init__(self):
self.server = WikipediaServer()
self.results = []
async def measure_response_time(self, func, *args, **kwargs) -> Dict[str, Any]:
"""Measure response time for a function call."""
start_time = time.time()
try:
result = await func(*args, **kwargs)
end_time = time.time()
response_time = end_time - start_time
return {
"success": result.get("success", False),
"response_time": response_time,
"error": result.get("error") if not result.get("success") else None
}
except Exception as e:
end_time = time.time()
response_time = end_time - start_time
return {
"success": False,
"response_time": response_time,
"error": str(e)
}
async def benchmark_search_tool(self, queries: List[str], iterations: int = 3) -> Dict[str, Any]:
"""Benchmark the fetch_wikipedia_info tool."""
print(f"\n🔍 Benchmarking fetch_wikipedia_info with {len(queries)} queries, {iterations} iterations each...")
all_times = []
success_count = 0
total_requests = len(queries) * iterations
for iteration in range(iterations):
print(f" Iteration {iteration + 1}/{iterations}")
for i, query in enumerate(queries):
result = await self.measure_response_time(
self.server.fetch_wikipedia_info, query
)
all_times.append(result["response_time"])
if result["success"]:
success_count += 1
print(f" Query {i+1}: {result['response_time']:.3f}s {'✅' if result['success'] else '❌'}")
return {
"tool": "fetch_wikipedia_info",
"total_requests": total_requests,
"successful_requests": success_count,
"success_rate": success_count / total_requests,
"response_times": {
"min": min(all_times),
"max": max(all_times),
"mean": statistics.mean(all_times),
"median": statistics.median(all_times),
"stdev": statistics.stdev(all_times) if len(all_times) > 1 else 0
}
}
async def benchmark_sections_tool(self, topics: List[str], iterations: int = 3) -> Dict[str, Any]:
"""Benchmark the list_wikipedia_sections tool."""
print(f"\n📋 Benchmarking list_wikipedia_sections with {len(topics)} topics, {iterations} iterations each...")
all_times = []
success_count = 0
total_requests = len(topics) * iterations
for iteration in range(iterations):
print(f" Iteration {iteration + 1}/{iterations}")
for i, topic in enumerate(topics):
result = await self.measure_response_time(
self.server.list_wikipedia_sections, topic
)
all_times.append(result["response_time"])
if result["success"]:
success_count += 1
print(f" Topic {i+1}: {result['response_time']:.3f}s {'✅' if result['success'] else '❌'}")
return {
"tool": "list_wikipedia_sections",
"total_requests": total_requests,
"successful_requests": success_count,
"success_rate": success_count / total_requests,
"response_times": {
"min": min(all_times),
"max": max(all_times),
"mean": statistics.mean(all_times),
"median": statistics.median(all_times),
"stdev": statistics.stdev(all_times) if len(all_times) > 1 else 0
}
}
async def benchmark_content_tool(self, topic_sections: List[tuple], iterations: int = 2) -> Dict[str, Any]:
"""Benchmark the get_section_content tool."""
print(f"\n📄 Benchmarking get_section_content with {len(topic_sections)} topic-section pairs, {iterations} iterations each...")
all_times = []
success_count = 0
total_requests = len(topic_sections) * iterations
for iteration in range(iterations):
print(f" Iteration {iteration + 1}/{iterations}")
for i, (topic, section) in enumerate(topic_sections):
result = await self.measure_response_time(
self.server.get_section_content, topic, section
)
all_times.append(result["response_time"])
if result["success"]:
success_count += 1
print(f" Pair {i+1}: {result['response_time']:.3f}s {'✅' if result['success'] else '❌'}")
return {
"tool": "get_section_content",
"total_requests": total_requests,
"successful_requests": success_count,
"success_rate": success_count / total_requests,
"response_times": {
"min": min(all_times),
"max": max(all_times),
"mean": statistics.mean(all_times),
"median": statistics.median(all_times),
"stdev": statistics.stdev(all_times) if len(all_times) > 1 else 0
}
}
async def test_concurrent_load(self, query: str, concurrent_requests: int = 10) -> Dict[str, Any]:
"""Test server performance under concurrent load."""
print(f"\n⚡ Testing concurrent load: {concurrent_requests} simultaneous requests...")
start_time = time.time()
# Create concurrent tasks
tasks = [
self.measure_response_time(self.server.fetch_wikipedia_info, f"{query} {i}")
for i in range(concurrent_requests)
]
# Execute all tasks concurrently
results = await asyncio.gather(*tasks, return_exceptions=True)
end_time = time.time()
total_time = end_time - start_time
# Process results
successful_results = []
failed_results = []
response_times = []
for i, result in enumerate(results):
if isinstance(result, Exception):
failed_results.append(f"Request {i}: {str(result)}")
elif result.get("success"):
successful_results.append(result)
response_times.append(result["response_time"])
else:
failed_results.append(f"Request {i}: {result.get('error', 'Unknown error')}")
success_rate = len(successful_results) / concurrent_requests
print(f" Total time: {total_time:.3f}s")
print(f" Successful requests: {len(successful_results)}/{concurrent_requests}")
print(f" Success rate: {success_rate:.1%}")
return {
"test": "concurrent_load",
"concurrent_requests": concurrent_requests,
"total_time": total_time,
"successful_requests": len(successful_results),
"failed_requests": len(failed_results),
"success_rate": success_rate,
"response_times": {
"min": min(response_times) if response_times else 0,
"max": max(response_times) if response_times else 0,
"mean": statistics.mean(response_times) if response_times else 0,
"median": statistics.median(response_times) if response_times else 0
},
"failed_results": failed_results
}
def print_benchmark_summary(self, results: List[Dict[str, Any]]):
"""Print a summary of benchmark results."""
print("\n" + "="*60)
print("📊 PERFORMANCE BENCHMARK SUMMARY")
print("="*60)
for result in results:
if result.get("test") == "concurrent_load":
print(f"\n⚡ Concurrent Load Test:")
print(f" Requests: {result['concurrent_requests']}")
print(f" Total Time: {result['total_time']:.3f}s")
print(f" Success Rate: {result['success_rate']:.1%}")
print(f" Avg Response Time: {result['response_times']['mean']:.3f}s")
else:
tool_name = result.get("tool", "Unknown")
print(f"\n🛠️ {tool_name}:")
print(f" Total Requests: {result['total_requests']}")
print(f" Success Rate: {result['success_rate']:.1%}")
times = result['response_times']
print(f" Response Times:")
print(f" Min: {times['min']:.3f}s")
print(f" Max: {times['max']:.3f}s")
print(f" Mean: {times['mean']:.3f}s")
print(f" Median: {times['median']:.3f}s")
print(f" Std Dev: {times['stdev']:.3f}s")
async def run_performance_benchmarks():
"""Run comprehensive performance benchmarks."""
print("🚀 Starting MCP Wikipedia Server Performance Benchmarks")
print("="*60)
tester = PerformanceTester()
results = []
# Test queries for different complexity levels
simple_queries = [
"Python",
"Java",
"JavaScript",
]
medium_queries = [
"Machine Learning",
"Artificial Intelligence",
"Data Science",
]
complex_queries = [
"Quantum Computing Applications",
"Climate Change Mitigation",
]
# Common topics for section tests
topics = [
"Python (programming language)",
"Machine Learning",
"Artificial Intelligence",
]
# Topic-section pairs for content tests
topic_sections = [
("Python (programming language)", "History"),
("Machine Learning", "Applications"),
("Artificial Intelligence", "Ethics"),
]
try:
# Benchmark 1: Simple queries
result = await tester.benchmark_search_tool(simple_queries)
results.append(result)
# Benchmark 2: Medium complexity queries
result = await tester.benchmark_search_tool(medium_queries)
results.append(result)
# Benchmark 3: Complex queries
result = await tester.benchmark_search_tool(complex_queries, iterations=1)
results.append(result)
# Benchmark 4: Section listing
result = await tester.benchmark_sections_tool(topics)
results.append(result)
# Benchmark 5: Section content
result = await tester.benchmark_content_tool(topic_sections)
results.append(result)
# Benchmark 6: Concurrent load test
result = await tester.test_concurrent_load("Machine Learning", concurrent_requests=5)
results.append(result)
# Print summary
tester.print_benchmark_summary(results)
print("\n✅ Performance benchmarks completed successfully!")
# Performance guidelines check
print("\n📋 Performance Guidelines Check:")
for result in results:
if result.get("tool"):
avg_time = result['response_times']['mean']
success_rate = result['success_rate']
time_ok = "✅" if avg_time < 3.0 else "⚠️"
success_ok = "✅" if success_rate > 0.8 else "⚠️"
print(f" {result['tool']}: {time_ok} Avg Time: {avg_time:.3f}s, {success_ok} Success: {success_rate:.1%}")
except Exception as e:
print(f"\n❌ Benchmark failed with error: {e}")
raise
if __name__ == "__main__":
# Run benchmarks when script is executed directly
asyncio.run(run_performance_benchmarks())