Model Control Plane (MCP) Server

MCP
scripts
#!/usr/bin/env python3
"""
Check Kubernetes Metrics Integration

This script verifies that Prometheus can access the dummy metrics
and that the data is being correctly scraped and stored.
"""

import os
import sys
import json
import logging
import argparse
import requests
import time
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('check-k8s-metrics')

# Default configuration
METRICS_PORT = 9092
PROMETHEUS_URL = "http://localhost:9090"

def check_metrics_endpoint(port):
    """Check if the metrics endpoint is accessible"""
    url = f"http://localhost:{port}/metrics"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            metrics_count = len(response.text.split('\n'))
            logger.info(f"✅ Metrics endpoint is UP - {metrics_count} metrics available")
            return True, metrics_count
        else:
            logger.error(f"❌ Metrics endpoint returned status code {response.status_code}")
            return False, 0
    except Exception as e:
        logger.error(f"❌ Failed to access metrics endpoint: {e}")
        return False, 0

def check_prometheus_connection(prometheus_url):
    """Check if Prometheus is accessible"""
    try:
        response = requests.get(f"{prometheus_url}/api/v1/status/config", timeout=5)
        if response.status_code == 200:
            logger.info("✅ Prometheus is UP and accessible")
            return True
        else:
            logger.error(f"❌ Failed to access Prometheus: Status code {response.status_code}")
            return False
    except Exception as e:
        logger.error(f"❌ Failed to access Prometheus: {e}")
        return False

def check_prometheus_targets(prometheus_url):
    """Check if Prometheus has targets configured"""
    try:
        response = requests.get(f"{prometheus_url}/api/v1/targets", timeout=5)
        if response.status_code == 200:
            data = response.json()
            targets = data.get('data', {}).get('activeTargets', [])
            
            metrics_target = None
            for target in targets:
                if 'localhost:9092' in str(target.get('scrapeUrl', '')):
                    metrics_target = target
                    break
            
            if metrics_target:
                if metrics_target.get('health') == 'up':
                    logger.info(f"✅ Metrics target is configured and UP")
                    return True
                else:
                    logger.error(f"❌ Metrics target is configured but health is {metrics_target.get('health')}")
                    error_msg = metrics_target.get('lastError', 'No error message')
                    if error_msg:
                        logger.error(f"   Error: {error_msg}")
                    return False
            else:
                logger.warning("⚠️ No metrics target found for localhost:9092")
                logger.info("You may need to add a scrape configuration to Prometheus")
                
                # Print a helpful snippet
                logger.info("\nAdd this to your Prometheus config:")
                logger.info("""
  - job_name: 'kubernetes-dummy-metrics'
    scrape_interval: 15s
    static_configs:
      - targets: ['localhost:9092']
                """)
                
                return False
        else:
            logger.error(f"❌ Failed to get Prometheus targets: Status code {response.status_code}")
            return False
    except Exception as e:
        logger.error(f"❌ Failed to get Prometheus targets: {e}")
        return False

def check_metric_data(prometheus_url, metric_name="container_cpu_usage_seconds_total"):
    """Check if Prometheus has data for a specific metric"""
    try:
        # Query for the metric
        response = requests.get(
            f"{prometheus_url}/api/v1/query",
            params={"query": metric_name},
            timeout=5
        )
        
        if response.status_code == 200:
            data = response.json()
            results = data.get('data', {}).get('result', [])
            
            if results:
                logger.info(f"✅ Found {len(results)} series for metric '{metric_name}'")
                
                # Show a sample of the data
                if len(results) > 0:
                    sample = results[0]
                    metric_labels = sample.get('metric', {})
                    # Remove the metric name from the dict to make it cleaner
                    if '__name__' in metric_labels:
                        del metric_labels['__name__']
                    
                    value = sample.get('value', [0, "0"])[1]
                    logger.info(f"   Sample value: {value}")
                    logger.info(f"   Sample labels: {json.dumps(metric_labels, indent=2)}")
                
                return True
            else:
                logger.warning(f"⚠️ No data found for metric '{metric_name}'")
                return False
        else:
            logger.error(f"❌ Failed to query Prometheus: Status code {response.status_code}")
            return False
    except Exception as e:
        logger.error(f"❌ Failed to query Prometheus: {e}")
        return False

def suggest_fixes(metrics_ok, prometheus_ok, targets_ok, data_ok):
    """Suggest fixes based on the check results"""
    if not metrics_ok:
        logger.info("\n🛠️ Suggestions to fix metrics endpoint:")
        logger.info("1. Check if the k8s_dummy_data_generator.py script is running")
        logger.info("2. Verify the port (default: 9092) is correct and not in use")
        logger.info("3. Run the generator script: uv run scripts/k8s_dummy_data_generator.py --http-port 9092")
    
    if not prometheus_ok:
        logger.info("\n🛠️ Suggestions to fix Prometheus connection:")
        logger.info("1. Check if Prometheus is running (docker ps | grep prometheus)")
        logger.info("2. Start Prometheus if it's not running: docker compose up -d prometheus")
        logger.info("3. Verify PROMETHEUS_URL is correct (default: http://localhost:9090)")
    
    if metrics_ok and prometheus_ok and not targets_ok:
        logger.info("\n🛠️ Suggestions to fix Prometheus target configuration:")
        logger.info("1. Add a scrape configuration for the metrics endpoint (see above snippet)")
        logger.info("2. Restart Prometheus: docker compose restart prometheus")
    
    if metrics_ok and prometheus_ok and targets_ok and not data_ok:
        logger.info("\n🛠️ Suggestions to fix missing metric data:")
        logger.info("1. Wait a few minutes for Prometheus to scrape the data")
        logger.info("2. Check if the metrics names in the dashboard match what's being generated")
        logger.info("3. Verify the metric labels match what's expected in the queries")

def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description='Check Kubernetes Metrics Integration')
    
    parser.add_argument('--port', type=int, default=METRICS_PORT,
                        help=f'Port where metrics are exposed (default: {METRICS_PORT})')
    
    parser.add_argument('--prometheus', type=str, default=PROMETHEUS_URL,
                        help=f'Prometheus URL (default: {PROMETHEUS_URL})')
    
    parser.add_argument('--metric', type=str, default="container_cpu_usage_seconds_total",
                        help='Metric name to check (default: container_cpu_usage_seconds_total)')
    
    args = parser.parse_args()
    
    # Use local prometheus_url variable instead of global
    prometheus_url = args.prometheus
    
    logger.info("=== Kubernetes Metrics Integration Check ===")
    logger.info(f"Metrics port: {args.port}")
    logger.info(f"Prometheus URL: {prometheus_url}")
    logger.info(f"Metric to check: {args.metric}")
    logger.info("")
    
    # Run the checks
    metrics_ok, metrics_count = check_metrics_endpoint(args.port)
    if metrics_ok:
        prometheus_ok = check_prometheus_connection(prometheus_url)
        targets_ok = False
        data_ok = False
        
        if prometheus_ok:
            targets_ok = check_prometheus_targets(prometheus_url)
            if targets_ok:
                data_ok = check_metric_data(prometheus_url, args.metric)
        
        # Suggest fixes based on the check results
        suggest_fixes(metrics_ok, prometheus_ok, targets_ok, data_ok)
        
        # Summary
        logger.info("\n=== Summary ===")
        logger.info(f"Metrics endpoint: {'✅ UP' if metrics_ok else '❌ DOWN'} - {metrics_count} metrics")
        logger.info(f"Prometheus: {'✅ UP' if prometheus_ok else '❌ DOWN'}")
        logger.info(f"Target configuration: {'✅ OK' if targets_ok else '❌ Missing'}")
        logger.info(f"Metric data: {'✅ Present' if data_ok else '❌ Missing'}")
        
        # Overall status
        if metrics_ok and prometheus_ok and targets_ok and data_ok:
            logger.info("\n✅ Integration is WORKING CORRECTLY")
            return 0
        else:
            logger.info("\n⚠️ Integration has ISSUES that need to be fixed")
            return 1
    else:
        # If metrics endpoint is down, no point in checking the rest
        suggest_fixes(metrics_ok, False, False, False)
        logger.info("\n❌ Integration check FAILED - metrics endpoint is not accessible")
        return 1

if __name__ == "__main__":
    sys.exit(main())