Skip to main content
Glama

Security MCP Server

by nordeim
alerts.yml7.73 kB
groups: - name: general-availability rules: - alert: TargetDown expr: up == 0 for: 2m labels: severity: warning category: availability annotations: summary: "Target down: {{ $labels.job }} on {{ $labels.instance }}" description: "Prometheus target {{ $labels.job }} ({{ $labels.instance }}) has been down for 2 minutes." - alert: MCPServerDown expr: up{job="mcp-server"} == 0 for: 1m labels: severity: critical category: availability annotations: summary: "MCP server metrics endpoint down" description: "The mcp-server metrics endpoint {{ $labels.instance }} is unreachable for 1 minute." - name: mcp-reliability rules: - alert: MCPFailureRateHigh expr: | sum by (tool) (rate(mcp_tool_execution_total{success="false"}[10m])) / clamp_min(sum by (tool) (rate(mcp_tool_execution_total[10m])), 1e-6) > 0.2 for: 10m labels: severity: warning category: reliability annotations: summary: "High failure rate for {{ $labels.tool }}" description: "Failure ratio > 20% over 10m for tool {{ $labels.tool }}." - alert: MCPFailureRateSevere expr: | sum by (tool) (rate(mcp_tool_execution_total{success="false"}[10m])) / clamp_min(sum by (tool) (rate(mcp_tool_execution_total[10m])), 1e-6) > 0.5 for: 10m labels: severity: critical category: reliability annotations: summary: "Severe failure rate for {{ $labels.tool }}" description: "Failure ratio > 50% over 10m for tool {{ $labels.tool }}." - alert: MCPTimeoutRateHigh expr: | sum by (tool) (rate(mcp_tool_execution_total{timed_out="true"}[10m])) / clamp_min(sum by (tool) (rate(mcp_tool_execution_total[10m])), 1e-6) > 0.1 for: 10m labels: severity: warning category: reliability annotations: summary: "Timeouts > 10% for {{ $labels.tool }}" description: "Timeout rate exceeds 10% over 10m for tool {{ $labels.tool }}." - alert: MCPValidationErrorsObserved expr: | sum by (tool) (rate(mcp_tool_errors_total{error_type="validation_error"}[10m])) > 0 for: 10m labels: severity: info category: reliability annotations: summary: "Input validation errors for {{ $labels.tool }}" description: "Sustained validation errors over 10m. Investigate client inputs or configuration." - alert: MCPExecutionErrorsSpike expr: | sum by (tool, error_type) (rate(mcp_tool_errors_total{error_type!~"validation_error|timeout"}[10m])) > 0.05 for: 10m labels: severity: warning category: reliability annotations: summary: "Execution errors for {{ $labels.tool }} (type={{ $labels.error_type }})" description: "Non-validation, non-timeout errors > 0.05/s over 10m." - name: mcp-latency rules: - alert: MCPLatencyP95High expr: | histogram_quantile( 0.95, sum by (le, tool) (rate(mcp_tool_execution_seconds_bucket[5m])) ) > 5 for: 10m labels: severity: warning category: performance annotations: summary: "High p95 latency for {{ $labels.tool }}" description: "p95 execution time > 5s for 10m." - alert: MCPLatencyP99Severe expr: | histogram_quantile( 0.99, sum by (le, tool) (rate(mcp_tool_execution_seconds_bucket[5m])) ) > 10 for: 10m labels: severity: critical category: performance annotations: summary: "Severe p99 latency for {{ $labels.tool }}" description: "p99 execution time > 10s for 10m." - name: mcp-concurrency rules: - alert: MCPActiveExecutionsHigh expr: mcp_tool_active > 4 for: 10m labels: severity: warning category: capacity annotations: summary: "High active executions for {{ $labels.tool }}" description: "Active executions > 4 for 10m. Review concurrency limits or workload spikes." - alert: MCPActiveExecutionsCritical expr: mcp_tool_active > 8 for: 10m labels: severity: critical category: capacity annotations: summary: "Critical active executions for {{ $labels.tool }}" description: "Active executions > 8 for 10m. Possible runaway concurrency." - name: container-observability rules: - alert: MCPContainerMemoryPressure expr: | (container_memory_usage_bytes{container="mcp-server"} / container_spec_memory_limit_bytes{container="mcp-server"}) > 0.9 for: 10m labels: severity: warning category: infrastructure annotations: summary: "High memory usage in mcp-server container" description: "mcp-server memory usage > 90% of limit for 10m." - alert: MCPContainerMemoryCritical expr: | (container_memory_usage_bytes{container="mcp-server"} / container_spec_memory_limit_bytes{container="mcp-server"}) > 0.95 for: 10m labels: severity: critical category: infrastructure annotations: summary: "Critical memory usage in mcp-server container" description: "mcp-server memory usage > 95% of limit for 10m." - name: node-infrastructure rules: - alert: NodeHighCPU expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.8 for: 10m labels: severity: warning category: infrastructure annotations: summary: "High CPU on {{ $labels.instance }}" description: "CPU usage > 80% for 10m." - alert: NodeLowMemoryAvailable expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.2 for: 10m labels: severity: warning category: infrastructure annotations: summary: "Low memory available on {{ $labels.instance }}" description: "Available memory < 20% for 10m." - name: prometheus-self rules: - alert: PrometheusRuleEvaluationErrors expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0 for: 5m labels: severity: warning category: observability annotations: summary: "Prometheus rule evaluation errors" description: "Rule evaluation failures observed over 5m." # Optional (enable if breaker metric exists): # - name: mcp-circuit-breaker # rules: # - alert: MCPCircuitBreakerOpenLabeled # expr: mcp_circuit_breaker_state{state="open"} == 1 # for: 5m # labels: # severity: critical # category: reliability # annotations: # summary: "Circuit breaker OPEN for {{ $labels.tool }}" # description: "Breaker has been OPEN for 5m for tool {{ $labels.tool }}." # # - alert: MCPCircuitBreakerOpenNumeric # expr: mcp_circuit_breaker_state >= 2 # for: 5m # labels: # severity: critical # category: reliability # annotations: # summary: "Circuit breaker OPEN (numeric) for {{ $labels.tool }}" # description: "Breaker numeric state indicates OPEN for 5m."

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nordeim/Security-MCP-Server-v3'

If you have feedback or need assistance with the MCP directory API, please join our Discord server