# metrics.yml - OVN Kubernetes Container Metrics Configuration
# This file defines the Prometheus queries for collecting OVN container metrics
# If this file is not found or empty, hardcoded PromQL queries will be used
metrics:
# CPU Usage Metrics
- query: 'rate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*|ovnkube-node.*", container!="POD", container!=""}[5m]) * 100'
metricName: CONTAINER_CPU_USAGE_PERCENT
unit: percentage
description: "Container CPU usage percentage over 5 minutes"
# Memory Usage Metrics
- query: 'container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container!="POD", container!=""}'
metricName: CONTAINER_MEMORY_RSS_BYTES
unit: bytes
description: "Container memory RSS usage"
- query: 'container_memory_working_set_bytes{pod=~"ovnkube-master.*|ovnkube-node.*", container!="POD", container!=""}'
metricName: CONTAINER_MEMORY_WORKING_SET_BYTES
unit: bytes
description: "Container memory working set"
# OVN Database Size Metrics (as specified in requirements)
- query: 'ovn_db_db_size_bytes{db_name=~"OVN_Northbound"}'
metricName: NBDB_DB_SIZE_BYTES
unit: bytes
description: "OVN Northbound database size"
- query: 'ovn_db_db_size_bytes{db_name=~"OVN_Southbound"}'
metricName: SBDB_DB_SIZE_BYTES
unit: bytes
description: "OVN Southbound database size"
# OVN Container-Specific Memory Metrics (as specified in requirements)
- query: 'sum by(pod) (container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container=~"nbdb"})'
metricName: OVN_DB_DB_MEMROY_SIZE_BYTES
unit: bytes
description: "NBDB container memory usage"
- query: 'sum by(pod) (container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container=~"sbdb"})'
metricName: OVN_DB_DB_MEMROY_SIZE_BYTES
unit: bytes
description: "SBDB container memory usage"
- query: 'sum by(pod) (container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container=~"northd"})'
metricName: OVN_DB_DB_MEMROY_SIZE_BYTES
unit: bytes
description: "Northd container memory usage"
- query: 'sum by(pod) (container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container=~"ovnkube-controller"})'
metricName: OVN_KUBE_CONTROLLER_MEMROY_SIZE_BYTES
unit: bytes
description: "OVN Kube controller memory usage"
- query: 'sum by(pod) (container_memory_rss{pod=~"ovnkube-master.*|ovnkube-node.*", container=~"ovn-controller"})'
metricName: OVN_CONTROLLER_MEMROY_SIZE_BYTES
unit: bytes
description: "OVN controller memory usage"
- query: topk(10, ovnkube_controller_ready_duration_seconds)
metricName: ovnkube_controller_ready_duration_seconds
unit: seconds
description: "Time taken for ovnkube-controller to become ready"
component: controller
- query: topk(10, ovnkube_node_ready_duration_second s)
metricName: ovnkube_node_ready_duration_seconds
unit: seconds
description: "Time taken for ovnkube-node to become ready"
component: node
- query: ovnkube_controller_sync_duration_seconds{namespace="openshift-ovn-kubernetes"}
metricName: ovnkube_controller_sync_duration_seconds
unit: seconds
description: "Duration of controller sync operations"
component: controller
# Additional metrics can be added here
- query: histogram_quantile(0.95, rate(ovnkube_controller_sync_duration_seconds_bucket[5m]))
metricName: ovnkube_controller_sync_duration_p95
unit: seconds
description: "95th percentile of controller sync duration over 5 minutes"
component: controller
- query: histogram_quantile(0.99, sum by (pod, le) (rate(ovnkube_controller_pod_creation_latency_seconds_bucket[$interval]))) > 0
metricName: Pod_Annotation_Latency_p99
- query: histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[$interval])) by (pod,le)) > 0
metricName: CNI_Request_ADD_Latency_p99
- query: histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="DEL"}[$interval])) by (pod,le)) > 0
metricName: CNI_Request_DEL_Latency_p99
- query: histogram_quantile(0.99, sum by(pod, le) (rate(ovnkube_controller_pod_first_seen_lsp_created_duration_seconds_bucket[2m])))
metricName: Pod_creation_latency_first_seen_lsp_p99
- query: histogram_quantile(0.99, sum by(pod, le) (rate(ovnkube_controller_pod_lsp_created_port_binding_duration_seconds_bucket[2m])))
metricName: pod_creation_latency_port_binding_p99
- query: rate(ovnkube_controller_sync_service_latency_seconds_sum[2m])
metricName: sync_service_latency
- query: histogram_quantile(0.9, sum by(pod, le) (rate(ovnkube_controller_sync_service_latency_seconds_bucket[2m])))
metricName: sync_service_latency_p99
- query: histogram_quantile(0.99, sum by(pod, le) (rate(ovnkube_controller_network_programming_duration_seconds_bucket[2m])))
metricName: apply_network_config_pod_duration_p99
- query: histogram_quantile(0.99, sum by(service, le) (rate(ovnkube_controller_network_programming_duration_seconds_bucket[2m])))
metricName: apply_network_config_service_duration_p99
# Additional Custom Metrics (examples)
# - query: 'ovn_nb_logical_switch_ports'
# metricName: OVN_NB_LOGICAL_SWITCH_PORTS
# unit: count
# description: "Number of logical switch ports in NB database"
# - query: 'ovn_sb_datapath_binding'
# metricName: OVN_SB_DATAPATH_BINDING
# unit: count
# description: "Number of datapath bindings in SB database"
# - query: 'rate(container_network_receive_bytes_total{pod=~"ovnkube.*"}[5m])'
# metricName: OVN_CONTAINER_NETWORK_RX_BYTES_RATE
# unit: bytes_per_second
# description: "Container network receive rate"
# Configuration notes:
# - All queries should target OVN-related pods using pod=~"ovnkube-master.*|ovnkube-node.*"
# - Use meaningful metricName values that will be used in the analysis
# - Specify appropriate units: bytes, percentage, count, bytes_per_second, etc.
# - Include helpful descriptions for documentation
# - Comment out unused metrics or add custom ones as needed