diff --git a/workloads/kube-burner-ocp-wrapper/alerts.yml b/workloads/kube-burner-ocp-wrapper/alerts.yml new file mode 100644 index 00000000..12ff3f83 --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/alerts.yml @@ -0,0 +1,61 @@ +# etcd + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01 + description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.03 + description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s + severity: warning + +- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0 + description: etcd leader changes observed + severity: warning + +# API server +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1 + description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s + severity: warning + +# Control plane pods +- expr: up{apiserver=~"kube-apiserver|openshift-apiserver"} == 0 + description: "{{$labels.apiserver}} {{$labels.instance}} down" + severity: warning + +- expr: up{namespace=~"openshift-etcd"} == 0 + description: "{{$labels.namespace}}/{{$labels.pod}} down" + severity: error + +- expr: up{namespace=~"openshift-.*(kube-controller-manager|scheduler|controller-manager|sdn|ovn-kubernetes|dns)"} == 0 + description: "{{$labels.namespace}}/{{$labels.pod}} down" + severity: warning + +- expr: up{job=~"crio|kubelet"} == 0 + description: "{{$labels.node}}/{{$labels.job}} down" + severity: warning + +- expr: up{job="ovnkube-node"} == 0 + description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down" + severity: warning + +# Service sync latency +- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10 + description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s + severity: warning + +# Prometheus alerts +- expr: ALERTS{severity="critical", alertstate="firing"} > 0 + description: Critical prometheus alert. {{$labels.alertname}} + severity: warning diff --git a/workloads/kube-burner-ocp-wrapper/curl-deployment.yml b/workloads/kube-burner-ocp-wrapper/curl-deployment.yml new file mode 100644 index 00000000..de96b719 --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/curl-deployment.yml @@ -0,0 +1,53 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: curl-{{.Replica}}-{{.Iteration}} +spec: + template: + metadata: + labels: + name: curl-{{.Replica}}-{{.Iteration}} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + - key: node-role.kubernetes.io/infra + operator: DoesNotExist + - key: node-role.kubernetes.io/workload + operator: DoesNotExist + containers: + - name: curlapp + image: quay.io/cloud-bulldozer/curl:latest + command: ["sleep", "inf"] + resources: + requests: + memory: "10Mi" + cpu: "10m" + env: + - name: WEBSERVER_HOSTNAME + value: webserver-{{.Replica}}-{{.Iteration}} + - name: WEBSERVER_PORT + value: "8080" + imagePullPolicy: IfNotPresent + securityContext: + privileged: false + startupProbe: + exec: + command: + - "/bin/sh" + - "-c" + - "curl --fail -sS ${WEBSERVER_HOSTNAME}:${WEBSERVER_PORT} -o /dev/null" + periodSeconds: 1 + timeoutSeconds: 1 + failureThreshold: 600 + restartPolicy: Always + replicas: 1 + selector: + matchLabels: + name: curl-{{.Replica}}-{{.Iteration}} + strategy: + type: RollingUpdate diff --git a/workloads/kube-burner-ocp-wrapper/metrics-report.yml b/workloads/kube-burner-ocp-wrapper/metrics-report.yml new file mode 100644 index 00000000..8570a594 --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/metrics-report.yml @@ -0,0 +1,230 @@ +--- +# Kubelet & CRI-O + +# Average of the CPU usage from all worker's kubelet +- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: cpu-kubelet + instant: true + +# Average of the memory usage from all worker's kubelet +- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: memory-kubelet + instant: true + +# Max of the memory usage from all worker's kubelet +- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: max-memory-kubelet + instant: true + +# Average of the CPU usage from all worker's CRI-O +- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: cpu-crio + instant: true + +# Average of the memory usage from all worker's CRI-O +- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: memory-crio + instant: true + +# Max of the memory usage from all worker's CRI-O +- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"}) + metricName: max-memory-crio + instant: true + +# Etcd + +- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[{{.elapsed}}:])) + metricName: 99thEtcdDiskBackendCommit + instant: true + +- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{.elapsed}}:])) + metricName: 99thEtcdDiskWalFsync + instant: true + +- query: avg(avg_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[{{.elapsed}}:])) + metricName: 99thEtcdRoundTripTime + instant: true + +# Control-plane + +- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[{{.elapsed}}:])) + metricName: cpu-kube-controller-manager + instant: true + +- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[{{.elapsed}}:])) + metricName: memory-kube-controller-manager + instant: true + +- query: max(max_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[{{.elapsed}}:])) + metricName: maxmemory-kube-controller-manager + instant: true + +- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-apiserver"}[2m])) by (pod))[{{.elapsed}}:])) + metricName: cpu-kube-apiserver + instant: true + +- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-kube-apiserver"}) by (pod))[{{.elapsed}}:])) + metricName: memory-kube-apiserver + instant: true + +- query: max(max_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-kube-apiserver"}) by (pod))[{{.elapsed}}:])) + metricName: max-memory-kube-apiserver + instant: true + +- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-apiserver"}[2m])) by (pod))[{{.elapsed}}:])) + metricName: cpu-openshift-apiserver + instant: true + +- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-apiserver"}) by (pod))[{{.elapsed}}:])) + metricName: memory-openshift-apiserver + instant: true + +- query: max(max_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-apiserver"}) by (pod))[{{.elapsed}}:])) + metricName: max-memory-openshift-apiserver + instant: true + +- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-etcd"}[2m])) by (pod))[{{.elapsed}}:])) + metricName: cpu-etcd + instant: true + +- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-controller-manager"}[2m])) by (pod))[{{.elapsed}}:])) + metricName: cpu-openshift-controller-manager + instant: true + +- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-controller-manager"}) by (pod))[{{.elapsed}}:])) + metricName: memory-openshift-controller-manager + instant: true + +- query: max(max_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-controller-manager"}) by (pod))[{{.elapsed}}:])) + metricName: max-memory-openshift-controller-manager + instant: true + +- query: avg(avg_over_time(topk(3,sum(container_memory_rss{name!="", namespace="openshift-etcd"}) by (pod))[{{.elapsed}}:])) + metricName: memory-etcd + instant: true + +- query: max(max_over_time(topk(3,sum(container_memory_rss{name!="", namespace="openshift-etcd"}) by (pod))[{{.elapsed}}:])) + metricName: max-memory-etcd + instant: true + + # multus + +- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[2m])[{{.elapsed}}:])) by (container) + metricName: cpu-multus + instant: true + +- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: memory-multus + instant: true + +- query: max(avg_over_time(container_memory_rss{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: max-memory-multus + instant: true + +# OVNKubernetes - standard & IC + +- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[2m])[{{.elapsed}}:])) by (container) + metricName: cpu-ovn-control-plane + instant: true + +- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: memory-ovn-control-plane + instant: true + +- query: max(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: max-memory-ovn-control-plane + instant: true + +- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[2m])[{{.elapsed}}:])) by (container) + metricName: cpu-ovnkube-node + instant: true + +- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: memory-ovnkube-node + instant: true + +- query: max(max_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[{{.elapsed}}:])) by (container) + metricName: max-memory-ovnkube-node + instant: true + +# Nodes + +- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:])) + metricName: cpu-masters + instant: true + +- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) + metricName: memory-masters + instant: true + +- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) + metricName: max-memory-masters + instant: true + +- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:])) + metricName: cpu-workers + instant: true + +- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) + metricName: memory-workers + instant: true + +- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) + metricName: max-memory-workers + instant: true + +- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:])) + metricName: cpu-infra + instant: true + +- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:])) + metricName: memory-infra + instant: true + +- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) + metricName: max-memory-infra + instant: true + +# Monitoring and ingress + +- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[{{.elapsed}}:])) + metricName: cpu-prometheus + instant: true + +- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[{{.elapsed}}:])) + metricName: memory-prometheus + instant: true + +- query: max(max_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[{{.elapsed}}:])) + metricName: max-memory-prometheus + instant: true + +- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ingress", pod=~"router-default.+"}[2m])) by (pod)[{{.elapsed}}:])) + metricName: cpu-router + instant: true + +- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-ingress", pod=~"router-default.+"}) by (pod)[{{.elapsed}}:])) + metricName: memory-router + instant: true + +- query: max(max_over_time(sum(container_memory_rss{name!="", namespace="openshift-ingress", pod=~"router-default.+"}) by (pod)[{{.elapsed}}:])) + metricName: max-memory-router + instant: true + +# Cluster + +- query: avg_over_time(cluster:memory_usage:ratio[{{.elapsed}}:]) + metricName: memory-cluster-usage-ratio + instant: true + +- query: avg_over_time(cluster:memory_usage:ratio[{{.elapsed}}:]) + metricName: max-memory-cluster-usage-ratio + instant: true + +- query: avg_over_time(cluster:node_cpu:ratio[{{.elapsed}}:]) + metricName: cpu-cluster-usage-ratio + instant: true + +- query: max_over_time(cluster:node_cpu:ratio[{{.elapsed}}:]) + metricName: max-cpu-cluster-usage-ratio + instant: true diff --git a/workloads/kube-burner-ocp-wrapper/metrics.yml b/workloads/kube-burner-ocp-wrapper/metrics.yml new file mode 100644 index 00000000..e3df522f --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/metrics.yml @@ -0,0 +1,109 @@ +# API server + +- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0 + metricName: schedulingThroughput + +- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 + metricName: readOnlyAPICallsLatency + +- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 + metricName: mutatingAPICallsLatency + +- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH"}[2m])) by (verb,resource,code) > 0 + metricName: APIRequestRate + +# Kubeproxy and OVN service sync latency + +- query: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 0 + metricName: serviceSyncLatency + +- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{kind="service"}[2m])) by (le)) + metricName: serviceSyncLatency + +# Containers & pod metrics + +- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|network-node-identity|multus|.*apiserver|authentication|.*controller-manager|.*scheduler|image-registry|operator-lifecycle-manager)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerCPU-Masters + +- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(monitoring|sdn|ovn-kubernetes|multus|ingress)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 + metricName: containerCPU-Infra + +- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|network-node-identity|sdn|multus|ingress|authentication|.*controller-manager|.*scheduler|image-registry|operator-lifecycle-manager)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerMemory-Masters + +- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus|ingress|monitoring|image-registry)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 + metricName: containerMemory-Infra + +- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="ovnic"}) > 0 + metricName: containerCPU-Workers + +- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="ovnic"}) > 0 + metricName: containerMemory-Workers + +# Node metrics: CPU & Memory + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="ovnic"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Workers + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Masters + +- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Infra + +# We compute memory utilization by substrating available memory to the total +# +- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryUtilization-Masters + +- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryUtilization-Infra + +# Etcd metrics + +- query: sum(rate(etcd_server_leader_changes_seen_total[2m])) + metricName: etcdLeaderChangesRate + +- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskBackendCommitDurationSeconds + +- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskWalFsyncDurationSeconds + +- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + metricName: 99thEtcdRoundTripTimeSeconds + +- query: sum by (cluster_version)(etcd_cluster_version) + metricName: etcdVersion + instant: true + +# Cluster metrics + +- query: sum(kube_namespace_status_phase) by (phase) > 0 + metricName: namespaceCount + +- query: count(kube_secret_info{}) + metricName: secretCount + instant: true + +- query: count(kube_deployment_labels{}) + metricName: deploymentCount + instant: true + +- query: count(kube_configmap_info{}) + metricName: configmapCount + instant: true + +- query: count(kube_service_info{}) + metricName: serviceCount + instant: true + +- query: kube_node_role + metricName: nodeRoles + +- query: sum(kube_node_status_condition{status="true"}) by (condition) + metricName: nodeStatus + +- query: count(kube_replicaset_labels{}) + metricName: replicaSetCount + instant: true diff --git a/workloads/kube-burner-ocp-wrapper/node-density-cni.yml b/workloads/kube-burner-ocp-wrapper/node-density-cni.yml new file mode 100644 index 00000000..7386eacf --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/node-density-cni.yml @@ -0,0 +1,39 @@ +--- +global: + gc: {{.GC}} + requestTimeout: 60s + gcMetrics: {{.GC_METRICS}} + indexerConfig: + esServers: ["{{.ES_SERVER}}"] + insecureSkipVerify: true + defaultIndex: {{.ES_INDEX}} + type: {{.INDEXING_TYPE}} + measurements: + - name: podLatency +jobs: + - name: node-density-cni + namespace: node-density-cni + jobIterations: {{.JOB_ITERATIONS}} + qps: {{.QPS}} + burst: {{.BURST}} + namespacedIterations: {{.NAMESPACED_ITERATIONS}} + iterationsPerNamespace: {{.ITERATIONS_PER_NAMESPACE}} + podWait: false + waitWhenFinished: true + preLoadImages: true + preLoadPeriod: 15s + namespaceLabels: + security.openshift.io/scc.podSecurityLabelSync: false + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged + objects: + + - objectTemplate: webserver-deployment.yml + replicas: 1 + + - objectTemplate: webserver-service.yml + replicas: 1 + + - objectTemplate: curl-deployment.yml + replicas: 1 diff --git a/workloads/kube-burner-ocp-wrapper/run.sh b/workloads/kube-burner-ocp-wrapper/run.sh index 9a36e35d..cdc828d8 100755 --- a/workloads/kube-burner-ocp-wrapper/run.sh +++ b/workloads/kube-burner-ocp-wrapper/run.sh @@ -13,6 +13,10 @@ GC=${GC:-true} EXTRA_FLAGS=${EXTRA_FLAGS:-} UUID=${UUID:-$(uuidgen)} KUBE_DIR=${KUBE_DIR:-/tmp} +US_WEST_2A=${US_WEST_2A:-} +US_WEST_2B=${US_WEST_2B:-} +US_WEST_2C=${US_WEST_2C:-} +US_WEST_2D=${US_WEST_2D:-} download_binary(){ KUBE_BURNER_URL=https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_VERSION}/kube-burner-V${KUBE_BURNER_VERSION}-linux-x86_64.tar.gz @@ -116,6 +120,60 @@ fi # Capture the exit code of the run, but don't exit the script if it fails. set +e +# scale machineset +for machineset_name in $(oc get -n openshift-machine-api machineset --no-headers -o custom-columns=":.metadata.name" | grep -i worker); do + region=$(oc get -n openshift-machine-api machineset --no-headers -o custom-columns=":.spec.template.spec.providerSpec.value.placement.availabilityZone" $machineset_name) + # region will be of the form us-west-2a. We need to match it to user provided var i.e replae "-" with '_' and then convert it to upper case. + # For example us-west-2a will be converted to US_WEST_2A. + region_var=$(echo "$region" | tr '-' '_' | tr '[:lower:]' '[:upper:]') + # desired_replicas will be the value stored in US_WEST_2A (if povided by user) + desired_replicas=${!region_var} + if [[ "${desired_replicas}" != "" ]]; then + echo "scale the ${machineset_name} to ${desired_replicas}" + current_replicas=$(oc get -n openshift-machine-api -o template machineset "$machineset_name" --template={{.status.replicas}}) + # scale 50 at at time + while ((current_replicas < desired_replicas)); do + needed_replicas=$((desired_replicas - current_replicas)) + scale_step=$((current_replicas + needed_replicas)) + + if ((needed_replicas > 50)); then + scale_step=$((current_replicas + 50)) + fi + echo "Scaling from $current_replicas to $scale_step replicas." + oc scale -n openshift-machine-api machineset "$machineset_name" --replicas="${scale_step}" + # wait for 1 hour i.e 720 retries, each retry with 5 seconds sleep + for ((i = 1; i <= 720; i++)); do + available_replicas=$(oc get -n openshift-machine-api -o template machineset "$machineset_name" --template={{.status.availableReplicas}}) + if [ "$available_replicas" -eq "$scale_step" ]; then + echo "Desired number of replicas ($scale_step) reached." + break + fi + sleep 5 + done + current_replicas=$(oc get -n openshift-machine-api -o template machineset "$machineset_name" --template={{.status.replicas}}) + done + fi +done + + +# Label workers with ovnic. Metrics from only these workers are pulled. +# node-desnity-cni on 500 nodes runs for 2 hours 15 minutes. Scraping metrics from 500 nodes for the duration of 2 hours 15 minutes is overkill. +# So we scrape from only 10 worker nodes if the worker node count is more than 120. +workers_to_label=$(oc get nodes --ignore-not-found -l node-role.kubernetes.io/worker --no-headers=true | wc -l) || true +if [ "$workers_to_label" -gt 2 ]; then + workers_to_label=2 +fi + +count=0 +for node in $(oc get nodes --ignore-not-found -l node-role.kubernetes.io/worker --no-headers -o custom-columns=":.metadata.name"); do + if [ "$count" -eq "$workers_to_label" ]; then + break + fi + oc label nodes $node 'node-role.kubernetes.io/ovnic=' + ((count++)) +done + + echo $cmd JOB_START=$(date -u +"%Y-%m-%dT%H:%M:%SZ") $cmd diff --git a/workloads/kube-burner-ocp-wrapper/webserver-deployment.yml b/workloads/kube-burner-ocp-wrapper/webserver-deployment.yml new file mode 100644 index 00000000..34aa5c8d --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/webserver-deployment.yml @@ -0,0 +1,41 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: webserver-{{.Replica}}-{{.Iteration}} +spec: + template: + metadata: + labels: + name: webserver-{{.Replica}}-{{.Iteration}} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + - key: node-role.kubernetes.io/infra + operator: DoesNotExist + - key: node-role.kubernetes.io/workload + operator: DoesNotExist + containers: + - name: webserver + image: quay.io/cloud-bulldozer/sampleapp:latest + resources: + requests: + memory: "10Mi" + cpu: "10m" + ports: + - containerPort: 8080 + protocol: TCP + imagePullPolicy: IfNotPresent + securityContext: + privileged: false + restartPolicy: Always + replicas: 1 + selector: + matchLabels: + name: webserver-{{.Replica}}-{{.Iteration}} + strategy: + type: RollingUpdate diff --git a/workloads/kube-burner-ocp-wrapper/webserver-service.yml b/workloads/kube-burner-ocp-wrapper/webserver-service.yml new file mode 100644 index 00000000..a569151b --- /dev/null +++ b/workloads/kube-burner-ocp-wrapper/webserver-service.yml @@ -0,0 +1,12 @@ +kind: Service +apiVersion: v1 +metadata: + name: webserver-{{.Replica}}-{{.Iteration}} +spec: + selector: + name: webserver-{{.Replica}}-{{.Iteration}} + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 + type: ClusterIP