From a10905a95d6997c1beda07bbcb3e246bdee273fc Mon Sep 17 00:00:00 2001 From: David Espejo <82604841+davidmirror-ops@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:53:31 -0500 Subject: [PATCH] Update Grafana User dashboard (#5703) * Fix Accepted wf metric and succesful wf Signed-off-by: davidmirror-ops * Change successful wf execution to avg Signed-off-by: davidmirror-ops * Fix failed wf metric Signed-off-by: davidmirror-ops * 1st try to get kube pod label metric to work Signed-off-by: davidmirror-ops * Fixes pending tasks query Signed-off-by: davidmirror-ops * Fixes memory usage query Signed-off-by: davidmirror-ops * Fixes CPU usage query and panel type Signed-off-by: davidmirror-ops * Fixes CPU usage graph and remove node queuing graph Signed-off-by: davidmirror-ops * Turn success wf exec into bargauge viz Signed-off-by: davidmirror-ops * Turn failure duration quantile into bargauge Signed-off-by: davidmirror-ops * Minor fixes Signed-off-by: davidmirror-ops * Change to rates from counters Signed-off-by: davidmirror-ops * Remove stale file Signed-off-by: davidmirror-ops * Adapt CPU/mem metrics Signed-off-by: davidmirror-ops --------- Signed-off-by: davidmirror-ops --- .../stats/prometheus/flyteuser-dashboard.json | 781 +++++++----------- stats/flyteuser.dashboard.py | 109 +-- 2 files changed, 331 insertions(+), 559 deletions(-) diff --git a/deployment/stats/prometheus/flyteuser-dashboard.json b/deployment/stats/prometheus/flyteuser-dashboard.json index 55c7ad5851..36eb2bb7bf 100644 --- a/deployment/stats/prometheus/flyteuser-dashboard.json +++ b/deployment/stats/prometheus/flyteuser-dashboard.json @@ -12,7 +12,7 @@ "annotations": { "list": [] }, - "description": "Flyte User Dashboard. This is great to get a birds-eye and drill down view of executions in your Flyte cluster. Useful for the user.", + "description": "Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", "editable": false, "gnetId": null, "graphTooltip": 0, @@ -40,7 +40,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -96,7 +97,7 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "expr": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", "format": "time_series", "hide": false, "instant": false, @@ -104,7 +105,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "query": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", "refId": "A", "step": 10, "target": "" @@ -113,7 +114,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Accepted Workflow", + "title": "Accepted Workflows (avg)", "tooltip": { "msResolution": true, "shared": true, @@ -167,7 +168,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -240,7 +242,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Successful Workflow", + "title": "Workflow success rate", "tooltip": { "msResolution": true, "shared": true, @@ -294,7 +296,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -367,7 +370,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Failed Workflow", + "title": "Workflow failure rate", "tooltip": { "msResolution": true, "shared": true, @@ -421,7 +424,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -477,7 +481,7 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "expr": "avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m])", "format": "time_series", "hide": false, "instant": false, @@ -485,7 +489,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "query": "avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m])", "refId": "A", "step": 10, "target": "" @@ -494,7 +498,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Aborted Workflow", + "title": "Aborted Workflows (avg)", "tooltip": { "msResolution": true, "shared": true, @@ -513,7 +517,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -536,8 +540,6 @@ } }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -547,64 +549,84 @@ "defaults": { "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] } } }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, "gridPos": null, "height": null, "hideTimeOverride": false, "id": 5, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 2, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "expr": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "format": "time_series", "hide": false, "instant": false, @@ -612,59 +634,20 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "query": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Successful workflow execution time by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Successful wf execution duration by quantile", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "bargauge" }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -674,191 +657,84 @@ "defaults": { "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] } } }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, "gridPos": null, "height": null, "hideTimeOverride": false, "id": 6, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Failed workflow execution time by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 7, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 2, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:queueing_latency_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "expr": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "format": "time_series", "hide": false, "instant": false, @@ -866,55 +742,18 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:queueing_latency_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "query": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node queuing latency by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Failed wf execution duration by quantile", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "bargauge" } ], "repeat": null, @@ -939,7 +778,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -952,7 +792,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 8, + "id": 7, "interval": null, "isNew": true, "legend": { @@ -1001,7 +841,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "max cpu", + "legendFormat": "CPU limit", "metric": "", "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", "refId": "A", @@ -1016,7 +856,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "used cpu", + "legendFormat": "CPU requested", "metric": "", "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", "refId": "B", @@ -1027,7 +867,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "CPU Limits vs usage", + "title": "CPU Limit vs requested by namespace", "tooltip": { "msResolution": true, "shared": true, @@ -1046,7 +886,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1081,7 +921,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1094,7 +935,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 9, + "id": 8, "interval": null, "isNew": true, "legend": { @@ -1137,30 +978,30 @@ "targets": [ { "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"}", + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "max mem", + "legendFormat": "Memory limit (MiB)", "metric": "", - "query": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"}", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"}", + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "used mem", + "legendFormat": "Memory requested (MiB)", "metric": "", - "query": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"}", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", "refId": "B", "step": 10, "target": "" @@ -1169,7 +1010,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Mem Limits vs usage", + "title": "Memory limit vs requested by namespace (MiB)", "tooltip": { "msResolution": true, "shared": true, @@ -1188,7 +1029,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1213,7 +1054,7 @@ ], "repeat": null, "showTitle": true, - "title": "Kubernetes Quota Usage stats" + "title": "Kubernetes Resource Quota Usage" }, { "collapse": true, @@ -1233,7 +1074,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1246,7 +1088,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 10, + "id": 9, "interval": null, "isNew": true, "legend": { @@ -1289,7 +1131,7 @@ "targets": [ { "datasource": null, - "expr": "sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0", + "expr": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1297,7 +1139,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0", + "query": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", "refId": "A", "step": 10, "target": "" @@ -1306,7 +1148,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Pending tasks", + "title": "Pending Tasks", "tooltip": { "msResolution": true, "shared": true, @@ -1348,8 +1190,6 @@ } }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -1357,66 +1197,80 @@ "error": false, "fieldConfig": { "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "overrides": [] }, "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 11, + "id": 10, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 4, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "(100 * max(container_memory_rss{image!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "expr": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1424,59 +1278,20 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100 * max(container_memory_rss{image!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "query": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Memory Usage Percentage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Memory Usage per Task(%)", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "barchart" }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -1484,66 +1299,80 @@ "error": false, "fieldConfig": { "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "overrides": [] }, "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 12, + "id": 11, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 4, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "(100* sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "expr": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1551,55 +1380,18 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100* sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "query": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "CPU Usage Percentage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "CPU Usage per Task(%)", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "barchart" } ], "repeat": null, @@ -1624,7 +1416,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1637,7 +1430,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 13, + "id": 12, "interval": null, "isNew": true, "legend": { @@ -1697,7 +1490,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "User errors", + "title": "User error rate", "tooltip": { "msResolution": true, "shared": true, @@ -1751,7 +1544,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1764,7 +1558,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 14, + "id": 13, "interval": null, "isNew": true, "legend": { @@ -1824,7 +1618,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "System errors", + "title": "System error rate", "tooltip": { "msResolution": true, "shared": true, @@ -1868,7 +1662,7 @@ ], "repeat": null, "showTitle": true, - "title": "Error (System vs user)" + "title": "Error (System vs User)" } ], "schemaVersion": 12, @@ -1971,6 +1765,7 @@ }, "timepicker": { "hidden": false, + "nowDelay": null, "refresh_intervals": [ "5s", "10s", diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index d554532b24..763362f004 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -1,6 +1,6 @@ import typing from grafanalib.core import ( - Alert, AlertCondition, Dashboard, Graph, + Alert, AlertCondition, Dashboard, Graph,BarChart,BarGauge, GreaterThan, OP_AND, OPS_FORMAT, Row, RTYPE_SUM, SECONDS_FORMAT, SHORT_FORMAT, single_y_axis, Target, TimeRange, YAxes, YAxis, MILLISECONDS_FORMAT, Templating, Template, DataSourceInput @@ -19,11 +19,11 @@ def workflow_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="Accepted Workflow", + title="Accepted Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:accepted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg(flyte:propeller:all:workflow:accepted{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], @@ -33,7 +33,7 @@ def workflow_stats(collapse: bool) -> Row: ), ), Graph( - title="Successful Workflow", + title="Workflow success rate", dataSource=DATASOURCE, targets=[ Target( @@ -41,13 +41,10 @@ def workflow_stats(collapse: bool) -> Row: refId='A', ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title="Failed Workflow", + title="Workflow failure rate", dataSource=DATASOURCE, targets=[ Target( @@ -55,105 +52,85 @@ def workflow_stats(collapse: bool) -> Row: refId='A', ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title="Aborted Workflow", + title="Aborted Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m])', refId='A', ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=SHORT_FORMAT), ), - Graph( - title="Successful workflow execution time by Quantile", + BarGauge( + title="Successful wf execution duration by quantile", dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='(avg(flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000', refId='A', ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + orientation='horizontal', + format=SECONDS_FORMAT, ), - Graph( - title="Failed workflow execution time by Quantile", + BarGauge( + title="Failed wf execution duration by quantile", dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000', refId='A', ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), - ), - Graph( - title="Node queuing latency by Quantile", - dataSource=DATASOURCE, - targets=[ - Target( - expr='sum(flyte:propeller:all:node:queueing_latency_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', - refId='A', - ), - ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + orientation='horizontal', + format=SECONDS_FORMAT, ), ]) @staticmethod def quota_stats(collapse: bool) -> Row: return Row( - title="Kubernetes Quota Usage stats", + title="Kubernetes Resource Quota Usage", collapse=collapse, panels=[ Graph( - title="CPU Limits vs usage", + title="CPU Limit vs requested by namespace", dataSource=DATASOURCE, targets=[ Target( expr='kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}', refId='A', - legendFormat="max cpu", + legendFormat="CPU limit", ), Target( expr='kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}', refId='B', - legendFormat="used cpu", + legendFormat="CPU requested", ), ], yAxes=YAxes( - YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( - title="Mem Limits vs usage", + title="Memory limit vs requested by namespace (MiB)", dataSource=DATASOURCE, targets=[ Target( - expr='kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}', + expr='(kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"})*9.5367e-7', refId='A', - legendFormat="max mem", + legendFormat="Memory limit (MiB)", ), Target( - expr='kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}', + expr='(kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"})*9.5367e-7', refId='B', - legendFormat="used mem", + legendFormat="Memory requested (MiB)", ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), ), ]) @@ -164,48 +141,48 @@ def resource_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="Pending tasks", + title="Pending Tasks", dataSource=DATASOURCE, targets=[ Target( - expr='sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0', + expr='sum(kube_pod_status_phase{phase="Pending"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~"$workflow"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), - Graph( - title="Memory Usage Percentage", + BarChart( + title="Memory Usage per Task(%)", dataSource=DATASOURCE, targets=[ Target( - expr='(100 * max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100 * (max(container_memory_working_set_bytes{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0', refId='A', ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + showValue='true', ), - Graph( - title="CPU Usage Percentage", + BarChart( + title="CPU Usage per Task(%)", dataSource=DATASOURCE, targets=[ Target( - expr='(100* sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100 * (sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0', refId='A', ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + showValue='true', ), ]) @staticmethod def errors(collapse: bool) -> Row: return Row( - title="Error (System vs user)", + title="Error (System vs User)", collapse=collapse, panels=[ Graph( - title="User errors", + title="User error rate", dataSource=DATASOURCE, targets=[ Target( @@ -216,7 +193,7 @@ def errors(collapse: bool) -> Row: yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( - title="System errors", + title="System error rate", dataSource=DATASOURCE, targets=[ Target( @@ -280,7 +257,7 @@ def create_all_rows(interval: int) -> typing.List[Row]: domain_template, wf_template, ]), - description="Flyte User Dashboard. This is great to get a birds-eye and drill down view of executions in your Flyte cluster. Useful for the user.", + description="Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", ).auto_panel_ids() if __name__ == "__main__":