Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update Grafana dashboard to reflect throttle metrics rename #147

Merged
merged 3 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 30 additions & 30 deletions METRICS.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,36 +126,36 @@ nvidia_smi_clocks_max_memory_clock_hz{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa
# HELP nvidia_smi_clocks_max_sm_clock_hz clocks.max.sm [MHz]
# TYPE nvidia_smi_clocks_max_sm_clock_hz gauge
nvidia_smi_clocks_max_sm_clock_hz{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 2.28e+09
# HELP nvidia_smi_clocks_throttle_reasons_active clocks_throttle_reasons.active
# TYPE nvidia_smi_clocks_throttle_reasons_active gauge
nvidia_smi_clocks_throttle_reasons_active{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 4
# HELP nvidia_smi_clocks_throttle_reasons_applications_clocks_setting clocks_throttle_reasons.applications_clocks_setting
# TYPE nvidia_smi_clocks_throttle_reasons_applications_clocks_setting gauge
nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_gpu_idle clocks_throttle_reasons.gpu_idle
# TYPE nvidia_smi_clocks_throttle_reasons_gpu_idle gauge
nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown clocks_throttle_reasons.hw_power_brake_slowdown
# TYPE nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown gauge
nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_hw_slowdown clocks_throttle_reasons.hw_slowdown
# TYPE nvidia_smi_clocks_throttle_reasons_hw_slowdown gauge
nvidia_smi_clocks_throttle_reasons_hw_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown clocks_throttle_reasons.hw_thermal_slowdown
# TYPE nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown gauge
nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_supported clocks_throttle_reasons.supported
# TYPE nvidia_smi_clocks_throttle_reasons_supported gauge
nvidia_smi_clocks_throttle_reasons_supported{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 511
# HELP nvidia_smi_clocks_throttle_reasons_sw_power_cap clocks_throttle_reasons.sw_power_cap
# TYPE nvidia_smi_clocks_throttle_reasons_sw_power_cap gauge
nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 1
# HELP nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown clocks_throttle_reasons.sw_thermal_slowdown
# TYPE nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown gauge
nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_throttle_reasons_sync_boost clocks_throttle_reasons.sync_boost
# TYPE nvidia_smi_clocks_throttle_reasons_sync_boost gauge
nvidia_smi_clocks_throttle_reasons_sync_boost{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_active clocks_event_reasons.active
# TYPE nvidia_smi_clocks_event_reasons_active gauge
nvidia_smi_clocks_event_reasons_active{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 4
# HELP nvidia_smi_clocks_event_reasons_applications_clocks_setting clocks_event_reasons.applications_clocks_setting
# TYPE nvidia_smi_clocks_event_reasons_applications_clocks_setting gauge
nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_gpu_idle clocks_event_reasons.gpu_idle
# TYPE nvidia_smi_clocks_event_reasons_gpu_idle gauge
nvidia_smi_clocks_event_reasons_gpu_idle{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown clocks_event_reasons.hw_power_brake_slowdown
# TYPE nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown gauge
nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_hw_slowdown clocks_event_reasons.hw_slowdown
# TYPE nvidia_smi_clocks_event_reasons_hw_slowdown gauge
nvidia_smi_clocks_event_reasons_hw_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_hw_thermal_slowdown clocks_event_reasons.hw_thermal_slowdown
# TYPE nvidia_smi_clocks_event_reasons_hw_thermal_slowdown gauge
nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_supported clocks_event_reasons.supported
# TYPE nvidia_smi_clocks_event_reasons_supported gauge
nvidia_smi_clocks_event_reasons_supported{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 511
# HELP nvidia_smi_clocks_event_reasons_sw_power_cap clocks_event_reasons.sw_power_cap
# TYPE nvidia_smi_clocks_event_reasons_sw_power_cap gauge
nvidia_smi_clocks_event_reasons_sw_power_cap{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 1
# HELP nvidia_smi_clocks_event_reasons_sw_thermal_slowdown clocks_event_reasons.sw_thermal_slowdown
# TYPE nvidia_smi_clocks_event_reasons_sw_thermal_slowdown gauge
nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_clocks_event_reasons_sync_boost clocks_event_reasons.sync_boost
# TYPE nvidia_smi_clocks_event_reasons_sync_boost gauge
nvidia_smi_clocks_event_reasons_sync_boost{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
# HELP nvidia_smi_compute_mode compute_mode
# TYPE nvidia_smi_compute_mode gauge
nvidia_smi_compute_mode{uuid="df6e7a7c-7314-46f8-abc4-b88b36dcf3aa"} 0
Expand Down
52 changes: 31 additions & 21 deletions grafana/dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "11.1.0"
"version": "11.2.0"
},
{
"type": "datasource",
Expand Down Expand Up @@ -123,7 +123,7 @@
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -201,7 +201,7 @@
"textMode": "value",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -276,7 +276,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -351,7 +351,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -426,7 +426,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -501,7 +501,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -536,6 +536,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -672,7 +673,7 @@
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -741,7 +742,7 @@
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -823,15 +824,15 @@
"text": {},
"valueMode": "color"
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}",
"instant": false,
"interval": "",
"legendFormat": "Idle",
Expand All @@ -843,7 +844,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "HW Thermal Slowdown",
Expand All @@ -855,7 +856,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "SW Power Cap",
Expand All @@ -867,7 +868,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "App Clocks Setting",
Expand All @@ -879,7 +880,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "HW Power Brake",
Expand All @@ -891,7 +892,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "SW Thermal Slowdown",
Expand All @@ -903,7 +904,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}",
"expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "Sync Boost",
Expand Down Expand Up @@ -971,7 +972,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -1046,7 +1047,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -1121,7 +1122,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -1196,7 +1197,7 @@
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.1.0",
"pluginVersion": "11.2.0",
"targets": [
{
"datasource": {
Expand Down Expand Up @@ -1231,6 +1232,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1334,6 +1336,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1432,6 +1435,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1535,6 +1539,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1633,6 +1638,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1736,6 +1742,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1835,6 +1842,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -1934,6 +1942,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down Expand Up @@ -2033,6 +2042,7 @@
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
Expand Down