diff --git a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/README.md b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/README.md index 091d6c254..eb9c58211 100644 --- a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/README.md +++ b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/README.md @@ -4,11 +4,11 @@ This template is for Zabbix to monitor multiple NVidia GPUs -This template uses only one user parameter, receives all parameters in one request and requires no additional scripts +This template uses two user parameters. One to search for graphic cards and one to retrieve metrics from each card ### Features -* Low-level discovery of all the graphics Nvidia cards +* Low-level discovery of all the graphics NVidia cards * Prototype items and triggers for the most important parameters * General status panel @@ -18,7 +18,7 @@ This template uses only one user parameter, receives all parameters in one reque * Restart the zabbix-agent * Import template zbx_NVidia_GPUs.yaml and link this template to the monitored host -This template is set up and tested on a server with nine Nvidia graphics cards. Comments, suggestions and help to improve this template are welcome +This template is set up and tested on a server with nine NVidia graphics cards. Comments, suggestions and help to improve this template are welcome ## Author @@ -36,29 +36,31 @@ There are no template links in this template. |Name|Description|Type|Key and additional info| |----|-----------|----|----| -|GPU Data|

Data collection by GPUs

|`SNMP agent`|gpu.data

Update: 1m

| +|GPU Card|

GPU detection

|`Dependent item`|gpu.id| ## Items collected Common Items |Name|Description|Type|Key and additional info| |----|-----------|----|----| -|GPU Count|

Number of GPUs detected

|`Dependent items`|gpu.count| -|GPU Driver Version|

GPU driver version

|`Dependent items`|gpu.driver_version| -|GPU Power Total|

Power consumption of all GPUs

|`Dependent items`|gpu.power_total| -|GPUs Maximum Temperature|

Temperature of the hottest GPU

|`Dependent items`|gpu.temp_max| -|GPU Utilization Total|

Total GPU utilisation

|`Dependent items`|gpu.utilization_total| - -Items for each GPU found +|GPU Discovery|

GPU detection data

|'Zabbix agent'|gpu.discovery

Update: 1h

| +|GPU Count|

Number of GPUs detected

|`Dependent item`|gpu.county| +|GPU Driver Version|

GPU driver version

|`Dependent item`|gpu.driver_versiony| +|GPU Power Total|

Power consumption of all GPUs

|`Calculated item`|gpu.power_totaly

Update: 1m

| +|GPUs Maximum Temperature|

Temperature of the hottest GPU

|`Calculated item`|gpu.temp_maxy

Update: 1m

| +|GPU Utilization Total|

Total GPU utilisation

|`Calculated item`|gpu.utilization_totaly

Update: 1m

| + +Item prototypes for each GPU found |Name|Description|Type|Key and additional info| |----|-----------|----|----| -|GPU Power|Power consumption of the GPU|`Dependent items`|gpu.power| -|GPU Total Memory|GPU memory capacity|`Dependent items`|gpu.mtotal| -|GPU Used Memory|The amount of GPU memory used|`Dependent items`|gpu.mused| -|GPU Free Memory|Amount of free GPU memory|`Dependent items`|gpu.mfree| -|GPU Utilisation|GPU utilisation|`Dependent items`|gpu.utilization| -|GPU Temperature|GPU Temperature|`Dependent items`|gpu.temperature| -|GPU Fan Speed|GPU Fan Speed|`Dependent items`|gpu.fan| +|GPU {#GPUID} Data|

Data collection

|'Zabbix agent'|gpu.card[{#GPUID}]

Update: 1m

| +|GPU {#GPUID} Power - {#NAME}|

Power consumption of the GPU

|`Dependent items`|gpu.power.[{#GPUID}]| +|GPU {#GPUID} Memory Total - {#NAME}|

GPU memory capacity

|`Dependent items`|gpu.mtotal.[{#GPUID}]| +|GPU {#GPUID} Memory Used - {#NAME}|

The amount of GPU memory used

|`Dependent items`|gpu.mused.[{#GPUID}]| +|GPU {#GPUID} Memory Free - {#NAME}|

Amount of free GPU memory

|`Dependent items`|gpu.mfree.[{#GPUID}]| +|GPU {#GPUID} Utilization - {#NAME}|

GPU utilisation

|`Dependent items`|gpu.utilization.[{#GPUID}]| +|GPU {#GPUID} Temperature - {#NAME}|

GPU Temperature

|`Dependent items`|gpu.temperature.[{#GPUID}]| +|GPU {#GPUID} Fan Speed - {#NAME}|

GPU Fan Speed

|`Dependent items`|gpu.fan.[{#GPUID}]| ## Triggers @@ -68,4 +70,4 @@ Items for each GPU found |GPU {#GPUID} Temperature is extremely high|The temperature of the GPU is very high. Possibility of failure|last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>=80|`High`| |GPU {#GPUID} Temperature is high|Temperature of the graphics processor is high|

last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>=65

**Dependencies**: GPU {#GPUID} Temperature is extremely high

|`Average`| |Problem with the fan|Fan does not spin when GPU is hot|last(/Nvidia Multi-GPU/gpu.fan.[{#GPUID}])=0 and last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>60|`High`| -|Data retrieval error|Problem with data retrieval|nodata(/Nvidia Multi-GPU/gpu.driver_version,3m)=1|`Disaster`| +|Error receiving data for GPU {#GPUID}|Problem with data retrieval|nodata(/Nvidia Multi-GPU/gpu.utilization.[{#GPUID}],3m)=1|`Disaster`| diff --git a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/nvidia_gpus.conf b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/nvidia_gpus.conf index a50a66b41..e8721b75b 100644 --- a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/nvidia_gpus.conf +++ b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/nvidia_gpus.conf @@ -1 +1,2 @@ -UserParameter=gpu.data,nvidia-smi --query-gpu=gpu_bus_id,temperature.gpu,memory.total,memory.used,memory.free,fan.speed,utilization.gpu,power.draw,name,driver_version --format=csv,noheader,nounits | sed -e 's/, /,/g' +UserParameter=gpu.discovery,nvidia-smi --query-gpu=gpu_bus_id,name,driver_version --format=csv,noheader,nounits | sed -e 's/, /,/g' +UserParameter=gpu.card[*],nvidia-smi --query-gpu=temperature.gpu,memory.total,memory.used,memory.free,fan.speed,utilization.gpu,power.draw --format=csv,noheader,nounits -i $1 | sed -e 's/, /,/g' diff --git a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/zbx_NVidia_GPUs.yaml b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/zbx_NVidia_GPUs.yaml index c2f389667..bbf86bf2a 100644 --- a/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/zbx_NVidia_GPUs.yaml +++ b/Server_Hardware/Other/template_nvidia-smi_multigpu/6.0/zbx_NVidia_GPUs.yaml @@ -4,13 +4,13 @@ zabbix_export: - uuid: e960332b3f6c46a1956486d4f3f99fce name: 'Templates/Server hardware' templates: - - uuid: 0d0a24bfd8c046038be50e02b0203765 + - uuid: 9f5f658638eb43af88ebf74ce8d2acf3 template: 'Nvidia Multi-GPU' name: 'Nvidia Multi-GPU' groups: - name: 'Templates/Server hardware' items: - - uuid: a6099bd5ed6a49b8a0b8d76064931161 + - uuid: 6ca47609e7564842bb51f1818bdad56c name: 'GPU Count' type: DEPENDENT key: gpu.count @@ -23,25 +23,23 @@ zabbix_export: parameters: - '$[*].1.length()' master_item: - key: gpu.data - - uuid: 650b6faad048440da328b100a72f496c - name: 'GPU Data' - key: gpu.data + key: gpu.discovery + - uuid: 6a68154deed448699a39aa4b6e522cd7 + name: 'GPU Discovery' + key: gpu.discovery + delay: 1h history: '0' trends: '0' value_type: TEXT - description: 'Data collection by GPUs' + description: 'GPUs detection' preprocessing: - - type: NOT_MATCHES_REGEX - parameters: - - '^.*Error.*$' - error_handler: DISCARD_VALUE - type: CSV_TO_JSON parameters: - ',' - '"' - '0' - - uuid: 8d3fe49bb34048afa8ed79502db34cf1 + error_handler: DISCARD_VALUE + - uuid: 59985a010958410699e80f3dc3f81069 name: 'GPU Driver Version' type: DEPENDENT key: gpu.driver_version @@ -54,78 +52,67 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[0].10' + - '$[0].3' + error_handler: DISCARD_VALUE master_item: - key: gpu.data + key: gpu.discovery triggers: - - uuid: 2d233106186349fb8cec84eeb79ca68e - expression: 'nodata(/Nvidia Multi-GPU/gpu.driver_version,3m)=1' - name: 'Data retrieval error' - priority: DISASTER - description: | - Problem with data retrieval - To diagnose the problem, run the nvidia-smi command on the host - - uuid: 226db5338016428ba86d2a8fa5f31da2 + - uuid: d529da30d00b4ffa8e72ea11085710d9 expression: 'change(/Nvidia Multi-GPU/gpu.driver_version)<>0' name: 'Driver version changed' priority: INFO description: 'The driver version has changed' - - uuid: 2342ff31e369484c90d63210c58eb77e + - uuid: d6926fcc2b6449ada019f1b96ff8be8a name: 'GPU Power Total' - type: DEPENDENT + type: CALCULATED key: gpu.power_total - delay: '0' history: 30d trends: 90d value_type: FLOAT units: W + params: 'sum(last_foreach(/*/gpu.power.[*]))' description: 'Power consumption of all GPUs' - preprocessing: - - type: JSONPATH - parameters: - - '$[*].8.sum()' - master_item: - key: gpu.data - - uuid: 8401ed7ac3a249ea92515b41fd7cc68a + - uuid: 5fb5abcae9e04feeb5651ce367d577b6 name: 'GPUs Maximum Temperature' - type: DEPENDENT + type: CALCULATED key: gpu.temp_max - delay: '0' history: 30d trends: 90d units: °C + params: 'max(last_foreach(/*/gpu.temperature.[*]))' description: 'Temperature of the hottest GPU' - preprocessing: - - type: JSONPATH - parameters: - - '$[*].2.max()' - master_item: - key: gpu.data - - uuid: 1d16ecd4f52b464a812dfc500a2a9f34 + - uuid: 4f63285a711045eab4d62c46127cc1da name: 'GPU Utilization Total' - type: DEPENDENT + type: CALCULATED key: gpu.utilization_total - delay: '0' history: 30d trends: 90d value_type: FLOAT units: '%' + params: 'avg(last_foreach(/*/gpu.utilization.[*]))' description: 'Total GPU utilisation' - preprocessing: - - type: JSONPATH - parameters: - - '$[*].7.avg()' - master_item: - key: gpu.data discovery_rules: - - uuid: f9f116239737447b8ddfa2c558d8e583 + - uuid: 10011d82dcad4162b7ca36a66dd4d618 name: 'GPU Card' type: DEPENDENT key: gpu.id delay: '0' lifetime: 1d item_prototypes: - - uuid: bcb70b06d9db454bae4fc0aa0d01f210 + - uuid: 082be4addefa43e98ded2ab15be17d38 + name: 'GPU {#GPUID} Data' + key: 'gpu.card[{#GPUID}]' + history: '0' + trends: '0' + value_type: TEXT + description: 'GPU Data Collection' + preprocessing: + - type: CSV_TO_JSON + parameters: + - '' + - '' + - '0' + - uuid: 4a2648721a0048c7a900f7a8f499d589 name: 'GPU {#GPUID} Fan Speed - {#NAME}' type: DEPENDENT key: 'gpu.fan.[{#GPUID}]' @@ -137,11 +124,11 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].6.first()' + - '$[0].5' error_handler: DISCARD_VALUE master_item: - key: gpu.data - - uuid: a0601fa235114c8da7854df368dcbb91 + key: 'gpu.card[{#GPUID}]' + - uuid: e751483a4716408a9d10bb24564873a6 name: 'GPU {#GPUID} Memory Free - {#NAME}' type: DEPENDENT key: 'gpu.mfree.[{#GPUID}]' @@ -153,14 +140,14 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].5.first()' + - '$[0].4' error_handler: DISCARD_VALUE - type: MULTIPLIER parameters: - '1048576' master_item: - key: gpu.data - - uuid: da856164b416405591e4eefacb8d081a + key: 'gpu.card[{#GPUID}]' + - uuid: 5bb28babb98e4469bde899cb22977277 name: 'GPU {#GPUID} Memory Total - {#NAME}' type: DEPENDENT key: 'gpu.mtotal.[{#GPUID}]' @@ -172,14 +159,14 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].3.first()' + - '$[0].2' error_handler: DISCARD_VALUE - type: MULTIPLIER parameters: - '1048576' master_item: - key: gpu.data - - uuid: 7d2c35e7c6b64ffc8acd4e49bb4fff4c + key: 'gpu.card[{#GPUID}]' + - uuid: da7fbbcfc1da46288b879640f49658ec name: 'GPU {#GPUID} Memory Used - {#NAME}' type: DEPENDENT key: 'gpu.mused.[{#GPUID}]' @@ -191,14 +178,14 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].4.first()' + - '$[0].3' error_handler: DISCARD_VALUE - type: MULTIPLIER parameters: - '1048576' master_item: - key: gpu.data - - uuid: 3667831f6c834e3f8b43f5ea86d2bb65 + key: 'gpu.card[{#GPUID}]' + - uuid: 64e478f5ac654022bcd28b2ef4da208b name: 'GPU {#GPUID} Power - {#NAME}' type: DEPENDENT key: 'gpu.power.[{#GPUID}]' @@ -211,11 +198,11 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].8.first()' + - '$[0].7' error_handler: DISCARD_VALUE master_item: - key: gpu.data - - uuid: 242afcbf5fb34b05bd0043c2cb703f2e + key: 'gpu.card[{#GPUID}]' + - uuid: f9d3d0b8811347ca8f415d08c48aa564 name: 'GPU {#GPUID} Temperature - {#NAME}' type: DEPENDENT key: 'gpu.temperature.[{#GPUID}]' @@ -228,17 +215,17 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].2.first()' + - '$[0].1' error_handler: DISCARD_VALUE master_item: - key: gpu.data + key: 'gpu.card[{#GPUID}]' trigger_prototypes: - - uuid: 5929b2f1ceba49b6ba6bfdd623850c7c + - uuid: 7b25e87f388c41a5b3b9605cc02ac7fd expression: 'last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>=80' name: 'GPU {#GPUID} Temperature is extremely high' priority: HIGH description: 'The temperature of the GPU is very high. Possibility of failure' - - uuid: 9beb7dd56a644d63b48fd7604e57a233 + - uuid: ffbe376213334c8f9bbbf3382b37e872 expression: 'last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>=65' name: 'GPU {#GPUID} Temperature is high' priority: AVERAGE @@ -246,7 +233,7 @@ zabbix_export: dependencies: - name: 'GPU {#GPUID} Temperature is extremely high' expression: 'last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>=80' - - uuid: a234ea02aba3420a84e55ba62185572d + - uuid: 64b4543c0bcf4e08b3c7902e2952c011 name: 'GPU {#GPUID} Utilization - {#NAME}' type: DEPENDENT key: 'gpu.utilization.[{#GPUID}]' @@ -258,18 +245,26 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.1 == ''{#GPUID}'')].7.first()' + - '$[0].6' error_handler: DISCARD_VALUE master_item: - key: gpu.data + key: 'gpu.card[{#GPUID}]' + trigger_prototypes: + - uuid: 20784c31e14d40f6bf7903bba23e23e6 + expression: 'nodata(/Nvidia Multi-GPU/gpu.utilization.[{#GPUID}],3m)=1' + name: 'Error receiving data for GPU {#GPUID}' + priority: DISASTER + description: | + Problem with data retrieval + To diagnose the problem, run the nvidia-smi command on the host trigger_prototypes: - - uuid: 3bc18bab7d744b9d83de72130289562a + - uuid: c9adbd6229e34983809ba171bac5207a expression: 'last(/Nvidia Multi-GPU/gpu.fan.[{#GPUID}])=0 and last(/Nvidia Multi-GPU/gpu.temperature.[{#GPUID}])>60' name: 'GPU {#GPUID} Problem with the fan' priority: HIGH description: 'Fan does not spin when GPU is hot' graph_prototypes: - - uuid: 0e301af6d1ad469a83d8ff8d215e68bc + - uuid: 69931017386e4dc8b792fc4c877e60ef name: 'GPU {#GPUID} Memory - {#NAME}' show_work_period: 'NO' show_triggers: 'NO' @@ -292,17 +287,17 @@ zabbix_export: host: 'Nvidia Multi-GPU' key: 'gpu.mfree.[{#GPUID}]' master_item: - key: gpu.data + key: gpu.discovery lld_macro_paths: - lld_macro: '{#NAME}' - path: $.9 + path: $.2 - lld_macro: '{#GPUID}' path: $.1 tags: - - tag: Application - value: Nvidia + - tag: application + value: nvidia dashboards: - - uuid: 04cb54bfd57042b98f3524f8d421749f + - uuid: f7a3512d10254206b41830ba4e683db6 name: 'GPU Panel' pages: - name: GPU