forked from henry0312/datadog-gpustat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu_stat.py
32 lines (27 loc) · 1.21 KB
/
gpu_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# -*- coding: utf-8 -*-
# See:
# - https://docs.datadoghq.com/agent/agent_checks/
# - https://docs.datadoghq.com/ja/guides/agent_checks/
import time
from datadog_checks.checks import AgentCheck
from gpustat import GPUStatCollection
class GpuStat(AgentCheck):
def check(self, instance):
try:
gpu_stats = GPUStatCollection.new_query()
for gpu in gpu_stats.gpus:
entry = gpu.entry
tags = ['gpu:{}'.format(entry['index'])]
self.gauge('gpu.memory.used', entry['memory.used'], tags=tags)
self.gauge('gpu.memory.total', entry['memory.total'], tags=tags)
self.gauge('gpu.utilization', entry['utilization.gpu'], tags=tags)
self.gauge('gpu.temperature', entry['temperature.gpu'], tags=tags)
self.gauge('gpu.power.draw', entry['power.draw'], tags=tags)
self.gauge('gpu.enforced.power.limit', entry['enforced.power.limit'], tags=tags)
except Exception as ex:
self.event({
'timestamp': int(time.time()),
'event_type': 'gpu_stat',
'msg_title': 'Error in gpu stat',
'msg_text': str(ex),
})