-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcompose.yaml
59 lines (57 loc) · 1.6 KB
/
compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
services:
dcgm_exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_IMAGE_TAG}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
restart: always
environment:
- DCGM_EXPORTER_NO_HOSTNAME=1
cap_add:
- SYS_ADMIN
ports:
- "${DCGM_EXPORTER_HOST_PORT}:9400"
networks:
- gpu_metrics
prometheus:
image: prom/prometheus:${PROMETHEUS_IMAGE_TAG}
ports:
- "${PROMETHEUS_HOST_PORT}:9090"
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --web.console.libraries=/usr/share/prometheus/console_libraries
- --web.console.templates=/usr/share/prometheus/consoles
- --storage.tsdb.retention.time=${PROMETHEUS_STORAGE_TSDB_RETENTION_TIME}
restart: always
volumes:
- ./gpu-metrics-prometheus-config.yaml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
networks:
- gpu_metrics
grafana:
image: grafana/grafana:${GRAFANA_IMAGE_TAG}
ports:
- "${GRAFANA_HOST_PORT}:3000"
restart: always
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
- ./grafana_provisioning:/etc/grafana/provisioning:ro
- ./grafana_dashboards:/etc/dashboards:ro
networks:
- gpu_metrics
volumes:
prometheus_data:
driver: local
grafana_data:
driver: local
networks:
gpu_metrics:
driver: bridge