-
Notifications
You must be signed in to change notification settings - Fork 5
/
nvidia.yml
248 lines (214 loc) · 8.55 KB
/
nvidia.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
---
- hosts: homelab
vars:
application: nvidia
# https://www.nvidia.com/en-us/drivers/unix/
# check compatibility table https://github.com/keylase/nvidia-patch
nvidia_driver_version: "550.120"
docker_network: "{{ networks.pub }}"
tasks:
- name: Create config folder
ansible.builtin.file:
path: "{{ config_directory }}"
state: directory
owner: "{{ common_user }}"
group: "{{ common_group }}"
mode: "0771"
- name: Copy Docker driver check and restart script
ansible.builtin.copy:
content: |
#!/usr/bin/env bash
FUNCTION=$1
if [ "$FUNCTION" == "STOP" ]; then
for i in $(docker ps -q); do
if docker inspect "$i" | jq -re '.[].HostConfig.DeviceRequests?' > /dev/null; then
docker stop "$i" > /dev/null;
fi
done
fi
if [ "$FUNCTION" == "START" ]; then
for i in $(docker ps -a -q); do
if docker inspect "$i" | jq -re '.[].HostConfig.DeviceRequests?' > /dev/null; then
docker start "$i" > /dev/null;
fi
done
fi
if [ "$FUNCTION" == "MONITOR" ]; then
for i in $(docker ps -q); do
if docker inspect "$i" | jq -e '.[].State.Status == "running" and .[].HostConfig.DeviceRequests?' > /dev/null; then
if ! docker exec "$i" nvidia-smi > /dev/null; then
docker restart "$i" > /dev/null;
fi
fi
done
fi
dest: "{{ config_directory }}/docker-container-check-restart.sh"
mode: '0755'
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 600
changed_when: false
- name: Disable nouveau
ansible.builtin.copy:
content: |
blacklist nouveau
options nouveau modeset=0
dest: /etc/modprobe.d/blacklist-nouveau.conf
mode: '0644'
register: __disable_nouveau
- name: Regenerate the kernel initramfs
ansible.builtin.command: update-initramfs -u
changed_when: true
when: __disable_nouveau.changed
- name: Reboot to disable nouveau
ansible.builtin.reboot:
reboot_timeout: 360
when: __disable_nouveau.changed
- name: Install prerequisites
ansible.builtin.apt:
name:
- pkg-config
- xorg-dev
- libglvnd-dev
- dkms
- libvulkan1
- nvtop
state: present
register: __install_nvidia_prerequisites
until: __install_nvidia_prerequisites is success
retries: 5
delay: 5
- name: Create xorg modules folder
ansible.builtin.file:
path: /usr/lib/xorg/modules
state: directory
owner: "{{ common_root_id }}"
group: "{{ common_root_group }}"
mode: "0755"
- name: Get current driver version
ansible.builtin.command: nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0
changed_when: false
failed_when: false
register: _nvidia_installed_version
# https://www.nvidia.com/en-us/drivers/unix/
- name: Download NVIDIA driver
ansible.builtin.get_url:
url: "https://international.download.nvidia.com/XFree86/Linux-x86_64/{{ nvidia_driver_version }}/NVIDIA-Linux-x86_64-{{ nvidia_driver_version }}.run"
dest: "{{ config_directory }}/{{ nvidia_driver_version }}.run"
mode: '0755'
register: __download_nvidia_drivers
when: _nvidia_installed_version.stdout != nvidia_driver_version
- name: Stop containers using NVIDIA GPU
ansible.builtin.command: "{{ config_directory }}/docker-container-check-restart.sh STOP"
when: _nvidia_installed_version.stdout != nvidia_driver_version
changed_when: true
- name: Install NVIDIA driver
ansible.builtin.command: "{{ config_directory }}/{{ nvidia_driver_version }}.run --silent"
changed_when: true
register: __install_nvidia_drivers
when: _nvidia_installed_version.stdout != nvidia_driver_version
- name: Create persistenced override directory
ansible.builtin.file:
path: /etc/systemd/system/nvidia-persistenced.service.d/
state: directory
recurse: true
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker
- name: Add NVIDIA's Docker GPG key
ansible.builtin.get_url:
url: https://nvidia.github.io/nvidia-docker/gpgkey
dest: "/etc/apt/trusted.gpg.d/{{ application }}"
owner: "{{ common_root_id }}"
group: "{{ common_root_group }}"
mode: "0644"
register: result
until: result is success
retries: 5
delay: 5
# modified from https://nvidia.github.io/nvidia-docker/ubuntu22.04/nvidia-docker.list
- name: Add NVIDIA's Docker Repository to APT
ansible.builtin.copy:
content: |
deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/{{ application }}] https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/$(ARCH) /
#deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/{{ application }}] https://nvidia.github.io/libnvidia-container/experimental/ubuntu18.04/$(ARCH) /
deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/{{ application }}] https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu18.04/$(ARCH) /
#deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/{{ application }}] https://nvidia.github.io/nvidia-container-runtime/experimental/ubuntu18.04/$(ARCH) /
deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/{{ application }}] https://nvidia.github.io/nvidia-docker/ubuntu18.04/$(ARCH) /
dest: /etc/apt/sources.list.d/{{ application }}.list
owner: "{{ common_root_id }}"
group: "{{ common_root_group }}"
mode: "0644"
register: __apt_repository
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
when: __apt_repository.changed
register: result
until: result is success
retries: 5
delay: 5
- name: Install NVIDIA Container Toolkit
ansible.builtin.apt:
name: nvidia-container-toolkit
state: present
register: __install_nvidia_toolkit
until: __install_nvidia_toolkit is success
retries: 5
delay: 5
- name: Restart Docker to complete installation
ansible.builtin.service:
name: docker
state: restarted
when:
- not __install_nvidia_drivers.changed
- __install_nvidia_toolkit.changed
- name: Reboot to load NVIDIA drivers
ansible.builtin.reboot:
reboot_timeout: 360
when: __install_nvidia_drivers.changed
- name: Clone NVIDIA patch repo
ansible.builtin.git:
repo: "https://github.com/keylase/nvidia-patch"
dest: "{{ config_directory }}/nvidia-patch"
update: true
- name: Run NVENV video encoding patch
ansible.builtin.command: "bash {{ config_directory }}/nvidia-patch/patch.sh"
register: _nvidia_patch_video_encoding
changed_when: "'Patched!' in _nvidia_patch_video_encoding.stdout"
- name: Run NvFBC unlock patch
ansible.builtin.command: "bash {{ config_directory }}/nvidia-patch/patch-fbc.sh"
register: _nvidia_patch_nvfbc_unlock
changed_when: "'Patched!' in _nvidia_patch_nvfbc_unlock.stdout"
- name: Start containers using NVIDIA GPU
ansible.builtin.command: "{{ config_directory }}/docker-container-check-restart.sh START"
when: _nvidia_installed_version.stdout != nvidia_driver_version
changed_when: true
- name: Create dcgm-exporter container
ansible.builtin.include_role:
name: docker_container
vars:
name: nvidia-dcgm-exporter
image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.8-3.6.0-ubuntu22.04
capabilities:
- sys_admin
device_requests:
- driver: nvidia
count: -1
device_ids: []
options: {}
capabilities:
- - gpu
- video
- compute
- utility
metrics:
- port: 9400
- name: Automatically restart containers when driver is not found
ansible.builtin.cron:
name: "nvidia gpu driver docker check"
minute: "*/5"
job: "{{ config_directory }}/docker-container-check-restart.sh MONITOR"
# The NVIDIA driver install can mess with the Docker daemon settings.
# Rerun docker again to be safe.
- ansible.builtin.import_playbook: docker.yml
when: __install_nvidia_drivers.changed