Skip to content

Commit

Permalink
kmod-5.10-nvidia: move to R535 branch from R470
Browse files Browse the repository at this point in the history
The R470 branch is end of life. In order to keep variants using the 5.10
kernel on a supported NVIDIA driver, this commit moves the kmod package
for 5.10 to build the R535 branch and brings the driver in line with the
other two kernel kmod packages in packaging style.

Signed-off-by: Matthew Yeazel <[email protected]>
  • Loading branch information
yeazelm committed Oct 10, 2024
1 parent 62450eb commit d36b035
Show file tree
Hide file tree
Showing 16 changed files with 499 additions and 182 deletions.
2 changes: 2 additions & 0 deletions packages/kmod-5.10-nvidia/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
NVidiaEULAforAWS.pdf
COPYING
*.rpm
23 changes: 19 additions & 4 deletions packages/kmod-5.10-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,28 @@ url = "https://s3.amazonaws.com/EULA/NVidiaEULAforAWS.pdf"
sha512 = "e1926fe99afc3ab5b2f2744fcd53b4046465aefb2793e2e06c4a19455a3fde895e00af1415ff1a5804c32e6a2ed0657e475de63da6c23a0e9c59feeef52f3f58"

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-x86_64-470.256.02.run"
sha512 = "a837946dd24d7945c1962a695f1f31965f3ceb6927f52cd08fd51b8db138b7a888bbeab69243f5c8468a7bd7ccd47f5dbdb48a1ca81264866c1ebb7d88628f88"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-x86_64-535.183.06.run"
sha512 = "424950ef303ea39499e96f8c90c1e0c83aee12309779d4f335769ef554ad4f7c38e98f69c64b408adc85a7cf51ea600d85222792402b9c6b7941f1af066d2a33"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-aarch64-470.256.02.run"
sha512 = "38eee5933355c34ca816a2ac0fbc4f55c19c20e1322891bfc98cb6b37d99a31218eea9314877ab0e3cf3ac6eb61f9d9d4d09d0af304b689f18b4efa721b65d5c"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-aarch64-535.183.06.run"
sha512 = "bb305f1703557461b0a0a29066c304658d9684841104c6f4d9ff44f9db90fee14ae619cd2fe3242823a5fe3a69b168b8174b163740014b15cdef36db88ba2d96"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/nvidia-fabric-manager-535.183.06-1.x86_64.rpm"
sha512 = "c3d98878363f857b2963665a0e485cb7b1afeaabd0040a970478d00ffb870ab4130ab9dfe1b7a40d1b38734636ebccec39fd1b3fc8c06abc5c07470f749b6025"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/sbsa/nvidia-fabric-manager-535.183.06-1.aarch64.rpm"
sha512 = "6a646cd7ea11e668f7dbe6f6bb22516107a856e3c3755f8693c91d4bed706b8b3667b853f07e84c2d0da4de7ab1107337b6a1493879d75d8c201bfe9da071b32"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://raw.githubusercontent.com/NVIDIA/open-gpu-kernel-modules/535/COPYING"
sha512 = "f9cee68cbb12095af4b4e92d01c210461789ef41c70b64efefd6719d0b88468b7a67a3629c432d4d9304c730b5d1a942228a5bcc74a03ab1c411c77c758cd938"
force-upstream = true

[build-dependencies]
Expand Down
20 changes: 20 additions & 0 deletions packages/kmod-5.10-nvidia/copy-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[Unit]
Description=Copy open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
# since kernel modules are linked early in the boot sequence, but we still
# disable manual restarts to prevent unnecessary kernel modules rewrites.
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu link-modules
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu-copy-only link-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
520 changes: 352 additions & 168 deletions packages/kmod-5.10-nvidia/kmod-5.10-nvidia.spec

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Link additional kernel modules
Description=Link Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog link-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla link-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
19 changes: 19 additions & 0 deletions packages/kmod-5.10-nvidia/load-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Load open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=copy-open-gpu-kernel-modules.service
Requires=copy-open-gpu-kernel-modules.service
# Disable manual restarts to prevent loading kernel modules
# that weren't linked by the running system
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu load-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Load additional kernel modules
Description=Load Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=link-tesla-kernel-modules.service
Requires=link-tesla-kernel-modules.service
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog load-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla load-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
34 changes: 34 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Modern, systemd-aware settings:
# - Log to journal via stderr
# - Keep running in the foreground
LOG_LEVEL=4
LOG_FILE_NAME=
DAEMONIZE=0

# Use Unix domain sockets instead of localhost ports.
UNIX_SOCKET_PATH=/run/nvidia/fabricmanager.sock
FM_CMD_UNIX_SOCKET_PATH=/run/nvidia/fabricmanager-cmd.sock

# Start Fabric Manager in bare metal or full pass through virtualization mode.
FABRIC_MODE=0
FABRIC_MODE_RESTART=0

# Terminate on NVSwitch and GPU config failure.
FM_STAY_RESIDENT_ON_FAILURES=0

# When there is a GPU to NVSwitch NVLink failure, remove the GPU with the failure
# from NVLink P2P capability.
ACCESS_LINK_FAILURE_MODE=0

# When there is an NVSwitch to NVSwitch NVLink failure, exit Fabric Manager.
TRUNK_LINK_FAILURE_MODE=0

# When there is an NVSwitch failure or an NVSwitch is excluded, abort Fabric Manager.
NVSWITCH_FAILURE_MODE=0

# When Fabric Manager service is stopped or terminated, abort all running CUDA jobs.
ABORT_CUDA_JOBS_ON_FM_EXIT=1

# Path to topology and database files.
TOPOLOGY_FILE_PATH=/usr/share/nvidia/tesla/nvswitch
DATABASE_PATH=/usr/share/nvidia/tesla/nvswitch
16 changes: 16 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=NVIDIA fabric manager service

[Service]
ExecStart=/usr/libexec/nvidia/tesla/bin/nv-fabricmanager -c /etc/nvidia/fabricmanager.cfg
Type=simple
TimeoutSec=0
RestartSec=5
Restart=always
RemainAfterExit=true
StandardError=journal+console
SuccessExitStatus=255
LimitCORE=infinity

[Install]
WantedBy=multi-user.target
2 changes: 1 addition & 1 deletion packages/kmod-5.10-nvidia/nvidia-ld.so.conf.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__LIBDIR__/nvidia/tesla/__NVIDIA_VERSION__/
__LIBDIR__/nvidia/tesla/
11 changes: 11 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-open-gpu-config.toml.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[nvidia-open-gpu]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu.kernel-modules."nvidia.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-modeset.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-uvm.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[nvidia-open-gpu-copy-only]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-drm.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-peermem.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ After=load-tesla-kernel-modules.service load-open-gpu-kernel-modules.service

[Service]
Type=forking
ExecStart=__NVIDIA_BINDIR__/nvidia-persistenced --user nvidia --verbose
ExecStart=/usr/libexec/nvidia/tesla/bin/nvidia-persistenced --user nvidia --verbose

[Install]
RequiredBy=preconfigured.target
5 changes: 5 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
C /etc/drivers/nvidia-tesla.toml
C /etc/drivers/nvidia-open-gpu.toml
C /etc/drivers/nvidia-open-gpu-copy-only.toml
C /etc/containerd/nvidia.env - - - - /usr/share/factory/nvidia/tesla/nvidia-path.env
C /etc/ld.so.conf.d/nvidia-tesla.conf
3 changes: 0 additions & 3 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf.in

This file was deleted.

6 changes: 5 additions & 1 deletion packages/kmod-5.10-nvidia/nvidia-tmpfiles.conf.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla 0755 root root - -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu 0755 root root - -
C /etc/nvidia/fabricmanager.cfg - - - -
d /run/nvidia 0700 root root -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -

0 comments on commit d36b035

Please sign in to comment.