From cddf5da67883f32b74358b65a83be08e1b9e6b63 Mon Sep 17 00:00:00 2001 From: Andrea Ieri Date: Tue, 17 Sep 2024 10:49:03 -0700 Subject: [PATCH] Reorganize the snap files for added clarity (#13) * drop wget dependency - curl is already available * separate installing dcgm in its own part * add comments around all parts * move the hostengine service script under bin/ for consistency * split local files into scripts/ and files/ to avoid shipping the configure_sources.sh script * move proftester cleanup under the dcgm part --- snap/local/{ => files}/run_dcgm_exporter.sh | 0 snap/local/{ => files}/run_nv_hostengine.sh | 0 snap/local/{ => scripts}/configure_sources.sh | 2 +- snap/snapcraft.yaml | 73 ++++++++++--------- 4 files changed, 40 insertions(+), 35 deletions(-) rename snap/local/{ => files}/run_dcgm_exporter.sh (100%) mode change 100644 => 100755 rename snap/local/{ => files}/run_nv_hostengine.sh (100%) mode change 100644 => 100755 rename snap/local/{ => scripts}/configure_sources.sh (88%) diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/files/run_dcgm_exporter.sh old mode 100644 new mode 100755 similarity index 100% rename from snap/local/run_dcgm_exporter.sh rename to snap/local/files/run_dcgm_exporter.sh diff --git a/snap/local/run_nv_hostengine.sh b/snap/local/files/run_nv_hostengine.sh old mode 100644 new mode 100755 similarity index 100% rename from snap/local/run_nv_hostengine.sh rename to snap/local/files/run_nv_hostengine.sh diff --git a/snap/local/configure_sources.sh b/snap/local/scripts/configure_sources.sh similarity index 88% rename from snap/local/configure_sources.sh rename to snap/local/scripts/configure_sources.sh index 3535409..c3bed86 100755 --- a/snap/local/configure_sources.sh +++ b/snap/local/scripts/configure_sources.sh @@ -18,7 +18,7 @@ fi echo "Architecture is $SYSTEM_ARCH. Downloading cuda-keyring package..." -wget "https://developer.download.nvidia.com/compute/cuda/repos/$DISTRIBUTION/$ARCH/$CUDA_PKG" +curl --remote-name "https://developer.download.nvidia.com/compute/cuda/repos/$DISTRIBUTION/$ARCH/$CUDA_PKG" # Run the checksum verification and install cuda-keyring if valid if echo "$SHA256SUM $CUDA_PKG" | sha256sum --check --status; then diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 3902e52..6319608 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -1,6 +1,6 @@ name: dcgm base: core24 -adopt-info: dcgm-exporter +adopt-info: dcgm summary: Snap for NVIDIA DCGM and DCGM exporter license: Apache-2.0 contact: solutions-engineering@lists.canonical.com @@ -22,7 +22,7 @@ title: NVIDIA DCGM apps: dcgm-exporter: - command: run_dcgm_exporter.sh + command: bin/run_dcgm_exporter.sh plugs: - network-bind - opengl @@ -37,7 +37,7 @@ apps: - network-bind - opengl nv-hostengine: - command: run_nv_hostengine.sh + command: bin/run_nv_hostengine.sh plugs: - network-bind - opengl @@ -47,39 +47,28 @@ apps: DCGM_HOME_DIR: "${SNAP_COMMON}" parts: - wrapper: - plugin: dump - build-packages: - - wget - - dpkg - source: snap/local + # This is a workaround to package-repositories not supporting multiple architectures + # See https://forum.snapcraft.io/t/can-package-repositories-depend-on-architecture/27820 + cuda-sources: + plugin: nil + source: snap/local/scripts override-pull: | craftctl default ./configure_sources.sh - override-build: | - craftctl default - chmod +x run_nv_hostengine.sh - chmod +x run_dcgm_exporter.sh - dcgm-exporter: + + # This is the actual DCGM software. We don't build from source because the build + # process is very cumbersome and requires docker + # The deb is pulled from the sources configured in the cuda-sources part + dcgm: after: - - wrapper - plugin: go + - cuda-sources + plugin: nil stage-packages: [datacenter-gpu-manager=1:3.3.7] - build-snaps: - - go - source: https://github.com/NVIDIA/dcgm-exporter.git - source-type: git - source-tag: 3.3.7-3.5.0 - # override build to set custom csv file - override-build: | - craftctl default - mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter - cp etc/default-counters.csv etc/dcp-metrics-included.csv $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter/ # override prime to set version override-prime: | craftctl default # Locate dcgm .deb file - DEB_FILE=$(ls $HOME/parts/dcgm-exporter/stage_packages/datacenter-gpu-manager_*.deb) + DEB_FILE=$(ls $HOME/parts/dcgm/stage_packages/datacenter-gpu-manager_*.deb) # Extract the version from the .deb file DCGM_VERSION=$(dpkg-deb -f "$DEB_FILE" Version) @@ -87,14 +76,30 @@ parts: # Set the Snap version to the same as dcgm deb file craftctl set version="${DCGM_VERSION#1:}" - cleanup: - after: - - dcgm-exporter - plugin: nil - source: snap/local/scripts - override-prime: | + # Remove dcgmproftesters and related libraries + $CRAFT_PROJECT_DIR/snap/local/scripts/remove_dcgmproftester.sh + + # This is the DCGM exporter + dcgm-exporter: + plugin: go + build-snaps: + - go + source: https://github.com/NVIDIA/dcgm-exporter.git + source-type: git + source-tag: 3.3.7-3.5.0 + # override build to set custom csv file + override-build: | craftctl default - ./scripts/remove_dcgmproftester.sh + mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter + cp etc/default-counters.csv etc/dcp-metrics-included.csv $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter/ + + # wrappers supporting snap options + wrapper: + plugin: dump + source: snap/local/files + organize: + run_nv_hostengine.sh: bin/ + run_dcgm_exporter.sh: bin/ layout: /etc/dcgm-exporter: