From e3473857c1dd9c555b3921d00e520b7903254b73 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 11:29:31 -0400 Subject: [PATCH 01/36] Update structure to comply with bootstack template --- Makefile | 55 +++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ rename.sh | 17 +++++++++++++ tox.ini | 41 +++++++++++++++++++++++++++++++ 4 files changed, 179 insertions(+) create mode 100644 Makefile create mode 100644 pyproject.toml create mode 100755 rename.sh create mode 100644 tox.ini diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3394bab --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +# This is a template `Makefile` file for snaps +# This file is managed by bootstack-charms-spec and should not be modified +# within individual snap repos. https://launchpad.net/bootstack-charms-spec + +PYTHON := /usr/bin/python3 + +PROJECTPATH=$(dir $(realpath ${MAKEFILE_LIST})) +SNAP_NAME=$(shell cat ${PROJECTPATH}/snap/snapcraft.yaml | grep -E '^name:' | awk '{print $$2}') +SNAP_FILE=${PROJECTPATH}/${SNAP_NAME}.snap + +help: + @echo "This project supports the following targets" + @echo "" + @echo " make help - show this text" + @echo " make clean - remove unneeded files" + @echo " make build - build the snap" + @echo " make lint - run lint checkers" + @echo " make reformat - run lint tools to auto format code" + @echo " make unittests - run the tests defined in the unittest subdirectory" + @echo " make functional - run the tests defined in the functional subdirectory" + @echo " make test - run lint, proof, unittests and functional targets" + @echo "" + +lint: + @echo "Running lint checks" + @tox -e lint + +unittests: + @echo "Running unit tests" + @tox -e unit -- ${UNIT_ARGS} + +test: lint unittests functional + @echo "Tests completed for the snap." + +reformat: + @echo "Reformat files with black and isort" + @tox -e reformat + +build: + @echo "Building the snap" + @snapcraft --use-lxd + @bash -c ./rename.sh + +clean: + @echo "Cleaning snap" + @snapcraft clean --use-lxd + @echo "Cleaning existing snap builds" + @rm -rf ${SNAP_FILE} + +functional: build + @echo "Executing functional tests using built snap" + @TEST_SNAP=${SNAP_FILE} tox -e func -- ${FUNC_ARGS} + +# The targets below don't depend on a file +.PHONY: help clean build lint reformat unittests functional test \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8af02ae --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,66 @@ +# This is a template `pyproject.toml` file for snaps +# This file is managed by bootstack-charms-spec and should not be modified +# within individual snap repos. https://launchpad.net/bootstack-charms-spec + +[tool.flake8] +ignore = ["C901", "D100", "D101", "D102", "D103", "W503", "W504"] +exclude = ['.eggs', '.git', '.tox', '.venv', '.build', 'build', 'report'] +max-line-length = 99 +max-complexity = 10 + +[tool.black] +line-length = 99 +exclude = ''' +/( + | .eggs + | .git + | .tox + | .venv + | .build + | build + | report +)/ +''' + +[tool.isort] +profile = "black" +skip_glob = [ + ".eggs", + ".git", + ".tox", + ".venv", + ".build", + "build", + "report" +] + +[tool.pylint] +max-line-length = 99 +ignore = ['.eggs', '.git', '.tox', '.venv', '.build', 'report', 'tests'] + +[tool.mypy] +warn_unused_ignores = true +warn_unused_configs = true +warn_unreachable = true +disallow_untyped_defs = true +exclude = ['.eggs', '.git', '.tox', '.venv', '.build', 'report', 'tests'] + +## Ignore unsupported imports +[[tool.mypy.overrides]] +ignore_missing_imports = true +module = ["setuptools"] + +[tool.coverage.run] +relative_files = true +source = ["."] +omit = ["tests/**", "docs/**", "lib/**", "snap/**", "build/**", "setup.py"] + +[tool.coverage.report] +fail_under = 100 +show_missing = true + +[tool.coverage.html] +directory = "tests/unit/report/html" + +[tool.coverage.xml] +output = "tests/unit/report/coverage.xml" \ No newline at end of file diff --git a/rename.sh b/rename.sh new file mode 100755 index 0000000..3b14d01 --- /dev/null +++ b/rename.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# This is a template `rename.sh` file for snaps +# This file is managed by bootstack-charms-spec and should not be modified +# within individual snap repos. https://launchpad.net/bootstack-charms-spec + +snap=$(grep -E "^name:" snap/snapcraft.yaml | awk '{print $2}') +echo "renaming ${snap}_*.snap to ${snap}.snap" +echo -n "pwd: " +pwd +ls -al +echo "Removing previous snap if it exists" +if [[ -e "${snap}.snap" ]]; +then + rm "${snap}.snap" +fi +echo "Renaming snap here." +mv ${snap}_*.snap ${snap}.snap \ No newline at end of file diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..6827ecb --- /dev/null +++ b/tox.ini @@ -0,0 +1,41 @@ +# This is a template `tox.ini` file for snaps +# This file is managed by bootstack-charms-spec and should not be modified +# within individual snap repos. https://launchpad.net/bootstack-charms-spec + +[tox] +skipsdist=True +envlist = lint, unit, func +skip_missing_interpreters = True + +[testenv] +basepython = python3 +setenv = PYTHONPATH={toxinidir} + +[testenv:lint] +commands = + pflake8 + pylint --recursive=y . + black --check --diff --color . + isort --check --diff --color . +deps = + black + flake8 + pyproject-flake8 + flake8-docstrings + pep8-naming + flake8-colors + colorama + isort + pylint + {[testenv:func]deps} + +[testenv:reformat] +envdir = {toxworkdir}/lint +deps = {[testenv:lint]deps} +commands = + black . + isort . + +[testenv:unit] + +[testenv:func] \ No newline at end of file From 89b10bd31d28e27f0eb1e259f718106812c0dd87 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 11:29:45 -0400 Subject: [PATCH 02/36] Add install instructions to the README --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 08f5469..704667e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# dcgm-snap +# DCGM Snap This is a snap delivering NVIDIA dcgm components. The snap consists of [dcgm](https://developer.nvidia.com/dcgm) and [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter). @@ -8,5 +8,11 @@ The snap consists of [dcgm](https://developer.nvidia.com/dcgm) and [dcgm-exporte You can build the snap locally by using the command: ```shell -snapcraft --use-lxd -``` \ No newline at end of file +make build +``` + +## Install the snap + +```shell +snap install --dangerous ./dcgm.snap +``` From b531437b39d1407d9123116f958bdec382606b16 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 11:30:00 -0400 Subject: [PATCH 03/36] Add test structure and snap setup --- tests/functional/test_init.py | 13 +++++++++++++ tests/functional/test_snap_dcgm.py | 5 +++++ 2 files changed, 18 insertions(+) create mode 100644 tests/functional/test_init.py create mode 100644 tests/functional/test_snap_dcgm.py diff --git a/tests/functional/test_init.py b/tests/functional/test_init.py new file mode 100644 index 0000000..8dce7e9 --- /dev/null +++ b/tests/functional/test_init.py @@ -0,0 +1,13 @@ +import os +import subprocess + +import pytest + + +@pytest.fixture(scope="session", autouse=True) +def install_tempest_snap(): + snap = os.environ["TEST_SNAP"] + process = subprocess.run(["sudo", "snap", "install", "--dangerous", snap]) + assert 0 == process.returncode + yield + subprocess.run(["sudo", "snap", "remove", "--purge", "tempest"]) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py new file mode 100644 index 0000000..c3215e7 --- /dev/null +++ b/tests/functional/test_snap_dcgm.py @@ -0,0 +1,5 @@ +import subprocess + + +def test_network_interface(): + pass From 74f7c35e7c5ffa2e23de023e1904913ec039ea05 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 15:15:17 -0400 Subject: [PATCH 04/36] Update .gitignore --- .gitignore | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f24fcd3..74a9ea4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,21 @@ # This is a template `.gitignore` file for snaps +# This file is managed by bootstack-charms-spec and should not be modified +# within individual snap repos. https://launchpad.net/bootstack-charms-spec + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class # Tests files and dir +.pytest_cache/ +.coverage +.tox +.venv reports/ **/report/ +htmlcov/ +.mypy_cache # Log files *.log @@ -17,9 +30,14 @@ reports/ # version data repo-info +# Python builds +deb_dist/ +dist/ +*.egg-info/ + # Snaps *.snap # Builds .build/ -build/ +build/ \ No newline at end of file From ab9f50283ec041c6488f008b86b38ee833b0a3a2 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 15:32:27 -0400 Subject: [PATCH 05/36] Make dcgm-exporter into a daemon --- snap/snapcraft.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 973814a..e3dab38 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -29,6 +29,12 @@ apps: plugs: - network-bind - opengl + daemon: simple + install-mode: disable + restart-condition: on-failure + restart-delay: 2s + environment: + DCGM_HOME_DIR: "${SNAP_COMMON}" dcgmi: command: usr/bin/dcgmi plugs: From 931c25dd4b7a98e6fa6382cc73fdfb22df619344 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 15:32:58 -0400 Subject: [PATCH 06/36] Add simple smoke test for dcgm-exporter endpoint --- tests/functional/conftest.py | 27 +++++++++++++++++++++++++++ tests/functional/test_init.py | 13 ------------- tests/functional/test_snap_dcgm.py | 16 ++++++++++++++-- tox.ini | 8 +++++++- 4 files changed, 48 insertions(+), 16 deletions(-) create mode 100644 tests/functional/conftest.py delete mode 100644 tests/functional/test_init.py diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py new file mode 100644 index 0000000..00bd5ea --- /dev/null +++ b/tests/functional/conftest.py @@ -0,0 +1,27 @@ +import os +import subprocess +from time import sleep + +import pytest + + +@pytest.fixture(scope="session", autouse=True) +def install_dcgm_snap(): + """Install the snap and enable dcgm-exporter service for testing.""" + snap = os.environ["TEST_SNAP"] + dcgm_exporter_service = "snap.dcgm.dcgm-exporter.service" + assert 0 == subprocess.run(["sudo", "snap", "install", "--dangerous", snap]).returncode + + subprocess.run(["sudo", "systemctl", "enable", "--now", dcgm_exporter_service]) + sleep(5) # Give some time for the service to start + + assert ( + 0 + == subprocess.run( + ["sudo", "systemctl", "is-active", "--quiet", dcgm_exporter_service] + ).returncode + ) + + yield + + subprocess.run(["sudo", "snap", "remove", "--purge", "dcgm"]) diff --git a/tests/functional/test_init.py b/tests/functional/test_init.py deleted file mode 100644 index 8dce7e9..0000000 --- a/tests/functional/test_init.py +++ /dev/null @@ -1,13 +0,0 @@ -import os -import subprocess - -import pytest - - -@pytest.fixture(scope="session", autouse=True) -def install_tempest_snap(): - snap = os.environ["TEST_SNAP"] - process = subprocess.run(["sudo", "snap", "install", "--dangerous", snap]) - assert 0 == process.returncode - yield - subprocess.run(["sudo", "snap", "remove", "--purge", "tempest"]) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index c3215e7..3bd9e5b 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,5 +1,17 @@ import subprocess +from time import sleep -def test_network_interface(): - pass +def test_dcgm_exporter_endpoint(): + """Smoke test of the dcgm-exporter service and its endpoint.""" + dcgm_exporter_endpoint = "localhost:9400/metrics" + subprocess.run(["sudo", "snap", "disconnect", "dcgm:network-bind"]) + + # The Endpoint should be unavailable with the networking plug + assert 0 != subprocess.run(["curl", dcgm_exporter_endpoint]).returncode + + subprocess.run(["sudo", "snap", "connect", "dcgm:network-bind"]) + print("reconnect") + sleep(5) # should be sufficient for the service to restart + + assert 0 == subprocess.run(["curl", dcgm_exporter_endpoint]).returncode diff --git a/tox.ini b/tox.ini index 6827ecb..72eca81 100644 --- a/tox.ini +++ b/tox.ini @@ -38,4 +38,10 @@ commands = [testenv:unit] -[testenv:func] \ No newline at end of file +[testenv:func] +deps = + pytest +passenv = + TEST_* +commands = + pytest {toxinidir}/tests/functional {posargs:-v} \ No newline at end of file From b4f58774dca78449e1e0b11714baa71cba5e56d9 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 15:33:08 -0400 Subject: [PATCH 07/36] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 704667e..15a1f12 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,5 @@ make build ```shell snap install --dangerous ./dcgm.snap +sudo systemctl enable snap.dcgm.dcgm-exporter.service --now # enable and start dcgm-exporter service ``` From 36fd7579d178627b3b0d731ae7b9754a23c717a9 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 15:34:06 -0400 Subject: [PATCH 08/36] Fix format --- tests/functional/test_snap_dcgm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 3bd9e5b..1750cec 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -12,6 +12,6 @@ def test_dcgm_exporter_endpoint(): subprocess.run(["sudo", "snap", "connect", "dcgm:network-bind"]) print("reconnect") - sleep(5) # should be sufficient for the service to restart - + sleep(5) # should be sufficient for the service to restart + assert 0 == subprocess.run(["curl", dcgm_exporter_endpoint]).returncode From e257ad711162faeeb78666bde52fe81d5836bd7b Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 6 Sep 2024 16:19:22 -0400 Subject: [PATCH 09/36] Remove unneccesary envar for dcgm-exporter daemon --- snap/snapcraft.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index e3dab38..da5936e 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -33,8 +33,6 @@ apps: install-mode: disable restart-condition: on-failure restart-delay: 2s - environment: - DCGM_HOME_DIR: "${SNAP_COMMON}" dcgmi: command: usr/bin/dcgmi plugs: From 5d5047bf8b87ed26a4f52f8e7906937e8ed578f4 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 10:11:21 -0400 Subject: [PATCH 10/36] Un-omit the test directory for the additional coverage --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8af02ae..049c82c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ module = ["setuptools"] [tool.coverage.run] relative_files = true source = ["."] -omit = ["tests/**", "docs/**", "lib/**", "snap/**", "build/**", "setup.py"] +omit = ["docs/**", "lib/**", "snap/**", "build/**", "setup.py"] [tool.coverage.report] fail_under = 100 @@ -63,4 +63,4 @@ show_missing = true directory = "tests/unit/report/html" [tool.coverage.xml] -output = "tests/unit/report/coverage.xml" \ No newline at end of file +output = "tests/unit/report/coverage.xml" From 5be41618f6bb74ec4db50b1bb33f9829e08c0446 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 10:22:41 -0400 Subject: [PATCH 11/36] Remove redundant unit test commands --- Makefile | 7 +------ tox.ini | 4 +--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 3394bab..f5f7cff 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,6 @@ help: @echo " make build - build the snap" @echo " make lint - run lint checkers" @echo " make reformat - run lint tools to auto format code" - @echo " make unittests - run the tests defined in the unittest subdirectory" @echo " make functional - run the tests defined in the functional subdirectory" @echo " make test - run lint, proof, unittests and functional targets" @echo "" @@ -25,10 +24,6 @@ lint: @echo "Running lint checks" @tox -e lint -unittests: - @echo "Running unit tests" - @tox -e unit -- ${UNIT_ARGS} - test: lint unittests functional @echo "Tests completed for the snap." @@ -52,4 +47,4 @@ functional: build @TEST_SNAP=${SNAP_FILE} tox -e func -- ${FUNC_ARGS} # The targets below don't depend on a file -.PHONY: help clean build lint reformat unittests functional test \ No newline at end of file +.PHONY: help clean build lint reformat unittests functional test diff --git a/tox.ini b/tox.ini index 72eca81..a0fd1a7 100644 --- a/tox.ini +++ b/tox.ini @@ -36,12 +36,10 @@ commands = black . isort . -[testenv:unit] - [testenv:func] deps = pytest passenv = TEST_* commands = - pytest {toxinidir}/tests/functional {posargs:-v} \ No newline at end of file + pytest {toxinidir}/tests/functional {posargs:-v} From 349ee7a7c5df640431be848a7d9d48390c7b1239 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 11:50:52 -0400 Subject: [PATCH 12/36] Make dcgm-exporter listen address configurable --- snap/hooks/configure | 8 +++++++- snap/local/run_dcgm_exporter.sh | 14 ++++++++++++++ snap/snapcraft.yaml | 4 ++-- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 snap/local/run_dcgm_exporter.sh diff --git a/snap/hooks/configure b/snap/hooks/configure index 5d9de78..6364787 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -3,4 +3,10 @@ # Register config options if unset, # so users can see available options by running # `sudo snap get dcgm`. -[ -z "$(snapctl get nv-hostengine-port)" ] && snapctl set nv-hostengine-port= +if [ -z "$(snapctl get nv-hostengine-port)" ]; then + snapctl set nv-hostengine-port="" +fi + +if [ -z "$(snapctl get dcgm_exporter_listen)" ]; then + snapctl set dcgm_exporter_listen="" +fi diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh new file mode 100644 index 0000000..ad14d56 --- /dev/null +++ b/snap/local/run_dcgm_exporter.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +# Build the argument list for the dcgm-exporter command +args=() + +# Add the dcgm-exporter-port option if it is set. Default: “:9400” +dcgm_exporter_listen="$(snapctl get dcgm_exporter_listen)" + +if [ -n "$dcgm_exporter_listen" ]; then + args+=("-a" "$dcgm_exporter_listen") +fi + +exec "$SNAP/bin/dcgm-exporter" "${args[@]}" diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index da5936e..e5f356e 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -25,14 +25,13 @@ package-repositories: apps: dcgm-exporter: - command: bin/dcgm-exporter + command: run_dcgm_exporter.sh plugs: - network-bind - opengl daemon: simple install-mode: disable restart-condition: on-failure - restart-delay: 2s dcgmi: command: usr/bin/dcgmi plugs: @@ -61,6 +60,7 @@ parts: override-build: | craftctl default chmod +x run_nv_hostengine.sh + chmod +x run_dcgm_exporter.sh dcgm-exporter: plugin: go stage-packages: [datacenter-gpu-manager] From 48a50591d9113f18338c7218d5707402eb3f2cf3 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 12:00:54 -0400 Subject: [PATCH 13/36] Remove unittest from Makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f5f7cff..6c285f2 100644 --- a/Makefile +++ b/Makefile @@ -17,14 +17,14 @@ help: @echo " make lint - run lint checkers" @echo " make reformat - run lint tools to auto format code" @echo " make functional - run the tests defined in the functional subdirectory" - @echo " make test - run lint, proof, unittests and functional targets" + @echo " make test - run lint, proof, and functional targets" @echo "" lint: @echo "Running lint checks" @tox -e lint -test: lint unittests functional +test: lint functional @echo "Tests completed for the snap." reformat: From 818bd7af3ca886edcb7236abf0e1adb9d5e02857 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 16:10:44 -0400 Subject: [PATCH 14/36] Improve dcgm-exporter endpoint test --- snap/hooks/configure | 4 +-- snap/local/run_dcgm_exporter.sh | 2 +- tests/functional/conftest.py | 24 ++++++++++++----- tests/functional/test_snap_dcgm.py | 42 +++++++++++++++++++++++------- 4 files changed, 52 insertions(+), 20 deletions(-) diff --git a/snap/hooks/configure b/snap/hooks/configure index 6364787..5a0d5dd 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -7,6 +7,6 @@ if [ -z "$(snapctl get nv-hostengine-port)" ]; then snapctl set nv-hostengine-port="" fi -if [ -z "$(snapctl get dcgm_exporter_listen)" ]; then - snapctl set dcgm_exporter_listen="" +if [ -z "$(snapctl get dcgm-exporter-listen)" ]; then + snapctl set dcgm-exporter-listen="" fi diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh index ad14d56..10ec838 100644 --- a/snap/local/run_dcgm_exporter.sh +++ b/snap/local/run_dcgm_exporter.sh @@ -5,7 +5,7 @@ set -euo pipefail args=() # Add the dcgm-exporter-port option if it is set. Default: “:9400” -dcgm_exporter_listen="$(snapctl get dcgm_exporter_listen)" +dcgm_exporter_listen="$(snapctl get dcgm-exporter-listen)" if [ -n "$dcgm_exporter_listen" ]; then args+=("-a" "$dcgm_exporter_listen") diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 00bd5ea..24e21c0 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,6 +1,6 @@ import os import subprocess -from time import sleep +import time import pytest @@ -10,18 +10,28 @@ def install_dcgm_snap(): """Install the snap and enable dcgm-exporter service for testing.""" snap = os.environ["TEST_SNAP"] dcgm_exporter_service = "snap.dcgm.dcgm-exporter.service" - assert 0 == subprocess.run(["sudo", "snap", "install", "--dangerous", snap]).returncode + + assert ( + 0 == subprocess.run(["sudo", "snap", "install", "--dangerous", snap]).returncode + ), f"Failed to install {snap}" subprocess.run(["sudo", "systemctl", "enable", "--now", dcgm_exporter_service]) - sleep(5) # Give some time for the service to start - assert ( - 0 - == subprocess.run( + dcgm_exporter_is_active = ( + lambda: subprocess.call( ["sudo", "systemctl", "is-active", "--quiet", dcgm_exporter_service] - ).returncode + ) + == 0 ) + timeout = 30 # seconds + start_time = time.time() + + while not dcgm_exporter_is_active(): + if time.time() - start_time > timeout: + assert False, f"Failed to start {dcgm_exporter_service} service" + time.sleep(5) + yield subprocess.run(["sudo", "snap", "remove", "--purge", "dcgm"]) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 1750cec..d02d4a8 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,17 +1,39 @@ import subprocess -from time import sleep +import time def test_dcgm_exporter_endpoint(): - """Smoke test of the dcgm-exporter service and its endpoint.""" - dcgm_exporter_endpoint = "localhost:9400/metrics" - subprocess.run(["sudo", "snap", "disconnect", "dcgm:network-bind"]) + """Test of the dcgm-exporter service and its endpoint.""" + endpoint = "localhost:9400/metrics" + timeout = 60 # seconds + start_time = time.time() - # The Endpoint should be unavailable with the networking plug - assert 0 != subprocess.run(["curl", dcgm_exporter_endpoint]).returncode + def query_endpoint(): + return subprocess.run(["curl", endpoint], text=True, capture_output=True) - subprocess.run(["sudo", "snap", "connect", "dcgm:network-bind"]) - print("reconnect") - sleep(5) # should be sufficient for the service to restart + while (result := query_endpoint()).returncode != 0 or not result.stdout.strip(): + if time.time() - start_time > timeout: + assert False, f"Failed to reach '{endpoint}' of the dcgm-exporter service" + time.sleep(5) - assert 0 == subprocess.run(["curl", dcgm_exporter_endpoint]).returncode + assert "DCGM_FI_DRIVER_VERSION" in result.stdout, "No dcgm exported metrics found" + + +def test_dcgm_configs(): + """Test snap configuratin.""" + pass + + +def test_dcgm_nv_hostengine(): + """Test of the dcgm-nv-hostengine service and its endpoint.""" + pass + + +def test_dcgmi(): + """Test of the dcgmi command.""" + pass + + +def test_dcgmproftesters(): + """Test of the dcgmproftesters.""" + pass From c201332045e92a079ab14b4473ddf7ae5fdef8ce Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 16:12:53 -0400 Subject: [PATCH 15/36] Add missing trailing whitespace --- .gitignore | 2 +- rename.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 74a9ea4..b6b2573 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,4 @@ dist/ # Builds .build/ -build/ \ No newline at end of file +build/ diff --git a/rename.sh b/rename.sh index 3b14d01..6d645e1 100755 --- a/rename.sh +++ b/rename.sh @@ -14,4 +14,4 @@ then rm "${snap}.snap" fi echo "Renaming snap here." -mv ${snap}_*.snap ${snap}.snap \ No newline at end of file +mv ${snap}_*.snap ${snap}.snap From 36a913f5893d00c5743989014f9f6c1330d2c6e8 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 19:52:27 -0400 Subject: [PATCH 16/36] Add simple tests for other components --- tests/functional/test_snap_dcgm.py | 43 ++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index d02d4a8..eaf1f57 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,3 +1,4 @@ +import json import subprocess import time @@ -19,21 +20,47 @@ def query_endpoint(): assert "DCGM_FI_DRIVER_VERSION" in result.stdout, "No dcgm exported metrics found" -def test_dcgm_configs(): - """Test snap configuratin.""" - pass - - def test_dcgm_nv_hostengine(): - """Test of the dcgm-nv-hostengine service and its endpoint.""" - pass + """Check the dcgm-nv-hostengine service.""" + assert 0 == subprocess.call( + ["sudo", "systemctl", "is-active", "--quiet", "snap.dcgm.nv-hostengine.service"] + ), "DCGM NV Hostengine service is not running" def test_dcgmi(): """Test of the dcgmi command.""" - pass + result = subprocess.run(["dcgm.dcgmi", "discovery", "-l"], capture_output=True, text=True) + print(result) + assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" def test_dcgmproftesters(): """Test of the dcgmproftesters.""" pass + + +def test_dcgm_port_configs(): + """Test snap port configuratin.""" + services = ["snap.dcgm.dcgm-exporter.service", "snap.dcgm.nv-hostengine.service"] + configs = ["dcgm-exporter-listen", "nv-hostengine-port"] + new_values = [":9466", "5666"] + + result = subprocess.run( + ["sudo", "snap", "get", "dcgm", "-d"], check=True, capture_output=True, text=True + ) + pairs = json.loads(result.stdout.strip()) + assert all(config in pairs for config in configs), "Missing snap configuration keys" + + for config, new_value in zip(configs, new_values): + assert 0 == subprocess.call( + ["sudo", "snap", "set", "dcgm", f"{config}={new_value}"] + ), f"Failed to set snap configuration key '{config}'" + + # restart the service to apply the new configuration + for service in services: + subprocess.run(["sudo", "systemctl", "restart", service]) + + for service, port in zip(services, new_values): + assert 0 == subprocess.call( + ["sudo", "lsof", "-i", f":{port.lstrip(':')}"] + ), f"{service} port is not listening" From 558007e1725b51e819eec121c068c9764cfab3b9 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 11 Sep 2024 20:17:18 -0400 Subject: [PATCH 17/36] Improve dcgmi test --- Makefile | 2 +- tests/functional/test_snap_dcgm.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 6c285f2..2b77033 100644 --- a/Makefile +++ b/Makefile @@ -47,4 +47,4 @@ functional: build @TEST_SNAP=${SNAP_FILE} tox -e func -- ${FUNC_ARGS} # The targets below don't depend on a file -.PHONY: help clean build lint reformat unittests functional test +.PHONY: help clean build lint reformat functional test diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index eaf1f57..9f599c0 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -29,8 +29,7 @@ def test_dcgm_nv_hostengine(): def test_dcgmi(): """Test of the dcgmi command.""" - result = subprocess.run(["dcgm.dcgmi", "discovery", "-l"], capture_output=True, text=True) - print(result) + result = subprocess.run(["dcgm.dcgmi", "discovery", "-l"], check=True, capture_output=True, text=True) assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" From 9e2494409aa329226d15675602fb231a1608b0e4 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 11:10:12 -0400 Subject: [PATCH 18/36] Revert README.md --- README.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 15a1f12..1e8a975 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# DCGM Snap +# dcgm-snap This is a snap delivering NVIDIA dcgm components. The snap consists of [dcgm](https://developer.nvidia.com/dcgm) and [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter). @@ -8,12 +8,5 @@ The snap consists of [dcgm](https://developer.nvidia.com/dcgm) and [dcgm-exporte You can build the snap locally by using the command: ```shell -make build -``` - -## Install the snap - -```shell -snap install --dangerous ./dcgm.snap -sudo systemctl enable snap.dcgm.dcgm-exporter.service --now # enable and start dcgm-exporter service +snapcraft --use-lxd ``` From 2577dfb5fe093c934d183527c1134737ec18bf75 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 11:20:18 -0400 Subject: [PATCH 19/36] Switch to snap services subcommands --- tests/functional/conftest.py | 2 +- tests/functional/test_snap_dcgm.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 24e21c0..9ec8027 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -15,7 +15,7 @@ def install_dcgm_snap(): 0 == subprocess.run(["sudo", "snap", "install", "--dangerous", snap]).returncode ), f"Failed to install {snap}" - subprocess.run(["sudo", "systemctl", "enable", "--now", dcgm_exporter_service]) + subprocess.run(["sudo", "snap", "start", "dcgm.dcgm-exporter"]) dcgm_exporter_is_active = ( lambda: subprocess.call( diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 9f599c0..a8a3d43 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -40,15 +40,15 @@ def test_dcgmproftesters(): def test_dcgm_port_configs(): """Test snap port configuratin.""" - services = ["snap.dcgm.dcgm-exporter.service", "snap.dcgm.nv-hostengine.service"] + services = ["dcgm.dcgm-exporter", "dcgm.nv-hostengine"] configs = ["dcgm-exporter-listen", "nv-hostengine-port"] new_values = [":9466", "5666"] result = subprocess.run( ["sudo", "snap", "get", "dcgm", "-d"], check=True, capture_output=True, text=True ) - pairs = json.loads(result.stdout.strip()) - assert all(config in pairs for config in configs), "Missing snap configuration keys" + dcgm_snap_config = json.loads(result.stdout.strip()) + assert all(config in dcgm_snap_config for config in configs), "Missing snap configuration keys" for config, new_value in zip(configs, new_values): assert 0 == subprocess.call( @@ -57,7 +57,7 @@ def test_dcgm_port_configs(): # restart the service to apply the new configuration for service in services: - subprocess.run(["sudo", "systemctl", "restart", service]) + subprocess.run(["sudo", "snap", "restart", service]) for service, port in zip(services, new_values): assert 0 == subprocess.call( From 4dfd25092752413a2143e07fc108f50e70f1aa32 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 11:24:39 -0400 Subject: [PATCH 20/36] Remane dcgm-exporter snap config --- snap/hooks/configure | 4 ++-- snap/local/run_dcgm_exporter.sh | 6 +++--- tests/functional/test_snap_dcgm.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/snap/hooks/configure b/snap/hooks/configure index 5a0d5dd..f05078a 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -7,6 +7,6 @@ if [ -z "$(snapctl get nv-hostengine-port)" ]; then snapctl set nv-hostengine-port="" fi -if [ -z "$(snapctl get dcgm-exporter-listen)" ]; then - snapctl set dcgm-exporter-listen="" +if [ -z "$(snapctl get dcgm-exporter-address)" ]; then + snapctl set dcgm-exporter-address="" fi diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh index 10ec838..e852dd9 100644 --- a/snap/local/run_dcgm_exporter.sh +++ b/snap/local/run_dcgm_exporter.sh @@ -5,10 +5,10 @@ set -euo pipefail args=() # Add the dcgm-exporter-port option if it is set. Default: “:9400” -dcgm_exporter_listen="$(snapctl get dcgm-exporter-listen)" +dcgm_exporter_address="$(snapctl get dcgm-exporter-address)" -if [ -n "$dcgm_exporter_listen" ]; then - args+=("-a" "$dcgm_exporter_listen") +if [ -n "$dcgm_exporter_address" ]; then + args+=("-a" "$dcgm_exporter_address") fi exec "$SNAP/bin/dcgm-exporter" "${args[@]}" diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index a8a3d43..38ee4e9 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -29,7 +29,9 @@ def test_dcgm_nv_hostengine(): def test_dcgmi(): """Test of the dcgmi command.""" - result = subprocess.run(["dcgm.dcgmi", "discovery", "-l"], check=True, capture_output=True, text=True) + result = subprocess.run( + ["dcgm.dcgmi", "discovery", "-l"], check=True, capture_output=True, text=True + ) assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" @@ -41,7 +43,7 @@ def test_dcgmproftesters(): def test_dcgm_port_configs(): """Test snap port configuratin.""" services = ["dcgm.dcgm-exporter", "dcgm.nv-hostengine"] - configs = ["dcgm-exporter-listen", "nv-hostengine-port"] + configs = ["dcgm-exporter-address", "nv-hostengine-port"] new_values = [":9466", "5666"] result = subprocess.run( From bfb452af81a938ca17691f47f35704c4e233dac9 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 12:50:50 -0400 Subject: [PATCH 21/36] Simplify dcgm-exporter test --- tests/functional/requirments.txt | 1 + tests/functional/test_snap_dcgm.py | 25 ++++++------------------- tox.ini | 1 + 3 files changed, 8 insertions(+), 19 deletions(-) create mode 100644 tests/functional/requirments.txt diff --git a/tests/functional/requirments.txt b/tests/functional/requirments.txt new file mode 100644 index 0000000..067f0df --- /dev/null +++ b/tests/functional/requirments.txt @@ -0,0 +1 @@ +tenacity==9.0.0 diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 38ee4e9..ba5e38d 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,23 +1,15 @@ import json import subprocess -import time +import urllib.request +from tenacity import retry, stop_after_delay, wait_fixed + +@retry(wait=wait_fixed(5), stop=stop_after_delay(60)) def test_dcgm_exporter_endpoint(): """Test of the dcgm-exporter service and its endpoint.""" - endpoint = "localhost:9400/metrics" - timeout = 60 # seconds - start_time = time.time() - - def query_endpoint(): - return subprocess.run(["curl", endpoint], text=True, capture_output=True) - - while (result := query_endpoint()).returncode != 0 or not result.stdout.strip(): - if time.time() - start_time > timeout: - assert False, f"Failed to reach '{endpoint}' of the dcgm-exporter service" - time.sleep(5) - - assert "DCGM_FI_DRIVER_VERSION" in result.stdout, "No dcgm exported metrics found" + endpoint = "http://localhost:9400/metrics" + urllib.request.urlopen(endpoint) def test_dcgm_nv_hostengine(): @@ -35,11 +27,6 @@ def test_dcgmi(): assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" -def test_dcgmproftesters(): - """Test of the dcgmproftesters.""" - pass - - def test_dcgm_port_configs(): """Test snap port configuratin.""" services = ["dcgm.dcgm-exporter", "dcgm.nv-hostengine"] diff --git a/tox.ini b/tox.ini index a0fd1a7..455178f 100644 --- a/tox.ini +++ b/tox.ini @@ -39,6 +39,7 @@ commands = [testenv:func] deps = pytest + -r {toxinidir}/tests/functional/requirments.txt passenv = TEST_* commands = From 6c32bfc6d6945b2c375b67a8babdbcdb9cb9c3ee Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 12:51:23 -0400 Subject: [PATCH 22/36] Remove makefile and rename.sh --- Makefile | 50 -------------------------------------------------- rename.sh | 17 ----------------- 2 files changed, 67 deletions(-) delete mode 100644 Makefile delete mode 100755 rename.sh diff --git a/Makefile b/Makefile deleted file mode 100644 index 2b77033..0000000 --- a/Makefile +++ /dev/null @@ -1,50 +0,0 @@ -# This is a template `Makefile` file for snaps -# This file is managed by bootstack-charms-spec and should not be modified -# within individual snap repos. https://launchpad.net/bootstack-charms-spec - -PYTHON := /usr/bin/python3 - -PROJECTPATH=$(dir $(realpath ${MAKEFILE_LIST})) -SNAP_NAME=$(shell cat ${PROJECTPATH}/snap/snapcraft.yaml | grep -E '^name:' | awk '{print $$2}') -SNAP_FILE=${PROJECTPATH}/${SNAP_NAME}.snap - -help: - @echo "This project supports the following targets" - @echo "" - @echo " make help - show this text" - @echo " make clean - remove unneeded files" - @echo " make build - build the snap" - @echo " make lint - run lint checkers" - @echo " make reformat - run lint tools to auto format code" - @echo " make functional - run the tests defined in the functional subdirectory" - @echo " make test - run lint, proof, and functional targets" - @echo "" - -lint: - @echo "Running lint checks" - @tox -e lint - -test: lint functional - @echo "Tests completed for the snap." - -reformat: - @echo "Reformat files with black and isort" - @tox -e reformat - -build: - @echo "Building the snap" - @snapcraft --use-lxd - @bash -c ./rename.sh - -clean: - @echo "Cleaning snap" - @snapcraft clean --use-lxd - @echo "Cleaning existing snap builds" - @rm -rf ${SNAP_FILE} - -functional: build - @echo "Executing functional tests using built snap" - @TEST_SNAP=${SNAP_FILE} tox -e func -- ${FUNC_ARGS} - -# The targets below don't depend on a file -.PHONY: help clean build lint reformat functional test diff --git a/rename.sh b/rename.sh deleted file mode 100755 index 6d645e1..0000000 --- a/rename.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# This is a template `rename.sh` file for snaps -# This file is managed by bootstack-charms-spec and should not be modified -# within individual snap repos. https://launchpad.net/bootstack-charms-spec - -snap=$(grep -E "^name:" snap/snapcraft.yaml | awk '{print $2}') -echo "renaming ${snap}_*.snap to ${snap}.snap" -echo -n "pwd: " -pwd -ls -al -echo "Removing previous snap if it exists" -if [[ -e "${snap}.snap" ]]; -then - rm "${snap}.snap" -fi -echo "Renaming snap here." -mv ${snap}_*.snap ${snap}.snap From 19faa1119418d2d83221295d2538714842f7ee7d Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:08:36 -0400 Subject: [PATCH 23/36] Imrpove & simplify tests --- tests/functional/conftest.py | 38 ++++++++--------------- tests/functional/requirments.txt | 1 + tests/functional/test_snap_dcgm.py | 48 ++++++++++++++++++++++-------- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 9ec8027..e6b0c7a 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,37 +1,23 @@ -import os import subprocess -import time import pytest +import yaml @pytest.fixture(scope="session", autouse=True) def install_dcgm_snap(): """Install the snap and enable dcgm-exporter service for testing.""" - snap = os.environ["TEST_SNAP"] - dcgm_exporter_service = "snap.dcgm.dcgm-exporter.service" - - assert ( - 0 == subprocess.run(["sudo", "snap", "install", "--dangerous", snap]).returncode - ), f"Failed to install {snap}" - - subprocess.run(["sudo", "snap", "start", "dcgm.dcgm-exporter"]) - - dcgm_exporter_is_active = ( - lambda: subprocess.call( - ["sudo", "systemctl", "is-active", "--quiet", dcgm_exporter_service] + with open("snap/snapcraft.yaml") as f: + snapcraft = yaml.safe_load(f) + snap_build_name = f"{snapcraft['name']}_*_amd64.snap" + + subprocess.run( + f"sudo snap install --devmode {snap_build_name}", + check=True, + capture_output=True, + shell=True, ) - == 0 - ) - - timeout = 30 # seconds - start_time = time.time() - - while not dcgm_exporter_is_active(): - if time.time() - start_time > timeout: - assert False, f"Failed to start {dcgm_exporter_service} service" - time.sleep(5) - yield + yield - subprocess.run(["sudo", "snap", "remove", "--purge", "dcgm"]) + subprocess.run("sudo snap remove --purge dcgm".split(), check=True) diff --git a/tests/functional/requirments.txt b/tests/functional/requirments.txt index 067f0df..771e68b 100644 --- a/tests/functional/requirments.txt +++ b/tests/functional/requirments.txt @@ -1 +1,2 @@ tenacity==9.0.0 +pyyaml==6.0.2 diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index ba5e38d..2d30f66 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,28 +1,48 @@ import json import subprocess import urllib.request +from time import sleep from tenacity import retry, stop_after_delay, wait_fixed -@retry(wait=wait_fixed(5), stop=stop_after_delay(60)) -def test_dcgm_exporter_endpoint(): +@retry(wait=wait_fixed(5), stop=stop_after_delay(15)) +def test_dcgm_exporter(): """Test of the dcgm-exporter service and its endpoint.""" endpoint = "http://localhost:9400/metrics" + dcgm_exporter_service = "dcgm.dcgm-exporter" + + subprocess.run(f"sudo snap start {dcgm_exporter_service}".split(), check=True) + + result = subprocess.run( + f"sudo snap services {dcgm_exporter_service}".split(), + check=True, + capture_output=True, + text=True, + ) + assert " active" in result.stdout.strip(), "dcgm-exporter service is not active" + urllib.request.urlopen(endpoint) def test_dcgm_nv_hostengine(): """Check the dcgm-nv-hostengine service.""" - assert 0 == subprocess.call( - ["sudo", "systemctl", "is-active", "--quiet", "snap.dcgm.nv-hostengine.service"] - ), "DCGM NV Hostengine service is not running" + nv_hostengine_service = "dcgm.nv-hostengine" + + service = subprocess.run( + f"snap services {nv_hostengine_service}".split(), + check=True, + capture_output=True, + text=True, + ) + + assert " active" in service.stdout.strip(), "nv-hostengine service is not active" def test_dcgmi(): """Test of the dcgmi command.""" result = subprocess.run( - ["dcgm.dcgmi", "discovery", "-l"], check=True, capture_output=True, text=True + "dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True ) assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" @@ -34,21 +54,23 @@ def test_dcgm_port_configs(): new_values = [":9466", "5666"] result = subprocess.run( - ["sudo", "snap", "get", "dcgm", "-d"], check=True, capture_output=True, text=True + "sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True ) dcgm_snap_config = json.loads(result.stdout.strip()) assert all(config in dcgm_snap_config for config in configs), "Missing snap configuration keys" for config, new_value in zip(configs, new_values): - assert 0 == subprocess.call( - ["sudo", "snap", "set", "dcgm", f"{config}={new_value}"] - ), f"Failed to set snap configuration key '{config}'" + subprocess.run( + f"sudo snap set dcgm {config}={new_value}".split(), check=True + ), f"Failed to set {config}" # restart the service to apply the new configuration for service in services: - subprocess.run(["sudo", "snap", "restart", service]) + subprocess.run(f"sudo snap restart {service}".split(), check=True) + + sleep(5) for service, port in zip(services, new_values): - assert 0 == subprocess.call( - ["sudo", "lsof", "-i", f":{port.lstrip(':')}"] + subprocess.run( + f"sudo lsof -i :{port.lstrip(':')}".split(), check=True ), f"{service} port is not listening" From 9774fdeb9d5652d590613c88d45f2deb153ab2b0 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:08:45 -0400 Subject: [PATCH 24/36] Add func test to the CI --- .github/workflows/check.yaml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 5bc07fd..729c1d4 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -31,10 +31,13 @@ jobs: run: | sudo apt update sudo apt install -y yamllint + pipx install tox - name: Lint yaml files - run: | - yamllint .yamllint snap/snapcraft.yaml + run: yamllint .yamllint snap/snapcraft.yaml + + - name: Lint tests + run: tox -e lint build: runs-on: ubuntu-22.04 @@ -45,3 +48,15 @@ jobs: - name: Verify snap builds successfully uses: snapcore/action-build@v1 + + func: + needs: + - lint + - build + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + + - name: Run tests + run: tox -e func From 8738bc777505fdb1a9497f19bcc52021993235bd Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:14:45 -0400 Subject: [PATCH 25/36] Fix check.yaml --- .github/workflows/check.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 729c1d4..ec3c6cb 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -57,6 +57,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install dependencies - + run: | + pipx install tox - name: Run tests run: tox -e func From bea3f7113cdd7f0e9d773209403742cafe9a7181 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:26:11 -0400 Subject: [PATCH 26/36] Fix check.yaml --- .github/workflows/check.yaml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index ec3c6cb..f4d2144 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -55,9 +55,17 @@ jobs: - build runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 + - name: Install dependencies - run: | - pipx install tox + run: pipx install tox + + - name: Download snap package(s) + uses: actions/download-artifact@v4 + with: + pattern: ${{ needs.build.outputs.artifact-prefix }}-* + merge-multiple: true + - name: Run tests run: tox -e func From 55874a9766a9823de8c9e6c2ca7dd5de24f5d229 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:45:09 -0400 Subject: [PATCH 27/36] Merge build and func CI jobs --- .github/workflows/check.yaml | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index f4d2144..b1b5d4f 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -39,7 +39,9 @@ jobs: - name: Lint tests run: tox -e lint - build: + func: + needs: + - lint runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -49,15 +51,6 @@ jobs: - name: Verify snap builds successfully uses: snapcore/action-build@v1 - func: - needs: - - lint - - build - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Install dependencies run: pipx install tox From b740fa01f8b80d666bd21d65e8ed6b24d469cdeb Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 15:53:06 -0400 Subject: [PATCH 28/36] Remove redundant step for test job --- .github/workflows/check.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index b1b5d4f..67f2cb8 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -54,11 +54,5 @@ jobs: - name: Install dependencies run: pipx install tox - - name: Download snap package(s) - uses: actions/download-artifact@v4 - with: - pattern: ${{ needs.build.outputs.artifact-prefix }}-* - merge-multiple: true - - name: Run tests run: tox -e func From 3138f0a8cf3439a30ee2e506b1da7aa761d0f570 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 18:38:33 -0400 Subject: [PATCH 29/36] Revert CI to separate jobs for build and func --- .github/workflows/check.yaml | 32 ++++++++++++++++++++++++++++++-- tests/functional/conftest.py | 2 +- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 67f2cb8..0a6c2bd 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -27,6 +27,11 @@ jobs: with: fetch-depth: 0 # Complete git history is required to generate the version from git tags. + - name: Setup Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies run: | sudo apt update @@ -39,7 +44,7 @@ jobs: - name: Lint tests run: tox -e lint - func: + build: needs: - lint runs-on: ubuntu-22.04 @@ -51,8 +56,31 @@ jobs: - name: Verify snap builds successfully uses: snapcore/action-build@v1 + - name: Upload the built snap + uses: actions/upload-artifact@v4 + with: + name: SNAP_FILE + path: dcgm.snap + + func: + needs: + - build + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Download the built snap + uses: actions/download-artifact@v4 + with: + name: SNAP_FILE + + - name: Setup Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies run: pipx install tox - name: Run tests - run: tox -e func + run: TEST_SNAP=$GITHUB_WORKSPACE/dcgm.snap tox -e func diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index e6b0c7a..cac57c3 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -9,7 +9,7 @@ def install_dcgm_snap(): """Install the snap and enable dcgm-exporter service for testing.""" with open("snap/snapcraft.yaml") as f: snapcraft = yaml.safe_load(f) - snap_build_name = f"{snapcraft['name']}_*_amd64.snap" + snap_build_name = f"{snapcraft['name']}_*.snap" subprocess.run( f"sudo snap install --devmode {snap_build_name}", From c2ea073c004002a8fe459f7515b56b119adaef01 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 18:49:09 -0400 Subject: [PATCH 30/36] Fix artifact path --- .github/workflows/check.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 0a6c2bd..3091d11 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -60,7 +60,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: SNAP_FILE - path: dcgm.snap + path: dcgm*.snap func: needs: @@ -83,4 +83,4 @@ jobs: run: pipx install tox - name: Run tests - run: TEST_SNAP=$GITHUB_WORKSPACE/dcgm.snap tox -e func + run: tox -e func From 439376edb56460713c3480d5cdc3b498090c5c30 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 12 Sep 2024 20:42:23 -0400 Subject: [PATCH 31/36] Refine --- tests/functional/conftest.py | 21 +++++++++------------ tests/functional/test_snap_dcgm.py | 1 + 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index cac57c3..56b80a0 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,23 +1,20 @@ import subprocess import pytest -import yaml @pytest.fixture(scope="session", autouse=True) def install_dcgm_snap(): """Install the snap and enable dcgm-exporter service for testing.""" - with open("snap/snapcraft.yaml") as f: - snapcraft = yaml.safe_load(f) - snap_build_name = f"{snapcraft['name']}_*.snap" + snap_build_name = "dcgm_*.snap" - subprocess.run( - f"sudo snap install --devmode {snap_build_name}", - check=True, - capture_output=True, - shell=True, - ) + subprocess.run( + f"sudo snap install --dangerous {snap_build_name}", + check=True, + capture_output=True, + shell=True, + ) - yield + yield - subprocess.run("sudo snap remove --purge dcgm".split(), check=True) + subprocess.run("sudo snap remove --purge dcgm".split(), check=True) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 2d30f66..842d6df 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -22,6 +22,7 @@ def test_dcgm_exporter(): ) assert " active" in result.stdout.strip(), "dcgm-exporter service is not active" + # Check the exporter endpoint, will raise an exception if the endpoint is not reachable urllib.request.urlopen(endpoint) From 60bdf8514db11166821e2a1b5f97306fa3427871 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 13 Sep 2024 13:46:52 -0400 Subject: [PATCH 32/36] Refinements --- snap/hooks/configure | 2 ++ snap/local/run_dcgm_exporter.sh | 2 +- tests/functional/conftest.py | 17 +++++++++++++++++ tests/functional/requirments.txt | 4 ++-- tests/functional/test_snap_dcgm.py | 17 +++-------------- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/snap/hooks/configure b/snap/hooks/configure index f05078a..a55fc8a 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -4,9 +4,11 @@ # so users can see available options by running # `sudo snap get dcgm`. if [ -z "$(snapctl get nv-hostengine-port)" ]; then + # Setting to empty string for the nv-hostengine binary to use the default port. (5555) snapctl set nv-hostengine-port="" fi if [ -z "$(snapctl get dcgm-exporter-address)" ]; then + # Setting to empty string for the dcgm-exporter binary to use the default address. (:9400) snapctl set dcgm-exporter-address="" fi diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh index e852dd9..a1dd641 100644 --- a/snap/local/run_dcgm_exporter.sh +++ b/snap/local/run_dcgm_exporter.sh @@ -4,7 +4,7 @@ set -euo pipefail # Build the argument list for the dcgm-exporter command args=() -# Add the dcgm-exporter-port option if it is set. Default: “:9400” +# Add the dcgm-exporter-address option if it is set. Default: “:9400” dcgm_exporter_address="$(snapctl get dcgm-exporter-address)" if [ -n "$dcgm_exporter_address" ]; then diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 56b80a0..5e3e58a 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,6 +1,20 @@ import subprocess import pytest +from tenacity import retry, stop_after_delay, wait_fixed + + +@retry(wait=wait_fixed(5), stop=stop_after_delay(20)) +def check_dcgm_exporter_service(): + dcgm_exporter_service = "dcgm.dcgm-exporter" + + result = subprocess.run( + f"sudo snap services {dcgm_exporter_service}".split(), + check=True, + capture_output=True, + text=True, + ) + assert " active" in result.stdout.strip(), f"{dcgm_exporter_service} service is not active" @pytest.fixture(scope="session", autouse=True) @@ -15,6 +29,9 @@ def install_dcgm_snap(): shell=True, ) + subprocess.run("sudo snap start dcgm.dcgm-exporter".split(), check=True) + check_dcgm_exporter_service() + yield subprocess.run("sudo snap remove --purge dcgm".split(), check=True) diff --git a/tests/functional/requirments.txt b/tests/functional/requirments.txt index 771e68b..bd2d4ef 100644 --- a/tests/functional/requirments.txt +++ b/tests/functional/requirments.txt @@ -1,2 +1,2 @@ -tenacity==9.0.0 -pyyaml==6.0.2 +tenacity +pyyaml diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 842d6df..7d38f56 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -6,21 +6,10 @@ from tenacity import retry, stop_after_delay, wait_fixed -@retry(wait=wait_fixed(5), stop=stop_after_delay(15)) +@retry(wait=wait_fixed(5), stop=stop_after_delay(30)) def test_dcgm_exporter(): """Test of the dcgm-exporter service and its endpoint.""" endpoint = "http://localhost:9400/metrics" - dcgm_exporter_service = "dcgm.dcgm-exporter" - - subprocess.run(f"sudo snap start {dcgm_exporter_service}".split(), check=True) - - result = subprocess.run( - f"sudo snap services {dcgm_exporter_service}".split(), - check=True, - capture_output=True, - text=True, - ) - assert " active" in result.stdout.strip(), "dcgm-exporter service is not active" # Check the exporter endpoint, will raise an exception if the endpoint is not reachable urllib.request.urlopen(endpoint) @@ -37,7 +26,7 @@ def test_dcgm_nv_hostengine(): text=True, ) - assert " active" in service.stdout.strip(), "nv-hostengine service is not active" + assert " active" in service.stdout.strip(), f"{nv_hostengine_service} service is not active" def test_dcgmi(): @@ -48,7 +37,7 @@ def test_dcgmi(): assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" -def test_dcgm_port_configs(): +def test_dcgm_bind_configs(): """Test snap port configuratin.""" services = ["dcgm.dcgm-exporter", "dcgm.nv-hostengine"] configs = ["dcgm-exporter-address", "nv-hostengine-port"] From 6987442e8a39ab461a0b7714a50bde47ea4d1904 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 13 Sep 2024 18:52:05 -0400 Subject: [PATCH 33/36] Refine tests and comments --- .gitignore | 2 - pyproject.toml | 4 -- snap/hooks/configure | 4 +- snap/snapcraft.yaml | 2 + tests/functional/conftest.py | 15 ---- .../{requirments.txt => requirements.txt} | 1 - tests/functional/test_snap_dcgm.py | 72 +++++++++++-------- tox.ini | 6 +- 8 files changed, 47 insertions(+), 59 deletions(-) rename tests/functional/{requirments.txt => requirements.txt} (56%) diff --git a/.gitignore b/.gitignore index b6b2573..f43a27a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ # This is a template `.gitignore` file for snaps -# This file is managed by bootstack-charms-spec and should not be modified -# within individual snap repos. https://launchpad.net/bootstack-charms-spec # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/pyproject.toml b/pyproject.toml index 049c82c..ec1e0ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -# This is a template `pyproject.toml` file for snaps -# This file is managed by bootstack-charms-spec and should not be modified -# within individual snap repos. https://launchpad.net/bootstack-charms-spec - [tool.flake8] ignore = ["C901", "D100", "D101", "D102", "D103", "W503", "W504"] exclude = ['.eggs', '.git', '.tox', '.venv', '.build', 'build', 'report'] diff --git a/snap/hooks/configure b/snap/hooks/configure index a55fc8a..4478c03 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -5,10 +5,10 @@ # `sudo snap get dcgm`. if [ -z "$(snapctl get nv-hostengine-port)" ]; then # Setting to empty string for the nv-hostengine binary to use the default port. (5555) - snapctl set nv-hostengine-port="" + snapctl set nv-hostengine-port="5555" fi if [ -z "$(snapctl get dcgm-exporter-address)" ]; then # Setting to empty string for the dcgm-exporter binary to use the default address. (:9400) - snapctl set dcgm-exporter-address="" + snapctl set dcgm-exporter-address=":9400" fi diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 401f7f0..31ad3fc 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -26,6 +26,8 @@ apps: - network-bind - opengl daemon: simple + # As this is a dcgm snap, not the dcgm-exporter snap, + # user might not be interested in running dcgm-exporter, so disable it by default install-mode: disable restart-condition: on-failure dcgmi: diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 5e3e58a..5c19053 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,20 +1,6 @@ import subprocess import pytest -from tenacity import retry, stop_after_delay, wait_fixed - - -@retry(wait=wait_fixed(5), stop=stop_after_delay(20)) -def check_dcgm_exporter_service(): - dcgm_exporter_service = "dcgm.dcgm-exporter" - - result = subprocess.run( - f"sudo snap services {dcgm_exporter_service}".split(), - check=True, - capture_output=True, - text=True, - ) - assert " active" in result.stdout.strip(), f"{dcgm_exporter_service} service is not active" @pytest.fixture(scope="session", autouse=True) @@ -30,7 +16,6 @@ def install_dcgm_snap(): ) subprocess.run("sudo snap start dcgm.dcgm-exporter".split(), check=True) - check_dcgm_exporter_service() yield diff --git a/tests/functional/requirments.txt b/tests/functional/requirements.txt similarity index 56% rename from tests/functional/requirments.txt rename to tests/functional/requirements.txt index bd2d4ef..48f8e12 100644 --- a/tests/functional/requirments.txt +++ b/tests/functional/requirements.txt @@ -1,2 +1 @@ tenacity -pyyaml diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 7d38f56..ee87c51 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,32 +1,41 @@ import json import subprocess import urllib.request -from time import sleep -from tenacity import retry, stop_after_delay, wait_fixed +import pytest +from tenacity import Retrying, retry, stop_after_delay, wait_fixed @retry(wait=wait_fixed(5), stop=stop_after_delay(30)) def test_dcgm_exporter(): """Test of the dcgm-exporter service and its endpoint.""" + dcgm_exporter_service = "snap.dcgm.dcgm-exporter" endpoint = "http://localhost:9400/metrics" + assert 0 == subprocess.call( + f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split() + ), f"{dcgm_exporter_service} is not running" + # Check the exporter endpoint, will raise an exception if the endpoint is not reachable - urllib.request.urlopen(endpoint) + response = urllib.request.urlopen(endpoint) + + # The output of the exporter endpoint is not tested + # as in a virtual environment it will not have any GPU metrics + assert 200 == response.getcode(), "DCGM exporter endpoint returned an error" def test_dcgm_nv_hostengine(): """Check the dcgm-nv-hostengine service.""" - nv_hostengine_service = "dcgm.nv-hostengine" + nv_hostengine_service = "snap.dcgm.nv-hostengine" + nv_hostengine_port = 5555 - service = subprocess.run( - f"snap services {nv_hostengine_service}".split(), - check=True, - capture_output=True, - text=True, - ) + assert 0 == subprocess.call( + f"sudo systemctl is-active --quiet {nv_hostengine_service}".split() + ), f"{nv_hostengine_service} is not running" - assert " active" in service.stdout.strip(), f"{nv_hostengine_service} service is not active" + assert 0 == subprocess.call( + f"nc -z localhost {nv_hostengine_port}".split() + ), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}" def test_dcgmi(): @@ -34,33 +43,36 @@ def test_dcgmi(): result = subprocess.run( "dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True ) - assert "GPU ID" in result.stdout.strip(), "DCGMI is not working" + # Test if the command is working and outputs a table with the GPU ID + # The table will be empty in a virtual environment, but the command should still work + assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table" -def test_dcgm_bind_configs(): - """Test snap port configuratin.""" - services = ["dcgm.dcgm-exporter", "dcgm.nv-hostengine"] - configs = ["dcgm-exporter-address", "nv-hostengine-port"] - new_values = [":9466", "5666"] +bind_test_data = [ + ("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"), + ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), +] + + +@pytest.mark.parametrize("service, config, new_value", bind_test_data) +def test_dcgm_bind_config(service: str, config: str, new_value: str): + """Test snap bind configuration.""" result = subprocess.run( "sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True ) dcgm_snap_config = json.loads(result.stdout.strip()) - assert all(config in dcgm_snap_config for config in configs), "Missing snap configuration keys" + assert config in dcgm_snap_config, f"{config} is not in the snap configuration" - for config, new_value in zip(configs, new_values): - subprocess.run( - f"sudo snap set dcgm {config}={new_value}".split(), check=True - ), f"Failed to set {config}" + assert 0 == subprocess.call( + f"sudo snap set dcgm {config}={new_value}".split() + ), f"Failed to set {config} to {new_value}" # restart the service to apply the new configuration - for service in services: - subprocess.run(f"sudo snap restart {service}".split(), check=True) - - sleep(5) + subprocess.run(f"sudo snap restart {service}".split(), check=True) - for service, port in zip(services, new_values): - subprocess.run( - f"sudo lsof -i :{port.lstrip(':')}".split(), check=True - ), f"{service} port is not listening" + for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)): + with attempt: + assert 0 == subprocess.call( + f"nc -z localhost {new_value.lstrip(':')}".split() + ), f"{service} is not listening on {new_value}" diff --git a/tox.ini b/tox.ini index 455178f..fd01577 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,3 @@ -# This is a template `tox.ini` file for snaps -# This file is managed by bootstack-charms-spec and should not be modified -# within individual snap repos. https://launchpad.net/bootstack-charms-spec - [tox] skipsdist=True envlist = lint, unit, func @@ -39,7 +35,7 @@ commands = [testenv:func] deps = pytest - -r {toxinidir}/tests/functional/requirments.txt + -r {toxinidir}/tests/functional/requirements.txt passenv = TEST_* commands = From 2ea70fd936a53ab9ac7f9ada386f8912d48d8ba5 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 13 Sep 2024 19:03:34 -0400 Subject: [PATCH 34/36] Align check.yaml with check workflows --- .github/workflows/check.yaml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 3091d11..08d17a4 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -27,16 +27,19 @@ jobs: with: fetch-depth: 0 # Complete git history is required to generate the version from git tags. - - name: Setup Python 3.10 + - name: Set up Python 3.10 uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: "3.10" - name: Install dependencies run: | sudo apt update sudo apt install -y yamllint - pipx install tox + python -m pip install --upgrade pip + # pin tox to the current major version to avoid + # workflows breaking all at once when a new major version is released. + python -m pip install 'tox<5' - name: Lint yaml files run: yamllint .yamllint snap/snapcraft.yaml @@ -68,19 +71,23 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Complete git history is required to generate the version from git tags. - name: Download the built snap uses: actions/download-artifact@v4 with: name: SNAP_FILE - - name: Setup Python 3.10 + - name: Set up Python 3.10 uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: "3.10" - name: Install dependencies - run: pipx install tox + run: | + python -m pip install --upgrade pip + python -m pip install 'tox<5' - - name: Run tests + - name: Run unit tests run: tox -e func From 4059a6d37d8988cfc00bd4545c02926f663b2546 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 13 Sep 2024 19:52:04 -0400 Subject: [PATCH 35/36] Update comments for default configs --- snap/hooks/configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snap/hooks/configure b/snap/hooks/configure index 4478c03..9d5b294 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -4,11 +4,11 @@ # so users can see available options by running # `sudo snap get dcgm`. if [ -z "$(snapctl get nv-hostengine-port)" ]; then - # Setting to empty string for the nv-hostengine binary to use the default port. (5555) + # Explictly use default bind port of nv-hostengine binary snapctl set nv-hostengine-port="5555" fi if [ -z "$(snapctl get dcgm-exporter-address)" ]; then - # Setting to empty string for the dcgm-exporter binary to use the default address. (:9400) + # Explictly use default bind address of dcgm-exporter binary snapctl set dcgm-exporter-address=":9400" fi From bd6f22e2ceaf15765a8f6771c2dad108aaa97523 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Mon, 16 Sep 2024 12:33:07 -0400 Subject: [PATCH 36/36] Revert back the bind configs after the test --- tests/functional/test_snap_dcgm.py | 42 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index ee87c51..e9ee0e4 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -49,13 +49,13 @@ def test_dcgmi(): assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table" -bind_test_data = [ - ("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"), - ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), -] - - -@pytest.mark.parametrize("service, config, new_value", bind_test_data) +@pytest.mark.parametrize( + "service, config, new_value", + [ + ("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"), + ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), + ], +) def test_dcgm_bind_config(service: str, config: str, new_value: str): """Test snap bind configuration.""" result = subprocess.run( @@ -63,16 +63,24 @@ def test_dcgm_bind_config(service: str, config: str, new_value: str): ) dcgm_snap_config = json.loads(result.stdout.strip()) assert config in dcgm_snap_config, f"{config} is not in the snap configuration" + old_value = dcgm_snap_config[config] - assert 0 == subprocess.call( - f"sudo snap set dcgm {config}={new_value}".split() - ), f"Failed to set {config} to {new_value}" + def set_config_and_check(value: str): + assert 0 == subprocess.call( + f"sudo snap set dcgm {config}={value}".split() + ), f"Failed to set {config} to {new_value}" + + # restart the service to apply the configuration + subprocess.run(f"sudo snap restart {service}".split(), check=True) + + for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)): + with attempt: + assert 0 == subprocess.call( + f"nc -z localhost {value.lstrip(':')}".split() + ), f"{service} is not listening on {value}" - # restart the service to apply the new configuration - subprocess.run(f"sudo snap restart {service}".split(), check=True) + # Check new config + set_config_and_check(new_value) - for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)): - with attempt: - assert 0 == subprocess.call( - f"nc -z localhost {new_value.lstrip(':')}".split() - ), f"{service} is not listening on {new_value}" + # Revert back + set_config_and_check(str(old_value))