Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to install NVIDIA driver if not present in the machine #328

Merged
merged 14 commits into from
Oct 9, 2024
67 changes: 67 additions & 0 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ def __init__(self, tool: HWTool, path: Path):
self.message = f"Tool: {tool} path: {path} size is zero"


class ResourceInstallationError(Exception):
"""Exception raised when a hardware tool installation fails."""

def __init__(self, tool: HWTool):
"""Init."""
super().__init__(f"Installation failed for tool: {tool}")


def copy_to_snap_common_bin(source: Path, filename: str) -> None:
"""Copy file to $SNAP_COMMON/bin folder."""
Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True)
Expand Down Expand Up @@ -223,11 +231,70 @@ class DCGMExporterStrategy(SnapStrategy):
"""DCGM strategy class."""

_name = HWTool.DCGM
snap_common: Path = Path("/var/snap/dcgm/common/")

def __init__(self, channel: str) -> None:
"""Init."""
self.channel = channel

def install(self) -> None:
"""Install the snap from a channel and the necessary nvidia driver."""
super().install()
self._install_nvidia_drivers()
self._install_nvidia_utils()
jneo8 marked this conversation as resolved.
Show resolved Hide resolved

def _install_nvidia_drivers(self) -> None:
"""Install the NVIDIA driver if not present."""
if Path("/proc/driver/nvidia/version").exists():
logger.info("NVIDIA driver already installed in the machine")
return

logger.info("Installing NVIDIA driver")
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
apt.add_package("ubuntu-drivers-common", update_cache=True)

# output what driver was installed helps gets the version installed later
cmd = (
"ubuntu-drivers install --gpgpu --package-list "
f"{self.snap_common}/nvidia-installed-pkgs.txt"
)
try:
# This can be changed to check_call and not rely in the output if this is fixed
# https://github.com/canonical/ubuntu-drivers-common/issues/106
result = subprocess.check_output(cmd.split(), text=True)
if "No drivers found for installation" in result:
logger.warning(
"No drivers for the NVIDIA GPU were found. Manual installation is necessary"
)
raise ResourceInstallationError(self._name)
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
except subprocess.CalledProcessError as err:
logger.error("Failed to install the NVIDIA driver: %s", err)
raise err

logger.info("NVIDIA driver installed")

def _install_nvidia_utils(self) -> None:
"""Install the nvidia utils to be able to use nvidia-smi."""
nvidia_pkg = Path(self.snap_common / "nvidia-installed-pkgs.txt")
if not nvidia_pkg.exists():
logger.debug("nvidia-utils not installed by the charm")
return
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved

installed_pkg = nvidia_pkg.read_text(encoding="utf-8").splitlines()[0]
logger.debug("installed driver from hardware-observer: %s", installed_pkg)
nvidia_version = installed_pkg.split("-")[-2]
aieri marked this conversation as resolved.
Show resolved Hide resolved

if not nvidia_version.isdigit():
aieri marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(
"driver %s is an unexpected format and nvidia-utils was not installed",
installed_pkg,
)
return

pkg = f"nvidia-utils-{nvidia_version}-server"
apt.add_package(pkg, update_cache=True)
logger.info("installed %s", pkg)
return


class SmartCtlExporterStrategy(SnapStrategy):
"""SmartCtl strategy class."""
Expand Down
23 changes: 23 additions & 0 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import os
import shutil
import subprocess
from abc import ABC, abstractmethod
from logging import getLogger
from pathlib import Path
from subprocess import CalledProcessError
from time import sleep
from typing import Any, Dict, Optional, Set, Tuple

Expand Down Expand Up @@ -435,6 +437,27 @@ def hw_tools() -> Set[HWTool]:
"""Return hardware tools to watch."""
return {HWTool.DCGM}

def validate_exporter_configs(self) -> Tuple[bool, str]:
"""Validate the if the DCGM exporter is able to run."""
valid, msg = super().validate_exporter_configs()
if not valid:
return valid, msg

try:
subprocess.check_call("nvidia-smi", timeout=60)
return valid, msg
except (FileNotFoundError, CalledProcessError) as e:
logger.error(e)
logger.warning(
"nvidia-smi is not working. It's necessary to manually remove and install "
"a different NVIDIA driver until nvidia-smi is working. See the docs for more "
"details: https://ubuntu.com/server/docs/nvidia-drivers-installation"
)
return (
False,
"Failed to communicate with NVIDIA driver. Manual intervention is required.",
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
)


class SmartCtlExporter(SnapExporter):
"""A class representing the smartctl exporter and the metric endpoints."""
Expand Down
127 changes: 127 additions & 0 deletions tests/unit/test_hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
from config import SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool, StorageVendor, SystemVendor
from hw_tools import (
APTStrategyABC,
DCGMExporterStrategy,
HWToolHelper,
IPMIDCMIStrategy,
IPMISELStrategy,
IPMISENSORStrategy,
PercCLIStrategy,
ResourceChecksumError,
ResourceFileSizeZeroError,
ResourceInstallationError,
SAS2IRCUStrategy,
SAS3IRCUStrategy,
SnapStrategy,
Expand Down Expand Up @@ -1143,6 +1145,131 @@ def test_snap_strategy_check(snap_exporter, mock_snap_lib, services, expected):
assert snap_exporter.check() is expected


@pytest.fixture
def dcgm_exporter_strategy(mock_snap_lib):
strategy = DCGMExporterStrategy("latest/stable")
yield strategy


@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_drivers")
@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_utils")
def test_dcgm_exporter_strategy_install(
mock_install_nvidia_drivers, mock_install_nvidia_utils, dcgm_exporter_strategy
):
dcgm_exporter_strategy.install()
mock_install_nvidia_drivers.assert_called_once()
mock_install_nvidia_utils.assert_called_once()


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.subprocess.check_output")
@mock.patch("hw_tools.Path")
def test_dcgm_install_nvidia_drivers_success(
mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy
):
nvidia_version = mock.MagicMock()
nvidia_version.exists.return_value = False
mock_path.return_value = nvidia_version

dcgm_exporter_strategy._install_nvidia_drivers()

mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True)
mock_subprocess.assert_called_once()


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.subprocess.check_output")
@mock.patch("hw_tools.Path")
def test_dcgm_install_nvidia_drivers_already_installed(
mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy
):
nvidia_version = mock.MagicMock()
nvidia_version.exists.return_value = True
mock_path.return_value = nvidia_version

dcgm_exporter_strategy._install_nvidia_drivers()

mock_add_package.assert_not_called()
mock_subprocess.assert_not_called()


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.subprocess.check_output")
@mock.patch("hw_tools.Path")
def test_dcgm_install_nvidia_drivers_subprocess_exception(
mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy
):
nvidia_version = mock.MagicMock()
nvidia_version.exists.return_value = False
mock_path.return_value = nvidia_version
mock_subprocess.side_effect = subprocess.CalledProcessError(1, [])

with pytest.raises(subprocess.CalledProcessError):
dcgm_exporter_strategy._install_nvidia_drivers()

mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True)


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.subprocess.check_output")
@mock.patch("hw_tools.Path")
def test_dcgm_install_nvidia_drivers_no_drivers_found(
mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy
):
nvidia_version = mock.MagicMock()
nvidia_version.exists.return_value = False
mock_path.return_value = nvidia_version
mock_subprocess.return_value = "No drivers found for installation"

with pytest.raises(ResourceInstallationError):
dcgm_exporter_strategy._install_nvidia_drivers()

mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True)


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.Path")
def test_install_nvidia_utils_driver_installed_from_charm(
mock_path, mock_add_package, dcgm_exporter_strategy
):
driver_version = mock.MagicMock()
driver_version.exists.return_value = True
driver_version.read_text.return_value = (
"nvidia-headless-no-dkms-535-server\nlibnvidia-cfg1-535-server"
)
mock_path.return_value = driver_version

dcgm_exporter_strategy._install_nvidia_utils()
mock_add_package.assert_called_with("nvidia-utils-535-server", update_cache=True)


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.Path")
def test_install_nvidia_utils_driver_not_installed_from_charm(
mock_path, mock_add_package, dcgm_exporter_strategy
):
driver_version = mock.MagicMock()
driver_version.exists.return_value = False
mock_path.return_value = driver_version

dcgm_exporter_strategy._install_nvidia_utils()
mock_add_package.assert_not_called()


@mock.patch("hw_tools.apt.add_package")
@mock.patch("hw_tools.Path")
def test_install_nvidia_utils_driver_unexpected_format(
mock_path, mock_add_package, dcgm_exporter_strategy
):
driver_version = mock.MagicMock()
driver_version.exists.return_value = True
driver_version.read_text.return_value = "nvidia-my-version-server"
mock_path.return_value = driver_version

dcgm_exporter_strategy._install_nvidia_utils()
mock_add_package.assert_not_called()


@mock.patch("hw_tools.Path.unlink")
@mock.patch("hw_tools.Path.exists")
@mock.patch("hw_tools.shutil")
Expand Down
25 changes: 24 additions & 1 deletion tests/unit/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,10 +722,14 @@ def setUp(self) -> None:
"""Set up harness for each test case."""
snap_lib_patcher = mock.patch.object(service, "snap")
shutil_lib_patcher = mock.patch.object(service, "shutil")
subprocess = mock.patch.object(service, "subprocess")

self.mock_snap = snap_lib_patcher.start()
self.mock_shutil = shutil_lib_patcher.start()
self.mock_subprocess = subprocess.start()
self.addCleanup(snap_lib_patcher.stop)
self.addCleanup(shutil_lib_patcher.stop)
self.addCleanup(subprocess.stop)

search_path = pathlib.Path(f"{__file__}/../../..").resolve()
self.mock_config = {
Expand Down Expand Up @@ -775,6 +779,26 @@ def test_install_metrics_copy_fail(self):
self.exporter.snap_client.restart.assert_not_called()
self.assertFalse(exporter_install_ok)

def test_validate_exporter_configs_success(self):
valid, msg = self.exporter.validate_exporter_configs()
self.assertTrue(valid)
self.assertEqual(msg, "Exporter config is valid.")

def test_validate_exporter_configs_fails(self):
self.mock_subprocess.check_call.side_effect = FileNotFoundError
valid, msg = self.exporter.validate_exporter_configs()
self.assertFalse(valid)
self.assertEqual(
msg, "Failed to communicate with NVIDIA driver. Manual intervention is required."
)

@mock.patch.object(service.BaseExporter, "validate_exporter_configs")
def test_validate_exporter_configs_fails_parent(self, mock_parent_validate):
mock_parent_validate.return_value = False, "Invalid config: exporter's port"
valid, msg = self.exporter.validate_exporter_configs()
self.assertFalse(valid)
self.assertEqual(msg, "Invalid config: exporter's port")


class TestWriteToFile(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -956,7 +980,6 @@ def test_smartctl_exporter_configure(mock_set, mock_install, result, expected_re
"exporter-log-level": "info",
"smartctl-exporter-snap-channel": "latest/stable",
}

mock_set.return_value = result
mock_install.return_value = result
exporter = service.SmartCtlExporter(mock_config)
Expand Down
Loading