Skip to content

Commit

Permalink
Update k3s and nvidia install (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfaulhaber authored Sep 27, 2024
1 parent 0ae4564 commit 9cbafed
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 125 deletions.
123 changes: 0 additions & 123 deletions deploy/bin/install-k3s-g4.sh

This file was deleted.

85 changes: 85 additions & 0 deletions deploy/bin/install-k3s-nvidia.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash

# Install k3s and configure GPU support
# Tested on an AWS EC2 G4 instance using the following AMI:
# Deep Learning OSS Nvidia Driver AMI GPU PyTorch 2.3.0 (Ubuntu 20.04) 20240825

# This guide was more helpful than others fwiw:
# https://support.tools/post/nvidia-gpus-on-k3s/

set -ex

check_nvidia_drivers_and_container_runtime() {
# Retrieve existing version or default to 525
NVIDIA_VERSION=$(modinfo nvidia 2>/dev/null | awk '/^version:/ {split($2, a, "."); print a[1]}')
NVIDIA_VERSION=${NVIDIA_VERSION:-525}

if ! command -v nvidia-smi &> /dev/null; then
echo "NVIDIA drivers are not installed (nvidia-smi not found). Installing..."
sudo apt update && sudo apt install -y "nvidia-headless-$NVIDIA_VERSION-server"
else
echo "NVIDIA drivers for version $NVIDIA_VERSION are installed."
fi

# Check if nvidia container runtime is already installed.
if ! command -v nvidia-container-runtime &> /dev/null; then
echo " NVIDIA container runtime is not installed. Installing..."
# Get distribution information
DISTRIBUTION=$(. /etc/os-release; echo "$ID$VERSION_ID")

# Add NVIDIA Docker repository
echo "Adding NVIDIA Docker repository..."
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L "https://nvidia.github.io/nvidia-docker/$DISTRIBUTION/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt update -y && sudo apt install -y nvidia-container-runtime
else
echo " NVIDIA container runtime is installed."
fi
}

# Install Helm if it's not available since the Nvidia operator comes packaged as a Helm chart.
if ! command -v helm &> /dev/null
then
echo "Helm not found, installing Helm..."
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
fi

K="k3s kubectl"
SCRIPT_DIR=$(dirname "$0")

check_nvidia_drivers_and_container_runtime

# Install k3s using our standard script
$SCRIPT_DIR/install-k3s.sh

# Add the NVIDIA GPU Operator Helm repository
helm repo add nvidia https://nvidia.github.io/gpu-operator
helm repo update

# Get the latest version of the GPU Operator (the second field on the second line of the search result)
LATEST_VERSION=$(helm search repo nvidia/gpu-operator --devel --versions | awk 'NR == 2 {print $2}')

# Install the GPU Operator using Helm
echo "Installing NVIDIA GPU Operator version $LATEST_VERSION..."
helm install \
--wait \
--generate-name \
-n gpu-operator \
--create-namespace \
--version "$LATEST_VERSION" \
nvidia/gpu-operator

echo "NVIDIA GPU Operator installation completed."

# Verify that we actually added GPU capacity to the node
capacity=$($K get $($K get nodes -o name) -o=jsonpath='{.status.capacity.nvidia\.com/gpu}')
if [ "$capacity" = "1" ]; then
echo "GPU capacity successfully added"
else
echo "WARNING: No GPU capacity on node after install!!"
fi

# In addition, you can also check that the nvidia-device-plugin-ds pod
# is running in the `kube-system` namespace.
# kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds
5 changes: 3 additions & 2 deletions deploy/bin/install-k3s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ sudo apt update && sudo apt upgrade -y

# Install k3s
echo "Installing k3s..."
curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.28.2+k3s1 K3S_KUBECONFIG_MODE="644" sh -
curl -sfL https://get.k3s.io | K3S_KUBECONFIG_MODE="644" sh -s - --disable=traefik

check_k3s_is_running() {
local TIMEOUT=30 # Maximum wait time of 30 seconds
Expand All @@ -37,4 +37,5 @@ fi

# Set up kubeconfig for the current user
mkdir -p ~/.kube
cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
chmod 600 ~/.kube/config

0 comments on commit 9cbafed

Please sign in to comment.