refactor gpu configs into components

redhat-cop · May 2, 2024 · f77d214 · f77d214
1 parent 7f4ff51
commit f77d214
Show file tree

Hide file tree

Showing 55 changed files with 1,055 additions and 124 deletions.
diff --git a/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonAnnotations:
+  argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
+
+resources:
+  - ../../../operator/overlays/stable
+  - ../../../instance/overlays/aws
diff --git a/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml
@@ -4,8 +4,6 @@ kind: Kustomization
 commonAnnotations:
   argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
 
-namespace: nvidia-gpu-operator
-
 resources:
-  - ../../../operator/overlays/stable
   - ../../../instance/overlays/default
+  - ../../../operator/overlays/stable
diff --git a/gpu-operator-certified/instance/README.md → gpu-operator-certified/instance/INFO.md b/gpu-operator-certified/instance/README.md → gpu-operator-certified/instance/INFO.md
@@ -1,9 +1,5 @@
 # GPU Notes
 
-For more info please review the following:
-
-- [Demo GPUs on OpenShift](https://github.com/redhat-na-ssa/demo-ocp-gpu)
-
 ## Instance Types
 
 AWS GPU Types:
@@ -40,12 +36,13 @@ Time-slicing GPU can be any Nvidia type (as documented by Nvidia):
   - `g3.8xlarge`  - 2 x M60
   - `g3.16xlarge` - 4 x M60
 
+
 ## Links
 
 - [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing)
 - [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html)
 - [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html)
 - [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html)
-- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift)
+- [Blog - Red Hat Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift)
 - [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces)
 - [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags)
diff --git a/gpu-operator-certified/instance/base/cluster-policy.yaml b/gpu-operator-certified/instance/base/cluster-policy.yaml
@@ -2,6 +2,7 @@ kind: ClusterPolicy
 apiVersion: nvidia.com/v1
 metadata:
   name: gpu-cluster-policy
+  namespace: nvidia-gpu-operator
 spec:
   operator:
     defaultRuntime: crio
@@ -50,6 +51,10 @@ spec:
     updateStrategy: RollingUpdate
     rollingUpdate:
       maxUnavailable: '1'
+    tolerations:
+      - effect: NoSchedule
+        key: nvidia-gpu-only
+        operator: Exists
   devicePlugin:
     enabled: true
     config:

diff --git a/gpu-operator-certified/instance/base/device-plugin-config.yaml b/gpu-operator-certified/instance/base/device-plugin-config.yaml
@@ -2,4 +2,5 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: device-plugin-config
+  namespace: nvidia-gpu-operator
 data: {}
diff --git a/gpu-operator-certified/instance/base/kustomization.yaml b/gpu-operator-certified/instance/base/kustomization.yaml
@@ -1,8 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-namespace: nvidia-gpu-operator
-
 resources:
   - cluster-policy.yaml
   - device-plugin-config.yaml
+
+components:
+  - ../components/monitoring-dashboard
diff --git a/gpu-operator-certified/instance/components/README.md b/gpu-operator-certified/instance/components/README.md
@@ -0,0 +1,28 @@
+# NVIDIA GPU Operator Components
+
+The included components are intended to be common patching patterns used on top of the default NVIDIA GPU operator instance to configure additional features.  Components are composable patches that can be added at the overlays layer on top of a base.
+
+This repo currently contains the following components:
+
+* [aws-gpu-machineset](aws-gpu-machineset)
+* [mig-mixed](mig-mixed)
+* [mig-single](mig-single)
+* [monitoring-dashboard](monitoring-dashboard)
+* [time-sliced](time-sliced)
+* [time-sliced-2](time-sliced-2)
+* [time-sliced-4](time-sliced-4)
+
+## Usage
+
+Components can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/monitoring-dashboard
+```
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md b/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md
@@ -0,0 +1,24 @@
+# aws-gpu-machineset
+
+## Purpose
+
+This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster.
+
+This component triggers a job that creates a MachineSet based on your current MachineSet.
+
+This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com.
+
+## Usage
+
+This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/aws-gpu-machineset
+```
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091
+
+ocp_aws_cluster(){
+  TARGET_NS=kube-system
+  OBJ=secret/aws-creds
+  echo "Checking if ${OBJ} exists in ${TARGET_NS} namespace"
+  oc -n "${TARGET_NS}" get "${OBJ}" -o name > /dev/null 2>&1 || return 1
+  echo "AWS cluster detected"
+}
+
+ocp_aws_create_gpu_machineset(){
+  # https://aws.amazon.com/ec2/instance-types/g4
+  # single gpu: g4dn.{2,4,8,16}xlarge
+  # multi gpu:  g4dn.12xlarge
+  # practical:  g4ad.4xlarge
+  # a100 (MIG): p4d.24xlarge
+  # h100 (MIG): p5.48xlarge
+
+  # https://aws.amazon.com/ec2/instance-types/dl1
+  # 8 x gaudi:  dl1.24xlarge
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+
+  ocp_aws_clone_machineset "${INSTANCE_TYPE}"
+
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  echo "Patching: ${MACHINE_SET_TYPE}"
+
+  # cosmetic
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
+
+  # taint nodes for gpu-only workloads
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
+
+  # should use the default profile
+  # oc -n openshift-machine-api \
+  #   patch "${MACHINE_SET_TYPE}" \
+  #   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}'
+
+  # should help auto provisioner
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
+}
+
+ocp_aws_clone_machineset(){
+  [ -z "${1}" ] && \
+  echo "
+    usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge >
+  "
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
+
+  # check for an existing instance machine set
+  if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then
+    echo "Exists: machineset - ${INSTANCE_TYPE}"
+  else
+    echo "Creating: machineset - ${INSTANCE_TYPE}"
+    oc -n openshift-machine-api \
+      get "${MACHINE_SET}" -o yaml | \
+        sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
+          /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g
+          s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
+          s/replicas.*/replicas: 0/' | \
+      oc apply -f -
+  fi
+}
+
+ocp_create_machineset_autoscale(){
+  MACHINE_MIN=${1:-0}
+  MACHINE_MAX=${2:-4}
+  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}
+
+  for set in ${MACHINE_SETS}
+  do
+cat << YAML | oc apply -f -
+apiVersion: "autoscaling.openshift.io/v1beta1"
+kind: "MachineAutoscaler"
+metadata:
+  name: "${set}"
+  namespace: "openshift-machine-api"
+spec:
+  minReplicas: ${MACHINE_MIN}
+  maxReplicas: ${MACHINE_MAX}
+  scaleTargetRef:
+    apiVersion: machine.openshift.io/v1beta1
+    kind: MachineSet
+    name: "${set}"
+YAML
+  done
+}
+
+ocp_aws_cluster || exit 0
+ocp_aws_create_gpu_machineset
+ocp_create_machineset_autoscale
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml
@@ -0,0 +1,88 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: job-aws-gpu-machineset
+rules:
+- apiGroups:
+  - machine.openshift.io
+  resources:
+  - machinesets
+  verbs:
+  - '*'
+- apiGroups:
+  - autoscaling.openshift.io
+  resources:
+  - machineautoscalers
+  verbs:
+  - '*'
+- apiGroups:
+  - ''
+  resources:
+  - secrets
+  resourceNames:
+  - aws-creds
+  verbs:
+  - get
+  - list
+# - nonResourceURLs:
+#   - '*'
+#   verbs:
+#   - '*'
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: job-aws-gpu-machineset
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: job-aws-gpu-machineset
+subjects:
+  - kind: ServiceAccount
+    name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  generateName: job-aws-gpu-machineset-
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+  annotations:
+    argocd.argoproj.io/hook: Sync
+    # argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  template:
+    spec:
+      containers:
+        - name: job-aws-gpu-machineset
+          # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest
+          image: registry.redhat.io/openshift4/ose-cli
+          env:
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          command:
+            - /bin/bash
+            - -c
+            - /scripts/job.sh
+          volumeMounts:
+            - name: scripts
+              mountPath: /scripts
+      volumes:
+        - name: scripts
+          configMap:
+            name: job-aws-gpu-machineset
+            defaultMode: 0755
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 30
+      serviceAccount: job-aws-gpu-machineset
+      serviceAccountName: job-aws-gpu-machineset
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+resources:
+  # - ../../../../../../scripts/library
+  - job.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+configMapGenerator:
+  - name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+    files:
+      - job.sh
diff --git a/gpu-operator-certified/instance/components/mig-mixed/README.md b/gpu-operator-certified/instance/components/mig-mixed/README.md
@@ -0,0 +1,23 @@
+# mig-mixed
+
+## Purpose
+
+This component is designed to enable to enable MIG in mixed mode.  The mixed MIG strategy should be utilized when not all GPUs on a node have MIG enabled.
+
+To learn more about MIG, please refer to the official [docs](
+https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html)
+
+## Usage
+
+This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/mig-mixed
+```
diff --git a/gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml b/gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+patches:
+  - path: patch-gpu-cluster-policy.yaml
+    target:
+      kind: ClusterPolicy
diff --git a/gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml b/gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml
@@ -0,0 +1,7 @@
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  mig:
+    strategy: mixed