From 0d9c167a7f11be382c5c7601d020022e1f423271 Mon Sep 17 00:00:00 2001 From: Trevor Royer Date: Thu, 2 May 2024 17:18:56 -0600 Subject: [PATCH] Gpu Refactor (#298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add chart folders to gitignore * update validation to handle --enable-helm * refactor gpu configs into components * update wordlist * skip shellcheck --------- Co-authored-by: Alexis de Talhouƫt --- .gitignore | 3 +- .wordlist-md | 3 + .../aggregate/overlays/aws/kustomization.yaml | 9 ++ .../overlays/default/kustomization.yaml | 4 +- .../instance/{README.md => INFO.md} | 7 +- .../instance/base/cluster-policy.yaml | 5 + .../instance/base/device-plugin-config.yaml | 1 + .../instance/base/kustomization.yaml | 5 +- .../instance/components/README.md | 28 +++++ .../components/aws-gpu-machineset/README.md | 24 ++++ .../components/aws-gpu-machineset/job.sh | 110 ++++++++++++++++++ .../components/aws-gpu-machineset/job.yaml | 88 ++++++++++++++ .../aws-gpu-machineset/kustomization.yaml | 15 +++ .../instance/components/mig-mixed/README.md | 23 ++++ .../components/mig-mixed/kustomization.yaml | 7 ++ .../mig-mixed/patch-gpu-cluster-policy.yaml | 7 ++ .../instance/components/mig-single/README.md | 23 ++++ .../components/mig-single/kustomization.yaml | 7 ++ .../mig-single/patch-gpu-cluster-policy.yaml | 7 ++ .../components/monitoring-dashboard/README.md | 26 +++++ .../monitoring-dashboard/kustomization.yaml | 15 +++ .../components/time-sliced-2/README.md | 23 ++++ .../time-sliced-2/kustomization.yaml | 11 ++ .../patch-device-plugin-config.yaml | 12 ++ .../components/time-sliced-4/README.md | 22 ++++ .../time-sliced-4/kustomization.yaml | 11 ++ .../patch-device-plugin-config.yaml | 12 ++ .../instance/components/time-sliced/README.md | 27 +++++ .../components/time-sliced/kustomization.yaml | 11 ++ .../patch-device-plugin-config.yaml | 12 ++ .../time-sliced/patch-gpu-cluster-policy.yaml | 11 ++ .../aws-time-sliced-2/kustomization.yaml | 9 ++ .../aws-time-sliced-4/kustomization.yaml | 9 ++ .../instance/overlays/aws/kustomization.yaml | 8 ++ .../overlays/mig-mixed/kustomization.yaml | 10 +- .../overlays/mig-single/kustomization.yaml | 10 +- .../overlays/time-sliced-2/kustomization.yaml | 8 ++ .../overlays/time-sliced-4/kustomization.yaml | 8 ++ .../time-slicing-2/kustomization.yaml | 34 ------ .../time-slicing-4/kustomization.yaml | 21 ---- .../time-slicing-8-a100/kustomization.yaml | 42 ------- .../operator/base/kustomization.yaml | 3 + .../operator/base/operator-group.yaml | 2 +- .../operator/components/README.md | 23 ++++ .../components/console-plugin-helm/README.md | 29 +++++ .../console-plugin-helm/console-plugin-job.sh | 30 +++++ .../console-plugin-job.yaml | 65 +++++++++++ .../console-plugin-helm/kustomization.yaml | 20 ++++ .../components/console-plugin/README.md | 27 +++++ .../components/console-plugin/configmap.yaml | 32 +++++ .../console-plugin/console-plugin-job.sh | 30 +++++ .../console-plugin/console-plugin-job.yaml | 68 +++++++++++ .../console-plugin/consoleplugin.yaml | 22 ++++ .../components/console-plugin/deployment.yaml | 58 +++++++++ .../console-plugin/kustomization.yaml | 18 +++ .../components/console-plugin/service.yaml | 27 +++++ .../overlays/v22.9/patch-channel.yaml | 3 + scripts/validate_manifests.sh | 25 ++-- 58 files changed, 1076 insertions(+), 134 deletions(-) create mode 100644 gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml rename gpu-operator-certified/instance/{README.md => INFO.md} (87%) create mode 100644 gpu-operator-certified/instance/components/README.md create mode 100644 gpu-operator-certified/instance/components/aws-gpu-machineset/README.md create mode 100755 gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh create mode 100644 gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml create mode 100644 gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/mig-mixed/README.md create mode 100644 gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml create mode 100644 gpu-operator-certified/instance/components/mig-single/README.md create mode 100644 gpu-operator-certified/instance/components/mig-single/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/mig-single/patch-gpu-cluster-policy.yaml create mode 100644 gpu-operator-certified/instance/components/monitoring-dashboard/README.md create mode 100644 gpu-operator-certified/instance/components/monitoring-dashboard/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced-2/README.md create mode 100644 gpu-operator-certified/instance/components/time-sliced-2/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced-2/patch-device-plugin-config.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced-4/README.md create mode 100644 gpu-operator-certified/instance/components/time-sliced-4/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced/README.md create mode 100644 gpu-operator-certified/instance/components/time-sliced/kustomization.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml create mode 100644 gpu-operator-certified/instance/components/time-sliced/patch-gpu-cluster-policy.yaml create mode 100644 gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/aws/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/time-sliced-2/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/time-sliced-4/kustomization.yaml delete mode 100644 gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml delete mode 100644 gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml delete mode 100644 gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml create mode 100644 gpu-operator-certified/operator/components/README.md create mode 100644 gpu-operator-certified/operator/components/console-plugin-helm/README.md create mode 100755 gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.sh create mode 100644 gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin-helm/kustomization.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin/README.md create mode 100644 gpu-operator-certified/operator/components/console-plugin/configmap.yaml create mode 100755 gpu-operator-certified/operator/components/console-plugin/console-plugin-job.sh create mode 100644 gpu-operator-certified/operator/components/console-plugin/console-plugin-job.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin/consoleplugin.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin/deployment.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin/kustomization.yaml create mode 100644 gpu-operator-certified/operator/components/console-plugin/service.yaml diff --git a/.gitignore b/.gitignore index fdc4cdf0..c1e2d0a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ venv/ +**/charts/ .DS_Store temp /.idea/ -dictionary.dic \ No newline at end of file +dictionary.dic diff --git a/.wordlist-md b/.wordlist-md index 0fd8e84e..758d89bb 100644 --- a/.wordlist-md +++ b/.wordlist-md @@ -55,6 +55,7 @@ Lifecycle Logstash MTA MachineConfig +MachineSet Minio MultiClusterHub NFD @@ -177,6 +178,8 @@ letsencrypt libvirt lifecycle linux +machineset +mig microservices namespace namespaces diff --git a/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml new file mode 100644 index 00000000..432572c7 --- /dev/null +++ b/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +resources: + - ../../../operator/overlays/stable + - ../../../instance/overlays/aws diff --git a/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml index 65fd18e6..713d4b98 100644 --- a/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml +++ b/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml @@ -4,8 +4,6 @@ kind: Kustomization commonAnnotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true -namespace: nvidia-gpu-operator - resources: - - ../../../operator/overlays/stable - ../../../instance/overlays/default + - ../../../operator/overlays/stable diff --git a/gpu-operator-certified/instance/README.md b/gpu-operator-certified/instance/INFO.md similarity index 87% rename from gpu-operator-certified/instance/README.md rename to gpu-operator-certified/instance/INFO.md index e238cc20..bb5fc631 100644 --- a/gpu-operator-certified/instance/README.md +++ b/gpu-operator-certified/instance/INFO.md @@ -1,9 +1,5 @@ # GPU Notes -For more info please review the following: - -- [Demo GPUs on OpenShift](https://github.com/redhat-na-ssa/demo-ocp-gpu) - ## Instance Types AWS GPU Types: @@ -40,12 +36,13 @@ Time-slicing GPU can be any Nvidia type (as documented by Nvidia): - `g3.8xlarge` - 2 x M60 - `g3.16xlarge` - 4 x M60 + ## Links - [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing) - [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html) - [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html) - [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html) -- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift) +- [Blog - Red Hat Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift) - [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces) - [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags) \ No newline at end of file diff --git a/gpu-operator-certified/instance/base/cluster-policy.yaml b/gpu-operator-certified/instance/base/cluster-policy.yaml index 724b2026..48a61ee3 100644 --- a/gpu-operator-certified/instance/base/cluster-policy.yaml +++ b/gpu-operator-certified/instance/base/cluster-policy.yaml @@ -2,6 +2,7 @@ kind: ClusterPolicy apiVersion: nvidia.com/v1 metadata: name: gpu-cluster-policy + namespace: nvidia-gpu-operator spec: operator: defaultRuntime: crio @@ -50,6 +51,10 @@ spec: updateStrategy: RollingUpdate rollingUpdate: maxUnavailable: '1' + tolerations: + - effect: NoSchedule + key: nvidia-gpu-only + operator: Exists devicePlugin: enabled: true config: diff --git a/gpu-operator-certified/instance/base/device-plugin-config.yaml b/gpu-operator-certified/instance/base/device-plugin-config.yaml index 47fa37a7..3eb97a8a 100644 --- a/gpu-operator-certified/instance/base/device-plugin-config.yaml +++ b/gpu-operator-certified/instance/base/device-plugin-config.yaml @@ -2,4 +2,5 @@ apiVersion: v1 kind: ConfigMap metadata: name: device-plugin-config + namespace: nvidia-gpu-operator data: {} diff --git a/gpu-operator-certified/instance/base/kustomization.yaml b/gpu-operator-certified/instance/base/kustomization.yaml index eead45d1..90e9e4d5 100644 --- a/gpu-operator-certified/instance/base/kustomization.yaml +++ b/gpu-operator-certified/instance/base/kustomization.yaml @@ -1,8 +1,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: nvidia-gpu-operator - resources: - cluster-policy.yaml - device-plugin-config.yaml + +components: + - ../components/monitoring-dashboard diff --git a/gpu-operator-certified/instance/components/README.md b/gpu-operator-certified/instance/components/README.md new file mode 100644 index 00000000..a0d16ac5 --- /dev/null +++ b/gpu-operator-certified/instance/components/README.md @@ -0,0 +1,28 @@ +# NVIDIA GPU Operator Components + +The included components are intended to be common patching patterns used on top of the default NVIDIA GPU operator instance to configure additional features. Components are composable patches that can be added at the overlays layer on top of a base. + +This repo currently contains the following components: + +* [aws-gpu-machineset](aws-gpu-machineset) +* [mig-mixed](mig-mixed) +* [mig-single](mig-single) +* [monitoring-dashboard](monitoring-dashboard) +* [time-sliced](time-sliced) +* [time-sliced-2](time-sliced-2) +* [time-sliced-4](time-sliced-4) + +## Usage + +Components can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/monitoring-dashboard +``` diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md b/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md new file mode 100644 index 00000000..244364ae --- /dev/null +++ b/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md @@ -0,0 +1,24 @@ +# aws-gpu-machineset + +## Purpose + +This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster. + +This component triggers a job that creates a MachineSet based on your current MachineSet. + +This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com. + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/aws-gpu-machineset +``` diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh new file mode 100755 index 00000000..2cd17593 --- /dev/null +++ b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091,SC2120 + +ocp_aws_cluster(){ + TARGET_NS=kube-system + OBJ=secret/aws-creds + echo "Checking if ${OBJ} exists in ${TARGET_NS} namespace" + oc -n "${TARGET_NS}" get "${OBJ}" -o name > /dev/null 2>&1 || return 1 + echo "AWS cluster detected" +} + +ocp_aws_create_gpu_machineset(){ + # https://aws.amazon.com/ec2/instance-types/g4 + # single gpu: g4dn.{2,4,8,16}xlarge + # multi gpu: g4dn.12xlarge + # practical: g4ad.4xlarge + # a100 (MIG): p4d.24xlarge + # h100 (MIG): p5.48xlarge + + # https://aws.amazon.com/ec2/instance-types/dl1 + # 8 x gaudi: dl1.24xlarge + + INSTANCE_TYPE=${1:-g4dn.4xlarge} + + ocp_aws_clone_machineset "${INSTANCE_TYPE}" + + MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1) + + echo "Patching: ${MACHINE_SET_TYPE}" + + # cosmetic + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}' + + # taint nodes for gpu-only workloads + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}' + + # should use the default profile + # oc -n openshift-machine-api \ + # patch "${MACHINE_SET_TYPE}" \ + # --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}' + + # should help auto provisioner + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}' + + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}' + + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}' +} + +ocp_aws_clone_machineset(){ + [ -z "${1}" ] && \ + echo " + usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge > + " + + INSTANCE_TYPE=${1:-g4dn.4xlarge} + MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1) + + # check for an existing instance machine set + if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then + echo "Exists: machineset - ${INSTANCE_TYPE}" + else + echo "Creating: machineset - ${INSTANCE_TYPE}" + oc -n openshift-machine-api \ + get "${MACHINE_SET}" -o yaml | \ + sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g + /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g + s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/ + s/replicas.*/replicas: 0/' | \ + oc apply -f - + fi +} + +ocp_create_machineset_autoscale(){ + MACHINE_MIN=${1:-0} + MACHINE_MAX=${2:-4} + MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )} + + for set in ${MACHINE_SETS} + do +cat << YAML | oc apply -f - +apiVersion: "autoscaling.openshift.io/v1beta1" +kind: "MachineAutoscaler" +metadata: + name: "${set}" + namespace: "openshift-machine-api" +spec: + minReplicas: ${MACHINE_MIN} + maxReplicas: ${MACHINE_MAX} + scaleTargetRef: + apiVersion: machine.openshift.io/v1beta1 + kind: MachineSet + name: "${set}" +YAML + done +} + +ocp_aws_cluster || exit 0 +ocp_aws_create_gpu_machineset +ocp_create_machineset_autoscale diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml new file mode 100644 index 00000000..818e7e56 --- /dev/null +++ b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml @@ -0,0 +1,88 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: job-aws-gpu-machineset +rules: +- apiGroups: + - machine.openshift.io + resources: + - machinesets + verbs: + - '*' +- apiGroups: + - autoscaling.openshift.io + resources: + - machineautoscalers + verbs: + - '*' +- apiGroups: + - '' + resources: + - secrets + resourceNames: + - aws-creds + verbs: + - get + - list +# - nonResourceURLs: +# - '*' +# verbs: +# - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: job-aws-gpu-machineset +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: job-aws-gpu-machineset +subjects: + - kind: ServiceAccount + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: job-aws-gpu-machineset- + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator + annotations: + argocd.argoproj.io/hook: Sync + # argocd.argoproj.io/hook-delete-policy: HookSucceeded +spec: + template: + spec: + containers: + - name: job-aws-gpu-machineset + # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest + image: registry.redhat.io/openshift4/ose-cli + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /bin/bash + - -c + - /scripts/job.sh + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: job-aws-gpu-machineset + defaultMode: 0755 + restartPolicy: Never + terminationGracePeriodSeconds: 30 + serviceAccount: job-aws-gpu-machineset + serviceAccountName: job-aws-gpu-machineset diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml new file mode 100644 index 00000000..c3535d92 --- /dev/null +++ b/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + # - ../../../../../../scripts/library + - job.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator + files: + - job.sh diff --git a/gpu-operator-certified/instance/components/mig-mixed/README.md b/gpu-operator-certified/instance/components/mig-mixed/README.md new file mode 100644 index 00000000..9b6d35b0 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-mixed/README.md @@ -0,0 +1,23 @@ +# mig-mixed + +## Purpose + +This component is designed to enable to enable MIG in mixed mode. The mixed MIG strategy should be utilized when not all GPUs on a node have MIG enabled. + +To learn more about MIG, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/mig-mixed +``` diff --git a/gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml b/gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml new file mode 100644 index 00000000..94641d43 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-mixed/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - path: patch-gpu-cluster-policy.yaml + target: + kind: ClusterPolicy diff --git a/gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml b/gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml new file mode 100644 index 00000000..862e2d92 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-mixed/patch-gpu-cluster-policy.yaml @@ -0,0 +1,7 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + mig: + strategy: mixed diff --git a/gpu-operator-certified/instance/components/mig-single/README.md b/gpu-operator-certified/instance/components/mig-single/README.md new file mode 100644 index 00000000..d49ee7b7 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-single/README.md @@ -0,0 +1,23 @@ +# mig-single + +## Purpose + +This component is designed to enable to enable MIG in single mode. The single MIG strategy should be utilized when all GPUs on a node have MIG enabled. + +To learn more about MIG, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/mig-single +``` diff --git a/gpu-operator-certified/instance/components/mig-single/kustomization.yaml b/gpu-operator-certified/instance/components/mig-single/kustomization.yaml new file mode 100644 index 00000000..94641d43 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-single/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - path: patch-gpu-cluster-policy.yaml + target: + kind: ClusterPolicy diff --git a/gpu-operator-certified/instance/components/mig-single/patch-gpu-cluster-policy.yaml b/gpu-operator-certified/instance/components/mig-single/patch-gpu-cluster-policy.yaml new file mode 100644 index 00000000..20fd28a7 --- /dev/null +++ b/gpu-operator-certified/instance/components/mig-single/patch-gpu-cluster-policy.yaml @@ -0,0 +1,7 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + mig: + strategy: single diff --git a/gpu-operator-certified/instance/components/monitoring-dashboard/README.md b/gpu-operator-certified/instance/components/monitoring-dashboard/README.md new file mode 100644 index 00000000..2b7b265e --- /dev/null +++ b/gpu-operator-certified/instance/components/monitoring-dashboard/README.md @@ -0,0 +1,26 @@ +# monitoring-dashboard + +[NVIDIA Enable Monitoring Dashboard Docs]( +https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/enable-gpu-monitoring-dashboard.html) + +## Purpose + +This component is designed to enable to enable the GPU monitoring Dashboard. + +To learn more about the monitoring dashboard, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/enable-gpu-monitoring-dashboard.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/monitoring-dashboard +``` diff --git a/gpu-operator-certified/instance/components/monitoring-dashboard/kustomization.yaml b/gpu-operator-certified/instance/components/monitoring-dashboard/kustomization.yaml new file mode 100644 index 00000000..04d55e63 --- /dev/null +++ b/gpu-operator-certified/instance/components/monitoring-dashboard/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +generatorOptions: + labels: + console.openshift.io/dashboard: "true" + # optional label to enable visibilty in developer perspective + console.openshift.io/odc-dashboard: "true" + disableNameSuffixHash: true + +configMapGenerator: + - name: nvidia-dcgm-exporter-dashboard + namespace: openshift-config-managed + files: + - https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json diff --git a/gpu-operator-certified/instance/components/time-sliced-2/README.md b/gpu-operator-certified/instance/components/time-sliced-2/README.md new file mode 100644 index 00000000..3ef5e2bd --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-2/README.md @@ -0,0 +1,23 @@ +# time-sliced-2 + +## Purpose + +This component is designed to enable to enable time slicing on GPUs with two replicas of each GPU. + +To learn more about the monitoring dashboard, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/time-slicing-gpus-in-openshift.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/time-sliced-2 +``` diff --git a/gpu-operator-certified/instance/components/time-sliced-2/kustomization.yaml b/gpu-operator-certified/instance/components/time-sliced-2/kustomization.yaml new file mode 100644 index 00000000..e0447cd0 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-2/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +components: + - ../time-sliced + +patches: + - path: patch-device-plugin-config.yaml + target: + kind: ConfigMap + name: device-plugin-config diff --git a/gpu-operator-certified/instance/components/time-sliced-2/patch-device-plugin-config.yaml b/gpu-operator-certified/instance/components/time-sliced-2/patch-device-plugin-config.yaml new file mode 100644 index 00000000..7a620a1b --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-2/patch-device-plugin-config.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: device-plugin-config +data: + time-sliced: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 2 diff --git a/gpu-operator-certified/instance/components/time-sliced-4/README.md b/gpu-operator-certified/instance/components/time-sliced-4/README.md new file mode 100644 index 00000000..1da68aae --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-4/README.md @@ -0,0 +1,22 @@ +# time-sliced-4 + +## Purpose +This component is designed to enable to enable time slicing on GPUs with four replicas of each GPU. + +To learn more about the monitoring dashboard, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/time-slicing-gpus-in-openshift.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/time-sliced-4 +``` diff --git a/gpu-operator-certified/instance/components/time-sliced-4/kustomization.yaml b/gpu-operator-certified/instance/components/time-sliced-4/kustomization.yaml new file mode 100644 index 00000000..e0447cd0 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-4/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +components: + - ../time-sliced + +patches: + - path: patch-device-plugin-config.yaml + target: + kind: ConfigMap + name: device-plugin-config diff --git a/gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml b/gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml new file mode 100644 index 00000000..d2a6b7b5 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: device-plugin-config +data: + time-sliced: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4 diff --git a/gpu-operator-certified/instance/components/time-sliced/README.md b/gpu-operator-certified/instance/components/time-sliced/README.md new file mode 100644 index 00000000..bce39de4 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced/README.md @@ -0,0 +1,27 @@ +# time-sliced + +## Purpose + +This component is designed to enable to enable time slicing on GPUs. + +To learn more about the monitoring dashboard, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/time-slicing-gpus-in-openshift.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/time-sliced +``` + +This component is intended to be used with additional configurations to set the number of replicas. + +Please refer to [time-sliced-2](../time-sliced-2) and [time-sliced-4](../time-sliced-4) for complete implementations of the time slicing configuration. diff --git a/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml b/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml new file mode 100644 index 00000000..6b9a229f --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - path: patch-gpu-cluster-policy.yaml + target: + kind: ClusterPolicy + - path: patch-device-plugin-config.yaml + target: + kind: ConfigMap + name: device-plugin-config diff --git a/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml b/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml new file mode 100644 index 00000000..29ad5128 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: device-plugin-config +data: + no-time-sliced: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 0 diff --git a/gpu-operator-certified/instance/components/time-sliced/patch-gpu-cluster-policy.yaml b/gpu-operator-certified/instance/components/time-sliced/patch-gpu-cluster-policy.yaml new file mode 100644 index 00000000..6f0ec4a6 --- /dev/null +++ b/gpu-operator-certified/instance/components/time-sliced/patch-gpu-cluster-policy.yaml @@ -0,0 +1,11 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + devicePlugin: + config: + default: time-sliced + name: device-plugin-config + gfd: + enabled: true diff --git a/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml b/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml new file mode 100644 index 00000000..28f05f9a --- /dev/null +++ b/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/aws-gpu-machineset + - ../../components/time-sliced-2 diff --git a/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml b/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml new file mode 100644 index 00000000..6cbfe9a6 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/aws-gpu-machineset + - ../../components/time-sliced-4 diff --git a/gpu-operator-certified/instance/overlays/aws/kustomization.yaml b/gpu-operator-certified/instance/overlays/aws/kustomization.yaml new file mode 100644 index 00000000..8e6c5176 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/aws/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/aws-gpu-machineset diff --git a/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml index 5abb963b..48f6d6d3 100644 --- a/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml @@ -4,11 +4,5 @@ kind: Kustomization resources: - ../../base -patches: - - target: - kind: ClusterPolicy - name: gpu-cluster-policy - patch: |- - - op: add - path: /spec/mig/strategy - value: mixed +components: + - ../../components/mig-mixed diff --git a/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml index 87472ae9..4edf4bb2 100644 --- a/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml @@ -4,11 +4,5 @@ kind: Kustomization resources: - ../../base -patches: - - target: - kind: ClusterPolicy - name: gpu-cluster-policy - patch: |- - - op: add - path: /spec/mig/strategy - value: single +components: + - ../../components/mig-single diff --git a/gpu-operator-certified/instance/overlays/time-sliced-2/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-sliced-2/kustomization.yaml new file mode 100644 index 00000000..cbe83742 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/time-sliced-2/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/time-sliced-2 diff --git a/gpu-operator-certified/instance/overlays/time-sliced-4/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-sliced-4/kustomization.yaml new file mode 100644 index 00000000..24e17a20 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/time-sliced-4/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/time-sliced-4 diff --git a/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml deleted file mode 100644 index 1ed4b944..00000000 --- a/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../base - -patches: - - target: - kind: ClusterPolicy - name: gpu-cluster-policy - patch: |- - - op: add - path: /spec/devicePlugin/config/name - value: device-plugin-config - - op: add - path: /spec/devicePlugin/config/default - value: Tesla-T4 - - op: replace - path: /spec/gfd/enabled - value: true - - target: - kind: ConfigMap - name: device-plugin-config - patch: |- - - op: add - path: /data - value: - Tesla-T4: |- - version: v1 - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 2 diff --git a/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml deleted file mode 100644 index 35fe72ba..00000000 --- a/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../time-slicing-2 - -patches: - - target: - kind: ConfigMap - name: device-plugin-config - patch: |- - - op: add - path: /data - value: - Tesla-T4: |- - version: v1 - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 4 diff --git a/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml deleted file mode 100644 index 9b9570d1..00000000 --- a/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../base - -patches: - - target: - kind: ClusterPolicy - name: gpu-cluster-policy - patch: |- - - op: add - path: /spec/devicePlugin/config/name - value: device-plugin-config - - op: add - path: /spec/devicePlugin/config/default - value: A100-SXM4-40GB - - op: replace - path: /spec/gfd/enabled - value: true - - target: - kind: ConfigMap - name: device-plugin-config - patch: |- - - op: add - path: /data - value: - A100-SXM4-40GB: |- - version: v1 - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 8 - - name: nvidia.com/mig-1g.5gb - replicas: 1 - - name: nvidia.com/mig-2g.10gb - replicas: 2 - - name: nvidia.com/mig-3g.20gb - replicas: 3 - - name: nvidia.com/mig-7g.40gb - replicas: 7 diff --git a/gpu-operator-certified/operator/base/kustomization.yaml b/gpu-operator-certified/operator/base/kustomization.yaml index 1e66bd5f..4b1352ac 100644 --- a/gpu-operator-certified/operator/base/kustomization.yaml +++ b/gpu-operator-certified/operator/base/kustomization.yaml @@ -5,3 +5,6 @@ resources: - namespace.yaml - operator-group.yaml - subscription.yaml + +components: + - ../components/console-plugin diff --git a/gpu-operator-certified/operator/base/operator-group.yaml b/gpu-operator-certified/operator/base/operator-group.yaml index f0ba7fd9..ea78106d 100644 --- a/gpu-operator-certified/operator/base/operator-group.yaml +++ b/gpu-operator-certified/operator/base/operator-group.yaml @@ -1,7 +1,7 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - name: gpu-operator-certified-group + name: gpu-operator-certified namespace: nvidia-gpu-operator spec: targetNamespaces: diff --git a/gpu-operator-certified/operator/components/README.md b/gpu-operator-certified/operator/components/README.md new file mode 100644 index 00000000..c0b27b49 --- /dev/null +++ b/gpu-operator-certified/operator/components/README.md @@ -0,0 +1,23 @@ +# NVIDIA GPU Operator Components + +The included components are intended to be common patching patterns used on top of the default NVIDIA GPU operator instance to configure additional features. Components are composable patches that can be added at the overlays layer on top of a base. + +This repo currently contains the following components: + +* [console-plugin](console-plugin) +* [console-plugin-helm](console-plugin-helm) + +## Usage + +Components can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/console-plugin +``` diff --git a/gpu-operator-certified/operator/components/console-plugin-helm/README.md b/gpu-operator-certified/operator/components/console-plugin-helm/README.md new file mode 100644 index 00000000..73d2bd20 --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin-helm/README.md @@ -0,0 +1,29 @@ +# console-plugin-helm + +## Purpose + +This component is designed to enable to NVIDIA GPU Operator console plugin. + +This component renders the upstream helm chart for the console plugin, and runs a job that enables the console-plugin. + +To learn more about the console plugin, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/enable-gpu-op-dashboard.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/console-plugin-helm +``` + +This component requires the `--enable-helm` flag when applying this resource with `kustomize build`. + +If applying this resource with ArgoCD, be sure to configure the `--enable-helm` flag with ArgoCD. For examples, see the [here](https://github.com/redhat-cop/gitops-catalog/tree/main/openshift-gitops-operator/instance/components/kustomize-build-enable-helm). diff --git a/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.sh b/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.sh new file mode 100755 index 00000000..e2339bcc --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +enable_console_plugin(){ + [ -z "${PLUGIN_NAME}" ] && return 1 + + echo "Attempting to enable ${PLUGIN_NAME} plugin" + echo "" + + # Create the plugins section on the object if it doesn't exist + if [ -z "$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')" ]; then + echo "Creating plugins object" + oc patch consoles.operator.openshift.io cluster --patch '{ "spec": { "plugins": [] } }' --type=merge + fi + + INSTALLED_PLUGINS=$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}') + echo "Current plugins:" + echo "${INSTALLED_PLUGINS}" + + if [[ "${INSTALLED_PLUGINS}" == *"${PLUGIN_NAME}"* ]]; then + echo "${PLUGIN_NAME} is already enabled" + else + echo "Enabling plugin: ${PLUGIN_NAME}" + oc patch consoles.operator.openshift.io cluster --type=json --patch '[{"op": "add", "path": "/spec/plugins/-", "value": "'"${PLUGIN_NAME}"'"}]' + fi + + sleep 6 + oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}' +} + +enable_console_plugin diff --git a/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.yaml b/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.yaml new file mode 100644 index 00000000..9a940ed5 --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin-helm/console-plugin-job.yaml @@ -0,0 +1,65 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: job-gpu-console-plugin +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: job-gpu-console-plugin +rules: + - apiGroups: + - operator.openshift.io + resources: + - consoles + verbs: + - get + - list + - patch + - label +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: job-gpu-console-plugin +subjects: + - kind: ServiceAccount + name: job-gpu-console-plugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: job-gpu-console-plugin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: job-gpu-console-plugin + generateName: job-gpu-console-plugin- + annotations: + argocd.argoproj.io/sync-wave: "10" +spec: + template: + spec: + containers: + - name: minion + image: registry.redhat.io/openshift4/ose-cli + env: + - name: PLUGIN_NAME + value: console-plugin-nvidia-gpu + command: + - /bin/bash + - -c + - /scripts/console-plugin-job.sh + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: job-gpu-console-plugin + defaultMode: 0755 + restartPolicy: Never + serviceAccount: job-gpu-console-plugin + serviceAccountName: job-gpu-console-plugin + backoffLimit: 4 diff --git a/gpu-operator-certified/operator/components/console-plugin-helm/kustomization.yaml b/gpu-operator-certified/operator/components/console-plugin-helm/kustomization.yaml new file mode 100644 index 00000000..b5f2d95d --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin-helm/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +helmCharts: + - name: console-plugin-nvidia-gpu + releaseName: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + repo: https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu + +resources: + - console-plugin-job.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: job-gpu-console-plugin + namespace: nvidia-gpu-operator + files: + - console-plugin-job.sh diff --git a/gpu-operator-certified/operator/components/console-plugin/README.md b/gpu-operator-certified/operator/components/console-plugin/README.md new file mode 100644 index 00000000..229e4bae --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/README.md @@ -0,0 +1,27 @@ +# console-plugin + +## Purpose + +This component is designed to enable to NVIDIA GPU Operator console plugin. + +This component creates the console plugin objects, and runs a job that enables the console-plugin. + +This component is intended as an alternative to the `console-plugin-helm` which directly references the upstream helm chart used to install/configure the console-plugin. This component has pre-rendered the helm chart objects and does not require the usage of the `--enable-helm` flag. + +To learn more about the console plugin, please refer to the official [docs]( +https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/enable-gpu-op-dashboard.html) + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/console-plugin +``` diff --git a/gpu-operator-certified/operator/components/console-plugin/configmap.yaml b/gpu-operator-certified/operator/components/console-plugin/configmap.yaml new file mode 100644 index 00000000..1afa1c0a --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/configmap.yaml @@ -0,0 +1,32 @@ +--- +# Source: console-plugin-nvidia-gpu/templates/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + labels: + helm.sh/chart: console-plugin-nvidia-gpu-0.2.4 + app.kubernetes.io/name: console-plugin-nvidia-gpu + app.kubernetes.io/version: "latest" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: console-plugin-nvidia-gpu + app.kubernetes.io/instance: console-plugin-nvidia-gpu + app.kubernetes.io/part-of: console-plugin-nvidia-gpu +data: + dcgm-metrics.csv: | + # see https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/dcp-metrics-included.csv + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization. + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization. + DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization. + DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization. + DCGM_FI_DEV_FB_FREE, gauge, mem free. + DCGM_FI_DEV_FB_USED, gauge, mem used. + DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization. + DCGM_FI_DEV_POWER_USAGE, gauge, power usage. + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit. + DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp. + DCGM_FI_DEV_SM_CLOCK, gauge, sm clock. + DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock. + DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock. + DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock. diff --git a/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.sh b/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.sh new file mode 100755 index 00000000..e2339bcc --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +enable_console_plugin(){ + [ -z "${PLUGIN_NAME}" ] && return 1 + + echo "Attempting to enable ${PLUGIN_NAME} plugin" + echo "" + + # Create the plugins section on the object if it doesn't exist + if [ -z "$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}')" ]; then + echo "Creating plugins object" + oc patch consoles.operator.openshift.io cluster --patch '{ "spec": { "plugins": [] } }' --type=merge + fi + + INSTALLED_PLUGINS=$(oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}') + echo "Current plugins:" + echo "${INSTALLED_PLUGINS}" + + if [[ "${INSTALLED_PLUGINS}" == *"${PLUGIN_NAME}"* ]]; then + echo "${PLUGIN_NAME} is already enabled" + else + echo "Enabling plugin: ${PLUGIN_NAME}" + oc patch consoles.operator.openshift.io cluster --type=json --patch '[{"op": "add", "path": "/spec/plugins/-", "value": "'"${PLUGIN_NAME}"'"}]' + fi + + sleep 6 + oc get consoles.operator.openshift.io cluster -o=jsonpath='{.spec.plugins}' +} + +enable_console_plugin diff --git a/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.yaml b/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.yaml new file mode 100644 index 00000000..bb07ba2b --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/console-plugin-job.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: job-gpu-console-plugin + namespace: nvidia-gpu-operator +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: job-gpu-console-plugin +rules: + - apiGroups: + - operator.openshift.io + resources: + - consoles + verbs: + - get + - list + - patch + - label +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: job-gpu-console-plugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: job-gpu-console-plugin +subjects: + - kind: ServiceAccount + name: job-gpu-console-plugin + namespace: nvidia-gpu-operator +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: job-gpu-console-plugin + namespace: nvidia-gpu-operator + generateName: job-gpu-console-plugin- + annotations: + argocd.argoproj.io/sync-wave: "10" +spec: + template: + spec: + containers: + - name: minion + image: registry.redhat.io/openshift4/ose-cli + env: + - name: PLUGIN_NAME + value: console-plugin-nvidia-gpu + command: + - /bin/bash + - -c + - /scripts/console-plugin-job.sh + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: job-gpu-console-plugin + defaultMode: 0755 + restartPolicy: Never + serviceAccount: job-gpu-console-plugin + serviceAccountName: job-gpu-console-plugin + backoffLimit: 4 diff --git a/gpu-operator-certified/operator/components/console-plugin/consoleplugin.yaml b/gpu-operator-certified/operator/components/console-plugin/consoleplugin.yaml new file mode 100644 index 00000000..0a3e8e29 --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/consoleplugin.yaml @@ -0,0 +1,22 @@ +--- +# Source: console-plugin-nvidia-gpu/templates/consoleplugin.yaml +apiVersion: console.openshift.io/v1alpha1 +kind: ConsolePlugin +metadata: + name: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + labels: + helm.sh/chart: console-plugin-nvidia-gpu-0.2.4 + app.kubernetes.io/name: console-plugin-nvidia-gpu + app.kubernetes.io/version: "latest" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: console-plugin-nvidia-gpu + app.kubernetes.io/instance: console-plugin-nvidia-gpu + app.kubernetes.io/part-of: console-plugin-nvidia-gpu +spec: + displayName: 'Console Plugin NVIDIA GPU Template' + service: + name: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + port: 9443 + basePath: '/' diff --git a/gpu-operator-certified/operator/components/console-plugin/deployment.yaml b/gpu-operator-certified/operator/components/console-plugin/deployment.yaml new file mode 100644 index 00000000..6c59278f --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/deployment.yaml @@ -0,0 +1,58 @@ +--- +# Source: console-plugin-nvidia-gpu/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + labels: + helm.sh/chart: console-plugin-nvidia-gpu-0.2.4 + app.kubernetes.io/name: console-plugin-nvidia-gpu + app.kubernetes.io/version: "latest" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: console-plugin-nvidia-gpu + app.kubernetes.io/instance: console-plugin-nvidia-gpu + app.kubernetes.io/part-of: console-plugin-nvidia-gpu + app.openshift.io/runtime-namespace: console-plugin-nvidia-gpu +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: console-plugin-nvidia-gpu + template: + metadata: + labels: + app.kubernetes.io/name: console-plugin-nvidia-gpu + spec: + securityContext: + runAsNonRoot: true + containers: + - name: console-plugin-nvidia-gpu + image: "quay.io/edge-infrastructure/console-plugin-nvidia-gpu:latest" + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + ports: + - containerPort: 9443 + protocol: TCP + volumeMounts: + - name: plugin-serving-cert + readOnly: true + mountPath: /var/serving-cert + resources: {} + volumes: + - name: plugin-serving-cert + secret: + secretName: plugin-serving-cert + defaultMode: 420 + - name: nginx-conf + configMap: + name: nginx-conf + defaultMode: 420 + restartPolicy: Always + dnsPolicy: ClusterFirst + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 25% + maxSurge: 25% diff --git a/gpu-operator-certified/operator/components/console-plugin/kustomization.yaml b/gpu-operator-certified/operator/components/console-plugin/kustomization.yaml new file mode 100644 index 00000000..e1b60d10 --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - configmap.yaml + - console-plugin-job.yaml + - consoleplugin.yaml + - deployment.yaml + - service.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: job-gpu-console-plugin + namespace: nvidia-gpu-operator + files: + - console-plugin-job.sh diff --git a/gpu-operator-certified/operator/components/console-plugin/service.yaml b/gpu-operator-certified/operator/components/console-plugin/service.yaml new file mode 100644 index 00000000..dd32e37f --- /dev/null +++ b/gpu-operator-certified/operator/components/console-plugin/service.yaml @@ -0,0 +1,27 @@ +--- +# Source: console-plugin-nvidia-gpu/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: console-plugin-nvidia-gpu + namespace: nvidia-gpu-operator + labels: + helm.sh/chart: console-plugin-nvidia-gpu-0.2.4 + app.kubernetes.io/name: console-plugin-nvidia-gpu + app.kubernetes.io/version: "latest" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: console-plugin-nvidia-gpu + app.kubernetes.io/instance: console-plugin-nvidia-gpu + app.kubernetes.io/part-of: console-plugin-nvidia-gpu + annotations: + service.alpha.openshift.io/serving-cert-secret-name: plugin-serving-cert +spec: + ports: + - name: 9443-tcp + protocol: TCP + port: 9443 + targetPort: 9443 + selector: + app.kubernetes.io/name: console-plugin-nvidia-gpu + type: ClusterIP + sessionAffinity: None diff --git a/gpu-operator-certified/operator/overlays/v22.9/patch-channel.yaml b/gpu-operator-certified/operator/overlays/v22.9/patch-channel.yaml index fe4b08c3..0ae1cd3d 100644 --- a/gpu-operator-certified/operator/overlays/v22.9/patch-channel.yaml +++ b/gpu-operator-certified/operator/overlays/v22.9/patch-channel.yaml @@ -1,3 +1,6 @@ - op: replace path: /spec/channel value: v22.9 +# - op: replace +# path: /spec/startingCSV +# value: gpu-operator-certified.v22.9.1 diff --git a/scripts/validate_manifests.sh b/scripts/validate_manifests.sh index 7547cbf2..b314d769 100755 --- a/scripts/validate_manifests.sh +++ b/scripts/validate_manifests.sh @@ -15,6 +15,7 @@ Where: } which kustomize && KUSTOMIZE_CMD="kustomize build" +which helm && GOT_HELM="--enable-helm" KUSTOMIZE_CMD="${KUSTOMIZE_CMD:-oc kustomize}" IGNORE_MISSING_SCHEMAS="--ignore-missing-schemas" @@ -50,16 +51,10 @@ init(){ done } -process_kustomization(){ - - echo "Validating..." - - for BUILD in $(find "${KUSTOMIZE_DIRS}" -name "kustomization.yaml" -exec dirname {} \;) - do - echo "${BUILD}" - +kustomization_build(){ + BUILD=${1} + KUSTOMIZE_BUILD_OUTPUT=$(${KUSTOMIZE_CMD} "${BUILD}" "${GOT_HELM}") # echo "$KUSTOMIZE_BUILD_OUTPUT" | kubeval ${IGNORE_MISSING_SCHEMAS} --schema-location="file://${SCHEMA_LOCATION}" --force-color - KUSTOMIZE_BUILD_OUTPUT=$(${KUSTOMIZE_CMD} "${BUILD}") build_response=$? @@ -69,6 +64,18 @@ process_kustomization(){ fi echo "[OK]" +} + +process_kustomization(){ + + echo "Validating..." + + for BUILD in $(find "${KUSTOMIZE_DIRS}" -name "kustomization.yaml" -exec dirname {} \;) + do + echo "${BUILD}" + + kustomization_build "${BUILD}" + done }