From 1e7dc7722afe66ce8444e431e87578bfe210fc06 Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 18 Oct 2024 17:56:02 -0400 Subject: [PATCH] setup instructions for RHOAI 2.14 (#89) --- SETUP.md | 5 + setup.RHOAI-v2.10/mlbatch-subscription.yaml | 2 +- setup.RHOAI-v2.13/mlbatch-subscription.yaml | 2 +- setup.RHOAI-v2.14/CLUSTER-SETUP.md | 147 +++++++++ setup.RHOAI-v2.14/TEAM-SETUP.md | 91 ++++++ setup.RHOAI-v2.14/UNINSTALL.md | 23 ++ setup.RHOAI-v2.14/UPGRADE.md | 29 ++ .../coscheduler-priority-patch.yaml | 3 + setup.RHOAI-v2.14/default-flavor.yaml | 4 + setup.RHOAI-v2.14/mlbatch-dsc.yaml | 32 ++ setup.RHOAI-v2.14/mlbatch-dsci.yaml | 14 + setup.RHOAI-v2.14/mlbatch-edit-role.yaml | 141 ++++++++ setup.RHOAI-v2.14/mlbatch-priorities.yaml | 26 ++ setup.RHOAI-v2.14/mlbatch-subscription.yaml | 300 ++++++++++++++++++ setup.tmpl/CLUSTER-SETUP.md.tmpl | 2 + setup.tmpl/Makefile | 2 + setup.tmpl/RHOAI-v2.14.yaml | 6 + 17 files changed, 827 insertions(+), 2 deletions(-) create mode 100644 setup.RHOAI-v2.14/CLUSTER-SETUP.md create mode 100644 setup.RHOAI-v2.14/TEAM-SETUP.md create mode 100644 setup.RHOAI-v2.14/UNINSTALL.md create mode 100644 setup.RHOAI-v2.14/UPGRADE.md create mode 100644 setup.RHOAI-v2.14/coscheduler-priority-patch.yaml create mode 100644 setup.RHOAI-v2.14/default-flavor.yaml create mode 100644 setup.RHOAI-v2.14/mlbatch-dsc.yaml create mode 100644 setup.RHOAI-v2.14/mlbatch-dsci.yaml create mode 100644 setup.RHOAI-v2.14/mlbatch-edit-role.yaml create mode 100644 setup.RHOAI-v2.14/mlbatch-priorities.yaml create mode 100644 setup.RHOAI-v2.14/mlbatch-subscription.yaml create mode 100644 setup.tmpl/RHOAI-v2.14.yaml diff --git a/SETUP.md b/SETUP.md index 055ce41..89abb90 100644 --- a/SETUP.md +++ b/SETUP.md @@ -42,6 +42,11 @@ Instructions are provided for the following Red Hat OpenShift AI ***stable*** re + [RHOAI 2.10 Uninstall](./setup.RHOAI-v2.10/UNINSTALL.md) Instructions are provided for the following Red Hat OpenShift AI ***fast*** releases: ++ Red Hat OpenShift AI 2.14 + + [RHOAI 2.14 Cluster Setup](./setup.RHOAI-v2.14/CLUSTER-SETUP.md) + + [RHOAI 2.14 Team Setup](./setup.RHOAI-v2.14/TEAM-SETUP.md) + + [UPGRADING from RHOAI 2.13](./setup.RHOAI-v2.14/UPGRADE.md) + + [RHOAI 2.14 Uninstall](./setup.RHOAI-v2.14/UNINSTALL.md) + Red Hat OpenShift AI 2.11 + [RHOAI 2.11 Cluster Setup](./setup.RHOAI-v2.11/CLUSTER-SETUP.md) + [RHOAI 2.11 Team Setup](./setup.RHOAI-v2.11/TEAM-SETUP.md) diff --git a/setup.RHOAI-v2.10/mlbatch-subscription.yaml b/setup.RHOAI-v2.10/mlbatch-subscription.yaml index 7157ae5..f9cb18e 100644 --- a/setup.RHOAI-v2.10/mlbatch-subscription.yaml +++ b/setup.RHOAI-v2.10/mlbatch-subscription.yaml @@ -245,7 +245,7 @@ metadata: name: rhods-operator namespace: redhat-ods-operator spec: - channel: stable-2.10 + channel: stable installPlanApproval: Manual name: rhods-operator source: redhat-operators diff --git a/setup.RHOAI-v2.13/mlbatch-subscription.yaml b/setup.RHOAI-v2.13/mlbatch-subscription.yaml index 9d76dcd..7b06f5a 100644 --- a/setup.RHOAI-v2.13/mlbatch-subscription.yaml +++ b/setup.RHOAI-v2.13/mlbatch-subscription.yaml @@ -260,7 +260,7 @@ metadata: name: rhods-operator namespace: redhat-ods-operator spec: - channel: fast + channel: stable installPlanApproval: Manual name: rhods-operator source: redhat-operators diff --git a/setup.RHOAI-v2.14/CLUSTER-SETUP.md b/setup.RHOAI-v2.14/CLUSTER-SETUP.md new file mode 100644 index 0000000..e7f8147 --- /dev/null +++ b/setup.RHOAI-v2.14/CLUSTER-SETUP.md @@ -0,0 +1,147 @@ +# Cluster Setup + +The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue, +cluster roles, and priority classes. + +If MLBatch is deployed on a cluster that used to run earlier versions of ODH, +[MCAD](https://github.com/project-codeflare/mcad), Red Hat OpenShift AI, or Coscheduler, +make sure to scrub traces of these installations. In particular, make sure to +delete the following custom resource definitions (CRD) if present on the +cluster. Make sure to delete all instances prior to deleting the CRDs: +```sh +# Delete old appwrappers and crd +oc delete appwrappers --all -A +oc delete crd appwrappers.workload.codeflare.dev + +# Delete old noderesourcetopologies and crd +oc delete noderesourcetopologies --all -A +oc delete crd noderesourcetopologies.topology.node.k8s.io +``` + +## Priorities + +Create `default-priority`, `high-priority`, and `low-priority` priority classes: +```sh +oc apply -f setup.RHOAI-v2.14/mlbatch-priorities.yaml +``` + +## Coscheduler + +Install Coscheduler v0.28.9 as a secondary scheduler and configure packing: +```sh +helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ + scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ + --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"}]' +``` +Patch Coscheduler pod priorities: +```sh +oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.14/coscheduler-priority-patch.yaml scheduler-plugins-controller +oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.14/coscheduler-priority-patch.yaml scheduler-plugins-scheduler +``` + +## Red Hat OpenShift AI + +Create the Red Hat OpenShift AI subscription: +```sh +oc apply -f setup.RHOAI-v2.14/mlbatch-subscription.yaml +```` +Identify install plan: +```sh +oc get ip -n redhat-ods-operator +``` +``` +NAMESPACE NAME CSV APPROVAL APPROVED +redhat-ods-operator install-kmh8w rhods-operator.2.10.0 Manual false +``` +Approve install plan replacing the generated plan name below with the actual +value: +```sh +oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w +``` +Create DSC Initialization: +```sh +oc apply -f setup.RHOAI-v2.14/mlbatch-dsci.yaml +``` +Create Data Science Cluster: +```sh +oc apply -f setup.RHOAI-v2.14/mlbatch-dsc.yaml +``` +The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift +AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The +remaining components such as `dashboard` can be optionally enabled. + +The configuration of the managed components differs from the default Red Hat OpenShift +AI configuration as follows: +- Kubeflow Training Operator: + - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, +- Kueue: + - `manageJobsWithoutQueueName` is enabled, + - `batch/job` integration is disabled, + - `waitForPodsReady` is disabled, + - `LendingLimit` feature gate is enabled, + - `enableClusterQueueResources` metrics is enabled, +- Codeflare operator: + - the AppWrapper controller is enabled and configured as follows: + - `userRBACAdmissionCheck` is disabled, + - `schedulerName` is set to `scheduler-plugins-scheduler`, + - `queueName` is set to `default-queue`, +- pod priorities, resource requests and limits have been adjusted. + + + +## Kueue Configuration + +Create Kueue's default flavor: +```sh +oc apply -f setup.RHOAI-v2.14/default-flavor.yaml +``` + +## Cluster Role + +Create `mlbatch-edit` role: +```sh +oc apply -f setup.RHOAI-v2.14/mlbatch-edit-role.yaml +``` + +## Slack Cluster Queue + +Create the designated slack `ClusterQueue` which will be used to automate +minor adjustments to cluster capacity caused by node failures and +scheduler maintanence. +```sh +oc apply -f- << EOF +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: slack-cluster-queue +spec: + namespaceSelector: {} + cohort: default-cohort + preemption: + withinClusterQueue: LowerOrNewerEqualPriority + reclaimWithinCohort: Any + borrowWithinCohort: + policy: Never + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] + flavors: + - name: default-flavor + resources: + - name: "cpu" + nominalQuota: 8000m + - name: "memory" + nominalQuota: 128Gi + - name: "nvidia.com/gpu" + nominalQuota: 8 + - name: "nvidia.com/roce_gdr" + nominalQuota: 1 + - name: "pods" + nominalQuota: 100 +EOF +``` +Edit the above quantities to adjust the quota to the desired +values. Pod counts are optional and can be omitted from the list of +covered resources. The `lendingLimit` for each resource will be +dynamically adjusted by the MLBatch system to reflect reduced cluster +capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a +detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.RHOAI-v2.14/TEAM-SETUP.md b/setup.RHOAI-v2.14/TEAM-SETUP.md new file mode 100644 index 0000000..85c9429 --- /dev/null +++ b/setup.RHOAI-v2.14/TEAM-SETUP.md @@ -0,0 +1,91 @@ +# Team Setup + +A *team* in MLBatch is a group of users that share a resource quota. + +Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) +for a discussion of our recommended best practices. + + +Setting up a new team requires the cluster admin to create a project, +a user group, a quota, a queue, and the required role bindings as described below. + +Create project: +```sh +oc new-project team1 +``` +Create user group: +```sh +oc adm groups new team1-edit-group +``` +Add users to group for example: +```sh +oc adm groups add-users team1-edit-group user1 +``` +Bind cluster role to group in namespace: +```sh +oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 +``` + +Specify the intended quota for the namespace by creating a `ClusterQueue`: +```sh +oc apply -f- << EOF +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: team1-cluster-queue +spec: + namespaceSelector: {} + cohort: default-cohort + preemption: + withinClusterQueue: LowerOrNewerEqualPriority + reclaimWithinCohort: Any + borrowWithinCohort: + policy: Never + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] + flavors: + - name: default-flavor + resources: + - name: "cpu" + nominalQuota: 8000m + # borrowingLimit: 0 + # lendingLimit: 0 + - name: "memory" + nominalQuota: 128Gi + # borrowingLimit: 0 + # lendingLimit: 0 + - name: "nvidia.com/gpu" + nominalQuota: 16 + # borrowingLimit: 0 + # lendingLimit: 0 + - name: "nvidia.com/roce_gdr" + nominalQuota: 4 + # borrowingLimit: 0 + # lendingLimit: 0 + - name: "pods" + nominalQuota: 100 + # borrowingLimit: 0 + # lendingLimit: 0 +EOF +``` +Edit the above quantities to adjust the quota to the desired values. Pod counts +are optional and can be omitted from the list of covered resources. + +Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing +quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other +namespaces from borrowing quota from this namespace. + +Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: +```sh +oc apply -n team1 -f- << EOF +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: default-queue +spec: + clusterQueue: team1-cluster-queue +EOF +``` +We recommend naming the local queue `default-queue` as `AppWrappers` will +default to this queue name. + diff --git a/setup.RHOAI-v2.14/UNINSTALL.md b/setup.RHOAI-v2.14/UNINSTALL.md new file mode 100644 index 0000000..776045d --- /dev/null +++ b/setup.RHOAI-v2.14/UNINSTALL.md @@ -0,0 +1,23 @@ +# Uninstall + +***First, remove all team projects and corresponding cluster queues.*** + +Then to uninstall the MLBatch controllers and reclaim the corresponding +namespaces, run: +```sh +# OpenShift AI uninstall +oc delete dsc mlbatch-dsc +oc delete dsci mlbatch-dsci +oc delete subscription -n redhat-ods-operator rhods-operator +oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator +oc delete crd featuretrackers.features.opendatahub.io \ + dscinitializations.dscinitialization.opendatahub.io \ + datascienceclusters.datasciencecluster.opendatahub.io +oc delete operators rhods-operator.redhat-ods-operator +oc delete operatorgroup -n redhat-ods-operator rhods-operator +oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator + +# Coscheduler uninstall +helm uninstall -n scheduler-plugins scheduler-plugins +oc delete namespace scheduler-plugins +``` diff --git a/setup.RHOAI-v2.14/UPGRADE.md b/setup.RHOAI-v2.14/UPGRADE.md new file mode 100644 index 0000000..0f124ba --- /dev/null +++ b/setup.RHOAI-v2.14/UPGRADE.md @@ -0,0 +1,29 @@ +# Upgrading from RHOAI 2.13 + +These instructions assume you installed and configured RHOAI 2.13 following +the MLBatch [install instructions for RHOAI-v2.13](../setup.RHOAI-v2.13/CLUSTER-SETUP.md) +and are subscribed to the fast channel. + +Your subscription will have automatically created an unapproved +install plan to upgrade to RHOAI 2.14. + +Before beginning, verify that the expected install plan exists: +```sh +oc get ip -n redhat-ods-operator +``` +Typical output would be: +```sh +NAME CSV APPROVAL APPROVED +install-kpzzl rhods-operator.2.14.0 Manual false +install-nqrbp rhods-operator.2.13.0 Manual true +``` + +Assuming the install plan exists you can begin the upgrade process. + +There are no MLBatch modifications to the default RHOAI configuration maps +beyond those already made in previous installs. Therefore, you can simply +approve the install plan replacing the example plan name below with the actual +value on your cluster: +```sh +oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl +``` diff --git a/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml b/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml new file mode 100644 index 0000000..278802f --- /dev/null +++ b/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml @@ -0,0 +1,3 @@ +- op: add + path: /spec/template/spec/priorityClassName + value: system-node-critical diff --git a/setup.RHOAI-v2.14/default-flavor.yaml b/setup.RHOAI-v2.14/default-flavor.yaml new file mode 100644 index 0000000..6cbccf3 --- /dev/null +++ b/setup.RHOAI-v2.14/default-flavor.yaml @@ -0,0 +1,4 @@ +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: default-flavor diff --git a/setup.RHOAI-v2.14/mlbatch-dsc.yaml b/setup.RHOAI-v2.14/mlbatch-dsc.yaml new file mode 100644 index 0000000..66336bc --- /dev/null +++ b/setup.RHOAI-v2.14/mlbatch-dsc.yaml @@ -0,0 +1,32 @@ +apiVersion: datasciencecluster.opendatahub.io/v1 +kind: DataScienceCluster +metadata: + name: mlbatch-dsc +spec: + components: + codeflare: + managementState: Managed + dashboard: + managementState: Removed + datasciencepipelines: + managementState: Removed + kserve: + managementState: Removed + serving: + ingressGateway: + certificate: + type: SelfSigned + managementState: Removed + name: knative-serving + kueue: + managementState: Managed + modelmeshserving: + managementState: Removed + ray: + managementState: Managed + trainingoperator: + managementState: Managed + trustyai: + managementState: Removed + workbenches: + managementState: Removed diff --git a/setup.RHOAI-v2.14/mlbatch-dsci.yaml b/setup.RHOAI-v2.14/mlbatch-dsci.yaml new file mode 100644 index 0000000..77785c3 --- /dev/null +++ b/setup.RHOAI-v2.14/mlbatch-dsci.yaml @@ -0,0 +1,14 @@ +apiVersion: dscinitialization.opendatahub.io/v1 +kind: DSCInitialization +metadata: + name: mlbatch-dsci +spec: + applicationsNamespace: redhat-ods-applications + monitoring: + managementState: Managed + namespace: redhat-ods-monitoring + serviceMesh: + managementState: Removed + trustedCABundle: + customCABundle: "" + managementState: Managed diff --git a/setup.RHOAI-v2.14/mlbatch-edit-role.yaml b/setup.RHOAI-v2.14/mlbatch-edit-role.yaml new file mode 100644 index 0000000..74f547d --- /dev/null +++ b/setup.RHOAI-v2.14/mlbatch-edit-role.yaml @@ -0,0 +1,141 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mlbatch-edit +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - watch +- apiGroups: + - apps + resources: + - deployments + - statefulsets + verbs: + - delete + - get + - list + - watch +- apiGroups: + - "" + resources: + - services + - secrets + - configmaps + - persistentvolumeclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - kueue.x-k8s.io + resources: + - "*" + verbs: + - get + - list + - watch +- apiGroups: + - kubeflow.org + resources: + - pytorchjobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - cluster.ray.io + resources: + - rayjobs + - rayclusters + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - delete + - get + - list + - watch +- apiGroups: + - workload.codeflare.dev + resources: + - appwrappers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scheduling.k8s.io + resources: + - priorityclasses + verbs: + - get + - list + - watch +- apiGroups: + - scheduling.x-k8s.io + resources: + - podgroups + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - events + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - namespaces + - pods/logs + verbs: + - get +- apiGroups: + - "" + - project.openshift.io + resources: + - projects + verbs: + - get +- apiGroups: + - "" + resources: + - pods/exec + verbs: + - create diff --git a/setup.RHOAI-v2.14/mlbatch-priorities.yaml b/setup.RHOAI-v2.14/mlbatch-priorities.yaml new file mode 100644 index 0000000..77c8f3b --- /dev/null +++ b/setup.RHOAI-v2.14/mlbatch-priorities.yaml @@ -0,0 +1,26 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low-priority +value: 1 +preemptionPolicy: PreemptLowerPriority +globalDefault: false +description: "This is the priority class for all lower priority jobs." +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: default-priority +value: 5 +preemptionPolicy: PreemptLowerPriority +globalDefault: true +description: "This is the priority class for all jobs (default priority)." +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high-priority +value: 10 +preemptionPolicy: PreemptLowerPriority +globalDefault: false +description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.RHOAI-v2.14/mlbatch-subscription.yaml b/setup.RHOAI-v2.14/mlbatch-subscription.yaml new file mode 100644 index 0000000..aa352e9 --- /dev/null +++ b/setup.RHOAI-v2.14/mlbatch-subscription.yaml @@ -0,0 +1,300 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: redhat-ods-operator +--- +apiVersion: v1 +kind: Namespace +metadata: + name: redhat-ods-applications +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: rhods-operator + namespace: redhat-ods-operator +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-codeflare + namespace: redhat-ods-operator +data: + manager.yaml: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: manager + namespace: system + spec: + selector: + matchLabels: + app.kubernetes.io/name: codeflare-operator + app.kubernetes.io/part-of: codeflare + replicas: 1 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + app.kubernetes.io/name: codeflare-operator + app.kubernetes.io/part-of: codeflare + spec: + priorityClassName: system-node-critical + securityContext: + runAsNonRoot: true + # TODO(user): For common cases that do not require escalating privileges + # it is recommended to ensure that all your Pods/Containers are restrictive. + # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted + # Please uncomment the following code if your project does NOT have to work on old Kubernetes + # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). + # seccompProfile: + # type: RuntimeDefault + containers: + - command: + - /manager + image: $(codeflare_operator_controller_image) + imagePullPolicy: Always + name: manager + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8080 + protocol: TCP + name: metrics + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi + serviceAccountName: controller-manager + terminationGracePeriodSeconds: 10 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: codeflare-operator-config + namespace: redhat-ods-applications +data: + config.yaml: | + appwrapper: + enabled: true + Config: + manageJobsWithoutQueueName: true + userRBACAdmissionCheck: false + schedulerName: scheduler-plugins-scheduler + defaultQueueName: default-queue + slackQueueName: slack-cluster-queue + autopilot: + resourceTaints: + nvidia.com/gpu: + - key: autopilot.ibm.com/gpuhealth + value: ERR + effect: NoSchedule + - key: autopilot.ibm.com/gpuhealth + value: TESTING + effect: NoSchedule + - key: autopilot.ibm.com/gpuhealth + value: EVICT + effect: NoExecute +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-kuberay + namespace: redhat-ods-operator +data: + kuberay-operator-image-patch.yaml: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: kuberay-operator + spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: kuberay-operator + image: $(image) +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-kueue + namespace: redhat-ods-operator +data: + controller_manager_config.yaml: | + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8080 + enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + #pprofBindAddress: :8082 + waitForPodsReady: + enable: false + blockAdmission: false + manageJobsWithoutQueueName: true + #internalCertManagement: + # enable: false + # webhookServiceName: "" + # webhookSecretName: "" + integrations: + frameworks: + # - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + externalFrameworks: + - "AppWrapper.v1beta2.workload.codeflare.dev" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + manager_config_patch.yaml: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: controller-manager + namespace: system + spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: manager + image: $(image) + args: + - "--config=/controller_manager_config.yaml" + - "--zap-log-level=2" + - "--feature-gates=LendingLimit=true" + volumeMounts: + - name: manager-config + mountPath: /controller_manager_config.yaml + subPath: controller_manager_config.yaml + volumes: + - name: manager-config + configMap: + name: manager-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-training-operator + namespace: redhat-ods-operator +data: + manager_config_patch.yaml: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: training-operator + spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: training-operator + image: $(image) + args: + - "--zap-log-level=2" + - "--gang-scheduler-name=scheduler-plugins-scheduler" + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 1000Mi +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: rhods-operator + namespace: redhat-ods-operator +spec: + channel: fast + installPlanApproval: Manual + name: rhods-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + startingCSV: rhods-operator.2.14.0 + config: + env: + - name: "DISABLE_DSC_CONFIG" + volumeMounts: + - name: mlbatch-codeflare + mountPath: /opt/manifests/codeflare/manager/manager.yaml + subPath: manager.yaml + - name: mlbatch-kuberay + mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml + subPath: kuberay-operator-image-patch.yaml + - name: mlbatch-kueue + mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml + subPath: controller_manager_config.yaml + - name: mlbatch-kueue + mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml + subPath: manager_config_patch.yaml + - name: mlbatch-training-operator + mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml + subPath: manager_config_patch.yaml + volumes: + - name: mlbatch-codeflare + configMap: + name: mlbatch-codeflare + - name: mlbatch-kuberay + configMap: + name: mlbatch-kuberay + - name: mlbatch-kueue + configMap: + name: mlbatch-kueue + - name: mlbatch-training-operator + configMap: + name: mlbatch-training-operator diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl index 0fca977..7d9bc61 100644 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ b/setup.tmpl/CLUSTER-SETUP.md.tmpl @@ -113,6 +113,7 @@ AI configuration as follows: - `queueName` is set to `default-queue`, - pod priorities, resource requests and limits have been adjusted. +{{ if or (eq .VERSION "RHOAI-v2.10") (eq .VERSION "RHOAI-v2.11") (eq .VERSION "RHOAI-v2.13") -}} To work around https://issues.redhat.com/browse/RHOAIENG-7887 (a race condition in Red Hat OpenShift AI installation), do a rolling restart of the Kueue manager. ```sh @@ -127,6 +128,7 @@ kueue-controller-manager's log: {"level":"info","ts":"2024-06-25T20:17:25.689743757Z","logger":"setup","caller":"jobframework/setup.go:81","msg":"Set up controller and webhook for job framework","jobFrameworkName":"kubeflow.org/pytorchjob"} ``` +{{- end }} {{- else -}} ## Install Operators diff --git a/setup.tmpl/Makefile b/setup.tmpl/Makefile index 069c25b..0f342e4 100644 --- a/setup.tmpl/Makefile +++ b/setup.tmpl/Makefile @@ -27,6 +27,8 @@ docs: gotmpl ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.11/TEAM-SETUP.md -values RHOAI-v2.11.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/CLUSTER-SETUP.md -values RHOAI-v2.13.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/TEAM-SETUP.md -values RHOAI-v2.13.yaml + ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.14/CLUSTER-SETUP.md -values RHOAI-v2.14.yaml + ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.14/TEAM-SETUP.md -values RHOAI-v2.14.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.27/CLUSTER-SETUP.md -values Kubernetes-v1.27.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s-v1.27/TEAM-SETUP.md -values Kubernetes-v1.27.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.30/CLUSTER-SETUP.md -values Kubernetes-v1.30.yaml diff --git a/setup.tmpl/RHOAI-v2.14.yaml b/setup.tmpl/RHOAI-v2.14.yaml new file mode 100644 index 0000000..631b9fd --- /dev/null +++ b/setup.tmpl/RHOAI-v2.14.yaml @@ -0,0 +1,6 @@ +# Values for RHOAI 2.14 + +OPENSHIFT: true +VERSION: RHOAI-v2.14 +KUBECTL: oc +SLACKCQ: true \ No newline at end of file