diff --git a/deployment/network-operator/README.md b/deployment/network-operator/README.md index af1425033..66bcadb06 100644 --- a/deployment/network-operator/README.md +++ b/deployment/network-operator/README.md @@ -373,23 +373,37 @@ imagePullSecrets: #### Mellanox OFED driver -| Name | Type | Default | description | -| ---- | ---- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | -| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | -| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | -| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | -| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | -| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | -| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | -| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | -| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds| -| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | -| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | -| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | -| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | -| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | -| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval | +| Name | Type | Default | description | +| -------------------------------------------------- | -------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | +| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | +| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | +| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | +| `ofedDriver.initContainer.enable` | bool | `true` | deploy init container | +| `ofedDriver.initContainer.repository` | string | `ghcr.io/mellanox` | init container image repository | +| `ofedDriver.initContainer.image` | string | `network-operator-init-container` | init container image name | +| `ofedDriver.initContainer.version` | string | `v0.0.1` | init container image version | +| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | +| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | +| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | +| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | +| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds | +| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | +| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | +| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | +| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | +| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | +| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature | +| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel | +| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it | +| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart | +| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) | +| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector | +| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation | +| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir | +| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade | +| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite | + #### RDMA Device Plugin diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index e4efc5e5b..549b9159f 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -32,6 +32,13 @@ spec: image: {{ .Values.ofedDriver.image }} repository: {{ .Values.ofedDriver.repository }} version: {{ .Values.ofedDriver.version }} + {{- if .Values.ofedDriver.initContainer }} + initContainer: + enable: {{ .Values.ofedDriver.initContainer.enable }} + repository: {{ .Values.ofedDriver.initContainer.repository }} + image: {{ .Values.ofedDriver.initContainer.image }} + version: {{ .Values.ofedDriver.initContainer.version }} + {{- end }} {{- if .Values.ofedDriver.env }} env: {{ toYaml .Values.ofedDriver.env | nindent 6 }} @@ -59,12 +66,20 @@ spec: upgradePolicy: autoUpgrade: {{ .Values.ofedDriver.upgradePolicy.autoUpgrade | default false }} maxParallelUpgrades: {{ .Values.ofedDriver.upgradePolicy.maxParallelUpgrades | default 0 }} + safeLoad: {{ .Values.ofedDriver.upgradePolicy.safeLoad | default false }} + {{- if .Values.ofedDriver.upgradePolicy.drain }} drain: enable: {{ .Values.ofedDriver.upgradePolicy.drain.enable | default true }} force: {{ .Values.ofedDriver.upgradePolicy.drain.force | default false }} podSelector: {{ .Values.ofedDriver.upgradePolicy.drain.podSelector | quote }} timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.drain.timeoutSeconds }} deleteEmptyDir: {{ .Values.ofedDriver.upgradePolicy.drain.deleteEmptyDir | default false}} + {{- end }} + {{- if .Values.ofedDriver.upgradePolicy.waitForCompletion }} + waitForCompletion: + podSelector: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.podSelector | default ""}} + timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds | default 0 }} + {{- end }} {{- end }} {{- end }} {{- if .Values.rdmaSharedDevicePlugin.deploy }} diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index c09722e87..7b574f24b 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -154,6 +154,11 @@ ofedDriver: image: mofed repository: nvcr.io/nvidia/mellanox version: 23.07-0.5.0.0 + initContainer: + enable: true + repository: ghcr.io/mellanox + image: network-operator-init-container + version: v0.0.1 # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -166,7 +171,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -183,6 +187,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -194,6 +200,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true diff --git a/docs/automatic-ofed-upgrade.md b/docs/automatic-ofed-upgrade.md index b3bec3cd0..52b1cc575 100644 --- a/docs/automatic-ofed-upgrade.md +++ b/docs/automatic-ofed-upgrade.md @@ -7,7 +7,7 @@ It is possible to do a driver upgrade manually by following the [manual upgrade This document describes the automatic upgrade flow for the containerized OFED driver. ### Upgrade NVIDIA Mellanox OFED automatically -* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the [NicClusterPolicy spec: +* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the NicClusterPolicy spec: ``` apiVersion: mellanox.com/v1alpha1 kind: NicClusterPolicy @@ -21,11 +21,13 @@ spec: version: 5.6-1.0.3.3 upgradePolicy: # autoUpgrade is a global switch for automatic upgrade feature - # if set to false all other options are ignored + # if set to false all other options are ignored autoUpgrade: true # maxParallelUpgrades indicates how many nodes can be upgraded in parallel - # 0 means no limit, all nodes will be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 0 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # describes the configuration for waiting on job completions waitForCompletion: # specifies a label selector for the pods to wait for completion @@ -49,11 +51,31 @@ spec: ``` * Change ofedDriver version in the NicClusterPolicy * To check if upgrade is finished, query the status of `state-OFED` in the [NicClusterPolicy status](https://github.com/Mellanox/network-operator#nicclusterpolicy-status) -* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. +* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-driver-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. + +### Safe driver loading + +The state of the feature can be controlled with `ofedDriver.upgradePolicy.safeLoad` option. + +On Node startup, the OFED container takes some time to compile and load the driver. +During that time, workloads might get scheduled on that Node. +When OFED is loaded, all existing PODs that use NVIDIA NICs will lose their network interfaces. +Some such PODs might silently fail or hang. +To avoid such a situation, before the OFED container is loaded, +the Node should get Cordoned and Drained to ensure all workloads are rescheduled. +The Node should be un-cordoned when the driver is ready on it. + +The safe driver loading feature is implemented as a part of the upgrade flow, +meaning safe driver loading is a special scenario of the upgrade procedure, +where we upgrade from the inbox driver to the containerized OFED. + +When this feature is enabled, the initial OFED driver rollout on the large cluster can take much time. +To speed up the rollout, the initial deployment can be done with the safe driver loading feature disabled, +and this feature can be enabled later by updating NicClusterPolicy CR ### Details #### Node upgrade states -Each node's upgrade status is reflected in its `nvidia.com/ofed-upgrade-state` label. This label can have the following values: +Each node's upgrade status is reflected in its `nvidia.com/ofed-driver-upgrade-state` label. This label can have the following values: * Unknown (empty): node has this state when the upgrade flow is disabled or the node hasn't been processed yet * `upgrade-done` is set when OFED POD is up to date and running on the node, the node is schedulable UpgradeStateDone = "upgrade-done" diff --git a/hack/release.go b/hack/release.go index a524cbfca..258316f15 100644 --- a/hack/release.go +++ b/hack/release.go @@ -31,20 +31,21 @@ import ( ) type Release struct { - NetworkOperator *mellanoxv1alpha1.ImageSpec - SriovNetworkOperator *mellanoxv1alpha1.ImageSpec - SriovConfigDaemon *mellanoxv1alpha1.ImageSpec - SriovCni *mellanoxv1alpha1.ImageSpec - SriovIbCni *mellanoxv1alpha1.ImageSpec - Mofed *mellanoxv1alpha1.ImageSpec - RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec - SriovDevicePlugin *mellanoxv1alpha1.ImageSpec - IbKubernetes *mellanoxv1alpha1.ImageSpec - CniPlugins *mellanoxv1alpha1.ImageSpec - Multus *mellanoxv1alpha1.ImageSpec - Ipoib *mellanoxv1alpha1.ImageSpec - IpamPlugin *mellanoxv1alpha1.ImageSpec - NvIPAM *mellanoxv1alpha1.ImageSpec + NetworkOperator *mellanoxv1alpha1.ImageSpec + NetworkOperatorInitContainer *mellanoxv1alpha1.ImageSpec + SriovNetworkOperator *mellanoxv1alpha1.ImageSpec + SriovConfigDaemon *mellanoxv1alpha1.ImageSpec + SriovCni *mellanoxv1alpha1.ImageSpec + SriovIbCni *mellanoxv1alpha1.ImageSpec + Mofed *mellanoxv1alpha1.ImageSpec + RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec + SriovDevicePlugin *mellanoxv1alpha1.ImageSpec + IbKubernetes *mellanoxv1alpha1.ImageSpec + CniPlugins *mellanoxv1alpha1.ImageSpec + Multus *mellanoxv1alpha1.ImageSpec + Ipoib *mellanoxv1alpha1.ImageSpec + IpamPlugin *mellanoxv1alpha1.ImageSpec + NvIPAM *mellanoxv1alpha1.ImageSpec } func readDefaults(releaseDefaults string) Release { @@ -79,6 +80,7 @@ func initWithEnvVariale(name string, image *mellanoxv1alpha1.ImageSpec) { func readEnvironmentVariables(release *Release) { initWithEnvVariale("NETWORK_OPERATOR", release.NetworkOperator) + initWithEnvVariale("NETWORK_OPERATOR_INIT_CONTAINER", release.NetworkOperatorInitContainer) initWithEnvVariale("MOFED", release.Mofed) initWithEnvVariale("RDMA_SHARED_DEVICE_PLUGIN", release.RdmaSharedDevicePlugin) initWithEnvVariale("SRIOV_DEVICE_PLUGIN", release.SriovDevicePlugin) diff --git a/hack/release.yaml b/hack/release.yaml index 5ba4bdab8..a2bb7784f 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -1,6 +1,10 @@ NetworkOperator: image: network-operator repository: nvcr.io/nvidia/cloud-native +NetworkOperatorInitContainer: + image: network-operator-init-container + repository: ghcr.io/mellanox + version: v0.0.1 SriovNetworkOperator: image: sriov-network-operator repository: nvcr.io/nvidia/mellanox diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 7e0165864..e41d74a67 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -154,6 +154,11 @@ ofedDriver: image: {{ .Mofed.Image }} repository: {{ .Mofed.Repository }} version: {{ .Mofed.Version }} + initContainer: + enable: true + repository: {{ .NetworkOperatorInitContainer.Repository }} + image: {{ .NetworkOperatorInitContainer.Image }} + version: {{ .NetworkOperatorInitContainer.Version }} # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -166,7 +171,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -183,6 +187,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -194,6 +200,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true