diff --git a/deployment/network-operator/README.md b/deployment/network-operator/README.md index cd58e4d3c..10bf737c4 100644 --- a/deployment/network-operator/README.md +++ b/deployment/network-operator/README.md @@ -128,7 +128,7 @@ $ helm install --set nfd.enabled=false -n network-operator --create-namespace -- > __Note:__ The labels which Network Operator depends on may change between releases. > __Note:__ By default the operator is deployed without an instance of `NicClusterPolicy` and `MacvlanNetwork` -custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator. +> custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator. #### Deploy development version of Network Operator @@ -412,23 +412,37 @@ imagePullSecrets: #### Mellanox OFED driver -| Name | Type | Default | Description | -| ---- | ---- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | -| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | -| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | -| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | -| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | -| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | -| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | -| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | -| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds| -| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | -| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | -| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | -| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | -| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | -| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval | +| Name | Type | Default | Description | +|-------------------------------------------------------------|--------|-----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | +| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | +| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | +| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | +| `ofedDriver.initContainer.enable` | bool | `true` | deploy init container | +| `ofedDriver.initContainer.repository` | string | `ghcr.io/mellanox` | init container image repository | +| `ofedDriver.initContainer.image` | string | `network-operator-init-container` | init container image name | +| `ofedDriver.initContainer.version` | string | `v0.0.1` | init container image version | +| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | +| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | +| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | +| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | +| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds | +| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | +| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | +| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | +| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | +| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | +| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval | +| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature | +| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel | +| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it | +| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart | +| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) | +| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector | +| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation | +| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir | +| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade | +| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite | #### RDMA Device Plugin @@ -606,7 +620,7 @@ optionally deployed components: | `nvIpam.enableWebhook` | bool | `false` | Enable deployment of the validataion webhook for IPPool CRD | > __Note__: Supported X.509 certificate management system should be available in the cluster to enable the validation webhook. -Currently supported systems are [certmanager](https://cert-manager.io/) and +> Currently supported systems are [certmanager](https://cert-manager.io/) and [Openshift certificate management](https://docs.openshift.com/container-platform/4.13/security/certificates/service-serving-certificate.html) diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index 4da2ac340..1443df094 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -32,6 +32,13 @@ spec: image: {{ .Values.ofedDriver.image }} repository: {{ .Values.ofedDriver.repository }} version: {{ .Values.ofedDriver.version }} + {{- if .Values.ofedDriver.initContainer }} + initContainer: + enable: {{ .Values.ofedDriver.initContainer.enable }} + repository: {{ .Values.ofedDriver.initContainer.repository }} + image: {{ .Values.ofedDriver.initContainer.image }} + version: {{ .Values.ofedDriver.initContainer.version }} + {{- end }} {{- if .Values.ofedDriver.env }} env: {{ toYaml .Values.ofedDriver.env | nindent 6 }} @@ -59,12 +66,20 @@ spec: upgradePolicy: autoUpgrade: {{ .Values.ofedDriver.upgradePolicy.autoUpgrade | default false }} maxParallelUpgrades: {{ .Values.ofedDriver.upgradePolicy.maxParallelUpgrades | default 0 }} + safeLoad: {{ .Values.ofedDriver.upgradePolicy.safeLoad | default false }} + {{- if .Values.ofedDriver.upgradePolicy.drain }} drain: enable: {{ .Values.ofedDriver.upgradePolicy.drain.enable | default true }} force: {{ .Values.ofedDriver.upgradePolicy.drain.force | default false }} podSelector: {{ .Values.ofedDriver.upgradePolicy.drain.podSelector | quote }} timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.drain.timeoutSeconds }} deleteEmptyDir: {{ .Values.ofedDriver.upgradePolicy.drain.deleteEmptyDir | default false}} + {{- end }} + {{- if .Values.ofedDriver.upgradePolicy.waitForCompletion }} + waitForCompletion: + podSelector: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.podSelector | default ""}} + timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds | default 0 }} + {{- end }} {{- end }} {{- end }} {{- if .Values.rdmaSharedDevicePlugin.deploy }} diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index 6c029925f..f51023c1a 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -169,6 +169,11 @@ ofedDriver: image: mofed repository: nvcr.io/nvstaging/mellanox version: 23.10-0.5.5.0 + initContainer: + enable: true + repository: ghcr.io/mellanox + image: network-operator-init-container + version: v0.0.1 # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -181,7 +186,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -198,6 +202,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -209,6 +215,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true diff --git a/docs/automatic-ofed-upgrade.md b/docs/automatic-ofed-upgrade.md index b3bec3cd0..52b1cc575 100644 --- a/docs/automatic-ofed-upgrade.md +++ b/docs/automatic-ofed-upgrade.md @@ -7,7 +7,7 @@ It is possible to do a driver upgrade manually by following the [manual upgrade This document describes the automatic upgrade flow for the containerized OFED driver. ### Upgrade NVIDIA Mellanox OFED automatically -* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the [NicClusterPolicy spec: +* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the NicClusterPolicy spec: ``` apiVersion: mellanox.com/v1alpha1 kind: NicClusterPolicy @@ -21,11 +21,13 @@ spec: version: 5.6-1.0.3.3 upgradePolicy: # autoUpgrade is a global switch for automatic upgrade feature - # if set to false all other options are ignored + # if set to false all other options are ignored autoUpgrade: true # maxParallelUpgrades indicates how many nodes can be upgraded in parallel - # 0 means no limit, all nodes will be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 0 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # describes the configuration for waiting on job completions waitForCompletion: # specifies a label selector for the pods to wait for completion @@ -49,11 +51,31 @@ spec: ``` * Change ofedDriver version in the NicClusterPolicy * To check if upgrade is finished, query the status of `state-OFED` in the [NicClusterPolicy status](https://github.com/Mellanox/network-operator#nicclusterpolicy-status) -* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. +* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-driver-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. + +### Safe driver loading + +The state of the feature can be controlled with `ofedDriver.upgradePolicy.safeLoad` option. + +On Node startup, the OFED container takes some time to compile and load the driver. +During that time, workloads might get scheduled on that Node. +When OFED is loaded, all existing PODs that use NVIDIA NICs will lose their network interfaces. +Some such PODs might silently fail or hang. +To avoid such a situation, before the OFED container is loaded, +the Node should get Cordoned and Drained to ensure all workloads are rescheduled. +The Node should be un-cordoned when the driver is ready on it. + +The safe driver loading feature is implemented as a part of the upgrade flow, +meaning safe driver loading is a special scenario of the upgrade procedure, +where we upgrade from the inbox driver to the containerized OFED. + +When this feature is enabled, the initial OFED driver rollout on the large cluster can take much time. +To speed up the rollout, the initial deployment can be done with the safe driver loading feature disabled, +and this feature can be enabled later by updating NicClusterPolicy CR ### Details #### Node upgrade states -Each node's upgrade status is reflected in its `nvidia.com/ofed-upgrade-state` label. This label can have the following values: +Each node's upgrade status is reflected in its `nvidia.com/ofed-driver-upgrade-state` label. This label can have the following values: * Unknown (empty): node has this state when the upgrade flow is disabled or the node hasn't been processed yet * `upgrade-done` is set when OFED POD is up to date and running on the node, the node is schedulable UpgradeStateDone = "upgrade-done" diff --git a/hack/release.go b/hack/release.go index 67685ed6b..0b8d1a5f0 100644 --- a/hack/release.go +++ b/hack/release.go @@ -31,21 +31,22 @@ import ( ) type Release struct { - NetworkOperator *mellanoxv1alpha1.ImageSpec - SriovNetworkOperator *mellanoxv1alpha1.ImageSpec - SriovConfigDaemon *mellanoxv1alpha1.ImageSpec - SriovCni *mellanoxv1alpha1.ImageSpec - SriovIbCni *mellanoxv1alpha1.ImageSpec - Mofed *mellanoxv1alpha1.ImageSpec - RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec - SriovDevicePlugin *mellanoxv1alpha1.ImageSpec - IbKubernetes *mellanoxv1alpha1.ImageSpec - CniPlugins *mellanoxv1alpha1.ImageSpec - Multus *mellanoxv1alpha1.ImageSpec - Ipoib *mellanoxv1alpha1.ImageSpec - IpamPlugin *mellanoxv1alpha1.ImageSpec - NvIPAM *mellanoxv1alpha1.ImageSpec - NicFeatureDiscovery *mellanoxv1alpha1.ImageSpec + NetworkOperator *mellanoxv1alpha1.ImageSpec + NetworkOperatorInitContainer *mellanoxv1alpha1.ImageSpec + SriovNetworkOperator *mellanoxv1alpha1.ImageSpec + SriovConfigDaemon *mellanoxv1alpha1.ImageSpec + SriovCni *mellanoxv1alpha1.ImageSpec + SriovIbCni *mellanoxv1alpha1.ImageSpec + Mofed *mellanoxv1alpha1.ImageSpec + RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec + SriovDevicePlugin *mellanoxv1alpha1.ImageSpec + IbKubernetes *mellanoxv1alpha1.ImageSpec + CniPlugins *mellanoxv1alpha1.ImageSpec + Multus *mellanoxv1alpha1.ImageSpec + Ipoib *mellanoxv1alpha1.ImageSpec + IpamPlugin *mellanoxv1alpha1.ImageSpec + NvIPAM *mellanoxv1alpha1.ImageSpec + NicFeatureDiscovery *mellanoxv1alpha1.ImageSpec } func readDefaults(releaseDefaults string) Release { @@ -80,6 +81,7 @@ func initWithEnvVariale(name string, image *mellanoxv1alpha1.ImageSpec) { func readEnvironmentVariables(release *Release) { initWithEnvVariale("NETWORK_OPERATOR", release.NetworkOperator) + initWithEnvVariale("NETWORK_OPERATOR_INIT_CONTAINER", release.NetworkOperatorInitContainer) initWithEnvVariale("MOFED", release.Mofed) initWithEnvVariale("RDMA_SHARED_DEVICE_PLUGIN", release.RdmaSharedDevicePlugin) initWithEnvVariale("SRIOV_DEVICE_PLUGIN", release.SriovDevicePlugin) diff --git a/hack/release.yaml b/hack/release.yaml index 9047afe02..02950d56b 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -1,6 +1,10 @@ NetworkOperator: image: network-operator repository: nvcr.io/nvstaging/mellanox +NetworkOperatorInitContainer: + image: network-operator-init-container + repository: ghcr.io/mellanox + version: v0.0.1 SriovNetworkOperator: image: sriov-network-operator repository: nvcr.io/nvstaging/mellanox diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 7cb716510..4589329a8 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -169,6 +169,11 @@ ofedDriver: image: {{ .Mofed.Image }} repository: {{ .Mofed.Repository }} version: {{ .Mofed.Version }} + initContainer: + enable: true + repository: {{ .NetworkOperatorInitContainer.Repository }} + image: {{ .NetworkOperatorInitContainer.Image }} + version: {{ .NetworkOperatorInitContainer.Version }} # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -181,7 +186,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -198,6 +202,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -209,6 +215,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true