diff --git a/Makefile b/Makefile index cf10aa6f5..d277cf2a0 100644 --- a/Makefile +++ b/Makefile @@ -308,6 +308,7 @@ release-build: cd hack && $(GO) run release.go --templateDir ./templates/samples/ --outputDir ../config/samples/ cd hack && $(GO) run release.go --templateDir ./templates/crs/ --outputDir ../example/crs cd hack && $(GO) run release.go --templateDir ./templates/values/ --outputDir ../deployment/network-operator/ + cd hack && $(GO) run release.go --templateDir ./templates/config/manager --outputDir ../config/manager/ # go-install-tool will 'go install' any package $2 and install it to $1. define go-install-tool diff --git a/api/v1alpha1/nicclusterpolicy_types.go b/api/v1alpha1/nicclusterpolicy_types.go index f24be2760..f5bbebb3a 100644 --- a/api/v1alpha1/nicclusterpolicy_types.go +++ b/api/v1alpha1/nicclusterpolicy_types.go @@ -108,6 +108,10 @@ type DriverUpgradePolicySpec struct { MaxParallelUpgrades int `json:"maxParallelUpgrades,omitempty"` WaitForCompletion *WaitForCompletionSpec `json:"waitForCompletion,omitempty"` DrainSpec *DrainSpec `json:"drain,omitempty"` + // SafeLoad turn on safe driver loading (cordon and drain the node before loading the driver) + // +optional + // +kubebuilder:default:=false + SafeLoad bool `json:"safeLoad,omitempty"` } // WaitForCompletionSpec describes the configuration for waiting on job completions diff --git a/api/v1alpha1/nicclusterpolicy_webhook.go b/api/v1alpha1/nicclusterpolicy_webhook.go index 714e8c842..7ad4f1b47 100644 --- a/api/v1alpha1/nicclusterpolicy_webhook.go +++ b/api/v1alpha1/nicclusterpolicy_webhook.go @@ -100,7 +100,9 @@ func (w *NicClusterPolicy) ValidateDelete() (admission.Warnings, error) { /* We are validating here NicClusterPolicy: 1. IBKubernetes.pKeyGUIDPoolRangeStart and IBKubernetes.pKeyGUIDPoolRangeEnd must be valid GUID and valid range. - 2. OFEDDriver.version must be a valid ofed version. + 2. OFEDDriver driver configuration + 2.1 version must be a valid ofed version. + 2.2 safeLoad feature can be enabled only when autoUpgrade is enabled 3. RdmaSharedDevicePlugin.Config. 3.1. Configuration is a valid JSON and check its schema. 3.2. resourceName is valid for k8s. @@ -124,7 +126,10 @@ func (w *NicClusterPolicy) validateNicClusterPolicy() error { // Validate OFEDDriverSpec ofedDriver := w.Spec.OFEDDriver if ofedDriver != nil { - allErrs = append(allErrs, ofedDriver.validateVersion(field.NewPath("spec").Child("ofedDriver"))...) + ofedDriverFieldPath := field.NewPath("spec").Child("ofedDriver") + allErrs = append(append(allErrs, + ofedDriver.validateVersion(ofedDriverFieldPath)...), + ofedDriver.validateSafeLoad(ofedDriverFieldPath)...) } // Validate RdmaSharedDevicePlugin rdmaSharedDevicePlugin := w.Spec.RdmaSharedDevicePlugin @@ -364,6 +369,24 @@ func (ofedSpec *OFEDDriverSpec) validateVersion(fldPath *field.Path) field.Error return allErrs } +func (ofedSpec *OFEDDriverSpec) validateSafeLoad(fldPath *field.Path) field.ErrorList { + upgradePolicy := ofedSpec.OfedUpgradePolicy + if upgradePolicy == nil { + return nil + } + if !upgradePolicy.SafeLoad { + return nil + } + allErrs := field.ErrorList{} + upgradePolicyFieldPath := fldPath.Child("upgradePolicy") + if !upgradePolicy.AutoUpgrade { + allErrs = append(allErrs, field.Forbidden(upgradePolicyFieldPath.Child("safeLoad"), + fmt.Sprintf("safeLoad requires %s to be true", + upgradePolicyFieldPath.Child("autoUpgrade").String()))) + } + return allErrs +} + func (w *NicClusterPolicy) validateRepositories(allErrs field.ErrorList) field.ErrorList { fp := field.NewPath("spec") if w.Spec.OFEDDriver != nil { diff --git a/api/v1alpha1/nicclusterpolicy_webhook_test.go b/api/v1alpha1/nicclusterpolicy_webhook_test.go index af9fd265d..6f174de3c 100644 --- a/api/v1alpha1/nicclusterpolicy_webhook_test.go +++ b/api/v1alpha1/nicclusterpolicy_webhook_test.go @@ -118,6 +118,47 @@ var _ = Describe("Validate", func() { _, err := nicClusterPolicy.ValidateCreate() Expect(err.Error()).To(ContainSubstring("invalid OFED version")) }) + It("MOFED SafeLoad requires AutoUpgrade to be enabled", func() { + nicClusterPolicy := NicClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: NicClusterPolicySpec{ + OFEDDriver: &OFEDDriverSpec{ + ImageSpec: ImageSpec{ + Image: "mofed", + Repository: "ghcr.io/mellanox", + Version: "23.10-0.2.2.0", + ImagePullSecrets: []string{}, + }, + OfedUpgradePolicy: &DriverUpgradePolicySpec{ + SafeLoad: true, + }, + }, + }, + } + _, err := nicClusterPolicy.ValidateCreate() + Expect(err.Error()).To(ContainSubstring("autoUpgrade")) + }) + It("MOFED valid SafeLoad config", func() { + nicClusterPolicy := NicClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: NicClusterPolicySpec{ + OFEDDriver: &OFEDDriverSpec{ + ImageSpec: ImageSpec{ + Image: "mofed", + Repository: "ghcr.io/mellanox", + Version: "23.10-0.2.2.0", + ImagePullSecrets: []string{}, + }, + OfedUpgradePolicy: &DriverUpgradePolicySpec{ + SafeLoad: true, + AutoUpgrade: true, + }, + }, + }, + } + _, err := nicClusterPolicy.ValidateCreate() + Expect(err).To(BeNil()) + }) It("Valid RDMA config JSON", func() { rdmaConfig := `{ "configList": [{ diff --git a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml index 7f335399d..2dd989a4e 100644 --- a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml +++ b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml @@ -553,6 +553,11 @@ spec: will be upgraded in parallel minimum: 0 type: integer + safeLoad: + default: false + description: SafeLoad turn on safe driver loading (cordon + and drain the node before loading the driver) + type: boolean waitForCompletion: description: WaitForCompletionSpec describes the configuration for waiting on job completions diff --git a/config/manager/init_container_image_name_patch.yaml b/config/manager/init_container_image_name_patch.yaml new file mode 100644 index 000000000..6ec0e1485 --- /dev/null +++ b/config/manager/init_container_image_name_patch.yaml @@ -0,0 +1,5 @@ +- op: add + path: "/spec/template/spec/containers/0/env/-" + value: + name: OFED_INIT_CONTAINER_IMAGE + value: "ghcr.io/mellanox/network-operator-init-container:v0.0.2" diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 3003fa20e..3fee003d7 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -7,6 +7,15 @@ commonLabels: generatorOptions: disableNameSuffixHash: true +patches: +- path: init_container_image_name_patch.yaml + target: + group: apps + kind: Deployment + name: controller-manager + namespace: system + version: v1 + kind: Kustomization images: - name: controller diff --git a/deployment/network-operator/README.md b/deployment/network-operator/README.md index 0bef0137e..3812c14b5 100644 --- a/deployment/network-operator/README.md +++ b/deployment/network-operator/README.md @@ -128,7 +128,7 @@ $ helm install --set nfd.enabled=false -n network-operator --create-namespace -- > __Note:__ The labels which Network Operator depends on may change between releases. > __Note:__ By default the operator is deployed without an instance of `NicClusterPolicy` and `MacvlanNetwork` -custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator. +> custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator. #### Deploy development version of Network Operator @@ -398,23 +398,37 @@ imagePullSecrets: #### Mellanox OFED driver -| Name | Type | Default | Description | -| ---- | ---- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | -| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | -| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | -| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | -| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | -| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | -| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | -| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | -| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds| -| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | -| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | -| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | -| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | -| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | -| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval | +| Name | Type | Default | Description | +|-------------------------------------------------------------|--------|-----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container | +| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository | +| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name | +| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version | +| `ofedDriver.initContainer.enable` | bool | `true` | deploy init container | +| `ofedDriver.initContainer.repository` | string | `ghcr.io/mellanox` | init container image repository | +| `ofedDriver.initContainer.image` | string | `network-operator-init-container` | init container image name | +| `ofedDriver.initContainer.version` | string | `v0.0.2` | init container image version | +| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image | +| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image | +| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name | +| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name | +| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds | +| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay | +| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval | +| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay | +| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval | +| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay | +| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval | +| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature | +| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel | +| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it, requires `ofedDriver.initContainer` to be enabled and `ofedDriver.upgradePolicy.autoUpgrade` to be true | +| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart | +| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) | +| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector | +| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation | +| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir | +| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade | +| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite | #### RDMA Device Plugin @@ -592,7 +606,7 @@ optionally deployed components: | `nvIpam.enableWebhook` | bool | `false` | Enable deployment of the validataion webhook for IPPool CRD | > __Note__: Supported X.509 certificate management system should be available in the cluster to enable the validation webhook. -Currently supported systems are [certmanager](https://cert-manager.io/) and +> Currently supported systems are [certmanager](https://cert-manager.io/) and [Openshift certificate management](https://docs.openshift.com/container-platform/4.13/security/certificates/service-serving-certificate.html) diff --git a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml index 7f335399d..2dd989a4e 100644 --- a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml +++ b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml @@ -553,6 +553,11 @@ spec: will be upgraded in parallel minimum: 0 type: integer + safeLoad: + default: false + description: SafeLoad turn on safe driver loading (cordon + and drain the node before loading the driver) + type: boolean waitForCompletion: description: WaitForCompletionSpec describes the configuration for waiting on job completions diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index 4da2ac340..e431bc271 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -59,12 +59,20 @@ spec: upgradePolicy: autoUpgrade: {{ .Values.ofedDriver.upgradePolicy.autoUpgrade | default false }} maxParallelUpgrades: {{ .Values.ofedDriver.upgradePolicy.maxParallelUpgrades | default 0 }} + safeLoad: {{ .Values.ofedDriver.upgradePolicy.safeLoad | default false }} + {{- if .Values.ofedDriver.upgradePolicy.drain }} drain: enable: {{ .Values.ofedDriver.upgradePolicy.drain.enable | default true }} force: {{ .Values.ofedDriver.upgradePolicy.drain.force | default false }} podSelector: {{ .Values.ofedDriver.upgradePolicy.drain.podSelector | quote }} timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.drain.timeoutSeconds }} deleteEmptyDir: {{ .Values.ofedDriver.upgradePolicy.drain.deleteEmptyDir | default false}} + {{- end }} + {{- if .Values.ofedDriver.upgradePolicy.waitForCompletion }} + waitForCompletion: + podSelector: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.podSelector | default ""}} + timeoutSeconds: {{ .Values.ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds | default 0 }} + {{- end }} {{- end }} {{- end }} {{- if .Values.rdmaSharedDevicePlugin.deploy }} diff --git a/deployment/network-operator/templates/operator.yaml b/deployment/network-operator/templates/operator.yaml index 0a7809ff3..07ccb460c 100644 --- a/deployment/network-operator/templates/operator.yaml +++ b/deployment/network-operator/templates/operator.yaml @@ -82,6 +82,12 @@ spec: - name: CNI_BIN_DIR value: "{{ .Values.operator.cniBinDirectory }}" {{- end }} + {{- if and .Values.ofedDriver.initContainer .Values.ofedDriver.initContainer.enable }} + - name: OFED_INIT_CONTAINER_IMAGE + {{- with .Values.ofedDriver.initContainer }} + value: "{{ .repository }}/{{ .image }}:{{ .version }}" + {{- end }} + {{- end }} securityContext: allowPrivilegeEscalation: false livenessProbe: diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index 6c029925f..b137503a9 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -169,6 +169,11 @@ ofedDriver: image: mofed repository: nvcr.io/nvstaging/mellanox version: 23.10-0.5.5.0 + initContainer: + enable: true + repository: ghcr.io/mellanox + image: network-operator-init-container + version: v0.0.2 # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -181,7 +186,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -198,6 +202,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -209,6 +215,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true diff --git a/docs/automatic-ofed-upgrade.md b/docs/automatic-ofed-upgrade.md index b3bec3cd0..52b1cc575 100644 --- a/docs/automatic-ofed-upgrade.md +++ b/docs/automatic-ofed-upgrade.md @@ -7,7 +7,7 @@ It is possible to do a driver upgrade manually by following the [manual upgrade This document describes the automatic upgrade flow for the containerized OFED driver. ### Upgrade NVIDIA Mellanox OFED automatically -* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the [NicClusterPolicy spec: +* Enable automatic MOFED upgrade, define UpgradePolicy section for ofedDriver in the NicClusterPolicy spec: ``` apiVersion: mellanox.com/v1alpha1 kind: NicClusterPolicy @@ -21,11 +21,13 @@ spec: version: 5.6-1.0.3.3 upgradePolicy: # autoUpgrade is a global switch for automatic upgrade feature - # if set to false all other options are ignored + # if set to false all other options are ignored autoUpgrade: true # maxParallelUpgrades indicates how many nodes can be upgraded in parallel - # 0 means no limit, all nodes will be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 0 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # describes the configuration for waiting on job completions waitForCompletion: # specifies a label selector for the pods to wait for completion @@ -49,11 +51,31 @@ spec: ``` * Change ofedDriver version in the NicClusterPolicy * To check if upgrade is finished, query the status of `state-OFED` in the [NicClusterPolicy status](https://github.com/Mellanox/network-operator#nicclusterpolicy-status) -* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. +* To track each node's upgrade status separately, run `kubectl describe node | grep nvidia.com/ofed-driver-upgrade-state`. See [Node upgrade states](#node-upgrade-states) section describing each state. + +### Safe driver loading + +The state of the feature can be controlled with `ofedDriver.upgradePolicy.safeLoad` option. + +On Node startup, the OFED container takes some time to compile and load the driver. +During that time, workloads might get scheduled on that Node. +When OFED is loaded, all existing PODs that use NVIDIA NICs will lose their network interfaces. +Some such PODs might silently fail or hang. +To avoid such a situation, before the OFED container is loaded, +the Node should get Cordoned and Drained to ensure all workloads are rescheduled. +The Node should be un-cordoned when the driver is ready on it. + +The safe driver loading feature is implemented as a part of the upgrade flow, +meaning safe driver loading is a special scenario of the upgrade procedure, +where we upgrade from the inbox driver to the containerized OFED. + +When this feature is enabled, the initial OFED driver rollout on the large cluster can take much time. +To speed up the rollout, the initial deployment can be done with the safe driver loading feature disabled, +and this feature can be enabled later by updating NicClusterPolicy CR ### Details #### Node upgrade states -Each node's upgrade status is reflected in its `nvidia.com/ofed-upgrade-state` label. This label can have the following values: +Each node's upgrade status is reflected in its `nvidia.com/ofed-driver-upgrade-state` label. This label can have the following values: * Unknown (empty): node has this state when the upgrade flow is disabled or the node hasn't been processed yet * `upgrade-done` is set when OFED POD is up to date and running on the node, the node is schedulable UpgradeStateDone = "upgrade-done" diff --git a/hack/release.go b/hack/release.go index 67685ed6b..0b8d1a5f0 100644 --- a/hack/release.go +++ b/hack/release.go @@ -31,21 +31,22 @@ import ( ) type Release struct { - NetworkOperator *mellanoxv1alpha1.ImageSpec - SriovNetworkOperator *mellanoxv1alpha1.ImageSpec - SriovConfigDaemon *mellanoxv1alpha1.ImageSpec - SriovCni *mellanoxv1alpha1.ImageSpec - SriovIbCni *mellanoxv1alpha1.ImageSpec - Mofed *mellanoxv1alpha1.ImageSpec - RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec - SriovDevicePlugin *mellanoxv1alpha1.ImageSpec - IbKubernetes *mellanoxv1alpha1.ImageSpec - CniPlugins *mellanoxv1alpha1.ImageSpec - Multus *mellanoxv1alpha1.ImageSpec - Ipoib *mellanoxv1alpha1.ImageSpec - IpamPlugin *mellanoxv1alpha1.ImageSpec - NvIPAM *mellanoxv1alpha1.ImageSpec - NicFeatureDiscovery *mellanoxv1alpha1.ImageSpec + NetworkOperator *mellanoxv1alpha1.ImageSpec + NetworkOperatorInitContainer *mellanoxv1alpha1.ImageSpec + SriovNetworkOperator *mellanoxv1alpha1.ImageSpec + SriovConfigDaemon *mellanoxv1alpha1.ImageSpec + SriovCni *mellanoxv1alpha1.ImageSpec + SriovIbCni *mellanoxv1alpha1.ImageSpec + Mofed *mellanoxv1alpha1.ImageSpec + RdmaSharedDevicePlugin *mellanoxv1alpha1.ImageSpec + SriovDevicePlugin *mellanoxv1alpha1.ImageSpec + IbKubernetes *mellanoxv1alpha1.ImageSpec + CniPlugins *mellanoxv1alpha1.ImageSpec + Multus *mellanoxv1alpha1.ImageSpec + Ipoib *mellanoxv1alpha1.ImageSpec + IpamPlugin *mellanoxv1alpha1.ImageSpec + NvIPAM *mellanoxv1alpha1.ImageSpec + NicFeatureDiscovery *mellanoxv1alpha1.ImageSpec } func readDefaults(releaseDefaults string) Release { @@ -80,6 +81,7 @@ func initWithEnvVariale(name string, image *mellanoxv1alpha1.ImageSpec) { func readEnvironmentVariables(release *Release) { initWithEnvVariale("NETWORK_OPERATOR", release.NetworkOperator) + initWithEnvVariale("NETWORK_OPERATOR_INIT_CONTAINER", release.NetworkOperatorInitContainer) initWithEnvVariale("MOFED", release.Mofed) initWithEnvVariale("RDMA_SHARED_DEVICE_PLUGIN", release.RdmaSharedDevicePlugin) initWithEnvVariale("SRIOV_DEVICE_PLUGIN", release.SriovDevicePlugin) diff --git a/hack/release.yaml b/hack/release.yaml index 9047afe02..41d295f4d 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -1,6 +1,10 @@ NetworkOperator: image: network-operator repository: nvcr.io/nvstaging/mellanox +NetworkOperatorInitContainer: + image: network-operator-init-container + repository: ghcr.io/mellanox + version: v0.0.2 SriovNetworkOperator: image: sriov-network-operator repository: nvcr.io/nvstaging/mellanox diff --git a/hack/templates/config/manager/init_container_image_name_patch.template b/hack/templates/config/manager/init_container_image_name_patch.template new file mode 100644 index 000000000..efc93309f --- /dev/null +++ b/hack/templates/config/manager/init_container_image_name_patch.template @@ -0,0 +1,5 @@ +- op: add + path: "/spec/template/spec/containers/0/env/-" + value: + name: OFED_INIT_CONTAINER_IMAGE + value: "{{ .NetworkOperatorInitContainer.Repository }}/{{ .NetworkOperatorInitContainer.Image }}:{{ .NetworkOperatorInitContainer.Version }}" diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 7cb716510..4589329a8 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -169,6 +169,11 @@ ofedDriver: image: {{ .Mofed.Image }} repository: {{ .Mofed.Repository }} version: {{ .Mofed.Version }} + initContainer: + enable: true + repository: {{ .NetworkOperatorInitContainer.Repository }} + image: {{ .NetworkOperatorInitContainer.Image }} + version: {{ .NetworkOperatorInitContainer.Version }} # imagePullSecrets: [] # env, if defined will pass environment variables to the OFED container # env: @@ -181,7 +186,6 @@ ofedDriver: # Custom ssl key/certificate configuration certConfig: name: "" - startupProbe: initialDelaySeconds: 10 periodSeconds: 20 @@ -198,6 +202,8 @@ ofedDriver: # how many nodes can be upgraded in parallel (default: 1) # 0 means no limit, all nodes will be upgraded in parallel maxParallelUpgrades: 1 + # cordon and drain (if enabled) a node before loading the driver on it + safeLoad: false # options for node drain (`kubectl drain`) before the driver reload # if auto upgrade is enabled but drain.enable is false, # then driver POD will be reloaded immediately without @@ -209,6 +215,11 @@ ofedDriver: # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries timeoutSeconds: 300 deleteEmptyDir: false + waitForCompletion: + # specifies a label selector for the pods to wait for completion + # podSelector: "app=myapp" + # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite + # timeoutSeconds: 300 rdmaSharedDevicePlugin: deploy: true diff --git a/manifests/state-ofed-driver/0010_service-account.openshift.yaml b/manifests/state-ofed-driver/0010_service-account.openshift.yaml deleted file mode 100644 index 7f15dacc4..000000000 --- a/manifests/state-ofed-driver/0010_service-account.openshift.yaml +++ /dev/null @@ -1,7 +0,0 @@ -{{ if .RuntimeSpec.IsOpenshift }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: ofed-driver - namespace: {{ .RuntimeSpec.Namespace }} -{{end}} diff --git a/manifests/state-ofed-driver/0010_service-account.yaml b/manifests/state-ofed-driver/0010_service-account.yaml new file mode 100644 index 000000000..27599c451 --- /dev/null +++ b/manifests/state-ofed-driver/0010_service-account.yaml @@ -0,0 +1,18 @@ +# 2023 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ofed-driver + namespace: {{ .RuntimeSpec.Namespace }} diff --git a/manifests/state-ofed-driver/0020_cluster_role.yaml b/manifests/state-ofed-driver/0020_cluster_role.yaml new file mode 100644 index 000000000..72d25d549 --- /dev/null +++ b/manifests/state-ofed-driver/0020_cluster_role.yaml @@ -0,0 +1,24 @@ +# 2023 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ofed-driver +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "patch", "watch", "update"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list"] diff --git a/manifests/state-ofed-driver/0030_cluster_role_binding.yaml b/manifests/state-ofed-driver/0030_cluster_role_binding.yaml new file mode 100644 index 000000000..46b5b3d96 --- /dev/null +++ b/manifests/state-ofed-driver/0030_cluster_role_binding.yaml @@ -0,0 +1,25 @@ +# 2023 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ofed-driver +subjects: + - kind: ServiceAccount + name: ofed-driver + namespace: {{ .RuntimeSpec.Namespace }} +roleRef: + kind: ClusterRole + name: ofed-driver + apiGroup: rbac.authorization.k8s.io diff --git a/manifests/state-ofed-driver/0040_ofed_init_container_config-configmap.yaml b/manifests/state-ofed-driver/0040_ofed_init_container_config-configmap.yaml new file mode 100644 index 000000000..3d79b41bb --- /dev/null +++ b/manifests/state-ofed-driver/0040_ofed_init_container_config-configmap.yaml @@ -0,0 +1,28 @@ +# 2023 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +{{- if .RuntimeSpec.InitContainerConfig.InitContainerEnable }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: ofed-init-container-config + namespace: {{ .RuntimeSpec.Namespace }} +data: + config.json: |- + { + "safeDriverLoad": { + "enable": {{ .RuntimeSpec.InitContainerConfig.SafeLoadEnable }}, + "annotation": "{{ .RuntimeSpec.InitContainerConfig.SafeLoadAnnotation }}" + } + } +{{end}} diff --git a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml index eabfbcfbc..4f1da2fd1 100644 --- a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml +++ b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml @@ -1,4 +1,4 @@ -# Copyright 2020 NVIDIA +# Copyright 2023 NVIDIA # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -40,9 +40,7 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule -{{ if .RuntimeSpec.IsOpenshift }} serviceAccountName: ofed-driver -{{end}} hostNetwork: true {{- if .CrSpec.ImagePullSecrets }} imagePullSecrets: @@ -50,6 +48,26 @@ spec: - name: {{ . }} {{- end }} {{- end }} + {{- if .RuntimeSpec.InitContainerConfig.InitContainerEnable }} + initContainers: + - name: network-operator-init-container + imagePullPolicy: IfNotPresent + image: {{ .RuntimeSpec.InitContainerConfig.InitContainerImageName }} + args: + - --node-name + - $(NODE_NAME) + - --configmap-name + - ofed-init-container-config + - --configmap-namespace + - {{ .RuntimeSpec.Namespace }} + - --configmap-key + - config.json + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + {{- end }} containers: - image: {{ .RuntimeSpec.MOFEDImageName }} imagePullPolicy: IfNotPresent diff --git a/pkg/config/config.go b/pkg/config/config.go index 73495b531..6f53e4740 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -37,6 +37,7 @@ type OperatorConfig struct { type StateConfig struct { NetworkOperatorResourceNamespace string `env:"POD_NAMESPACE" envDefault:"nvidia-network-operator"` ManifestBaseDir string `env:"STATE_MANIFEST_BASE_DIR" envDefault:"./manifests"` + OFEDState OFEDStateConfig } // Controller related configurations @@ -46,6 +47,14 @@ type ControllerConfig struct { RequeueTimeSeconds uint `env:"CONTROLLER_REQUEST_REQUEUE_SECONDS" envDefault:"5"` } +// OFEDStateConfig contains extra configuration options for the OFED state which +// can't be configured via CRD +type OFEDStateConfig struct { + // InitContainerImage is a full image name (registry, image name, tag) for the OFED init container. + // The init container will not be deployed if this variable is empty/not set. + InitContainerImage string `env:"OFED_INIT_CONTAINER_IMAGE"` +} + func FromEnv() *OperatorConfig { once.Do(func() { operatorConfig = &OperatorConfig{} diff --git a/pkg/state/state_ofed.go b/pkg/state/state_ofed.go index fb5167c40..472e0264c 100644 --- a/pkg/state/state_ofed.go +++ b/pkg/state/state_ofed.go @@ -25,6 +25,7 @@ import ( "time" "github.com/Masterminds/semver/v3" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" "github.com/go-logr/logr" osconfigv1 "github.com/openshift/api/config/v1" "github.com/pkg/errors" @@ -137,12 +138,20 @@ type additionalVolumeMounts struct { Volumes []v1.Volume } +type initContainerConfig struct { + InitContainerEnable bool + InitContainerImageName string + SafeLoadEnable bool + SafeLoadAnnotation string +} + type ofedRuntimeSpec struct { runtimeSpec - CPUArch string - OSName string - OSVer string - MOFEDImageName string + CPUArch string + OSName string + OSVer string + MOFEDImageName string + InitContainerConfig initContainerConfig // is true if cluster type is Openshift IsOpenshift bool } @@ -448,7 +457,9 @@ func (s *stateOFED) getManifestObjects( OSName: nodeAttr[nodeinfo.AttrTypeOSName], OSVer: nodeAttr[nodeinfo.AttrTypeOSVer], MOFEDImageName: s.getMofedDriverImageName(cr, nodeAttr, reqLogger), - IsOpenshift: clusterInfo.IsOpenshift(), + InitContainerConfig: s.getInitContainerConfig(cr, reqLogger, + config.FromEnv().State.OFEDState.InitContainerImage), + IsOpenshift: clusterInfo.IsOpenshift(), }, Tolerations: cr.Spec.Tolerations, NodeAffinity: cr.Spec.NodeAffinity, @@ -464,6 +475,30 @@ func (s *stateOFED) getManifestObjects( return objs, nil } +// prepare configuration for the init container, +// the init container will be disabled if the image is empty +func (s *stateOFED) getInitContainerConfig( + cr *mellanoxv1alpha1.NicClusterPolicy, reqLogger logr.Logger, image string) initContainerConfig { + var initContCfg initContainerConfig + safeLoadEnable := cr.Spec.OFEDDriver.OfedUpgradePolicy != nil && + cr.Spec.OFEDDriver.OfedUpgradePolicy.AutoUpgrade && + cr.Spec.OFEDDriver.OfedUpgradePolicy.SafeLoad + if image != "" { + initContCfg = initContainerConfig{ + InitContainerEnable: true, + InitContainerImageName: image, + SafeLoadEnable: safeLoadEnable, + SafeLoadAnnotation: upgrade.GetUpgradeDriverWaitForSafeLoadAnnotationKey(), + } + } + + if safeLoadEnable && !initContCfg.InitContainerEnable { + reqLogger.Error(nil, "safe driver loading feature is enabled, but init container is "+ + "disabled. It is required to enable init container to use safe driver loading feature.") + } + return initContCfg +} + // getMofedDriverImageName generates MOFED driver image name based on the driver version specified in CR // TODO(adrianc): in Network-Operator v1.5.0, we should just use the new naming scheme func (s *stateOFED) getMofedDriverImageName(cr *mellanoxv1alpha1.NicClusterPolicy, diff --git a/pkg/state/state_ofed_test.go b/pkg/state/state_ofed_test.go index 2209d624f..f2f31bf0f 100644 --- a/pkg/state/state_ofed_test.go +++ b/pkg/state/state_ofed_test.go @@ -78,6 +78,55 @@ var _ = Describe("MOFED state test", func() { }) }) + Context("Init container", func() { + It("getInitContainerConfig", func() { + cr := &v1alpha1.NicClusterPolicy{ + Spec: v1alpha1.NicClusterPolicySpec{ + OFEDDriver: &v1alpha1.OFEDDriverSpec{ + OfedUpgradePolicy: &v1alpha1.DriverUpgradePolicySpec{ + AutoUpgrade: true, + SafeLoad: true, + }, + }, + }, + } + cfg := stateOfed.getInitContainerConfig(cr, testLogger, "repository/image:version") + Expect(cfg.SafeLoadAnnotation).NotTo(BeEmpty()) + Expect(cfg.SafeLoadEnable).To(BeTrue()) + Expect(cfg.InitContainerEnable).To(BeTrue()) + Expect(cfg.InitContainerImageName).To(Equal("repository/image:version")) + }) + It("getInitContainerConfig - no image", func() { + cr := &v1alpha1.NicClusterPolicy{ + Spec: v1alpha1.NicClusterPolicySpec{ + OFEDDriver: &v1alpha1.OFEDDriverSpec{ + OfedUpgradePolicy: &v1alpha1.DriverUpgradePolicySpec{ + AutoUpgrade: true, + SafeLoad: true, + }, + }, + }, + } + cfg := stateOfed.getInitContainerConfig(cr, testLogger, "") + Expect(cfg.SafeLoadEnable).To(BeFalse()) + Expect(cfg.InitContainerEnable).To(BeFalse()) + }) + It("getInitContainerConfig - SafeLoad disabled if AutoUpgrade is false ", func() { + cr := &v1alpha1.NicClusterPolicy{ + Spec: v1alpha1.NicClusterPolicySpec{ + OFEDDriver: &v1alpha1.OFEDDriverSpec{ + OfedUpgradePolicy: &v1alpha1.DriverUpgradePolicySpec{ + AutoUpgrade: false, + SafeLoad: true, + }, + }, + }, + } + cfg := stateOfed.getInitContainerConfig(cr, testLogger, "repository/image:version") + Expect(cfg.SafeLoadEnable).To(BeFalse()) + Expect(cfg.InitContainerEnable).To(BeTrue()) + }) + }) Context("Proxy config", func() { It("Set Proxy from Cluster Wide Proxy", func() { cr := &v1alpha1.NicClusterPolicy{