Skip to content

Commit

Permalink
feat: add support for safe driver loading feature (#600)
Browse files Browse the repository at this point in the history
On Node startup, the OFED container takes some time to compile and load
the driver.
During that time, workloads might get scheduled on that Node.
When OFED is loaded, all existing PODs that use NVIDIA NICs will lose
their network interfaces.
Some such PODs might silently fail or hang.
To avoid such a situation, before the OFED container is loaded, 
the Node should get Cordoned and Drained to ensure all workloads are
rescheduled.
The Node should be un-cordoned when the driver is ready on it.

The safe driver loading feature is implemented as a part of the upgrade
flow,
meaning safe driver loading is a special scenario of the upgrade
procedure,
where we upgrade from the inbox driver to the containerized OFED.

depends on: #685
  • Loading branch information
e0ne authored Dec 5, 2023
2 parents c40b096 + b2398da commit f64281f
Show file tree
Hide file tree
Showing 26 changed files with 433 additions and 58 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ release-build:
cd hack && $(GO) run release.go --templateDir ./templates/samples/ --outputDir ../config/samples/
cd hack && $(GO) run release.go --templateDir ./templates/crs/ --outputDir ../example/crs
cd hack && $(GO) run release.go --templateDir ./templates/values/ --outputDir ../deployment/network-operator/
cd hack && $(GO) run release.go --templateDir ./templates/config/manager --outputDir ../config/manager/

# go-install-tool will 'go install' any package $2 and install it to $1.
define go-install-tool
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/nicclusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type DriverUpgradePolicySpec struct {
MaxParallelUpgrades int `json:"maxParallelUpgrades,omitempty"`
WaitForCompletion *WaitForCompletionSpec `json:"waitForCompletion,omitempty"`
DrainSpec *DrainSpec `json:"drain,omitempty"`
// SafeLoad turn on safe driver loading (cordon and drain the node before loading the driver)
// +optional
// +kubebuilder:default:=false
SafeLoad bool `json:"safeLoad,omitempty"`
}

// WaitForCompletionSpec describes the configuration for waiting on job completions
Expand Down
27 changes: 25 additions & 2 deletions api/v1alpha1/nicclusterpolicy_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ func (w *NicClusterPolicy) ValidateDelete() (admission.Warnings, error) {
/*
We are validating here NicClusterPolicy:
1. IBKubernetes.pKeyGUIDPoolRangeStart and IBKubernetes.pKeyGUIDPoolRangeEnd must be valid GUID and valid range.
2. OFEDDriver.version must be a valid ofed version.
2. OFEDDriver driver configuration
2.1 version must be a valid ofed version.
2.2 safeLoad feature can be enabled only when autoUpgrade is enabled
3. RdmaSharedDevicePlugin.Config.
3.1. Configuration is a valid JSON and check its schema.
3.2. resourceName is valid for k8s.
Expand All @@ -124,7 +126,10 @@ func (w *NicClusterPolicy) validateNicClusterPolicy() error {
// Validate OFEDDriverSpec
ofedDriver := w.Spec.OFEDDriver
if ofedDriver != nil {
allErrs = append(allErrs, ofedDriver.validateVersion(field.NewPath("spec").Child("ofedDriver"))...)
ofedDriverFieldPath := field.NewPath("spec").Child("ofedDriver")
allErrs = append(append(allErrs,
ofedDriver.validateVersion(ofedDriverFieldPath)...),
ofedDriver.validateSafeLoad(ofedDriverFieldPath)...)
}
// Validate RdmaSharedDevicePlugin
rdmaSharedDevicePlugin := w.Spec.RdmaSharedDevicePlugin
Expand Down Expand Up @@ -364,6 +369,24 @@ func (ofedSpec *OFEDDriverSpec) validateVersion(fldPath *field.Path) field.Error
return allErrs
}

func (ofedSpec *OFEDDriverSpec) validateSafeLoad(fldPath *field.Path) field.ErrorList {
upgradePolicy := ofedSpec.OfedUpgradePolicy
if upgradePolicy == nil {
return nil
}
if !upgradePolicy.SafeLoad {
return nil
}
allErrs := field.ErrorList{}
upgradePolicyFieldPath := fldPath.Child("upgradePolicy")
if !upgradePolicy.AutoUpgrade {
allErrs = append(allErrs, field.Forbidden(upgradePolicyFieldPath.Child("safeLoad"),
fmt.Sprintf("safeLoad requires %s to be true",
upgradePolicyFieldPath.Child("autoUpgrade").String())))
}
return allErrs
}

func (w *NicClusterPolicy) validateRepositories(allErrs field.ErrorList) field.ErrorList {
fp := field.NewPath("spec")
if w.Spec.OFEDDriver != nil {
Expand Down
41 changes: 41 additions & 0 deletions api/v1alpha1/nicclusterpolicy_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,47 @@ var _ = Describe("Validate", func() {
_, err := nicClusterPolicy.ValidateCreate()
Expect(err.Error()).To(ContainSubstring("invalid OFED version"))
})
It("MOFED SafeLoad requires AutoUpgrade to be enabled", func() {
nicClusterPolicy := NicClusterPolicy{
ObjectMeta: metav1.ObjectMeta{Name: "test"},
Spec: NicClusterPolicySpec{
OFEDDriver: &OFEDDriverSpec{
ImageSpec: ImageSpec{
Image: "mofed",
Repository: "ghcr.io/mellanox",
Version: "23.10-0.2.2.0",
ImagePullSecrets: []string{},
},
OfedUpgradePolicy: &DriverUpgradePolicySpec{
SafeLoad: true,
},
},
},
}
_, err := nicClusterPolicy.ValidateCreate()
Expect(err.Error()).To(ContainSubstring("autoUpgrade"))
})
It("MOFED valid SafeLoad config", func() {
nicClusterPolicy := NicClusterPolicy{
ObjectMeta: metav1.ObjectMeta{Name: "test"},
Spec: NicClusterPolicySpec{
OFEDDriver: &OFEDDriverSpec{
ImageSpec: ImageSpec{
Image: "mofed",
Repository: "ghcr.io/mellanox",
Version: "23.10-0.2.2.0",
ImagePullSecrets: []string{},
},
OfedUpgradePolicy: &DriverUpgradePolicySpec{
SafeLoad: true,
AutoUpgrade: true,
},
},
},
}
_, err := nicClusterPolicy.ValidateCreate()
Expect(err).To(BeNil())
})
It("Valid RDMA config JSON", func() {
rdmaConfig := `{
"configList": [{
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/mellanox.com_nicclusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,11 @@ spec:
will be upgraded in parallel
minimum: 0
type: integer
safeLoad:
default: false
description: SafeLoad turn on safe driver loading (cordon
and drain the node before loading the driver)
type: boolean
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
Expand Down
5 changes: 5 additions & 0 deletions config/manager/init_container_image_name_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- op: add
path: "/spec/template/spec/containers/0/env/-"
value:
name: OFED_INIT_CONTAINER_IMAGE
value: "ghcr.io/mellanox/network-operator-init-container:v0.0.2"
9 changes: 9 additions & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ commonLabels:
generatorOptions:
disableNameSuffixHash: true

patches:
- path: init_container_image_name_patch.yaml
target:
group: apps
kind: Deployment
name: controller-manager
namespace: system
version: v1

kind: Kustomization
images:
- name: controller
Expand Down
52 changes: 33 additions & 19 deletions deployment/network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ $ helm install --set nfd.enabled=false -n network-operator --create-namespace --
> __Note:__ The labels which Network Operator depends on may change between releases.
> __Note:__ By default the operator is deployed without an instance of `NicClusterPolicy` and `MacvlanNetwork`
custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator.
> custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator.
#### Deploy development version of Network Operator

Expand Down Expand Up @@ -398,23 +398,37 @@ imagePullSecrets:

#### Mellanox OFED driver

| Name | Type | Default | Description |
| ---- | ---- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container |
| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository |
| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name |
| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version |
| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image |
| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image |
| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name |
| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name |
| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds|
| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay |
| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval |
| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay |
| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval |
| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay |
| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval |
| Name | Type | Default | Description |
|-------------------------------------------------------------|--------|-----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container |
| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository |
| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name |
| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version |
| `ofedDriver.initContainer.enable` | bool | `true` | deploy init container |
| `ofedDriver.initContainer.repository` | string | `ghcr.io/mellanox` | init container image repository |
| `ofedDriver.initContainer.image` | string | `network-operator-init-container` | init container image name |
| `ofedDriver.initContainer.version` | string | `v0.0.2` | init container image version |
| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image |
| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image |
| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name |
| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name |
| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds |
| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay |
| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval |
| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay |
| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval |
| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay |
| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval |
| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature |
| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel |
| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it, requires `ofedDriver.initContainer` to be enabled and `ofedDriver.upgradePolicy.autoUpgrade` to be true |
| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart |
| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) |
| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector |
| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation |
| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir |
| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade |
| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite |

#### RDMA Device Plugin

Expand Down Expand Up @@ -592,7 +606,7 @@ optionally deployed components:
| `nvIpam.enableWebhook` | bool | `false` | Enable deployment of the validataion webhook for IPPool CRD |

> __Note__: Supported X.509 certificate management system should be available in the cluster to enable the validation webhook.
Currently supported systems are [certmanager](https://cert-manager.io/) and
> Currently supported systems are [certmanager](https://cert-manager.io/) and
[Openshift certificate management](https://docs.openshift.com/container-platform/4.13/security/certificates/service-serving-certificate.html)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,11 @@ spec:
will be upgraded in parallel
minimum: 0
type: integer
safeLoad:
default: false
description: SafeLoad turn on safe driver loading (cordon
and drain the node before loading the driver)
type: boolean
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
Expand Down
Loading

0 comments on commit f64281f

Please sign in to comment.