Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for safe driver loading feature #600

Merged
merged 3 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ release-build:
cd hack && $(GO) run release.go --templateDir ./templates/samples/ --outputDir ../config/samples/
cd hack && $(GO) run release.go --templateDir ./templates/crs/ --outputDir ../example/crs
cd hack && $(GO) run release.go --templateDir ./templates/values/ --outputDir ../deployment/network-operator/
cd hack && $(GO) run release.go --templateDir ./templates/config/manager --outputDir ../config/manager/

# go-install-tool will 'go install' any package $2 and install it to $1.
define go-install-tool
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/nicclusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type DriverUpgradePolicySpec struct {
MaxParallelUpgrades int `json:"maxParallelUpgrades,omitempty"`
WaitForCompletion *WaitForCompletionSpec `json:"waitForCompletion,omitempty"`
DrainSpec *DrainSpec `json:"drain,omitempty"`
// SafeLoad turn on safe driver loading (cordon and drain the node before loading the driver)
// +optional
// +kubebuilder:default:=false
SafeLoad bool `json:"safeLoad,omitempty"`
}

// WaitForCompletionSpec describes the configuration for waiting on job completions
Expand Down
27 changes: 25 additions & 2 deletions api/v1alpha1/nicclusterpolicy_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ func (w *NicClusterPolicy) ValidateDelete() (admission.Warnings, error) {
/*
We are validating here NicClusterPolicy:
1. IBKubernetes.pKeyGUIDPoolRangeStart and IBKubernetes.pKeyGUIDPoolRangeEnd must be valid GUID and valid range.
2. OFEDDriver.version must be a valid ofed version.
2. OFEDDriver driver configuration
2.1 version must be a valid ofed version.
2.2 safeLoad feature can be enabled only when autoUpgrade is enabled
3. RdmaSharedDevicePlugin.Config.
3.1. Configuration is a valid JSON and check its schema.
3.2. resourceName is valid for k8s.
Expand All @@ -124,7 +126,10 @@ func (w *NicClusterPolicy) validateNicClusterPolicy() error {
// Validate OFEDDriverSpec
ofedDriver := w.Spec.OFEDDriver
if ofedDriver != nil {
allErrs = append(allErrs, ofedDriver.validateVersion(field.NewPath("spec").Child("ofedDriver"))...)
ofedDriverFieldPath := field.NewPath("spec").Child("ofedDriver")
allErrs = append(append(allErrs,
ofedDriver.validateVersion(ofedDriverFieldPath)...),
ofedDriver.validateSafeLoad(ofedDriverFieldPath)...)
}
// Validate RdmaSharedDevicePlugin
rdmaSharedDevicePlugin := w.Spec.RdmaSharedDevicePlugin
Expand Down Expand Up @@ -364,6 +369,24 @@ func (ofedSpec *OFEDDriverSpec) validateVersion(fldPath *field.Path) field.Error
return allErrs
}

func (ofedSpec *OFEDDriverSpec) validateSafeLoad(fldPath *field.Path) field.ErrorList {
upgradePolicy := ofedSpec.OfedUpgradePolicy
if upgradePolicy == nil {
return nil
}
if !upgradePolicy.SafeLoad {
return nil
}
allErrs := field.ErrorList{}
upgradePolicyFieldPath := fldPath.Child("upgradePolicy")
if !upgradePolicy.AutoUpgrade {
allErrs = append(allErrs, field.Forbidden(upgradePolicyFieldPath.Child("safeLoad"),
fmt.Sprintf("safeLoad requires %s to be true",
upgradePolicyFieldPath.Child("autoUpgrade").String())))
}
return allErrs
}

func (w *NicClusterPolicy) validateRepositories(allErrs field.ErrorList) field.ErrorList {
fp := field.NewPath("spec")
if w.Spec.OFEDDriver != nil {
Expand Down
41 changes: 41 additions & 0 deletions api/v1alpha1/nicclusterpolicy_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,47 @@ var _ = Describe("Validate", func() {
_, err := nicClusterPolicy.ValidateCreate()
Expect(err.Error()).To(ContainSubstring("invalid OFED version"))
})
It("MOFED SafeLoad requires AutoUpgrade to be enabled", func() {
nicClusterPolicy := NicClusterPolicy{
ObjectMeta: metav1.ObjectMeta{Name: "test"},
Spec: NicClusterPolicySpec{
OFEDDriver: &OFEDDriverSpec{
ImageSpec: ImageSpec{
Image: "mofed",
Repository: "ghcr.io/mellanox",
Version: "23.10-0.2.2.0",
ImagePullSecrets: []string{},
},
OfedUpgradePolicy: &DriverUpgradePolicySpec{
SafeLoad: true,
},
},
},
}
_, err := nicClusterPolicy.ValidateCreate()
Expect(err.Error()).To(ContainSubstring("autoUpgrade"))
})
It("MOFED valid SafeLoad config", func() {
nicClusterPolicy := NicClusterPolicy{
ObjectMeta: metav1.ObjectMeta{Name: "test"},
Spec: NicClusterPolicySpec{
OFEDDriver: &OFEDDriverSpec{
ImageSpec: ImageSpec{
Image: "mofed",
Repository: "ghcr.io/mellanox",
Version: "23.10-0.2.2.0",
ImagePullSecrets: []string{},
},
OfedUpgradePolicy: &DriverUpgradePolicySpec{
SafeLoad: true,
AutoUpgrade: true,
},
},
},
}
_, err := nicClusterPolicy.ValidateCreate()
Expect(err).To(BeNil())
})
It("Valid RDMA config JSON", func() {
rdmaConfig := `{
"configList": [{
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/mellanox.com_nicclusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,11 @@ spec:
will be upgraded in parallel
minimum: 0
type: integer
safeLoad:
default: false
description: SafeLoad turn on safe driver loading (cordon
and drain the node before loading the driver)
type: boolean
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
Expand Down
5 changes: 5 additions & 0 deletions config/manager/init_container_image_name_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- op: add
path: "/spec/template/spec/containers/0/env/-"
value:
name: OFED_INIT_CONTAINER_IMAGE
value: "ghcr.io/mellanox/network-operator-init-container:v0.0.2"
9 changes: 9 additions & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ commonLabels:
generatorOptions:
disableNameSuffixHash: true

patches:
- path: init_container_image_name_patch.yaml
target:
group: apps
kind: Deployment
name: controller-manager
namespace: system
version: v1

kind: Kustomization
images:
- name: controller
Expand Down
52 changes: 33 additions & 19 deletions deployment/network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ $ helm install --set nfd.enabled=false -n network-operator --create-namespace --
> __Note:__ The labels which Network Operator depends on may change between releases.

> __Note:__ By default the operator is deployed without an instance of `NicClusterPolicy` and `MacvlanNetwork`
custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator.
> custom resources. The user is required to create it later with configuration matching the cluster or use chart parameters to deploy it together with the operator.

#### Deploy development version of Network Operator

Expand Down Expand Up @@ -398,23 +398,37 @@ imagePullSecrets:

#### Mellanox OFED driver

| Name | Type | Default | Description |
| ---- | ---- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container |
| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository |
| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name |
| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version |
| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image |
| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image |
| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name |
| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name |
| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds|
| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay |
| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval |
| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay |
| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval |
| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay |
| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval |
| Name | Type | Default | Description |
|-------------------------------------------------------------|--------|-----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `ofedDriver.deploy` | bool | `false` | deploy Mellanox OFED driver container |
| `ofedDriver.repository` | string | `mellanox` | Mellanox OFED driver image repository |
| `ofedDriver.image` | string | `mofed` | Mellanox OFED driver image name |
| `ofedDriver.version` | string | `5.9-0.5.6.0` | Mellanox OFED driver version |
| `ofedDriver.initContainer.enable` | bool | `true` | deploy init container |
| `ofedDriver.initContainer.repository` | string | `ghcr.io/mellanox` | init container image repository |
| `ofedDriver.initContainer.image` | string | `network-operator-init-container` | init container image name |
| `ofedDriver.initContainer.version` | string | `v0.0.2` | init container image version |
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
| `ofedDriver.imagePullSecrets` | list | `[]` | An optional list of references to secrets to use for pulling any of the Mellanox OFED driver image |
| `ofedDriver.env` | list | `[]` | An optional list of [environment variables](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) passed to the Mellanox OFED driver image |
| `ofedDriver.repoConfig.name` | string | `` | Private mirror repository configuration configMap name |
| `ofedDriver.certConfig.name` | string | `` | Custom TLS key/certificate configuration configMap name |
| `ofedDriver.terminationGracePeriodSeconds` | int | 300 | Mellanox OFED termination grace periods in seconds |
| `ofedDriver.startupProbe.initialDelaySeconds` | int | 10 | Mellanox OFED startup probe initial delay |
| `ofedDriver.startupProbe.periodSeconds` | int | 20 | Mellanox OFED startup probe interval |
| `ofedDriver.livenessProbe.initialDelaySeconds` | int | 30 | Mellanox OFED liveness probe initial delay |
| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval |
| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay |
| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval |
| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature |
| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel |
| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it, requires `ofedDriver.initContainer` to be enabled and `ofedDriver.upgradePolicy.autoUpgrade` to be true |
| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart |
| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) |
| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector |
| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation |
| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir |
| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade |
| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite |

#### RDMA Device Plugin

Expand Down Expand Up @@ -592,7 +606,7 @@ optionally deployed components:
| `nvIpam.enableWebhook` | bool | `false` | Enable deployment of the validataion webhook for IPPool CRD |

> __Note__: Supported X.509 certificate management system should be available in the cluster to enable the validation webhook.
Currently supported systems are [certmanager](https://cert-manager.io/) and
> Currently supported systems are [certmanager](https://cert-manager.io/) and
[Openshift certificate management](https://docs.openshift.com/container-platform/4.13/security/certificates/service-serving-certificate.html)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,11 @@ spec:
will be upgraded in parallel
minimum: 0
type: integer
safeLoad:
default: false
description: SafeLoad turn on safe driver loading (cordon
and drain the node before loading the driver)
type: boolean
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
Expand Down
Loading
Loading