From eaa0b6ac60b30c99d2c5e090efda876661c2233f Mon Sep 17 00:00:00 2001 From: Fred Rolland Date: Wed, 22 Nov 2023 18:12:15 +0200 Subject: [PATCH] feat: Support OFED with DTK In Openshift, in order to OFED containter to be able to download and compile the needed Kernel files, it is required to install a cluster-wide entitlement. This requirement is not user friendly. In order to avoid this, a container image with the needed files is available in Openshift distributions. This image is called DriverToolKit aka DTK. By using this container as a side-car to MOFED container, the modules can be compiled without entitlement. Changes: API: - NicClusterPolicy CRD: add bool 'useOcpDriverToolkit' under 'ofedDriver'. Default is 'true' - Helm : add support to 'useOcpDriverToolkit' OFED state: - In case of OCP and 'useOcpDriverToolkit' is true, find DTK image based on NFD label of node. - If available, add to MOFED DS a DTK container, change entrypoint logic. Signed-off-by: Fred Rolland --- api/v1alpha1/nicclusterpolicy_types.go | 3 + .../mellanox.com_nicclusterpolicies.yaml | 6 + config/rbac/role.yaml | 8 ++ controllers/nicclusterpolicy_controller.go | 1 + deployment/network-operator/README.md | 1 + .../crds/mellanox.com_nicclusterpolicies.yaml | 6 + ...anox.com_v1alpha1_nicclusterpolicy_cr.yaml | 1 + .../network-operator/templates/role.yaml | 8 ++ deployment/network-operator/values.yaml | 1 + hack/templates/values/values.template | 1 + main.go | 2 + .../0050_ofed-driver-ds.yaml | 56 +++++++++ pkg/nodeinfo/attributes.go | 4 + pkg/state/state_ofed.go | 50 +++++++- pkg/state/state_ofed_test.go | 107 +++++++++++++++++- 15 files changed, 252 insertions(+), 3 deletions(-) diff --git a/api/v1alpha1/nicclusterpolicy_types.go b/api/v1alpha1/nicclusterpolicy_types.go index e378e608f..5497815a1 100644 --- a/api/v1alpha1/nicclusterpolicy_types.go +++ b/api/v1alpha1/nicclusterpolicy_types.go @@ -91,6 +91,9 @@ type OFEDDriverSpec struct { // +kubebuilder:default:=300 // +kubebuilder:validation:Minimum:=0 TerminationGracePeriodSeconds int64 `json:"terminationGracePeriodSeconds,omitempty"` + // UseOCPDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules + // +kubebuilder:default:=true + UseOCPDriverToolkit bool `json:"useOcpDriverToolkit"` } // DriverUpgradePolicySpec describes policy configuration for automatic upgrades diff --git a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml index c8d659dbc..0a77c5289 100644 --- a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml +++ b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml @@ -576,12 +576,18 @@ spec: type: integer type: object type: object + useOcpDriverToolkit: + default: true + description: UseOCPDriverToolkit indicates if DriverToolkit image + should be used on OpenShift to build and install driver modules + type: boolean version: pattern: '[a-zA-Z0-9\.-]+' type: string required: - image - repository + - useOcpDriverToolkit - version type: object rdmaSharedDevicePlugin: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 18f3447f9..adf73e10d 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -181,6 +181,14 @@ rules: - create - patch - update +- apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch - apiGroups: - k8s.cni.cncf.io resources: diff --git a/controllers/nicclusterpolicy_controller.go b/controllers/nicclusterpolicy_controller.go index dbad39fee..32bfe08d2 100644 --- a/controllers/nicclusterpolicy_controller.go +++ b/controllers/nicclusterpolicy_controller.go @@ -82,6 +82,7 @@ type NicClusterPolicyReconciler struct { // +kubebuilder:rbac:groups=nv-ipam.nvidia.com,resources=ippools/status,verbs=get;update;patch; // +kubebuilder:rbac:groups=cert-manager.io,resources=issuers;certificates,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=image.openshift.io,resources=imagestreams,verbs=get;list;watch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/deployment/network-operator/README.md b/deployment/network-operator/README.md index 3c0afbdda..73e6a6838 100644 --- a/deployment/network-operator/README.md +++ b/deployment/network-operator/README.md @@ -428,6 +428,7 @@ imagePullSecrets: | `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `true` | continue even if there are pods using emptyDir | | `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade | | `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite | +| `ofedDriver.useOcpDriverToolkit` | bool | `true` | In OpenShift, use Driver Toolkit image to compile OFED drivers | #### RDMA Device Plugin diff --git a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml index c8d659dbc..0a77c5289 100644 --- a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml +++ b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml @@ -576,12 +576,18 @@ spec: type: integer type: object type: object + useOcpDriverToolkit: + default: true + description: UseOCPDriverToolkit indicates if DriverToolkit image + should be used on OpenShift to build and install driver modules + type: boolean version: pattern: '[a-zA-Z0-9\.-]+' type: string required: - image - repository + - useOcpDriverToolkit - version type: object rdmaSharedDevicePlugin: diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index 5cda6a02e..4e51a12db 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -32,6 +32,7 @@ spec: image: {{ .Values.ofedDriver.image }} repository: {{ .Values.ofedDriver.repository }} version: {{ .Values.ofedDriver.version }} + useOcpDriverToolkit: {{ .Values.ofedDriver.useOcpDriverToolkit }} {{- if .Values.ofedDriver.env }} env: {{ toYaml .Values.ofedDriver.env | nindent 6 }} diff --git a/deployment/network-operator/templates/role.yaml b/deployment/network-operator/templates/role.yaml index 0352e5c68..b46053b00 100644 --- a/deployment/network-operator/templates/role.yaml +++ b/deployment/network-operator/templates/role.yaml @@ -195,6 +195,14 @@ rules: - create - patch - update +- apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch - apiGroups: - k8s.cni.cncf.io resources: diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index 49cb7aadd..5c0fb68c5 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -217,6 +217,7 @@ ofedDriver: # podSelector: "app=myapp" # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite # timeoutSeconds: 300 + useOcpDriverToolkit: true rdmaSharedDevicePlugin: deploy: true diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index eb07d8c92..e823b2b4e 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -217,6 +217,7 @@ ofedDriver: # podSelector: "app=myapp" # specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite # timeoutSeconds: 300 + useOcpDriverToolkit: true rdmaSharedDevicePlugin: deploy: true diff --git a/main.go b/main.go index 66e42cdbb..715cc9c7e 100644 --- a/main.go +++ b/main.go @@ -24,6 +24,7 @@ import ( "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" netattdefv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1" osconfigv1 "github.com/openshift/api/config/v1" + imagev1 "github.com/openshift/api/image/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -57,6 +58,7 @@ func init() { utilruntime.Must(mellanoxcomv1alpha1.AddToScheme(scheme)) utilruntime.Must(netattdefv1.AddToScheme(scheme)) utilruntime.Must(osconfigv1.AddToScheme(scheme)) + utilruntime.Must(imagev1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } diff --git a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml index bfa46405b..fa5f19ba8 100644 --- a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml +++ b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml @@ -80,6 +80,10 @@ spec: - image: {{ .RuntimeSpec.MOFEDImageName }} imagePullPolicy: IfNotPresent name: mofed-container + {{- if .RuntimeSpec.UseDtk }} + command: ["ocp_dtk_entrypoint"] + args: ["nv-fs-ctr-run-with-dtk"] + {{- end }} securityContext: privileged: true seLinuxOptions: @@ -112,6 +116,10 @@ spec: readOnly: {{ .ReadOnly }} {{- end }} {{- end }} + {{- if .RuntimeSpec.UseDtk }} + - name: shared-nvidia-driver-toolkit + mountPath: /mnt/shared-nvidia-driver-toolkit + {{- end}} startupProbe: exec: command: @@ -135,6 +143,47 @@ spec: initialDelaySeconds: {{ .CrSpec.ReadinessProbe.InitialDelaySeconds }} failureThreshold: 1 periodSeconds: {{ .CrSpec.ReadinessProbe.PeriodSeconds }} + {{- if .RuntimeSpec.UseDtk }} + - image: {{ .RuntimeSpec.DtkImageName }} + imagePullPolicy: IfNotPresent + name: openshift-driver-toolkit-ctr + command: [bash, -xc] + args: ["until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver"] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + env: + {{- if .CrSpec.Env }} + {{- range .CrSpec.Env }} + {{ . | yaml | nindentPrefix 14 "- " }} + {{- end }} + {{- end }} + volumeMounts: + - name: run-mlnx-ofed + mountPath: /run/mellanox/drivers + mountPropagation: Bidirectional + - name: etc-network + mountPath: /etc/network + - name: host-etc + mountPath: /host/etc + - name: host-usr + mountPath: /host/usr + - name: host-udev + mountPath: /host/lib/udev + - name: host-lib-modules + mountPath: /host/lib/modules + {{- if.AdditionalVolumeMounts.VolumeMounts }} + {{- range .AdditionalVolumeMounts.VolumeMounts }} + - name: {{ .Name }} + mountPath: {{ .MountPath }} + subPath: {{ .SubPath }} + readOnly: {{ .ReadOnly }} + {{- end }} + {{- end }} + - name: shared-nvidia-driver-toolkit + mountPath: /mnt/shared-nvidia-driver-toolkit + {{- end }} # unloading OFED modules can take more time than default terminationGracePeriod (30 sec) terminationGracePeriodSeconds: {{ .CrSpec.TerminationGracePeriodSeconds }} volumes: @@ -165,10 +214,17 @@ spec: {{ . | yaml | nindentPrefix 14 "- " }} {{- end }} {{- end }} + {{- if .RuntimeSpec.UseDtk }} + - name: shared-nvidia-driver-toolkit + emptyDir: {} + {{- end }} nodeSelector: feature.node.kubernetes.io/pci-15b3.present: "true" feature.node.kubernetes.io/system-os_release.ID: {{ .RuntimeSpec.OSName }} feature.node.kubernetes.io/system-os_release.VERSION_ID: "{{ .RuntimeSpec.OSVer }}" + {{- if .RuntimeSpec.UseDtk }} + feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: "{{ .RuntimeSpec.RhcosVersion }}" + {{- end }} {{- if .NodeAffinity }} affinity: nodeAffinity: diff --git a/pkg/nodeinfo/attributes.go b/pkg/nodeinfo/attributes.go index 12890f256..bcb0069c7 100644 --- a/pkg/nodeinfo/attributes.go +++ b/pkg/nodeinfo/attributes.go @@ -37,6 +37,7 @@ const ( NodeLabelNvGPU = "nvidia.com/gpu.present" NodeLabelWaitOFED = "network.nvidia.com/operator.mofed.wait" NodeLabelCudaVersionMajor = "nvidia.com/cuda.driver.major" + NodeLabelOSTreeVersion = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" ) type AttributeType int @@ -50,6 +51,7 @@ const ( AttrTypeOSVer // optional attrs AttrTypeCudaVersionMajor + AttrTypeOSTreeVersion OptionalAttrsStart = AttrTypeCudaVersionMajor ) @@ -65,6 +67,8 @@ var attrToLabel = []string{ NodeLabelOSVer, // AttrTypeCudaVersionMajor NodeLabelCudaVersionMajor, + // AttrTypeOSTreeVersion + NodeLabelOSTreeVersion, } // NodeAttributes provides attributes of a specific node diff --git a/pkg/state/state_ofed.go b/pkg/state/state_ofed.go index 4a2dd440c..ced927ac5 100644 --- a/pkg/state/state_ofed.go +++ b/pkg/state/state_ofed.go @@ -28,6 +28,7 @@ import ( "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" "github.com/go-logr/logr" osconfigv1 "github.com/openshift/api/config/v1" + apiimagev1 "github.com/openshift/api/image/v1" "github.com/pkg/errors" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" @@ -153,7 +154,10 @@ type ofedRuntimeSpec struct { MOFEDImageName string InitContainerConfig initContainerConfig // is true if cluster type is Openshift - IsOpenshift bool + IsOpenshift bool + UseDtk bool + DtkImageName string + RhcosVersion string } type ofedManifestRenderData struct { @@ -397,6 +401,26 @@ func (s *stateOFED) getManifestObjects( } nodeAttr := attrs[0].Attributes + useDtk := clusterInfo.IsOpenshift() && cr.Spec.OFEDDriver.UseOCPDriverToolkit + var dtkImageName string + var rhcosVersion string + if useDtk { + if err := s.checkAttributesExist(attrs[0], + nodeinfo.AttrTypeOSTreeVersion); err != nil { + return nil, err + } + rhcosVersion = nodeAttr[nodeinfo.AttrTypeOSTreeVersion] + rhcosDriverToolkitImages, err := s.ocpDriverToolkitImages(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get OpenShift DTK ImageStream : %v", err) + } + image, ok := rhcosDriverToolkitImages[rhcosVersion] + if !ok { + return nil, fmt.Errorf("failed to find DTK image for RHCOS version: %v", rhcosVersion) + } + dtkImageName = image + } + setProbesDefaults(cr) // Update MOFED Env variables with defaults for the cluster @@ -426,7 +450,10 @@ func (s *stateOFED) getManifestObjects( MOFEDImageName: s.getMofedDriverImageName(cr, nodeAttr, reqLogger), InitContainerConfig: s.getInitContainerConfig(cr, reqLogger, config.FromEnv().State.OFEDState.InitContainerImage), - IsOpenshift: clusterInfo.IsOpenshift(), + IsOpenshift: clusterInfo.IsOpenshift(), + UseDtk: useDtk, + DtkImageName: dtkImageName, + RhcosVersion: rhcosVersion, }, Tolerations: cr.Spec.Tolerations, NodeAffinity: cr.Spec.NodeAffinity, @@ -721,3 +748,22 @@ func (s *stateOFED) handleRepoConfig( } return nil } + +func (s *stateOFED) ocpDriverToolkitImages(ctx context.Context) (map[string]string, error) { + reqLogger := log.FromContext(ctx) + dtkImageStream := &apiimagev1.ImageStream{} + name := "driver-toolkit" + namespace := "openshift" + err := s.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, dtkImageStream) + if err != nil { + reqLogger.Error(err, "Couldn't get the driver-toolkit imagestream") + return nil, err + } + rhcosDriverToolkitImages := make(map[string]string) + reqLogger.Info("ocpDriverToolkitImages: driver-toolkit imagestream found") + for _, tag := range dtkImageStream.Spec.Tags { + rhcosDriverToolkitImages[tag.Name] = tag.From.Name + } + + return rhcosDriverToolkitImages, nil +} diff --git a/pkg/state/state_ofed_test.go b/pkg/state/state_ofed_test.go index 513ddb3be..a593a6de2 100644 --- a/pkg/state/state_ofed_test.go +++ b/pkg/state/state_ofed_test.go @@ -26,11 +26,14 @@ import ( . "github.com/onsi/gomega" osconfigv1 "github.com/openshift/api/config/v1" + apiimagev1 "github.com/openshift/api/image/v1" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/Mellanox/network-operator/api/v1alpha1" + "github.com/Mellanox/network-operator/pkg/clustertype" "github.com/Mellanox/network-operator/pkg/nodeinfo" "github.com/Mellanox/network-operator/pkg/render" "github.com/Mellanox/network-operator/pkg/testing/mocks" @@ -45,8 +48,27 @@ const ( testNicPolicyNoProxy = "no-proxy-policy" osName = "ubuntu" osVer = "22.04" + kernelMajor = "5" + kernelMinorPool1 = "15" + kernelMinorPool2 = "20" + rhcosOsTree = "414.92.202311061957-0" ) +type openShiftClusterProvider struct { +} + +func (d *openShiftClusterProvider) GetClusterType() clustertype.Type { + return clustertype.Kubernetes +} + +func (d *openShiftClusterProvider) IsKubernetes() bool { + return false +} + +func (d *openShiftClusterProvider) IsOpenshift() bool { + return true +} + var _ = Describe("MOFED state test", func() { var stateOfed stateOFED var ctx context.Context @@ -253,7 +275,7 @@ var _ = Describe("MOFED state test", func() { }) objs, err := ofedState.getManifestObjects(ctx, cr, infoProvider, &dummyProvider{}) Expect(err).NotTo(HaveOccurred()) - // Expect 4 objects: DaemonSet, Service Account, Role, RoleBinding + // Expect 4 objects: DaemonSet, Service Account, ClusterRole, ClusterRoleBinding Expect(len(objs)).To(Equal(4)) By("Verify DaemonSet") for _, obj := range objs { @@ -269,6 +291,89 @@ var _ = Describe("MOFED state test", func() { } }) }) + Context("Render Manifests DTK", func() { + It("Should Render DaemonSet with DTK", func() { + dtkImageName := "quay.io/openshift-release-dev/ocp-v4.0-art-dev:414" + dtkImageStream := &apiimagev1.ImageStream{ + TypeMeta: metav1.TypeMeta{ + Kind: "ImageStream", + APIVersion: "image/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "driver-toolkit", + Namespace: "openshift", + }, + Spec: apiimagev1.ImageStreamSpec{ + Tags: []apiimagev1.TagReference{ + { + Name: rhcosOsTree, + From: &v1.ObjectReference{ + Kind: "DockerImage", + Name: dtkImageName, + }, + }, + }, + }, + } + scheme := runtime.NewScheme() + Expect(apiimagev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(dtkImageStream).Build() + manifestBaseDir := "../../manifests/state-ofed-driver" + + files, err := utils.GetFilesWithSuffix(manifestBaseDir, render.ManifestFileSuffix...) + Expect(err).NotTo(HaveOccurred()) + renderer := render.NewRenderer(files) + + ofedState := stateOFED{ + stateSkel: stateSkel{ + name: stateOFEDName, + description: stateOFEDDescription, + client: client, + scheme: scheme, + renderer: renderer, + }, + } + cr := &v1alpha1.NicClusterPolicy{} + cr.Name = "nic-cluster-policy" + cr.Spec.OFEDDriver = &v1alpha1.OFEDDriverSpec{ + UseOCPDriverToolkit: true, + ImageSpec: v1alpha1.ImageSpec{ + Image: "mofed", + Repository: "nvcr.io/mellanox", + Version: "23.10-0.5.5.0", + }, + } + By("Creating NodeProvider with 1 Node with RHCOS OS TREE label") + node := getNode("node1") + node.Labels[nodeinfo.NodeLabelOSTreeVersion] = rhcosOsTree + infoProvider := nodeinfo.NewProvider([]*v1.Node{ + node, + }) + objs, err := ofedState.getManifestObjects(ctx, cr, infoProvider, &openShiftClusterProvider{}) + Expect(err).NotTo(HaveOccurred()) + // Expect 6 object due to OpenShift: DaemonSet, Service Account, ClusterRole, ClusterRoleBinding + // Role, RoleBinding + Expect(len(objs)).To(Equal(6)) + By("Verify DaemonSet with DTK") + for _, obj := range objs { + if obj.GetKind() != "DaemonSet" { + continue + } + ds := appsv1.DaemonSet{} + err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &ds) + Expect(err).NotTo(HaveOccurred()) + By("Verify DaemonSet NodeSelector") + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector) + rhcosVer, ok := ds.Spec.Template.Spec.NodeSelector["feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"] + Expect(ok).To(BeTrue()) + Expect(rhcosVer).To(Equal(rhcosOsTree)) + By("Verify DTK container image") + Expect(len(ds.Spec.Template.Spec.Containers)).To(Equal(2)) + dtkContainer := ds.Spec.Template.Spec.Containers[1] + Expect(dtkContainer.Image).To(Equal(dtkImageName)) + } + }) + }) }) func verifyPodAntiInfinity(affinity *v1.Affinity) {