From 234ba6e88b57c5d4814e98c4eec602f143411cb9 Mon Sep 17 00:00:00 2001 From: killianmuldoon Date: Thu, 18 Jan 2024 14:11:02 +0000 Subject: [PATCH] feat: Add DOCA Telemetry Service Signed-off-by: killianmuldoon --- api/v1alpha1/nicclusterpolicy_types.go | 36 +++- api/v1alpha1/zz_generated.deepcopy.go | 41 ++++ .../mellanox.com_nicclusterpolicies.yaml | 71 +++++++ .../nicclusterpolicy_controller_test.go | 155 +++++++++++--- .../crds/mellanox.com_nicclusterpolicies.yaml | 71 +++++++ .../network-operator/templates/_helpers.tpl | 17 ++ ...anox.com_v1alpha1_nicclusterpolicy_cr.yaml | 10 + deployment/network-operator/values.yaml | 15 ++ hack/release.go | 2 + hack/release.yaml | 4 + hack/templates/values/values.template | 15 ++ .../0010-doca-telemetry-service.yaml | 139 +++++++++++++ pkg/state/continuity_check_test.go | 1 + pkg/state/factory.go | 8 +- pkg/state/state_doca_telemetry_service.go | 196 ++++++++++++++++++ 15 files changed, 739 insertions(+), 42 deletions(-) create mode 100644 manifests/state-doca-telemetry-service/0010-doca-telemetry-service.yaml create mode 100644 pkg/state/state_doca_telemetry_service.go diff --git a/api/v1alpha1/nicclusterpolicy_types.go b/api/v1alpha1/nicclusterpolicy_types.go index 2a3e04487..60ca7e2a4 100644 --- a/api/v1alpha1/nicclusterpolicy_types.go +++ b/api/v1alpha1/nicclusterpolicy_types.go @@ -246,20 +246,38 @@ type NICFeatureDiscoverySpec struct { ImageSpec `json:""` } +// DOCATelemetryServiceConfig contains configuration for the DOCATelemetryService. +type DOCATelemetryServiceConfig struct { + // FromConfigMap sets the configMap the DOCATelemetryService gets its configuration from. The ConfigMap must be in + // the same namespace as the NICClusterPolicy. + // +optional + FromConfigMap string `json:"fromConfigMap"` +} + +// DOCATelemetryServiceSpec is the configuration for DOCA Telemetry Service. +type DOCATelemetryServiceSpec struct { + ImageSpec `json:""` + // +optional + // Config contains custom config for the DOCATelemetryService. + // If set no default config will be deployed. + Config *DOCATelemetryServiceConfig `json:"config"` +} + // NicClusterPolicySpec defines the desired state of NicClusterPolicy type NicClusterPolicySpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster // Important: Run "make" to regenerate code after modifying this file - NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"` - Tolerations []v1.Toleration `json:"tolerations,omitempty"` - OFEDDriver *OFEDDriverSpec `json:"ofedDriver,omitempty"` - RdmaSharedDevicePlugin *DevicePluginSpec `json:"rdmaSharedDevicePlugin,omitempty"` - SriovDevicePlugin *DevicePluginSpec `json:"sriovDevicePlugin,omitempty"` - IBKubernetes *IBKubernetesSpec `json:"ibKubernetes,omitempty"` - SecondaryNetwork *SecondaryNetworkSpec `json:"secondaryNetwork,omitempty"` - NvIpam *NVIPAMSpec `json:"nvIpam,omitempty"` - NicFeatureDiscovery *NICFeatureDiscoverySpec `json:"nicFeatureDiscovery,omitempty"` + NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"` + Tolerations []v1.Toleration `json:"tolerations,omitempty"` + OFEDDriver *OFEDDriverSpec `json:"ofedDriver,omitempty"` + RdmaSharedDevicePlugin *DevicePluginSpec `json:"rdmaSharedDevicePlugin,omitempty"` + SriovDevicePlugin *DevicePluginSpec `json:"sriovDevicePlugin,omitempty"` + IBKubernetes *IBKubernetesSpec `json:"ibKubernetes,omitempty"` + SecondaryNetwork *SecondaryNetworkSpec `json:"secondaryNetwork,omitempty"` + NvIpam *NVIPAMSpec `json:"nvIpam,omitempty"` + NicFeatureDiscovery *NICFeatureDiscoverySpec `json:"nicFeatureDiscovery,omitempty"` + DOCATelemetryService *DOCATelemetryServiceSpec `json:"docaTelemetryService,omitempty"` } // AppliedState defines a finer-grained view of the observed state of NicClusterPolicy diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 26139f0bc..417d9c3ed 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -55,6 +55,42 @@ func (in *ConfigMapNameReference) DeepCopy() *ConfigMapNameReference { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DOCATelemetryServiceConfig) DeepCopyInto(out *DOCATelemetryServiceConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DOCATelemetryServiceConfig. +func (in *DOCATelemetryServiceConfig) DeepCopy() *DOCATelemetryServiceConfig { + if in == nil { + return nil + } + out := new(DOCATelemetryServiceConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DOCATelemetryServiceSpec) DeepCopyInto(out *DOCATelemetryServiceSpec) { + *out = *in + in.ImageSpec.DeepCopyInto(&out.ImageSpec) + if in.Config != nil { + in, out := &in.Config, &out.Config + *out = new(DOCATelemetryServiceConfig) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DOCATelemetryServiceSpec. +func (in *DOCATelemetryServiceSpec) DeepCopy() *DOCATelemetryServiceSpec { + if in == nil { + return nil + } + out := new(DOCATelemetryServiceSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) { *out = *in @@ -604,6 +640,11 @@ func (in *NicClusterPolicySpec) DeepCopyInto(out *NicClusterPolicySpec) { *out = new(NICFeatureDiscoverySpec) (*in).DeepCopyInto(*out) } + if in.DOCATelemetryService != nil { + in, out := &in.DOCATelemetryService, &out.DOCATelemetryService + *out = new(DOCATelemetryServiceSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NicClusterPolicySpec. diff --git a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml index 9b6a1a153..4496c765e 100644 --- a/config/crd/bases/mellanox.com_nicclusterpolicies.yaml +++ b/config/crd/bases/mellanox.com_nicclusterpolicies.yaml @@ -46,6 +46,77 @@ spec: spec: description: NicClusterPolicySpec defines the desired state of NicClusterPolicy properties: + docaTelemetryService: + description: DOCATelemetryServiceSpec is the configuration for DOCA + Telemetry Service. + properties: + config: + description: |- + Config contains custom config for the DOCATelemetryService. + If set no default config will be deployed. + properties: + fromConfigMap: + description: |- + FromConfigMap sets the configMap the DOCATelemetryService gets its configuration from. The ConfigMap must be in + the same namespace as the NICClusterPolicy. + type: string + type: object + containerResources: + items: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + name: + description: Name of the container the requirements are + set for + type: string + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + required: + - name + type: object + type: array + image: + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullSecrets: + default: [] + items: + type: string + type: array + repository: + pattern: '[a-zA-Z0-9\.\-\/]+' + type: string + version: + pattern: '[a-zA-Z0-9\.-]+' + type: string + required: + - image + - repository + - version + type: object ibKubernetes: description: IBKubernetesSpec describes configuration options for ib-kubernetes diff --git a/controllers/nicclusterpolicy_controller_test.go b/controllers/nicclusterpolicy_controller_test.go index 1a92c8a96..057be3fd7 100644 --- a/controllers/nicclusterpolicy_controller_test.go +++ b/controllers/nicclusterpolicy_controller_test.go @@ -17,16 +17,14 @@ limitations under the License. package controllers //nolint:dupl import ( - goctx "context" + "context" "fmt" - "k8s.io/apimachinery/pkg/types" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" . "github.com/onsi/ginkgo/v2" @@ -37,9 +35,7 @@ import ( "github.com/Mellanox/network-operator/pkg/nodeinfo" ) -//nolint:dupl var _ = Describe("NicClusterPolicyReconciler Controller", func() { - Context("When NicClusterPolicy CR is created", func() { It("should create whereabouts and delete it after un-setting CR value", func() { By("Check NicClusterPolicy with whereabouts") @@ -60,17 +56,17 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { }, } - err := k8sClient.Create(goctx.TODO(), &cr) + err := k8sClient.Create(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) ncp := &mellanoxv1alpha1.NicClusterPolicy{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) Expect(err).NotTo(HaveOccurred()) By("Check DS created with state label") Eventually(func() bool { ds := &appsv1.DaemonSet{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) if err != nil { return false } @@ -84,7 +80,7 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { By("Check SA created with state label") Eventually(func() bool { ds := &corev1.ServiceAccount{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) if err != nil { return false } @@ -97,29 +93,29 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { By("Update CR to remove whereabout") ncp = &mellanoxv1alpha1.NicClusterPolicy{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) Expect(err).NotTo(HaveOccurred()) ncp.Spec.SecondaryNetwork = nil - err = k8sClient.Update(goctx.TODO(), ncp) + err = k8sClient.Update(context.TODO(), ncp) Expect(err).NotTo(HaveOccurred()) By("Check DS is deleted") Eventually(func() bool { ds := &appsv1.DaemonSet{} - err := k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) - return errors.IsNotFound(err) + err := k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, ds) + return apierrors.IsNotFound(err) }, timeout*3, interval).Should(BeTrue()) By("Check SA is deleted") Eventually(func() bool { sa := &corev1.ServiceAccount{} - err := k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, sa) - return errors.IsNotFound(err) + err := k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "whereabouts"}, sa) + return apierrors.IsNotFound(err) }, timeout*3, interval).Should(BeTrue()) By("Delete NicClusterPolicy") - err = k8sClient.Delete(goctx.TODO(), &cr) + err = k8sClient.Delete(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) }) It("Unsupported name", func() { @@ -129,16 +125,16 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { Namespace: "", }, } - err := k8sClient.Create(goctx.TODO(), &cr) + err := k8sClient.Create(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) Eventually(func() string { found := &mellanoxv1alpha1.NicClusterPolicy{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found) Expect(err).NotTo(HaveOccurred()) return string(found.Status.State) }, timeout*3, interval).Should(BeEquivalentTo(mellanoxv1alpha1.StateIgnore)) - err = k8sClient.Delete(goctx.TODO(), &cr) + err = k8sClient.Delete(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) }) }) @@ -152,7 +148,7 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { Annotations: make(map[string]string), }, } - err := k8sClient.Create(goctx.TODO(), node) + err := k8sClient.Create(context.TODO(), node) Expect(err).NotTo(HaveOccurred()) By("Create NicClusterPolicy") cr := mellanoxv1alpha1.NicClusterPolicy{ @@ -171,34 +167,35 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { }, }, } - err = k8sClient.Create(goctx.TODO(), &cr) + + err = k8sClient.Create(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) ncp := &mellanoxv1alpha1.NicClusterPolicy{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp) Expect(err).NotTo(HaveOccurred()) By("Wait for NicClusterPolicy state to be populated") Eventually(func() string { found := &mellanoxv1alpha1.NicClusterPolicy{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found) Expect(err).NotTo(HaveOccurred()) return string(found.Status.State) }, timeout*3, interval).Should(BeEquivalentTo(mellanoxv1alpha1.StateNotReady)) By("Update Node labels") n := &corev1.Node{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) Expect(err).NotTo(HaveOccurred()) patch := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:"true", %q:"true"}}}`, nodeinfo.NodeLabelWaitOFED, nodeinfo.NodeLabelMlnxNIC)) - err = k8sClient.Patch(goctx.TODO(), n, client.RawPatch(types.StrategicMergePatchType, patch)) + err = k8sClient.Patch(context.TODO(), n, client.RawPatch(types.StrategicMergePatchType, patch)) Expect(err).NotTo(HaveOccurred()) Consistently(func() bool { n := &corev1.Node{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) if err != nil { return false } @@ -206,13 +203,13 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { }, timeout, interval).Should(BeTrue()) By("Delete NicClusterPolicy") - err = k8sClient.Delete(goctx.TODO(), &cr) + err = k8sClient.Delete(context.TODO(), &cr) Expect(err).NotTo(HaveOccurred()) By("Verify Mofed Label is false") Eventually(func() bool { n := &corev1.Node{} - err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) + err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: node.GetNamespace(), Name: node.GetName()}, n) if err != nil { return false } @@ -220,8 +217,104 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() { }, timeout*3, interval).Should(BeTrue()) By("Delete Node") - err = k8sClient.Delete(goctx.TODO(), node) + err = k8sClient.Delete(context.TODO(), node) Expect(err).NotTo(HaveOccurred()) }) }) + + Context("Create DOCATelemetryService deployment through NICClusterPolicy", func() { + stateLabel := "state-doca-telemetry-service" + imageRepo := "nvcr.io/nvidia/doca" + imageName := "doca-telemetry-service" + imageVersion := "1.15.5-doca2.5.0" + updatedVersion := "v9.9.9-doca2.5.0" + ctx := context.Background() + It("should create, update and delete doca-telemetry-service through NICClusterPolicy", func() { + By("Create doca-telemetry-service through NICClusterPolicy") + cr := &mellanoxv1alpha1.NicClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nic-cluster-policy", + Namespace: "test-namespace", + }, + Spec: mellanoxv1alpha1.NicClusterPolicySpec{ + DOCATelemetryService: &mellanoxv1alpha1.DOCATelemetryServiceSpec{ + ImageSpec: mellanoxv1alpha1.ImageSpec{ + Image: imageName, + Repository: imageRepo, + Version: imageVersion, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, cr)).To(Succeed()) + + By("Check DaemonSet is correctly created") + Eventually(func() bool { + ds := &appsv1.DaemonSet{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: namespaceName, Name: "doca-telemetry-service"}, ds) + if err != nil { + return false + } + l, ok := ds.Labels[consts.StateLabel] + if !ok { + return false + } + return l == stateLabel + }, timeout*3, interval).Should(BeTrue()) + + By("Check ConfigMap is created with state label") + Eventually(func() bool { + cm := &corev1.ConfigMap{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: namespaceName, Name: "doca-telemetry-service"}, cm) + if err != nil { + return false + } + l, ok := cm.Labels[consts.StateLabel] + if !ok { + return false + } + return l == stateLabel + }, timeout*3, interval).Should(BeTrue()) + + By("Update DOCATelemetryService through NICClusterPolicy") + expectedImageName := fmt.Sprintf("%v/%v:%v", imageRepo, imageName, updatedVersion) + + // Patch the NICClusterPolicy with the updated DOCATelemetryService version number. + patch := []byte(fmt.Sprintf(`{"spec": {"docaTelemetryService":{"version": %q}}}`, updatedVersion)) + Expect(k8sClient.Patch(ctx, cr, client.RawPatch(types.MergePatchType, patch))).To(Succeed()) + + // Expect the image name in the Daemonset to be updated with the new version. + Eventually(func() bool { + ds := &appsv1.DaemonSet{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: namespaceName, Name: "doca-telemetry-service"}, ds) + if err != nil { + return false + } + return ds.Spec.Template.Spec.Containers[0].Image == expectedImageName + }, timeout*3, interval).Should(BeTrue()) + + By("Delete DOCATelemetryService through NICClusterPolicy") + + // Patch the NICClusterPolicy to drop the DOCATelemetryService. + patch = []byte(`{"spec": {"docaTelemetryService": null}}`) + Expect(k8sClient.Patch(ctx, cr, client.RawPatch(types.MergePatchType, patch))).To(Succeed()) + + // Expect the DaemonSet to be NotFound by the client. + Eventually(func() bool { + ds := &appsv1.DaemonSet{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: namespaceName, Name: "doca-telemetry-service"}, ds) + return apierrors.IsNotFound(err) + }, timeout*3, interval).Should(BeTrue()) + + // Expect the ConfigMap to be NotFound by the client. + Eventually(func() bool { + cm := &corev1.ConfigMap{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: namespaceName, Name: "doca-telemetry-service"}, cm) + return apierrors.IsNotFound(err) + }, timeout*3, interval).Should(BeTrue()) + + By("Delete NICClusterPolicy") + Expect(k8sClient.Delete(ctx, cr)).To(Succeed()) + }) + }) }) diff --git a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml index 9b6a1a153..4496c765e 100644 --- a/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml +++ b/deployment/network-operator/crds/mellanox.com_nicclusterpolicies.yaml @@ -46,6 +46,77 @@ spec: spec: description: NicClusterPolicySpec defines the desired state of NicClusterPolicy properties: + docaTelemetryService: + description: DOCATelemetryServiceSpec is the configuration for DOCA + Telemetry Service. + properties: + config: + description: |- + Config contains custom config for the DOCATelemetryService. + If set no default config will be deployed. + properties: + fromConfigMap: + description: |- + FromConfigMap sets the configMap the DOCATelemetryService gets its configuration from. The ConfigMap must be in + the same namespace as the NICClusterPolicy. + type: string + type: object + containerResources: + items: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + name: + description: Name of the container the requirements are + set for + type: string + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + required: + - name + type: object + type: array + image: + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullSecrets: + default: [] + items: + type: string + type: array + repository: + pattern: '[a-zA-Z0-9\.\-\/]+' + type: string + version: + pattern: '[a-zA-Z0-9\.-]+' + type: string + required: + - image + - repository + - version + type: object ibKubernetes: description: IBKubernetesSpec describes configuration options for ib-kubernetes diff --git a/deployment/network-operator/templates/_helpers.tpl b/deployment/network-operator/templates/_helpers.tpl index 6f2702331..813bd8f78 100644 --- a/deployment/network-operator/templates/_helpers.tpl +++ b/deployment/network-operator/templates/_helpers.tpl @@ -228,3 +228,20 @@ imagePullSecrets helpers {{- end }} {{- $imagePullSecrets | toJson }} {{- end }} + +{{- define "network-operator.docaTelemetryService.imagePullSecrets" }} +{{- $imagePullSecrets := list }} +{{- if .Values.docaTelemetryService.imagePullSecrets }} +{{- range .Values.docaTelemetryService.imagePullSecrets }} +{{- $imagePullSecrets = append $imagePullSecrets . }} +{{- end }} +{{- else }} +{{- if .Values.imagePullSecrets }} +{{- range .Values.imagePullSecrets }} +{{- $imagePullSecrets = append $imagePullSecrets . }} +{{- end }} +{{- end }} +{{- end }} +{{- $imagePullSecrets | toJson }} +{{- end }} + diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index d47b03e92..1943435ab 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -227,4 +227,14 @@ spec: containerResources: {{ toYaml .Values.nicFeatureDiscovery.containerResources | nindent 6 }} {{- end }} {{- end }} + {{- if .Values.docaTelemetryService.deploy }} + docaTelemetryService: + image: {{ .Values.docaTelemetryService.image }} + repository: {{ .Values.docaTelemetryService.repository }} + version: {{ .Values.docaTelemetryService.version }} + imagePullSecrets: {{ include "network-operator.docaTelemetryService.imagePullSecrets" . }} + {{- if .Values.docaTelemetryService.containerResources }} + containerResources: {{ toYaml .Values.docaTelemetryService.containerResources | nindent 6 }} + {{- end }} + {{- end }} {{ end }} diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index fcd30df20..b9af8981d 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -441,6 +441,21 @@ nicFeatureDiscovery: # cpu: "300m" # memory: "150Mi" +docaTelemetryService: + deploy: false + image: doca_telemetry + repository: nvcr.io/nvidia/doca + version: 1.15.5-doca2.5.0-host + # imagePullSecrets: [] + # containerResources: + # - name: "doca_telemetry" + # requests: + # cpu: "100m" + # memory: "50Mi" + # limits: + # cpu: "300m" + # memory: "150Mi" + # Can be set to nicclusterpolicy and override other ds node affinity, # e.g. https://github.com/Mellanox/network-operator/blob/master/manifests/state-multus-cni/0050-multus-ds.yml#L26-L36 #nodeAffinity: diff --git a/hack/release.go b/hack/release.go index aa32fb7a6..bb91b549a 100644 --- a/hack/release.go +++ b/hack/release.go @@ -50,6 +50,7 @@ type Release struct { IpamPlugin *mellanoxv1alpha1.ImageSpec NvIPAM *mellanoxv1alpha1.ImageSpec NicFeatureDiscovery *mellanoxv1alpha1.ImageSpec + DOCATelemetryService *mellanoxv1alpha1.ImageSpec } func readDefaults(releaseDefaults string) Release { @@ -95,6 +96,7 @@ func readEnvironmentVariables(release *Release) { initWithEnvVariale("IPAM_PLUGIN", release.Ipoib) initWithEnvVariale("NV_IPAM", release.NvIPAM) initWithEnvVariale("NIC_FEATURE_DISCOVERY", release.NicFeatureDiscovery) + initWithEnvVariale("DOCA_TELEMETRY_SERVICE", release.DOCATelemetryService) } func main() { diff --git a/hack/release.yaml b/hack/release.yaml index a4fdbe0ce..8e413c391 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -65,3 +65,7 @@ nicFeatureDiscovery: image: nic-feature-discovery repository: ghcr.io/mellanox version: v0.0.1 +docaTelemetryService: + image: doca_telemetry + repository: nvcr.io/nvidia/doca + version: 1.15.5-doca2.5.0-host diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 40f38980b..9ef4376d0 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -441,6 +441,21 @@ nicFeatureDiscovery: # cpu: "300m" # memory: "150Mi" +docaTelemetryService: + deploy: false + image: {{ .DOCATelemetryService.Image }} + repository: {{ .DOCATelemetryService.Repository }} + version: {{ .DOCATelemetryService.Version }} + # imagePullSecrets: [] + # containerResources: + # - name: "doca_telemetry" + # requests: + # cpu: "100m" + # memory: "50Mi" + # limits: + # cpu: "300m" + # memory: "150Mi" + # Can be set to nicclusterpolicy and override other ds node affinity, # e.g. https://github.com/Mellanox/network-operator/blob/master/manifests/state-multus-cni/0050-multus-ds.yml#L26-L36 #nodeAffinity: diff --git a/manifests/state-doca-telemetry-service/0010-doca-telemetry-service.yaml b/manifests/state-doca-telemetry-service/0010-doca-telemetry-service.yaml new file mode 100644 index 000000000..9ec916be2 --- /dev/null +++ b/manifests/state-doca-telemetry-service/0010-doca-telemetry-service.yaml @@ -0,0 +1,139 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: doca-telemetry-service + namespace: {{ .RuntimeSpec.Namespace }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: doca-telemetry + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: doca-telemetry + spec: + # hostNetwork is required to expose sysfs counters folder + hostNetwork: true + {{- if .NodeAffinity }} + affinity: + nodeAffinity: + {{- .NodeAffinity | yaml | nindent 10 }} + {{- end }} + {{- if .CrSpec.ImagePullSecrets }} + imagePullSecrets: + {{- range .CrSpec.ImagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + containers: + - name: doca-telemetry-service + image: {{ .CrSpec.Repository }}/{{ .CrSpec.Image }}:{{ .CrSpec.Version }} + {{- with .RuntimeSpec.ContainerResources }} + {{- with index . "nic-feature-discovery" }} + resources: + {{- if .Requests }} + requests: + {{ .Requests | yaml | nindent 14}} + {{- end }} + {{- if .Limits }} + limits: + { { .Limits | yaml | nindent 14 }} + {{- end }} + {{- end }} + {{- end }} + volumeMounts: + - name: doca-telemetry-service-configmap + mountPath: /configmap + - name: pod-device-resources + mountPath: /var/lib/kubelet/pod-resources + - name: cni + mountPath: /var/run/k8s.cni.cncf.io/devinfo/cni + env: + - name: MY_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + - name: PROMETHEUS_XCSET_JOIN_FIELDS + value: "hca" + - name: PROMETHEUS_XCSET_MANDATORY_TYPES + value: "counters,pod_resources_event" + command: [ "/bin/bash", "-c", "rm -rf /config/* && DTS_CONFIG_DIR=host /usr/bin/telemetry-init.sh && /usr/bin/telemetry-run.sh" ] + securityContext: + privileged: true + volumes: + - name: doca-telemetry-service-configmap + configMap: + name: {{ .ConfigMapName }} + - name: pod-device-resources + hostPath: + path: /var/lib/kubelet/pod-resources + - name: cni + hostPath: + path: /var/run/k8s.cni.cncf.io/devinfo/cni/ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .ConfigMapName }} + namespace: {{ .RuntimeSpec.Namespace }} +data: + dts_config_map.ini: | + ############################ GENERAL CONFIGURATION ############################ + # Port for TCP connection with aggregator + port=0 + # Timeout for forced page rotation + sync-time-limit=10000 + + # Sizes of internal buffers used by counter and event providers. + # By the default they set to 1MB, however + # If set to 0 buffers will be rounded up to fit at least a single data payload. + counter-buffer-size=0 + event-buffer-size=0 + + # Verbosity levels: error=3, info=6, debug=7 + verbose=6 + + # Sampling interval for providers in ms + update=1000 + + # Configuration cache folder that is used on restarts. + runtime-configuration-folder=/config/clx_last_runtime_conf + idle-time-limit=0 + + ################################### PROVIDERS ################################# + enable-explicitly=true + + ##################################### + # DOCA TELEMETRY SERVICE PROVIDERS: # + ##################################### + + enable-provider=sysfs + enable-provider=pod_resources + enable-provider=ethtool + enable-provider=ifconfig + + ################################ DATA OUTPUTS ################################# + + ################################ Prometheus ################################### + # Set address and port for Prometheus endpoint. + # If not set, the Prometheus endpoint is disabled. + prometheus=http://0.0.0.0:9189 + + # Prometheus can use data field as index to keep several data records with + # different index value. Index fields will be added to Prometheus labels + # Comma-separated counterset description for Prometheus indexing + #prometheus-indexes=idx1,idx2 + + # Comma-separated fieldset description for Prometheus indexing + #prometheus-fset-indexes=idx1,idx2 + prometheus-fset-indexes=device_name,device_id,pod_name,id + + # Comma-separated list of counter names to be ignored by Prometheus exporter + #prometheus-ignore-names=counter_name1,counter_name_2 + + # Comma-separated list of data source tags to be ignored by Prometheus exporter + prometheus-ignore-tags=FI_metrics + diff --git a/pkg/state/continuity_check_test.go b/pkg/state/continuity_check_test.go index 18e04f34b..395c32342 100644 --- a/pkg/state/continuity_check_test.go +++ b/pkg/state/continuity_check_test.go @@ -135,6 +135,7 @@ var _ = Describe("Continuity check", func() { cr.Spec.SecondaryNetwork.IpamPlugin = &imageSpec cr.Spec.SecondaryNetwork.IPoIB = &imageSpec cr.Spec.SecondaryNetwork.Multus = &mellanoxv1alpha1.MultusSpec{ImageSpecWithConfig: imageSpecWithConfig} + cr.Spec.DOCATelemetryService = &mellanoxv1alpha1.DOCATelemetryServiceSpec{ImageSpec: imageSpec} manifestsBaseDir := filepath.Join("..", "..", "manifests") envConfig = &config.OperatorConfig{State: config.StateConfig{ManifestBaseDir: manifestsBaseDir}} diff --git a/pkg/state/factory.go b/pkg/state/factory.go index 0334282b8..9c580e4ec 100644 --- a/pkg/state/factory.go +++ b/pkg/state/factory.go @@ -124,11 +124,15 @@ func newNicClusterPolicyStates(k8sAPIClient client.Client) ([]State, error) { if err != nil { return nil, errors.Wrapf(err, "failed to create nic-feature-discovery State") } - + docaTelemetryServiceState, _, err := NewStateDOCATelemetryService( + k8sAPIClient, filepath.Join(manifestBaseDir, "state-doca-telemetry-service")) + if err != nil { + return nil, errors.Wrapf(err, "failed to create doca-telemetry-service State") + } return []State{ multusState, cniPluginsState, ipoibState, whereaboutState, ofedState, sriovDpState, sharedDpState, ibKubernetesState, nvIpamCniState, - nicFeatureDiscoveryState}, nil + nicFeatureDiscoveryState, docaTelemetryServiceState}, nil } // newMacvlanNetworkStates creates states that reconcile MacvlanNetwork CRD diff --git a/pkg/state/state_doca_telemetry_service.go b/pkg/state/state_doca_telemetry_service.go new file mode 100644 index 000000000..fc71db2ff --- /dev/null +++ b/pkg/state/state_doca_telemetry_service.go @@ -0,0 +1,196 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "context" + + "github.com/go-logr/logr" + "github.com/pkg/errors" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + "github.com/Mellanox/network-operator/pkg/config" + "github.com/Mellanox/network-operator/pkg/consts" + "github.com/Mellanox/network-operator/pkg/render" + "github.com/Mellanox/network-operator/pkg/utils" +) + +type docaTelemetryServiceState struct { + stateSkel +} + +type dtsRuntimeSpec struct { + runtimeSpec + ContainerResources ContainerResourcesMap +} + +const ( + docaTelemetryServiceName = "state-doca-telemetry-service" + docaTelemetryServiceDefaultConfigMapName = "doca-telemetry-service" + docaTelemetryServiceDescription = "DOCA Telemetry Service deployed in the cluster" +) + +// DOCATelemetryServiceManifestRenderData is used to render Kubernetes objects related to DOCA Telemetry Service. +type DOCATelemetryServiceManifestRenderData struct { + CrSpec *mellanoxv1alpha1.DOCATelemetryServiceSpec + ConfigMapName string + RuntimeSpec *dtsRuntimeSpec + Tolerations []v1.Toleration + NodeAffinity *v1.NodeAffinity +} + +// Sync attempt to get the system to match the desired state which State represents. +// a sync operation must be relatively short and must not block the execution thread. +func (d docaTelemetryServiceState) Sync( + ctx context.Context, customResource interface{}, infoCatalog InfoCatalog) (SyncState, error) { + cr := customResource.(*mellanoxv1alpha1.NicClusterPolicy) + + reqLogger := log.FromContext(ctx) + reqLogger.WithValues("State:", d.name, "Name:", cr.Name, "Namespace:", cr.Namespace) + log.IntoContext(ctx, reqLogger) + reqLogger.V(consts.LogLevelInfo).Info("Sync Custom resource") + + if cr.Spec.DOCATelemetryService == nil { + // Either this state was not required to run or an update occurred and we need to remove + // the resources that where created. + return d.handleStateObjectsDeletion(ctx) + } + + objs, err := d.GetManifestObjects(ctx, cr, infoCatalog, reqLogger) + if err != nil { + return SyncStateNotReady, errors.Wrap(err, "failed to create k8s objects from manifest") + } + if len(objs) == 0 { + return SyncStateNotReady, nil + } + + // Create objects if they don't exist, Update objects if they do exist + err = d.createOrUpdateObjs(ctx, func(obj *unstructured.Unstructured) error { + if err := controllerutil.SetControllerReference(cr, obj, d.client.Scheme()); err != nil { + return errors.Wrap(err, "failed to set controller reference for object") + } + return nil + }, objs) + if err != nil { + return SyncStateNotReady, errors.Wrap(err, "failed to create/update objects") + } + waitForStaleObjectsRemoval, err := d.handleStaleStateObjects(ctx, objs) + if err != nil { + return SyncStateNotReady, errors.Wrap(err, "failed to handle state stale objects") + } + if waitForStaleObjectsRemoval { + return SyncStateNotReady, nil + } + // Check objects status + syncState, err := d.getSyncState(ctx, objs) + if err != nil { + return SyncStateNotReady, errors.Wrap(err, "failed to get sync state") + } + return syncState, nil +} + +// GetManifestObjects returns the Unstructured objects to deploy for DOCA Telemetry Service. +func (d docaTelemetryServiceState) GetManifestObjects( + _ context.Context, cr *mellanoxv1alpha1.NicClusterPolicy, + _ InfoCatalog, reqLogger logr.Logger) ([]*unstructured.Unstructured, error) { + if cr == nil || cr.Spec.DOCATelemetryService == nil { + return nil, errors.New("failed to render objects: state spec is nil") + } + dts := cr.Spec.DOCATelemetryService + + configMapName := docaTelemetryServiceDefaultConfigMapName + if dts.Config != nil { + configMapName = dts.Config.FromConfigMap + } + renderData := &DOCATelemetryServiceManifestRenderData{ + CrSpec: dts, + ConfigMapName: configMapName, + Tolerations: cr.Spec.Tolerations, + NodeAffinity: cr.Spec.NodeAffinity, + RuntimeSpec: &dtsRuntimeSpec{ + runtimeSpec: runtimeSpec{config.FromEnv().State.NetworkOperatorResourceNamespace}, + ContainerResources: createContainerResourcesMap(cr.Spec.DOCATelemetryService.ContainerResources), + }, + } + + // Render objects related to the DOCATelemetryService + reqLogger.V(consts.LogLevelDebug).Info("Rendering objects", "data:", renderData) + renderedObjects, err := d.renderer.RenderObjects(&render.TemplatingData{Data: renderData}) + if err != nil { + return nil, errors.Wrap(err, "failed to render objects") + } + + // Remove the configMap from the deployment if an explicit config is set. + if !shouldDeployConfigMap(cr.Spec.DOCATelemetryService) { + for i := range renderedObjects { + if renderedObjects[i].GetKind() == "ConfigMap" { + reqLogger.V(consts.LogLevelDebug).Info("Not rendering ConfigMap for DocaTelemetryService") + renderedObjects = append(renderedObjects[:i], renderedObjects[i+1:]...) + continue + } + } + } + reqLogger.V(consts.LogLevelDebug).Info("Rendered", "objects:", renderedObjects) + return renderedObjects, nil +} + +// Name returns the name of the DOCA Telemetry Service. +func (d docaTelemetryServiceState) Name() string { + return docaTelemetryServiceName +} + +// Description returns the description of the DOCA Telemetry Service. +func (d docaTelemetryServiceState) Description() string { + return docaTelemetryServiceDescription +} + +// GetWatchSources returns the objects that should be watched to trigger events for the DOCA Telemetry Service state. +func (docaTelemetryServiceState) GetWatchSources() map[string]client.Object { + wr := make(map[string]client.Object) + wr["DaemonSet"] = &appsv1.DaemonSet{} + return wr +} + +// NewStateDOCATelemetryService creates a new state for DOCA Telemetry Service. +func NewStateDOCATelemetryService( + c client.Client, manifestDir string) (State, ManifestRenderer, error) { + files, err := utils.GetFilesWithSuffix(manifestDir, render.ManifestFileSuffix...) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to get files from manifest dir") + } + + renderer := render.NewRenderer(files) + state := &docaTelemetryServiceState{ + stateSkel: stateSkel{ + name: docaTelemetryServiceName, + description: docaTelemetryServiceDescription, + client: c, + renderer: renderer, + }} + return state, state, nil +} + +// If the DOCATelemetryService defines a configuration we should not deploy the configMap. +func shouldDeployConfigMap(dts *mellanoxv1alpha1.DOCATelemetryServiceSpec) bool { + return dts.Config == nil +}