diff --git a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml index c71d9f75..4da2ac34 100644 --- a/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml +++ b/deployment/network-operator/templates/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml @@ -18,8 +18,6 @@ apiVersion: mellanox.com/v1alpha1 kind: NicClusterPolicy metadata: name: nic-cluster-policy - annotations: - helm.sh/hook: post-install,post-upgrade spec: {{- if .Values.nodeAffinity }} nodeAffinity: diff --git a/deployment/network-operator/templates/delete-cr-hook.yaml b/deployment/network-operator/templates/scale-down-hook.yaml similarity index 66% rename from deployment/network-operator/templates/delete-cr-hook.yaml rename to deployment/network-operator/templates/scale-down-hook.yaml index 3103bf1a..d124e77c 100644 --- a/deployment/network-operator/templates/delete-cr-hook.yaml +++ b/deployment/network-operator/templates/scale-down-hook.yaml @@ -1,54 +1,59 @@ -{{- if .Values.deployCR }} --- apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "network-operator.fullname" . }}-hooks-sa + name: {{ include "network-operator.fullname" . }}-hooks-scale-sa annotations: - helm.sh/hook: post-delete + helm.sh/hook: pre-install,pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "network-operator.fullname" . }}-hooks-role + name: {{ include "network-operator.fullname" . }}-hooks-scale-role annotations: - helm.sh/hook: post-delete + helm.sh/hook: pre-install,pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" rules: - apiGroups: - - mellanox.com + - apps resources: - - nicclusterpolicies + - deployments/scale verbs: - - delete + - "*" + - apiGroups: + - "" + resources: + - pods + verbs: + - "*" --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "network-operator.fullname" . }}-hooks-binding + name: {{ include "network-operator.fullname" . }}-hooks-scale-binding annotations: - helm.sh/hook: post-delete + helm.sh/hook: pre-install,pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" subjects: - kind: ServiceAccount - name: {{ include "network-operator.fullname" . }}-hooks-sa + name: {{ include "network-operator.fullname" . }}-hooks-scale-sa namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: {{ include "network-operator.fullname" . }}-hooks-role + name: {{ include "network-operator.fullname" . }}-hooks-scale-role apiGroup: rbac.authorization.k8s.io --- apiVersion: batch/v1 kind: Job metadata: - name: network-operator-delete-cr + name: network-operator-scale-down namespace: {{ .Release.Namespace }} annotations: - "helm.sh/hook": post-delete + "helm.sh/hook": pre-install,pre-upgrade "helm.sh/hook-weight": "1" "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation labels: @@ -57,7 +62,7 @@ metadata: spec: template: metadata: - name: network-operator-delete-cr + name: network-operator-scale-down labels: {{- include "network-operator.labels" . | nindent 8 }} app.kubernetes.io/component: "network-operator" @@ -74,16 +79,16 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} - serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-sa + serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-scale-sa imagePullSecrets: {{ include "network-operator.operator.imagePullSecrets" . }} containers: - - name: delete-cr + - name: scale-down image: "{{ .Values.operator.repository }}/{{ .Values.operator.image }}:{{ .Values.operator.tag | default .Chart.AppVersion }}" imagePullPolicy: IfNotPresent command: - /bin/sh - -c - > - kubectl delete nicclusterpolicies.mellanox.com nic-cluster-policy; + kubectl scale -n {{ .Release.Namespace }} deployment {{ include "network-operator.fullname" . }} --replicas=0 || true; + kubectl wait -n {{ .Release.Namespace }} pod -l control-plane={{ .Release.Name }}-controller --for=delete --timeout=30s; restartPolicy: OnFailure -{{- end }} diff --git a/pkg/migrate/migrate.go b/pkg/migrate/migrate.go index 8a862e49..6419f55b 100644 --- a/pkg/migrate/migrate.go +++ b/pkg/migrate/migrate.go @@ -18,11 +18,14 @@ package migrate import ( "context" + "fmt" "github.com/go-logr/logr" v1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/Mellanox/network-operator/pkg/config" @@ -46,6 +49,11 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error { // not critical for the operator operation, safer to ignore log.V(consts.LogLevelWarning).Info("ignore error during whereabouts CronJob removal") } + if err := removeStateLabelFromNVIpamConfigMap(ctx, log, c); err != nil { + // critical for the operator operation, will fail NVIPAM migration + log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap") + return err + } return nil } @@ -68,3 +76,41 @@ func removeWhereaboutsIPReconcileCronJob(ctx context.Context, log logr.Logger, c } return nil } + +// reason: remove state label on NV IPAM config map if exists, to allow migration to IPPool CR +// If the state label is present, the config map will be removed by the NCP Controller as a stale object +func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c client.Client) error { + namespace := config.FromEnv().State.NetworkOperatorResourceNamespace + cmName := "nvidia-k8s-ipam-config" + cfg := &corev1.ConfigMap{} + key := types.NamespacedName{ + Name: cmName, + Namespace: namespace, + } + err := c.Get(ctx, key, cfg) + if apiErrors.IsNotFound(err) { + log.V(consts.LogLevelDebug).Info("NVIPAM config map not found, skip remove state label") + return nil + } else if err != nil { + log.V(consts.LogLevelError).Error(err, "fail to get NVIPAM configmap") + return err + } + _, ok := cfg.Labels[consts.StateLabel] + if ok { + log.V(consts.LogLevelDebug).Info("clear State label from NVIPAM configmap") + patch := []byte(fmt.Sprintf("[{\"op\": \"remove\", \"path\": \"/metadata/labels/%s\"}]", consts.StateLabel)) + err = c.Patch(ctx, &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: namespace, + }, + }, client.RawPatch(types.JSONPatchType, patch)) + if err != nil { + log.V(consts.LogLevelError).Error(err, "fail to remove State label from NVIPAM configmap") + return err + } + } else { + log.V(consts.LogLevelDebug).Info("state label not set on NVIPAM configmap") + } + return nil +}