Skip to content

Commit

Permalink
Update k8s-operator-libs version
Browse files Browse the repository at this point in the history
+ use BuildState function from the lib
to build upgrade state for a cluster

Signed-off-by: Yury Kulazhenkov <[email protected]>
  • Loading branch information
ykulazhenkov committed Nov 13, 2023
1 parent 4750fea commit 58cf4df
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 201 deletions.
21 changes: 20 additions & 1 deletion api/v1alpha1/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ limitations under the License.

package v1alpha1

import upgradeApi "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1"
import (
"fmt"

upgradeApi "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1"

"github.com/Mellanox/network-operator/pkg/consts"
)

func GetDriverUpgradePolicy(
ofedUpgradePolicy *DriverUpgradePolicySpec) *upgradeApi.DriverUpgradePolicySpec {
Expand All @@ -33,6 +39,19 @@ func GetDriverUpgradePolicy(
driverUpgradePolicy.WaitForCompletion = getWaitForCompletionSpec(ofedUpgradePolicy.WaitForCompletion)
driverUpgradePolicy.DrainSpec = getDrainSpec(ofedUpgradePolicy.DrainSpec)

if driverUpgradePolicy.DrainSpec != nil && driverUpgradePolicy.DrainSpec.Enable {
// We want to skip operator itself during the drain because the upgrade process might hang
// if the operator is evicted and can't be rescheduled to any other node, e.g. in a single-node cluster.
// It's safe to do because the goal of the node draining during the upgrade is to
// evict pods that might use driver and operator doesn't use in its own pod.
if driverUpgradePolicy.DrainSpec.PodSelector == "" {
driverUpgradePolicy.DrainSpec.PodSelector = consts.OfedDriverSkipDrainLabel
} else {
driverUpgradePolicy.DrainSpec.PodSelector =
fmt.Sprintf("%s,%s", driverUpgradePolicy.DrainSpec.PodSelector, consts.OfedDriverSkipDrainLabel)
}
}

return &driverUpgradePolicy
}

Expand Down
2 changes: 1 addition & 1 deletion api/v1alpha1/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ var _ = Describe("API utils tests", func() {
result := getDrainSpec(input)
Expect(result.Enable).To(Equal(input.Enable))
Expect(result.Force).To(Equal(input.Force))
Expect(result.PodSelector).To(Equal(input.PodSelector))
Expect(result.PodSelector).To(ContainSubstring(input.PodSelector))
Expect(result.TimeoutSecond).To(Equal(input.TimeoutSecond))
Expect(result.DeleteEmptyDir).To(Equal(input.DeleteEmptyDir))
})
Expand Down
130 changes: 5 additions & 125 deletions controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package controllers

import (
"context"
"fmt"
"time"

corev1 "k8s.io/api/core/v1"
Expand All @@ -35,7 +34,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -49,9 +47,8 @@ import (
// UpgradeReconciler reconciles OFED Daemon Sets for upgrade
type UpgradeReconciler struct {
client.Client
Scheme *runtime.Scheme
StateManager *upgrade.ClusterUpgradeStateManager
NodeUpgradeStateProvider upgrade.NodeUpgradeStateProvider
Scheme *runtime.Scheme
StateManager upgrade.ClusterUpgradeStateManager
}

const plannedRequeueInterval = time.Minute * 2
Expand Down Expand Up @@ -102,7 +99,9 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl

upgradePolicy := nicClusterPolicy.Spec.OFEDDriver.OfedUpgradePolicy

state, err := r.BuildState(ctx)
state, err := r.StateManager.BuildState(ctx,
config.FromEnv().State.NetworkOperatorResourceNamespace,
map[string]string{consts.OfedDriverLabel: ""})
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to build cluster upgrade state")
return ctrl.Result{}, err
Expand Down Expand Up @@ -184,125 +183,6 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateAnnotations(ctx context.Contex
return nil
}

// BuildState creates a snapshot of the current OFED upgrade state in the cluster (upgrade.ClusterUpgradeState)
// It creates mappings between nodes and their upgrade state
// Nodes are grouped together with the driver POD running on them and the daemon set, controlling this pod
// This state is then used as an input for the upgrade.ClusterUpgradeStateManager
func (r *UpgradeReconciler) BuildState(ctx context.Context) (*upgrade.ClusterUpgradeState, error) {
reqLogger := log.FromContext(ctx)
reqLogger.V(consts.LogLevelInfo).Info("Building state")

upgradeState := upgrade.NewClusterUpgradeState()

daemonSets, err := r.getDriverDaemonSets(ctx)
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to get driver daemon set list")
return nil, err
}

reqLogger.V(consts.LogLevelDebug).Info("Got driver daemon sets", "length", len(daemonSets))

// Get list of driver pods
podList := &corev1.PodList{}

err = r.List(ctx, podList,
client.InNamespace(config.FromEnv().State.NetworkOperatorResourceNamespace),
client.MatchingLabels{consts.OfedDriverLabel: ""})
if err != nil {
return nil, err
}

filteredPodList := []corev1.Pod{}
for _, ds := range daemonSets {
dsPods := r.getPodsOwnedbyDs(ds, podList.Items, reqLogger)
if int(ds.Status.DesiredNumberScheduled) != len(dsPods) {
reqLogger.V(consts.LogLevelInfo).Info("Driver daemon set has Unscheduled pods", "name", ds.Name)
return nil, fmt.Errorf("DS should not have Unscheduled pods")
}
filteredPodList = append(filteredPodList, dsPods...)
}

upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()

for i := range filteredPodList {
pod := &filteredPodList[i]
ownerDaemonSet := daemonSets[pod.OwnerReferences[0].UID]
nodeState, err := r.buildNodeUpgradeState(ctx, pod, ownerDaemonSet)
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to build node upgrade state for pod", "pod", pod)
return nil, err
}
nodeStateLabel := nodeState.Node.Labels[upgradeStateLabel]
upgradeState.NodeStates[nodeStateLabel] = append(
upgradeState.NodeStates[nodeStateLabel], nodeState)
}

return &upgradeState, nil
}

// buildNodeUpgradeState creates a mapping between a node,
// the driver POD running on them and the daemon set, controlling this pod
func (r *UpgradeReconciler) buildNodeUpgradeState(
ctx context.Context, pod *corev1.Pod, ds *appsv1.DaemonSet) (*upgrade.NodeUpgradeState, error) {
reqLogger := log.FromContext(ctx)
node, err := r.NodeUpgradeStateProvider.GetNode(ctx, pod.Spec.NodeName)
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to get node", "node", pod.Spec.NodeName)
return nil, err
}

upgradeStateLabel := upgrade.GetUpgradeStateLabelKey()
reqLogger.V(consts.LogLevelInfo).Info("Node hosting a driver pod",
"node", node.Name, "state", node.Labels[upgradeStateLabel])

return &upgrade.NodeUpgradeState{Node: node, DriverPod: pod, DriverDaemonSet: ds}, nil
}

// getDriverDaemonSets retrieves DaemonSets labeled with OfedDriverLabel and returns UID->DaemonSet map
func (r *UpgradeReconciler) getDriverDaemonSets(ctx context.Context) (map[types.UID]*appsv1.DaemonSet, error) {
reqLogger := log.FromContext(ctx)
// Get list of driver pods
daemonSetList := &appsv1.DaemonSetList{}

err := r.List(ctx, daemonSetList,
client.InNamespace(config.FromEnv().State.NetworkOperatorResourceNamespace),
client.MatchingLabels{consts.OfedDriverLabel: ""})
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to get daemon set list")
return nil, err
}

daemonSetMap := make(map[types.UID]*appsv1.DaemonSet)
for i := range daemonSetList.Items {
daemonSet := &daemonSetList.Items[i]
daemonSetMap[daemonSet.UID] = daemonSet
}

return daemonSetMap, nil
}

// getPodsOwnedbyDs gets a list of pods return a list of the pods owned by the specified DaemonSet
func (r *UpgradeReconciler) getPodsOwnedbyDs(
ds *appsv1.DaemonSet, pods []corev1.Pod, reqLogger logr.Logger) []corev1.Pod {
dsPodList := []corev1.Pod{}
for i := range pods {
pod := &pods[i]
if pod.OwnerReferences == nil || len(pod.OwnerReferences) < 1 {
reqLogger.V(consts.LogLevelWarning).Info("OFED Driver Pod has no owner DaemonSet", "pod", pod)
continue
}
reqLogger.V(consts.LogLevelDebug).Info("Pod", "pod", pod.Name, "owner", pod.OwnerReferences[0].Name)

if ds.UID == pod.OwnerReferences[0].UID {
dsPodList = append(dsPodList, *pod)
} else {
reqLogger.V(consts.LogLevelWarning).Info("OFED Driver Pod is not owned by an OFED Driver DaemonSet",
"pod", pod, "actual owner", pod.OwnerReferences[0])
}
}
return dsPodList
}

// SetupWithManager sets up the controller with the Manager.
//
//nolint:dupl
Expand Down
64 changes: 0 additions & 64 deletions controllers/upgrade_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ import (
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"

mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1"
"github.com/Mellanox/network-operator/pkg/consts"
Expand Down Expand Up @@ -97,68 +95,6 @@ var _ = Describe("Upgrade Controller", func() {
Expect(present).To(Equal(false))
}
})

It("should only retrieve pods owned by a given daemonset", func() {
upgrade.SetDriverName("ofed")

ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-daemonset",
Namespace: "default",
UID: "ds-uid",
},
}

// Create a Pod owned by the DaemonSet
dsPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "ds-pod",
Namespace: "default",
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "DaemonSet",
Name: ds.Name,
UID: ds.UID,
},
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Image: "test", Name: "test"}},
},
}
err := k8sClient.Create(goctx.TODO(), dsPod)
Expect(err).NotTo(HaveOccurred())

// Create a Pod NOT owned by the DaemonSet
nonDsPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "non-ds-pod",
Namespace: "default",
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Image: "test", Name: "test"}},
},
}
err = k8sClient.Create(goctx.TODO(), nonDsPod)
Expect(err).NotTo(HaveOccurred())

podList := &corev1.PodList{}
err = k8sClient.List(goctx.TODO(), podList)
Expect(podList.Items).To(HaveLen(2))
Expect(err).NotTo(HaveOccurred())

upgradeReconciler := &UpgradeReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
}

pods := upgradeReconciler.getPodsOwnedbyDs(ds, podList.Items, log.Log.WithName("test-log"))

// Verify that the returned Pods are owned by the DaemonSet
Expect(pods).To(HaveLen(1))
Expect(pods[0].Name).To(Equal(dsPod.Name))
})
})
})

Expand Down
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,7 @@ require (
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
)

replace github.com/emicklei/go-restful => github.com/emicklei/go-restful v2.16.0+incompatible
replace (
github.com/NVIDIA/k8s-operator-libs => gitlab.com/nvidia/cloud-native/k8s-operator-libs v0.0.0-20230913095933-ed4c9d6bb9e4
github.com/emicklei/go-restful => github.com/emicklei/go-restful v2.16.0+incompatible
)
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ
github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc=
github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20230504161004-dd428b908f5a h1:k8HalV8rdGshw0hCPxVLt8FRzpd9hnXfjggc/ePAn3A=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20230504161004-dd428b908f5a/go.mod h1:o7UeOk3V0xlOX87NnSaGNDje23VKCKPdfN6aSIZ9BwM=
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
Expand Down Expand Up @@ -385,6 +383,8 @@ github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
gitlab.com/nvidia/cloud-native/k8s-operator-libs v0.0.0-20230913095933-ed4c9d6bb9e4 h1:RzPeXJ+YHt30+kPIFrlwiATo3D8vFgvB7rxEFtjOu5I=
gitlab.com/nvidia/cloud-native/k8s-operator-libs v0.0.0-20230913095933-ed4c9d6bb9e4/go.mod h1:o7UeOk3V0xlOX87NnSaGNDje23VKCKPdfN6aSIZ9BwM=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
Expand Down
10 changes: 3 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,6 @@ func main() {

upgradeLogger := ctrl.Log.WithName("controllers").WithName("Upgrade")

nodeUpgradeStateProvider := upgrade.NewNodeUpgradeStateProvider(
mgr.GetClient(), upgradeLogger.WithName("nodeUpgradeStateProvider"), nil)

clusterUpdateStateManager, err := upgrade.NewClusterUpgradeStateManager(
upgradeLogger.WithName("clusterUpgradeManager"), config.GetConfigOrDie(), nil)

Expand All @@ -182,10 +179,9 @@ func main() {
}

if err = (&controllers.UpgradeReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
StateManager: clusterUpdateStateManager,
NodeUpgradeStateProvider: nodeUpgradeStateProvider,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
StateManager: clusterUpdateStateManager,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Upgrade")
os.Exit(1)
Expand Down
3 changes: 3 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@ const (
NicClusterPolicyResourceName = "nic-cluster-policy"
OfedDriverLabel = "nvidia.com/ofed-driver"
StateLabel = "nvidia.network-operator.state"
// OfedDriverSkipDrainLabel contains name of the label which is used to indicate
// that OFED pod should be skipped during the drain operation which is executed by the upgrade controller.
OfedDriverSkipDrainLabel = "nvidia.com/ofed-driver-upgrade-drain.skip"
)

0 comments on commit 58cf4df

Please sign in to comment.