Skip to content

Commit

Permalink
redesign device plugin
Browse files Browse the repository at this point in the history
always deploy sriov network device plugin and use a label to enable or
disable it on the nodes

Signed-off-by: Sebastian Sch <[email protected]>
  • Loading branch information
SchSeba committed Nov 6, 2024
1 parent 823b4d4 commit 39ef9f0
Show file tree
Hide file tree
Showing 12 changed files with 280 additions and 106 deletions.
11 changes: 0 additions & 11 deletions controllers/drain_controller_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,6 @@ func (dr *DrainReconcile) handleNodeDrainOrReboot(ctx context.Context,
return reconcile.Result{RequeueAfter: 5 * time.Second}, nil
}

reqLogger.Info("remove Device plugin from node")
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, dr.Client)
if err != nil {
reqLogger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelDisabled)
return reconcile.Result{}, err
}

// if we manage to drain we label the node state with drain completed and finish
err = utils.AnnotateObject(ctx, nodeNetworkState, constants.NodeStateDrainAnnotationCurrent, constants.DrainComplete, dr.Client)
if err != nil {
Expand Down
50 changes: 41 additions & 9 deletions controllers/sriovnetworknodepolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
)

Expand Down Expand Up @@ -133,10 +134,6 @@ func (r *SriovNetworkNodePolicyReconciler) Reconcile(ctx context.Context, req ct
if err = r.syncDevicePluginConfigMap(ctx, defaultOpConf, policyList, nodeList); err != nil {
return reconcile.Result{}, err
}
// Render and sync Daemon objects
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultOpConf, policyList); err != nil {
return reconcile.Result{}, err
}

// All was successful. Request that this be re-triggered after ResyncPeriod,
// so we can reconcile state again.
Expand Down Expand Up @@ -182,6 +179,12 @@ func (r *SriovNetworkNodePolicyReconciler) SetupWithManager(mgr ctrl.Manager) er
Info("Enqueuing sync for create event", "resource", e.Object.GetName())
qHandler(q)
},
UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) {
reflect.DeepEqual(e.ObjectOld.GetLabels(), e.ObjectNew.GetLabels())
log.Log.WithName("SriovNetworkNodePolicy").
Info("Enqueuing sync for create event", "resource", e.ObjectNew.GetName())
qHandler(q)
},
DeleteFunc: func(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) {
log.Log.WithName("SriovNetworkNodePolicy").
Info("Enqueuing sync for delete event", "resource", e.Object.GetName())
Expand Down Expand Up @@ -219,6 +222,30 @@ func (r *SriovNetworkNodePolicyReconciler) syncDevicePluginConfigMap(ctx context
return err
}
configData[node.Name] = string(config)

if data.ResourceList == nil || len(data.ResourceList) == 0 {
// if we don't have policies we should add the disabled label for the device plugin
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, r.Client)
if err != nil {
logger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelDisabled)
return err
}
} else {
// if we have policies we should add the enabled label for the device plugin
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelEnabled, r.Client)
if err != nil {
logger.Error(err, "failed to label node for device plugin label",
"labelKey",
constants.SriovDevicePluginLabel,
"labelValue",
constants.SriovDevicePluginLabelEnabled)
return err
}
}
}

cm := &corev1.ConfigMap{
Expand Down Expand Up @@ -296,8 +323,15 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
}
}
if !found {
// remove device plugin labels
logger.Info("removing device plugin label from node as SriovNetworkNodeState doesn't exist", "nodeStateName", ns.Name)
err = utils.RemoveLabelFromNode(ctx, ns.Name, constants.SriovDevicePluginLabel, r.Client)
if err != nil {
logger.Error(err, "Fail to remove device plugin label from node", "node", ns.Name)
return err
}
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
err := r.Delete(ctx, &ns, &client.DeleteOptions{})
err = r.Delete(ctx, &ns, &client.DeleteOptions{})
if err != nil {
logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName())
return err
Expand Down Expand Up @@ -415,13 +449,13 @@ func (r *SriovNetworkNodePolicyReconciler) renderDevicePluginConfigData(ctx cont
found, i := resourceNameInList(p.Spec.ResourceName, &rcl)

if found {
err := updateDevicePluginResource(ctx, &rcl.ResourceList[i], &p, nodeState)
err := updateDevicePluginResource(&rcl.ResourceList[i], &p, nodeState)
if err != nil {
return rcl, err
}
logger.V(1).Info("Update resource", "Resource", rcl.ResourceList[i])
} else {
rc, err := createDevicePluginResource(ctx, &p, nodeState)
rc, err := createDevicePluginResource(&p, nodeState)
if err != nil {
return rcl, err
}
Expand All @@ -442,7 +476,6 @@ func resourceNameInList(name string, rcl *dptypes.ResourceConfList) (bool, int)
}

func createDevicePluginResource(
ctx context.Context,
p *sriovnetworkv1.SriovNetworkNodePolicy,
nodeState *sriovnetworkv1.SriovNetworkNodeState) (*dptypes.ResourceConfig, error) {
netDeviceSelectors := dptypes.NetDeviceSelectors{}
Expand Down Expand Up @@ -516,7 +549,6 @@ func createDevicePluginResource(
}

func updateDevicePluginResource(
ctx context.Context,
rc *dptypes.ResourceConfig,
p *sriovnetworkv1.SriovNetworkNodePolicy,
nodeState *sriovnetworkv1.SriovNetworkNodeState) error {
Expand Down
137 changes: 136 additions & 1 deletion controllers/sriovnetworknodepolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@ package controllers
import (
"context"
"encoding/json"
"sync"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"
Expand Down Expand Up @@ -126,3 +132,132 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
})
}
}

var _ = Describe("SriovnetworkNodePolicy controller", Ordered, func() {
var cancel context.CancelFunc
var ctx context.Context

BeforeAll(func() {
By("Create SriovOperatorConfig controller k8s objs")
config := makeDefaultSriovOpConfig()
Expect(k8sClient.Create(context.Background(), config)).Should(Succeed())
DeferCleanup(func() {
err := k8sClient.Delete(context.Background(), config)
Expect(err).ToNot(HaveOccurred())
})

// setup controller manager
By("Setup controller manager")
k8sManager, err := setupK8sManagerForTest()
Expect(err).ToNot(HaveOccurred())

err = (&SriovNetworkNodePolicyReconciler{
Client: k8sManager.GetClient(),
Scheme: k8sManager.GetScheme(),
FeatureGate: featuregate.New(),
}).SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

ctx, cancel = context.WithCancel(context.Background())

wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
defer GinkgoRecover()
By("Start controller manager")
err := k8sManager.Start(ctx)
Expect(err).ToNot(HaveOccurred())
}()

DeferCleanup(func() {
By("Shut down manager")
cancel()
wg.Wait()
})
})
AfterEach(func() {
err := k8sClient.DeleteAllOf(context.Background(), &corev1.Node{})
Expect(err).ToNot(HaveOccurred())

err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodePolicy{}, k8sclient.InNamespace(vars.Namespace))
Expect(err).ToNot(HaveOccurred())

err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodeState{}, k8sclient.InNamespace(vars.Namespace))
Expect(err).ToNot(HaveOccurred())
})
Context("device plugin labels", func() {
It("Should add the right labels to the nodes", func() {
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
Name: "node0",
Labels: map[string]string{"kubernetes.io/os": "linux",
"node-role.kubernetes.io/worker": ""},
}}
Expect(k8sClient.Create(ctx, node)).To(Succeed())

nodeState := &sriovnetworkv1.SriovNetworkNodeState{}
Eventually(func(g Gomega) {
err := k8sClient.Get(context.TODO(), k8sclient.ObjectKey{Name: "node0", Namespace: testNamespace}, nodeState)
g.Expect(err).ToNot(HaveOccurred())
}, time.Minute, time.Second).Should(Succeed())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
value, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeTrue())
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelDisabled))
}, time.Minute, time.Second).Should(Succeed())

nodeState.Status.Interfaces = sriovnetworkv1.InterfaceExts{
sriovnetworkv1.InterfaceExt{
Vendor: "8086",
Driver: "i40e",
Mtu: 1500,
Name: "ens803f0",
PciAddress: "0000:86:00.0",
NumVfs: 0,
TotalVfs: 64,
},
}
err := k8sClient.Status().Update(context.Background(), nodeState)
Expect(err).ToNot(HaveOccurred())

somePolicy := &sriovnetworkv1.SriovNetworkNodePolicy{}
somePolicy.SetNamespace(testNamespace)
somePolicy.SetName("some-policy")
somePolicy.Spec = sriovnetworkv1.SriovNetworkNodePolicySpec{
NumVfs: 5,
NodeSelector: map[string]string{"node-role.kubernetes.io/worker": ""},
NicSelector: sriovnetworkv1.SriovNetworkNicSelector{Vendor: "8086"},
Priority: 20,
}
Expect(k8sClient.Create(context.Background(), somePolicy)).ToNot(HaveOccurred())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
value, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeTrue())
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelEnabled))
}, time.Minute, time.Second).Should(Succeed())

delete(node.Labels, "node-role.kubernetes.io/worker")
err = k8sClient.Update(context.Background(), node)
Expect(err).ToNot(HaveOccurred())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
g.Expect(err).ToNot(HaveOccurred())
_, exist := node.Labels[consts.SriovDevicePluginLabel]
g.Expect(exist).To(BeFalse())
}, time.Minute, time.Second).Should(Succeed())

Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name, Namespace: testNamespace}, nodeState)
Expect(err).To(HaveOccurred())
Expect(errors.IsNotFound(err)).To(BeTrue())
}, time.Minute, time.Second).Should(Succeed())
})
})
})
8 changes: 4 additions & 4 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ import (
machinev1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
apply "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
consts "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
render "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
)

Expand Down Expand Up @@ -140,7 +140,7 @@ func (r *SriovOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.
return reconcile.Result{}, err
}

if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig, policyList); err != nil {
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig); err != nil {
return reconcile.Result{}, err
}

Expand Down
52 changes: 2 additions & 50 deletions controllers/sriovoperatorconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package controllers

import (
"context"
"fmt"
"os"
"strings"
"sync"
Expand Down Expand Up @@ -30,7 +29,7 @@ import (
mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/openshift"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
util "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util"
)

var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
Expand Down Expand Up @@ -427,7 +426,7 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
metricsDaemonset := appsv1.DaemonSet{}
err := util.WaitForNamespacedObject(&metricsDaemonset, k8sClient, testNamespace, "sriov-network-metrics-exporter", util.RetryInterval, util.APITimeout)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(metricsDaemonset.Spec.Template.Spec.NodeSelector).To((Equal(nodeSelector)))
g.Expect(metricsDaemonset.Spec.Template.Spec.NodeSelector).To(Equal(nodeSelector))
}).Should(Succeed())
})

Expand Down Expand Up @@ -521,53 +520,6 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
g.Expect(injectorCfg.Webhooks[0].ClientConfig.CABundle).To(Equal([]byte("ca-bundle-2\n")))
}, "1s").Should(Succeed())
})

It("should reconcile to a converging state when multiple node policies are set", func() {
By("Creating a consistent number of node policies")
for i := 0; i < 30; i++ {
p := &sriovnetworkv1.SriovNetworkNodePolicy{
ObjectMeta: metav1.ObjectMeta{Namespace: testNamespace, Name: fmt.Sprintf("p%d", i)},
Spec: sriovnetworkv1.SriovNetworkNodePolicySpec{
Priority: 99,
NodeSelector: map[string]string{"foo": fmt.Sprintf("v%d", i)},
},
}
err := k8sClient.Create(context.Background(), p)
Expect(err).NotTo(HaveOccurred())
}

By("Triggering a the reconcile loop")
config := &sriovnetworkv1.SriovOperatorConfig{}
err := k8sClient.Get(context.Background(), types.NamespacedName{Name: "default", Namespace: testNamespace}, config)
Expect(err).NotTo(HaveOccurred())
if config.ObjectMeta.Labels == nil {
config.ObjectMeta.Labels = make(map[string]string)
}
config.ObjectMeta.Labels["trigger-test"] = "test-reconcile-daemonset"
err = k8sClient.Update(context.Background(), config)
Expect(err).NotTo(HaveOccurred())

By("Wait until device-plugin Daemonset's affinity has been calculated")
var expectedAffinity *corev1.Affinity

Eventually(func(g Gomega) {
daemonSet := &appsv1.DaemonSet{}
err = k8sClient.Get(context.Background(), types.NamespacedName{Name: "sriov-device-plugin", Namespace: testNamespace}, daemonSet)
g.Expect(err).NotTo(HaveOccurred())
// Wait until the last policy (with NodeSelector foo=v29) has been considered at least one time
g.Expect(daemonSet.Spec.Template.Spec.Affinity.String()).To(ContainSubstring("v29"))
expectedAffinity = daemonSet.Spec.Template.Spec.Affinity
}, "3s", "1s").Should(Succeed())

By("Verify device-plugin Daemonset's affinity doesn't change over time")
Consistently(func(g Gomega) {
daemonSet := &appsv1.DaemonSet{}
err = k8sClient.Get(context.Background(), types.NamespacedName{Name: "sriov-device-plugin", Namespace: testNamespace}, daemonSet)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(daemonSet.Spec.Template.Spec.Affinity).
To(Equal(expectedAffinity))
}, "3s", "1s").Should(Succeed())
})
})
})

Expand Down
Loading

0 comments on commit 39ef9f0

Please sign in to comment.