Skip to content

Commit

Permalink
Merge pull request #75 from DataDog/dom.bozzuto/scheduler-fix
Browse files Browse the repository at this point in the history
Revert scheduler GetEligibleNodes optimization from issue kubernetes#125336
  • Loading branch information
domenicbozzuto authored Sep 6, 2024
2 parents 85cf9e0 + 38c6f3c commit 40c5b2b
Show file tree
Hide file tree
Showing 10 changed files with 8 additions and 659 deletions.
64 changes: 1 addition & 63 deletions pkg/scheduler/framework/plugins/volumebinding/binder.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ import (
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
"k8s.io/kubernetes/pkg/volume/util"
)

// ConflictReason is used for the special strings which explain why
Expand Down Expand Up @@ -127,8 +126,6 @@ type InTreeToCSITranslator interface {
// 1. The scheduler takes a Pod off the scheduler queue and processes it serially:
// a. Invokes all pre-filter plugins for the pod. GetPodVolumeClaims() is invoked
// here, pod volume information will be saved in current scheduling cycle state for later use.
// If pod has bound immediate PVCs, GetEligibleNodes() is invoked to potentially reduce
// down the list of eligible nodes based on the bound PV's NodeAffinity (if any).
// b. Invokes all filter plugins, parallelized across nodes. FindPodVolumes() is invoked here.
// c. Invokes all score plugins. Future/TBD
// d. Selects the best node for the Pod.
Expand All @@ -151,14 +148,6 @@ type SchedulerVolumeBinder interface {
// unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding.
GetPodVolumeClaims(pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error)

// GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
// potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
// in subsequent scheduling stages.
//
// If eligibleNodes is 'nil', then it indicates that such eligible node reduction cannot be made
// and all nodes should be considered.
GetEligibleNodes(boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string])

// FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the
// node and returns pod's volumes information.
//
Expand Down Expand Up @@ -230,8 +219,6 @@ type volumeBinder struct {
csiStorageCapacityLister storagelisters.CSIStorageCapacityLister
}

var _ SchedulerVolumeBinder = &volumeBinder{}

// CapacityCheck contains additional parameters for NewVolumeBinder that
// are only needed when checking volume sizes against available storage
// capacity is desired.
Expand Down Expand Up @@ -272,7 +259,7 @@ func NewVolumeBinder(
}

// FindPodVolumes finds the matching PVs for PVCs and nodes to provision PVs
// for the given pod and node. If the node does not fit, conflict reasons are
// for the given pod and node. If the node does not fit, confilict reasons are
// returned.
func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
podVolumes = &PodVolumes{}
Expand Down Expand Up @@ -380,55 +367,6 @@ func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, podVolumeClaims *PodVolumeCla
return
}

// GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
// potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
// in subsequent scheduling stages.
//
// Returning 'nil' for eligibleNodes indicates that such eligible node reduction cannot be made and all nodes
// should be considered.
func (b *volumeBinder) GetEligibleNodes(boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) {
if len(boundClaims) == 0 {
return
}

var errs []error
for _, pvc := range boundClaims {
pvName := pvc.Spec.VolumeName
pv, err := b.pvCache.GetPV(pvName)
if err != nil {
errs = append(errs, err)
continue
}

// if the PersistentVolume is local and has node affinity matching specific node(s),
// add them to the eligible nodes
nodeNames := util.GetLocalPersistentVolumeNodeNames(pv)
if len(nodeNames) != 0 {
// on the first found list of eligible nodes for the local PersistentVolume,
// insert to the eligible node set.
if eligibleNodes == nil {
eligibleNodes = sets.New(nodeNames...)
} else {
// for subsequent finding of eligible nodes for the local PersistentVolume,
// take the intersection of the nodes with the existing eligible nodes
// for cases if PV1 has node affinity to node1 and PV2 has node affinity to node2,
// then the eligible node list should be empty.
eligibleNodes = eligibleNodes.Intersection(sets.New(nodeNames...))
}
}
}

if len(errs) > 0 {
klog.V(4).InfoS("GetEligibleNodes: one or more error occurred finding eligible nodes", "error", errs)
return nil
}

if eligibleNodes != nil {
klog.V(4).InfoS("GetEligibleNodes: reduced down eligible nodes", "nodes", eligibleNodes)
}
return
}

// AssumePodVolumes will take the matching PVs and PVCs to provision in pod's
// volume information for the chosen node, and:
// 1. Update the pvCache with the new prebound PV.
Expand Down
140 changes: 0 additions & 140 deletions pkg/scheduler/framework/plugins/volumebinding/binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"
"os"
"reflect"
"sort"
"testing"
"time"
Expand All @@ -32,7 +31,6 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/informers"
Expand Down Expand Up @@ -61,9 +59,6 @@ var (
boundPVCNode1a = makeTestPVC("unbound-pvc", "1G", "", pvcBound, "pv-node1a", "1", &waitClass)
immediateUnboundPVC = makeTestPVC("immediate-unbound-pvc", "1G", "", pvcUnbound, "", "1", &immediateClass)
immediateBoundPVC = makeTestPVC("immediate-bound-pvc", "1G", "", pvcBound, "pv-bound-immediate", "1", &immediateClass)
localPreboundPVC1a = makeTestPVC("local-prebound-pvc-1a", "1G", "", pvcPrebound, "local-pv-node1a", "1", &waitClass)
localPreboundPVC1b = makeTestPVC("local-prebound-pvc-1b", "1G", "", pvcPrebound, "local-pv-node1b", "1", &waitClass)
localPreboundPVC2a = makeTestPVC("local-prebound-pvc-2a", "1G", "", pvcPrebound, "local-pv-node2a", "1", &waitClass)

// PVCs for dynamic provisioning
provisionedPVC = makeTestPVC("provisioned-pvc", "1Gi", "", pvcUnbound, "", "1", &waitClassWithProvisioner)
Expand Down Expand Up @@ -95,9 +90,6 @@ var (
pvNode1bBoundHigherVersion = makeTestPV("pv-node1b", "node1", "10G", "2", unboundPVC2, waitClass)
pvBoundImmediate = makeTestPV("pv-bound-immediate", "node1", "1G", "1", immediateBoundPVC, immediateClass)
pvBoundImmediateNode2 = makeTestPV("pv-bound-immediate", "node2", "1G", "1", immediateBoundPVC, immediateClass)
localPVNode1a = makeLocalPV("local-pv-node1a", "node1", "5G", "1", nil, waitClass)
localPVNode1b = makeLocalPV("local-pv-node1b", "node1", "10G", "1", nil, waitClass)
localPVNode2a = makeLocalPV("local-pv-node2a", "node2", "5G", "1", nil, waitClass)

// PVs for CSI migration
migrationPVBound = makeTestPVForCSIMigration(zone1Labels, boundMigrationPVC, true)
Expand Down Expand Up @@ -728,12 +720,6 @@ func makeTestPVForCSIMigration(labels map[string]string, pvc *v1.PersistentVolum
return pv
}

func makeLocalPV(name, node, capacity, version string, boundToPVC *v1.PersistentVolumeClaim, className string) *v1.PersistentVolume {
pv := makeTestPV(name, node, capacity, version, boundToPVC, className)
pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions[0].Key = v1.LabelHostname
return pv
}

func pvcSetSelectedNode(pvc *v1.PersistentVolumeClaim, node string) *v1.PersistentVolumeClaim {
newPVC := pvc.DeepCopy()
metav1.SetMetaDataAnnotation(&newPVC.ObjectMeta, volume.AnnSelectedNode, node)
Expand Down Expand Up @@ -2334,129 +2320,3 @@ func TestCapacity(t *testing.T) {
})
}
}

func TestGetEligibleNodes(t *testing.T) {
type scenarioType struct {
// Inputs
pvcs []*v1.PersistentVolumeClaim
pvs []*v1.PersistentVolume
nodes []*v1.Node

// Expected return values
eligibleNodes sets.Set[string]
}

scenarios := map[string]scenarioType{
"no-bound-claims": {},
"no-nodes-found": {
pvcs: []*v1.PersistentVolumeClaim{
preboundPVC,
preboundPVCNode1a,
},
},
"pv-not-found": {
pvcs: []*v1.PersistentVolumeClaim{
preboundPVC,
preboundPVCNode1a,
},
nodes: []*v1.Node{
node1,
},
},
"node-affinity-mismatch": {
pvcs: []*v1.PersistentVolumeClaim{
preboundPVC,
preboundPVCNode1a,
},
pvs: []*v1.PersistentVolume{
pvNode1a,
},
nodes: []*v1.Node{
node1,
node2,
},
},
"local-pv-with-node-affinity": {
pvcs: []*v1.PersistentVolumeClaim{
localPreboundPVC1a,
localPreboundPVC1b,
},
pvs: []*v1.PersistentVolume{
localPVNode1a,
localPVNode1b,
},
nodes: []*v1.Node{
node1,
node2,
},
eligibleNodes: sets.New("node1"),
},
"multi-local-pv-with-different-nodes": {
pvcs: []*v1.PersistentVolumeClaim{
localPreboundPVC1a,
localPreboundPVC1b,
localPreboundPVC2a,
},
pvs: []*v1.PersistentVolume{
localPVNode1a,
localPVNode1b,
localPVNode2a,
},
nodes: []*v1.Node{
node1,
node2,
},
eligibleNodes: sets.New[string](),
},
"local-and-non-local-pv": {
pvcs: []*v1.PersistentVolumeClaim{
localPreboundPVC1a,
localPreboundPVC1b,
preboundPVC,
immediateBoundPVC,
},
pvs: []*v1.PersistentVolume{
localPVNode1a,
localPVNode1b,
pvNode1a,
pvBoundImmediate,
pvBoundImmediateNode2,
},
nodes: []*v1.Node{
node1,
node2,
},
eligibleNodes: sets.New("node1"),
},
}

run := func(t *testing.T, scenario scenarioType) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

// Setup
testEnv := newTestBinder(t, ctx.Done())
testEnv.initVolumes(scenario.pvs, scenario.pvs)

testEnv.initNodes(scenario.nodes)
testEnv.initClaims(scenario.pvcs, scenario.pvcs)

// Execute
eligibleNodes := testEnv.binder.GetEligibleNodes(scenario.pvcs)

// Validate
if reflect.DeepEqual(scenario.eligibleNodes, eligibleNodes) {
fmt.Println("foo")
}

if compDiff := cmp.Diff(scenario.eligibleNodes, eligibleNodes, cmp.Comparer(func(a, b sets.Set[string]) bool {
return reflect.DeepEqual(a, b)
})); compDiff != "" {
t.Errorf("Unexpected eligible nodes (-want +got):\n%s", compDiff)
}
}

for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) { run(t, scenario) })
}
}
8 changes: 0 additions & 8 deletions pkg/scheduler/framework/plugins/volumebinding/fake_binder.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
)

// FakeVolumeBinderConfig holds configurations for fake volume binder.
Expand All @@ -47,18 +46,11 @@ type FakeVolumeBinder struct {
BindCalled bool
}

var _ SchedulerVolumeBinder = &FakeVolumeBinder{}

// GetPodVolumeClaims implements SchedulerVolumeBinder.GetPodVolumes.
func (b *FakeVolumeBinder) GetPodVolumeClaims(pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
return &PodVolumeClaims{}, nil
}

// GetEligibleNodes implements SchedulerVolumeBinder.GetEligibleNodes.
func (b *FakeVolumeBinder) GetEligibleNodes(boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) {
return nil
}

// FindPodVolumes implements SchedulerVolumeBinder.FindPodVolumes.
func (b *FakeVolumeBinder) FindPodVolumes(pod *v1.Pod, _ *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
return nil, b.config.FindReasons, b.config.FindErr
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,6 @@ func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleSt
status.AppendReason("pod has unbound immediate PersistentVolumeClaims")
return nil, status
}
// Attempt to reduce down the number of nodes to consider in subsequent scheduling stages if pod has bound claims.
var result *framework.PreFilterResult
if eligibleNodes := pl.Binder.GetEligibleNodes(podVolumeClaims.boundClaims); eligibleNodes != nil {
result = &framework.PreFilterResult{
NodeNames: eligibleNodes,
}
}

state.Write(stateKey, &stateData{
podVolumesByNode: make(map[string]*PodVolumes),
Expand All @@ -196,7 +189,7 @@ func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleSt
unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding,
},
})
return result, nil
return nil, nil
}

// PreFilterExtensions returns prefilter extensions, pod add and remove.
Expand Down
Loading

0 comments on commit 40c5b2b

Please sign in to comment.