Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Instance Group Manager does not remove nodes with empty string in the zone field #2658

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion pkg/instancegroups/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,17 @@ func (m *manager) Sync(nodes []string, logger klog.Logger) (err error) {
// https://github.com/kubernetes/cloud-provider-gcp/blob/fca628cb3bf9267def0abb509eaae87d2d4040f3/providers/gce/gce_loadbalancer_internal.go#L606C1-L675C1
// the m.maxIGSize should be set to 1000 as is in the cloud-provider-gcp.
zonedNodes := m.splitNodesByZone(nodes, iglogger)
08volt marked this conversation as resolved.
Show resolved Hide resolved
iglogger.Info(fmt.Sprintf("Syncing nodes: %d nodes over %d zones", len(nodes), len(zonedNodes)))

emptyZoneNodesNames := sets.NewString(zonedNodes[zonegetter.EmptyZone]...)
if len(emptyZoneNodesNames) > 0 {
iglogger.Info(fmt.Sprintf("%d nodes have empty zone: %v. They will not be removed from instance group as long as zone is missing", len(emptyZoneNodesNames), emptyZoneNodesNames))
}

for zone, kubeNodesFromZone := range zonedNodes {
if zone == zonegetter.EmptyZone {
continue // skip ensuring instance group for empty zone
}
igName := m.namer.InstanceGroup()
if len(kubeNodesFromZone) > m.maxIGSize {
sortedKubeNodesFromZone := sets.NewString(kubeNodesFromZone...).List()
Expand All @@ -335,7 +345,12 @@ func (m *manager) Sync(nodes []string, logger klog.Logger) (err error) {
gceNodes.Insert(instance)
}

removeNodes := gceNodes.Difference(kubeNodes).List()
removalCandidates := gceNodes.Difference(kubeNodes)
iglogger.V(2).Info("Nodes that are removal candidates", "removalCandidates", events.TruncatedStringList(removalCandidates.List()))

removeNodes := removalCandidates.Difference(emptyZoneNodesNames).List() // Do not remove nodes which zone label still need to be assigned
iglogger.V(2).Info("Removing nodes (after ignoring nodes without zone assigned)", "removeNodes", events.TruncatedStringList(removeNodes)) // Do not remove nodes which zone label still need to be assigned

addNodes := kubeNodes.Difference(gceNodes).List()

iglogger.V(2).Info("Removing nodes", "removeNodes", events.TruncatedStringList(removeNodes))
Expand Down
133 changes: 133 additions & 0 deletions pkg/instancegroups/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,139 @@ func newNodePool(f Provider, maxIGSize int) Manager {
return pool
}

func getNodeNames(nodes map[string]string) []string {
names := make([]string, 0)
for name, _ := range nodes {
names = append(names, name)
}
08volt marked this conversation as resolved.
Show resolved Hide resolved
return names
}

func TestNodePoolSyncWithEmptyZone(t *testing.T) {

testCases := []struct {
desc string
gceNodes []string // all in defaultTestZone
kubeNodesEachStep []map[string]string // map of node:zone during each "ensure instance group"
wantedNodes []string // Nodes that should be there at the end of the updates
}{
{
desc: "Both nodes without zone during second update do not get deleted",
gceNodes: []string{"n1", "n2"},
kubeNodesEachStep: []map[string]string{
{
"n1": defaultTestZone,
"n2": defaultTestZone,
},
{
"n1": "",
"n2": "",
},
},
wantedNodes: []string{"n1", "n2"},
},
{
desc: "Create node when zone ready",
gceNodes: []string{"n1"},
kubeNodesEachStep: []map[string]string{
{
"n1": "",
"n2": "",
},
{
"n1": "",
"n2": defaultTestZone,
},
},
wantedNodes: []string{"n1", "n2"},
},
{
desc: "Do not delete nodes if zone is empty but delete if node not there",
gceNodes: []string{"n1", "n2", "n3"},
kubeNodesEachStep: []map[string]string{
{
"n1": defaultTestZone,
"n2": defaultTestZone,
"n3": defaultTestZone,
},
{
"n2": "",
"n3": defaultTestZone,
},
},
wantedNodes: []string{"n2", "n3"},
},
{
desc: "Do not create one Node without zone assigned",
gceNodes: []string{"n1"},
kubeNodesEachStep: []map[string]string{
{
"n1": defaultTestZone,
"n2": "",
},
},
wantedNodes: []string{"n1"},
},
{
desc: "Delete one Node",
gceNodes: []string{"n1", "n2"},
kubeNodesEachStep: []map[string]string{
{
"n1": defaultTestZone,
},
},
wantedNodes: []string{"n1"},
},
}

for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
// create fake gce node pool with existing gceNodes
ig := &compute.InstanceGroup{Name: defaultNamer.InstanceGroup()}
zonesToIGs := map[string]IGsToInstances{
defaultTestZone: {
ig: sets.NewString(tc.gceNodes...),
},
}
fakeGCEInstanceGroups := NewFakeInstanceGroups(zonesToIGs, 10)

// run required syncs
for _, nodeMap := range tc.kubeNodesEachStep {
pool := newNodePool(fakeGCEInstanceGroups, 10)
for name, zone := range nodeMap {
manager := pool.(*manager)
zonegetter.AddFakeNodes(manager.ZoneGetter, zone, name)
}

// run sync step
nodeNames := getNodeNames(nodeMap)
err := pool.Sync(nodeNames, klog.TODO())
if err != nil {
t.Fatalf("pool.Sync(%v) returned error %v, want nil", nodeNames, err)
}
}

instancesList, err := fakeGCEInstanceGroups.ListInstancesInInstanceGroup(ig.Name, defaultTestZone, allInstances)
if err != nil {
t.Fatalf("fakeGCEInstanceGroups.ListInstancesInInstanceGroup(%s, %s, %s) returned error %v, want nil", ig.Name, defaultTestZone, allInstances, err)
}
instances, err := test.InstancesListToNameSet(instancesList)
if err != nil {
t.Fatalf("test.InstancesListToNameSet(%v) returned error %v, want nil", ig, err)
}
expectedInstancesSize := len(tc.wantedNodes)
if instances.Len() != expectedInstancesSize {
t.Errorf("instances.Len() = %d not equal expectedInstancesSize = %d", instances.Len(), expectedInstancesSize)
}

kubeNodeSet := sets.NewString(tc.wantedNodes...)
if !kubeNodeSet.Equal(instances) {
t.Errorf("Expected kubeNodeSet = %v is not equal to instance set = %v", kubeNodeSet, instances)
}
})
}
}

func TestNodePoolSync(t *testing.T) {
maxIGSize := 1000

Expand Down
4 changes: 2 additions & 2 deletions pkg/neg/syncers/endpoints_calculator.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (l *LocalL4ILBEndpointsCalculator) CalculateEndpoints(eds []types.Endpoints
continue
}
zone, err := l.zoneGetter.ZoneForNode(node.Name, l.logger)
if err != nil {
if err != nil || zone == zonegetter.EmptyZone {
l.logger.Error(err, "Unable to find zone for node, skipping", "nodeName", node.Name)
metrics.PublishNegControllerErrorCountMetrics(err, true)
continue
Expand Down Expand Up @@ -178,7 +178,7 @@ func (l *ClusterL4ILBEndpointsCalculator) CalculateEndpoints(_ []types.Endpoints
continue
}
zone, err := l.zoneGetter.ZoneForNode(node.Name, l.logger)
if err != nil {
if err != nil || zone == zonegetter.EmptyZone {
l.logger.Error(err, "Unable to find zone for node skipping", "nodeName", node.Name)
metrics.PublishNegControllerErrorCountMetrics(err, true)
continue
Expand Down
14 changes: 8 additions & 6 deletions pkg/neg/syncers/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,14 @@ func toZoneNetworkEndpointMapDegradedMode(eds []negtypes.EndpointsData, zoneGett
continue
}
zone, getZoneErr := zoneGetter.ZoneForNode(nodeName, logger)
if getZoneErr != nil {
metrics.PublishNegControllerErrorCountMetrics(getZoneErr, true)
if enableMultiSubnetCluster && errors.Is(getZoneErr, zonegetter.ErrNodeNotInDefaultSubnet) {
epLogger.Error(getZoneErr, "Detected endpoint not from default subnet. Skipping", "nodeName", nodeName)
localEPCount[negtypes.NodeInNonDefaultSubnet]++
continue
if getZoneErr != nil || zone == zonegetter.EmptyZone {
if getZoneErr != nil {
metrics.PublishNegControllerErrorCountMetrics(getZoneErr, true)
if enableMultiSubnetCluster && errors.Is(getZoneErr, zonegetter.ErrNodeNotInDefaultSubnet) {
epLogger.Error(getZoneErr, "Detected endpoint not from default subnet. Skipping", "nodeName", nodeName)
localEPCount[negtypes.NodeInNonDefaultSubnet]++
continue
}
}
epLogger.Error(getZoneErr, "Endpoint's corresponding node does not have valid zone information, skipping", "nodeName", nodeName)
localEPCount[negtypes.NodeNotFound]++
Expand Down
8 changes: 3 additions & 5 deletions pkg/utils/zonegetter/zone_getter.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const (
AllNodesFilter = Filter("AllNodesFilter")
CandidateNodesFilter = Filter("CandidateNodesFilter")
CandidateAndUnreadyNodesFilter = Filter("CandidateAndUnreadyNodesFilter")
EmptyZone = ""
)

var ErrProviderIDNotFound = errors.New("providerID not found")
Expand All @@ -58,7 +59,7 @@ var ErrNodePodCIDRNotSet = errors.New("Node does not have PodCIDR set")

// providerIDRE is the regex to process providerID.
// A providerID is build out of '${ProviderName}://${project-id}/${zone}/${instance-name}'
var providerIDRE = regexp.MustCompile(`^` + "gce" + `://([^/]+)/([^/]+)/([^/]+)$`)
08volt marked this conversation as resolved.
Show resolved Hide resolved
var providerIDRE = regexp.MustCompile(`^` + "gce" + `://([^/]+)/([^/]*)/([^/]+)$`)

// ZoneGetter manages lookups for GCE instances to zones.
type ZoneGetter struct {
Expand Down Expand Up @@ -161,7 +162,7 @@ func (z *ZoneGetter) ListZones(filter Filter, logger klog.Logger) ([]string, err
zones := sets.String{}
for _, n := range nodes {
zone, err := getZone(n)
if err != nil {
if err != nil || zone == EmptyZone {
filterLogger.Error(err, "Failed to get zone from providerID", "nodeName", n.Name)
continue
}
Expand Down Expand Up @@ -325,9 +326,6 @@ func getZone(node *api_v1.Node) (string, error) {
if len(matches) != 4 {
return "", fmt.Errorf("%w: providerID %q of node %s is not valid", ErrSplitProviderID, node.Spec.ProviderID, node.Name)
}
if matches[2] == "" {
return "", fmt.Errorf("%w: node %s has an empty zone", ErrSplitProviderID, node.Name)
}
return matches[2], nil
}

Expand Down
12 changes: 6 additions & 6 deletions pkg/utils/zonegetter/zone_getter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func TestListZones(t *testing.T) {
t.Errorf("For test case %q with onlyIncludeDefaultSubnetNodes = %v, got %d zones, want %d zones", tc.desc, enableMultiSubnetCluster, len(zones), tc.expectLen)
}
for _, zone := range zones {
if zone == "" {
if zone == EmptyZone {
t.Errorf("For test case %q with onlyIncludeDefaultSubnetNodes = %v, got an empty zone,", tc.desc, enableMultiSubnetCluster)
}
}
Expand Down Expand Up @@ -112,7 +112,7 @@ func TestListZonesMultipleSubnets(t *testing.T) {
t.Errorf("For test case %q with multi subnet cluster enabled, got %d zones, want %d zones", tc.desc, len(zones), tc.expectLen)
}
for _, zone := range zones {
if zone == "" {
if zone == EmptyZone {
t.Errorf("For test case %q with multi subnet cluster enabled, got an empty zone,", tc.desc)
}
}
Expand Down Expand Up @@ -239,8 +239,8 @@ func TestZoneForNode(t *testing.T) {
{
desc: "Node with empty zone in providerID",
nodeName: "instance-empty-zone-providerID",
expectZone: "",
expectErr: ErrSplitProviderID,
expectZone: EmptyZone,
expectErr: nil,
},
}
for _, tc := range testCases {
Expand Down Expand Up @@ -355,8 +355,8 @@ func TestGetZone(t *testing.T) {
ProviderID: "gce://foo-project//bar-node",
},
},
expectZone: "",
expectErr: ErrSplitProviderID,
expectZone: EmptyZone,
expectErr: nil,
},
} {
zone, err := getZone(&tc.node)
Expand Down