From 4cf3e47c47d83703c5250a9962fd30a007adddc3 Mon Sep 17 00:00:00 2001 From: rambohe-ch Date: Thu, 31 Aug 2023 17:06:34 +0800 Subject: [PATCH] 1. add nodelifecycle controller 2. don't mark pods not ready when node has pod binding annotation --- .../yurt-manager-auto-generated.yaml | 24 +- cmd/yurt-manager/app/manager.go | 35 +- cmd/yurt-manager/app/options/generic.go | 3 +- .../app/options/nodelifecyclecontroller.go | 87 + cmd/yurt-manager/app/options/options.go | 8 +- cmd/yurt-manager/names/controller_names.go | 2 + go.mod | 5 +- go.sum | 3 + pkg/util/taints/taints.go | 289 ++ pkg/util/taints/taints_test.go | 984 +++++ .../controller/apis/config/types.go | 4 + pkg/yurtmanager/controller/controller.go | 9 +- .../csrapprover/csrapprover_controller.go | 2 +- .../daemon_pod_updater_controller.go | 2 +- .../internal/controller/controller.go | 222 ++ .../controller/nodelifecycle/metrics.go | 107 + .../node_lifecycle_controller.go | 1361 +++++++ .../node_lifecycle_controller_test.go | 3467 +++++++++++++++++ .../scheduler/rate_limited_queue.go | 308 ++ .../scheduler/rate_limited_queue_test.go | 333 ++ .../nodelifecycle/scheduler/taint_manager.go | 497 +++ .../scheduler/taint_manager_test.go | 1047 +++++ .../nodelifecycle/scheduler/timed_workers.go | 154 + .../scheduler/timed_workers_test.go | 153 + .../nodepool/nodepool_controller.go | 2 +- .../platformadmin/platformadmin_controller.go | 2 +- .../controller/raven/dns/dns_controller.go | 2 +- .../gateway_internal_service_controller.go | 2 +- .../gateway_pickup_controller.go | 2 +- .../gateway_public_service_controller.go | 2 +- .../endpoints/endpoints_controller.go | 2 +- .../endpointslice/endpointslice_controller.go | 2 +- .../controller/testutil/test_utils.go | 869 +++++ .../controller/util/helper/helpers.go | 75 + .../controller/util/helper/helpers_test.go | 46 + .../controller/util/node/controller_utils.go | 423 ++ pkg/yurtmanager/controller/util/tools.go | 46 + .../yurtappdaemon/yurtappdaemon_controller.go | 2 +- .../yurtappoverrider_controller.go | 2 +- .../yurtappset/yurtappset_controller.go | 2 +- .../cert/yurtcoordinatorcert_controller.go | 2 +- .../delegatelease/delegatelease_controller.go | 2 +- .../podbinding/podbinding_controller.go | 53 +- .../podbinding/podbinding_controller_test.go | 5 +- .../yurtstaticset/yurtstaticset_controller.go | 2 +- .../webhook/node/v1/node_handler.go | 4 +- pkg/yurtmanager/webhook/pod/v1/pod_handler.go | 57 - .../webhook/pod/v1/pod_validation.go | 149 - pkg/yurtmanager/webhook/server.go | 2 - 49 files changed, 10564 insertions(+), 299 deletions(-) create mode 100644 cmd/yurt-manager/app/options/nodelifecyclecontroller.go create mode 100644 pkg/util/taints/taints.go create mode 100644 pkg/util/taints/taints_test.go create mode 100644 pkg/yurtmanager/controller/internal/controller/controller.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/metrics.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller_test.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue_test.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager_test.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers.go create mode 100644 pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers_test.go create mode 100644 pkg/yurtmanager/controller/testutil/test_utils.go create mode 100644 pkg/yurtmanager/controller/util/helper/helpers.go create mode 100644 pkg/yurtmanager/controller/util/helper/helpers_test.go delete mode 100644 pkg/yurtmanager/webhook/pod/v1/pod_handler.go delete mode 100644 pkg/yurtmanager/webhook/pod/v1/pod_validation.go diff --git a/charts/yurt-manager/templates/yurt-manager-auto-generated.yaml b/charts/yurt-manager/templates/yurt-manager-auto-generated.yaml index 1b452b66bcb..ff6b9bbab04 100644 --- a/charts/yurt-manager/templates/yurt-manager-auto-generated.yaml +++ b/charts/yurt-manager/templates/yurt-manager-auto-generated.yaml @@ -572,7 +572,7 @@ webhooks: name: yurt-manager-webhook-service namespace: {{ .Release.Namespace }} path: /mutate-core-openyurt-io-v1-node - failurePolicy: Fail + failurePolicy: Ignore name: mutate.core.v1.node.openyurt.io rules: - apiGroups: @@ -743,7 +743,7 @@ webhooks: name: yurt-manager-webhook-service namespace: {{ .Release.Namespace }} path: /validate-core-openyurt-io-v1-node - failurePolicy: Fail + failurePolicy: Ignore name: validate.core.v1.node.openyurt.io rules: - apiGroups: @@ -797,26 +797,6 @@ webhooks: resources: - platformadmins sideEffects: None -- admissionReviewVersions: - - v1 - - v1beta1 - clientConfig: - service: - name: yurt-manager-webhook-service - namespace: {{ .Release.Namespace }} - path: /validate-core-openyurt-io-v1-pod - failurePolicy: Fail - name: validate.core.v1.pod.openyurt.io - rules: - - apiGroups: - - "" - apiVersions: - - v1 - operations: - - DELETE - resources: - - pods - sideEffects: None - admissionReviewVersions: - v1 - v1beta1 diff --git a/cmd/yurt-manager/app/manager.go b/cmd/yurt-manager/app/manager.go index f6e3b2bea9c..54523b88b57 100644 --- a/cmd/yurt-manager/app/manager.go +++ b/cmd/yurt-manager/app/manager.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" cliflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/cli/globalflag" "k8s.io/component-base/term" @@ -146,11 +147,17 @@ func PrintFlags(flags *pflag.FlagSet) { // Run runs the KubeControllerManagerOptions. This should never exit. func Run(c *config.CompletedConfig, stopCh <-chan struct{}) error { - ctrl.SetLogger(klogr.New()) - ctx := ctrl.SetupSignalHandler() cfg := ctrl.GetConfigOrDie() + if len(c.ComponentConfig.Generic.Kubeconfig) != 0 { + config, err := clientcmd.BuildConfigFromFlags("", c.ComponentConfig.Generic.Kubeconfig) + if err != nil { + klog.Infof("could not build rest config, %v", err) + return err + } + cfg = config + } setRestConfig(cfg, c) mgr, err := ctrl.NewManager(cfg, ctrl.Options{ @@ -173,7 +180,7 @@ func Run(c *config.CompletedConfig, stopCh <-chan struct{}) error { } setupLog.Info("setup controllers") - if err = controller.SetupWithManager(c, mgr); err != nil { + if err = controller.SetupWithManager(ctx, c, mgr); err != nil { setupLog.Error(err, "unable to setup controllers") os.Exit(1) } @@ -184,16 +191,20 @@ func Run(c *config.CompletedConfig, stopCh <-chan struct{}) error { os.Exit(1) } - // +kubebuilder:scaffold:builder - setupLog.Info("initialize webhook") - if err := webhook.Initialize(ctx, c, mgr.GetConfig()); err != nil { - setupLog.Error(err, "unable to initialize webhook") - os.Exit(1) - } + if len(webhook.WebhookHandlerPath) != 0 { + // +kubebuilder:scaffold:builder + setupLog.Info("initialize webhook") + if err := webhook.Initialize(ctx, c, mgr.GetConfig()); err != nil { + setupLog.Error(err, "unable to initialize webhook") + os.Exit(1) + } - if err := mgr.AddReadyzCheck("webhook-ready", mgr.GetWebhookServer().StartedChecker()); err != nil { - setupLog.Error(err, "unable to add readyz check") - os.Exit(1) + if err := mgr.AddReadyzCheck("webhook-ready", mgr.GetWebhookServer().StartedChecker()); err != nil { + setupLog.Error(err, "unable to add readyz check") + os.Exit(1) + } + } else { + klog.Infof("no webhook is registered, so skip webhook setup") } if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil { diff --git a/cmd/yurt-manager/app/options/generic.go b/cmd/yurt-manager/app/options/generic.go index dfa862d3f05..c3099d95321 100644 --- a/cmd/yurt-manager/app/options/generic.go +++ b/cmd/yurt-manager/app/options/generic.go @@ -98,6 +98,7 @@ func (o *GenericOptions) ApplyTo(cfg *config.GenericConfiguration, controllerAli cfg.RestConfigQPS = o.RestConfigQPS cfg.RestConfigBurst = o.RestConfigBurst cfg.WorkingNamespace = o.WorkingNamespace + cfg.Kubeconfig = o.Kubeconfig cfg.Controllers = make([]string, len(o.Controllers)) for i, initialName := range o.Controllers { @@ -135,6 +136,6 @@ func (o *GenericOptions) AddFlags(fs *pflag.FlagSet, allControllers, disabledByD strings.Join(allControllers, ", "), strings.Join(disabledByDefaultControllers, ", "))) fs.StringSliceVar(&o.DisabledWebhooks, "disable-independent-webhooks", o.DisabledWebhooks, "A list of webhooks to disable. "+ "'*' disables all independent webhooks, 'foo' disables the independent webhook named 'foo'.") - + fs.StringVar(&o.Kubeconfig, "kubeconfig", o.Kubeconfig, "Path to kubeconfig file with authorization and master location information") features.DefaultMutableFeatureGate.AddFlag(fs) } diff --git a/cmd/yurt-manager/app/options/nodelifecyclecontroller.go b/cmd/yurt-manager/app/options/nodelifecyclecontroller.go new file mode 100644 index 00000000000..1a6f9473a9b --- /dev/null +++ b/cmd/yurt-manager/app/options/nodelifecyclecontroller.go @@ -0,0 +1,87 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "fmt" + "time" + + "github.com/spf13/pflag" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kube-controller-manager/config/v1alpha1" + + "github.com/openyurtio/openyurt/cmd/yurt-manager/names" +) + +// NodeLifecycleControllerOptions holds the NodeLifecycleController options. +type NodeLifecycleControllerOptions struct { + *v1alpha1.NodeLifecycleControllerConfiguration +} + +func NewNodeLifecycleControllerOptions() *NodeLifecycleControllerOptions { + return &NodeLifecycleControllerOptions{ + NodeLifecycleControllerConfiguration: &v1alpha1.NodeLifecycleControllerConfiguration{ + PodEvictionTimeout: metav1.Duration{Duration: 5 * time.Minute}, + NodeMonitorGracePeriod: metav1.Duration{Duration: 40 * time.Second}, + NodeStartupGracePeriod: metav1.Duration{Duration: 60 * time.Second}, + }, + } +} + +// AddFlags adds flags related to NodeLifecycleController for controller manager to the specified FlagSet. +func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { + if o == nil { + return + } + + fs.DurationVar(&o.NodeStartupGracePeriod.Duration, "node-startup-grace-period", o.NodeStartupGracePeriod.Duration, + "Amount of time which we allow starting Node to be unresponsive before marking it unhealthy.") + fs.DurationVar(&o.NodeMonitorGracePeriod.Duration, "node-monitor-grace-period", o.NodeMonitorGracePeriod.Duration, + "Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+ + "Must be N times more than kubelet's nodeStatusUpdateFrequency, "+ + "where N means number of retries allowed for kubelet to post node status.") + fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.") + fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") + fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, fmt.Sprintf("Number of nodes from which %s treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.", names.NodeLifeCycleController)) + fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") +} + +// ApplyTo fills up NodeLifecycleController config with options. +func (o *NodeLifecycleControllerOptions) ApplyTo(cfg *v1alpha1.NodeLifecycleControllerConfiguration) error { + if o == nil { + return nil + } + + cfg.NodeStartupGracePeriod = o.NodeStartupGracePeriod + cfg.NodeMonitorGracePeriod = o.NodeMonitorGracePeriod + cfg.NodeEvictionRate = o.NodeEvictionRate + cfg.SecondaryNodeEvictionRate = o.SecondaryNodeEvictionRate + cfg.LargeClusterSizeThreshold = o.LargeClusterSizeThreshold + cfg.UnhealthyZoneThreshold = o.UnhealthyZoneThreshold + + return nil +} + +// Validate checks validation of NodeLifecycleControllerOptions. +func (o *NodeLifecycleControllerOptions) Validate() []error { + if o == nil { + return nil + } + + errs := []error{} + return errs +} diff --git a/cmd/yurt-manager/app/options/options.go b/cmd/yurt-manager/app/options/options.go index a0f8f161f11..54c9c1d199c 100644 --- a/cmd/yurt-manager/app/options/options.go +++ b/cmd/yurt-manager/app/options/options.go @@ -33,6 +33,7 @@ type YurtManagerOptions struct { YurtAppDaemonController *YurtAppDaemonControllerOptions PlatformAdminController *PlatformAdminControllerOptions YurtAppOverriderController *YurtAppOverriderControllerOptions + NodeLifeCycleController *NodeLifecycleControllerOptions } // NewYurtManagerOptions creates a new YurtManagerOptions with a default config. @@ -47,6 +48,7 @@ func NewYurtManagerOptions() (*YurtManagerOptions, error) { YurtAppDaemonController: NewYurtAppDaemonControllerOptions(), PlatformAdminController: NewPlatformAdminControllerOptions(), YurtAppOverriderController: NewYurtAppOverriderControllerOptions(), + NodeLifeCycleController: NewNodeLifecycleControllerOptions(), } return &s, nil @@ -61,7 +63,7 @@ func (y *YurtManagerOptions) Flags(allControllers, disabledByDefaultControllers y.YurtAppDaemonController.AddFlags(fss.FlagSet("yurtappdaemon controller")) y.PlatformAdminController.AddFlags(fss.FlagSet("iot controller")) y.YurtAppOverriderController.AddFlags(fss.FlagSet("yurtappoverrider controller")) - // Please Add Other controller flags @kadisi + y.NodeLifeCycleController.AddFlags(fss.FlagSet("nodelifecycle controller")) return fss } @@ -76,6 +78,7 @@ func (y *YurtManagerOptions) Validate(allControllers []string, controllerAliases errs = append(errs, y.YurtAppDaemonController.Validate()...) errs = append(errs, y.PlatformAdminController.Validate()...) errs = append(errs, y.YurtAppOverriderController.Validate()...) + errs = append(errs, y.NodeLifeCycleController.Validate()...) return utilerrors.NewAggregate(errs) } @@ -102,6 +105,9 @@ func (y *YurtManagerOptions) ApplyTo(c *config.Config, controllerAliases map[str if err := y.GatewayPickupController.ApplyTo(&c.ComponentConfig.GatewayPickupController); err != nil { return err } + if err := y.NodeLifeCycleController.ApplyTo(&c.ComponentConfig.NodeLifeCycleController); err != nil { + return err + } return nil } diff --git a/cmd/yurt-manager/names/controller_names.go b/cmd/yurt-manager/names/controller_names.go index 9749256c90f..a334c2696dd 100644 --- a/cmd/yurt-manager/names/controller_names.go +++ b/cmd/yurt-manager/names/controller_names.go @@ -34,6 +34,7 @@ const ( GatewayInternalServiceController = "gateway-internal-service-controller" GatewayPublicServiceController = "gateway-public-service" GatewayDNSController = "gateway-dns-controller" + NodeLifeCycleController = "node-life-cycle-controller" ) func YurtManagerControllerAliases() map[string]string { @@ -56,5 +57,6 @@ func YurtManagerControllerAliases() map[string]string { "gatewayinternalservice": GatewayInternalServiceController, "gatewaypublicservice": GatewayPublicServiceController, "gatewaydns": GatewayDNSController, + "nodelifecycle": NodeLifeCycleController, } } diff --git a/go.mod b/go.mod index 2a68ce5c0eb..9099ece3452 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/evanphx/json-patch v5.6.0+incompatible github.com/go-resty/resty/v2 v2.7.0 github.com/golang-jwt/jwt v3.2.2+incompatible + github.com/google/go-cmp v0.5.9 github.com/google/uuid v1.3.0 github.com/gorilla/mux v1.8.0 github.com/hashicorp/go-version v1.6.0 @@ -45,7 +46,9 @@ require ( k8s.io/component-helpers v0.22.3 k8s.io/controller-manager v0.22.3 k8s.io/klog/v2 v2.9.0 + k8s.io/kube-controller-manager v0.0.0 k8s.io/kubectl v0.22.3 + k8s.io/kubelet v0.0.0 k8s.io/kubernetes v1.22.3 k8s.io/utils v0.0.0-20210930125809-cb0fa318a74b sigs.k8s.io/apiserver-network-proxy v0.0.15 @@ -103,7 +106,6 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/googleapis/gnostic v0.5.5 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect @@ -166,6 +168,7 @@ require ( gopkg.in/ini.v1 v1.66.2 // indirect gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect + k8s.io/cloud-provider v0.22.3 // indirect k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.22 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect diff --git a/go.sum b/go.sum index be6b8320827..2331cb6f80b 100644 --- a/go.sum +++ b/go.sum @@ -1165,6 +1165,7 @@ k8s.io/cli-runtime v0.22.3 h1:AeOgaDpb/k36amWsjyyIU+FLpLzzdmoLD5gn38c5fio= k8s.io/cli-runtime v0.22.3/go.mod h1:um6JvCxV9Hrhq0zCUxcqYoY7/wF64g6IYgOViI8sg6Q= k8s.io/client-go v0.22.3 h1:6onkOSc+YNdwq5zXE0wFXicq64rrym+mXwHu/CPVGO4= k8s.io/client-go v0.22.3/go.mod h1:ElDjYf8gvZsKDYexmsmnMQ0DYO8W9RwBjfQ1PI53yow= +k8s.io/cloud-provider v0.22.3 h1:ZsWdB0WmyjKlE901EM14BuSvnN+QPGrCGjcfDc+b5NI= k8s.io/cloud-provider v0.22.3/go.mod h1:GsKMR5EnNH4zcfkEvOxBPEZVuRvadVRkZvGqYxxBvO4= k8s.io/cluster-bootstrap v0.22.3 h1:uTrzquwoXsstQ6PCea0dYbKWcPCetMp4MZEkZbT+Ei0= k8s.io/cluster-bootstrap v0.22.3/go.mod h1:FVBAeGJ/T6QbNgGb7DX98FCjExJnNLsRXtGRMjEQ26I= @@ -1183,6 +1184,7 @@ k8s.io/gengo v0.0.0-20210813121822-485abfe95c7c/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAE k8s.io/klog/v2 v2.9.0 h1:D7HV+n1V57XeZ0m6tdRkfknthUaM06VFbWldOFh8kzM= k8s.io/klog/v2 v2.9.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec= k8s.io/kube-aggregator v0.22.3/go.mod h1:TIpLq1HvR/S4y75i3y+4q9ik3ZvgyaDz72CBfDS0A6E= +k8s.io/kube-controller-manager v0.22.3 h1:DatYcgMKAn28e2A7MiMULoRoft3SaCV/qVk+FoGTUw0= k8s.io/kube-controller-manager v0.22.3/go.mod h1:7biFk6Azf7xD+pzTScw7X9M5vGScqYp4J4wOT61QL1s= k8s.io/kube-openapi v0.0.0-20210421082810-95288971da7e/go.mod h1:vHXdDvt9+2spS2Rx9ql3I8tycm3H9FDfdUoIuKCefvw= k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 h1:E3J9oCLlaobFUqsjG9DfKbP2BmgwBL2p7pn0A3dG9W4= @@ -1191,6 +1193,7 @@ k8s.io/kube-proxy v0.22.3/go.mod h1:9ta1U8GKKo6by981sN/L6MhFJzPWxMdfh7plVPH1I2s= k8s.io/kube-scheduler v0.22.3/go.mod h1:jVLHSttd8cSejBLOeiWE+g8etA6XdOBGiR8tI577OhU= k8s.io/kubectl v0.22.3 h1:xziSHHyFHg2nt9vE6A0XqW5dOePNSlzxG8z3z+IY63E= k8s.io/kubectl v0.22.3/go.mod h1:gcpQHPOx+Jke9Og6Li7YxR/ZuaOtFUeJw7xHH617tHs= +k8s.io/kubelet v0.22.3 h1:C21Kg66Zzvc21uJITEPg4stGMcSZsR1JB+7+6Uwm8zs= k8s.io/kubelet v0.22.3/go.mod h1:9nUZNGUigU2uAIm7kgf8BsvYDI9KjIE5nt9+yI1+p7w= k8s.io/legacy-cloud-providers v0.22.3/go.mod h1:eEOOaRtP2PuCVkjZvuTPa6ZgyPpzJkCVqpE3YtuArLQ= k8s.io/metrics v0.22.3/go.mod h1:HbLFLRKtXzoC/6tHLQAlO9AeOBXZp2eB6SsgkbujoNI= diff --git a/pkg/util/taints/taints.go b/pkg/util/taints/taints.go new file mode 100644 index 00000000000..1a1e0d1de76 --- /dev/null +++ b/pkg/util/taints/taints.go @@ -0,0 +1,289 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// package taints implements utilities for working with taints +package taints + +import ( + "fmt" + "strings" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/validation" + "k8s.io/kubernetes/pkg/apis/core/helper" +) + +const ( + MODIFIED = "modified" + TAINTED = "tainted" + UNTAINTED = "untainted" +) + +// parseTaint parses a taint from a string, whose form must be either +// '=:', ':', or ''. +func parseTaint(st string) (v1.Taint, error) { + var taint v1.Taint + + var key string + var value string + var effect v1.TaintEffect + + parts := strings.Split(st, ":") + switch len(parts) { + case 1: + key = parts[0] + case 2: + effect = v1.TaintEffect(parts[1]) + if err := validateTaintEffect(effect); err != nil { + return taint, err + } + + partsKV := strings.Split(parts[0], "=") + if len(partsKV) > 2 { + return taint, fmt.Errorf("invalid taint spec: %v", st) + } + key = partsKV[0] + if len(partsKV) == 2 { + value = partsKV[1] + if errs := validation.IsValidLabelValue(value); len(errs) > 0 { + return taint, fmt.Errorf("invalid taint spec: %v, %s", st, strings.Join(errs, "; ")) + } + } + default: + return taint, fmt.Errorf("invalid taint spec: %v", st) + } + + if errs := validation.IsQualifiedName(key); len(errs) > 0 { + return taint, fmt.Errorf("invalid taint spec: %v, %s", st, strings.Join(errs, "; ")) + } + + taint.Key = key + taint.Value = value + taint.Effect = effect + + return taint, nil +} + +func validateTaintEffect(effect v1.TaintEffect) error { + if effect != v1.TaintEffectNoSchedule && effect != v1.TaintEffectPreferNoSchedule && effect != v1.TaintEffectNoExecute { + return fmt.Errorf("invalid taint effect: %v, unsupported taint effect", effect) + } + + return nil +} + +// ParseTaints takes a spec which is an array and creates slices for new taints to be added, taints to be deleted. +// It also validates the spec. For example, the form `` may be used to remove a taint, but not to add one. +func ParseTaints(spec []string) ([]v1.Taint, []v1.Taint, error) { + var taints, taintsToRemove []v1.Taint + uniqueTaints := map[v1.TaintEffect]sets.String{} + + for _, taintSpec := range spec { + if strings.HasSuffix(taintSpec, "-") { + taintToRemove, err := parseTaint(strings.TrimSuffix(taintSpec, "-")) + if err != nil { + return nil, nil, err + } + taintsToRemove = append(taintsToRemove, v1.Taint{Key: taintToRemove.Key, Effect: taintToRemove.Effect}) + } else { + newTaint, err := parseTaint(taintSpec) + if err != nil { + return nil, nil, err + } + // validate that the taint has an effect, which is required to add the taint + if len(newTaint.Effect) == 0 { + return nil, nil, fmt.Errorf("invalid taint spec: %v", taintSpec) + } + // validate if taint is unique by + if len(uniqueTaints[newTaint.Effect]) > 0 && uniqueTaints[newTaint.Effect].Has(newTaint.Key) { + return nil, nil, fmt.Errorf("duplicated taints with the same key and effect: %v", newTaint) + } + // add taint to existingTaints for uniqueness check + if len(uniqueTaints[newTaint.Effect]) == 0 { + uniqueTaints[newTaint.Effect] = sets.String{} + } + uniqueTaints[newTaint.Effect].Insert(newTaint.Key) + + taints = append(taints, newTaint) + } + } + return taints, taintsToRemove, nil +} + +// CheckIfTaintsAlreadyExists checks if the node already has taints that we want to add and returns a string with taint keys. +func CheckIfTaintsAlreadyExists(oldTaints []v1.Taint, taints []v1.Taint) string { + var existingTaintList = make([]string, 0) + for _, taint := range taints { + for _, oldTaint := range oldTaints { + if taint.Key == oldTaint.Key && taint.Effect == oldTaint.Effect { + existingTaintList = append(existingTaintList, taint.Key) + } + } + } + return strings.Join(existingTaintList, ",") +} + +// DeleteTaintsByKey removes all the taints that have the same key to given taintKey +func DeleteTaintsByKey(taints []v1.Taint, taintKey string) ([]v1.Taint, bool) { + newTaints := []v1.Taint{} + deleted := false + for i := range taints { + if taintKey == taints[i].Key { + deleted = true + continue + } + newTaints = append(newTaints, taints[i]) + } + return newTaints, deleted +} + +// DeleteTaint removes all the taints that have the same key and effect to given taintToDelete. +func DeleteTaint(taints []v1.Taint, taintToDelete *v1.Taint) ([]v1.Taint, bool) { + newTaints := []v1.Taint{} + deleted := false + for i := range taints { + if taintToDelete.MatchTaint(&taints[i]) { + deleted = true + continue + } + newTaints = append(newTaints, taints[i]) + } + return newTaints, deleted +} + +// RemoveTaint tries to remove a taint from annotations list. Returns a new copy of updated Node and true if something was updated +// false otherwise. +func RemoveTaint(node *v1.Node, taint *v1.Taint) (*v1.Node, bool, error) { + newNode := node.DeepCopy() + nodeTaints := newNode.Spec.Taints + if len(nodeTaints) == 0 { + return newNode, false, nil + } + + if !TaintExists(nodeTaints, taint) { + return newNode, false, nil + } + + newTaints, _ := DeleteTaint(nodeTaints, taint) + newNode.Spec.Taints = newTaints + return newNode, true, nil +} + +// AddOrUpdateTaint tries to add a taint to annotations list. Returns a new copy of updated Node and true if something was updated +// false otherwise. +func AddOrUpdateTaint(node *v1.Node, taint *v1.Taint) (*v1.Node, bool, error) { + newNode := node.DeepCopy() + nodeTaints := newNode.Spec.Taints + + var newTaints []v1.Taint + updated := false + for i := range nodeTaints { + if taint.MatchTaint(&nodeTaints[i]) { + if helper.Semantic.DeepEqual(*taint, nodeTaints[i]) { + return newNode, false, nil + } + newTaints = append(newTaints, *taint) + updated = true + continue + } + + newTaints = append(newTaints, nodeTaints[i]) + } + + if !updated { + newTaints = append(newTaints, *taint) + } + + newNode.Spec.Taints = newTaints + return newNode, true, nil +} + +// TaintExists checks if the given taint exists in list of taints. Returns true if exists false otherwise. +func TaintExists(taints []v1.Taint, taintToFind *v1.Taint) bool { + for _, taint := range taints { + if taint.MatchTaint(taintToFind) { + return true + } + } + return false +} + +// TaintKeyExists checks if the given taint key exists in list of taints. Returns true if exists false otherwise. +func TaintKeyExists(taints []v1.Taint, taintKeyToMatch string) bool { + for _, taint := range taints { + if taint.Key == taintKeyToMatch { + return true + } + } + return false +} + +// TaintSetDiff finds the difference between two taint slices and +// returns all new and removed elements of the new slice relative to the old slice. +// for example: +// input: taintsNew=[a b] taintsOld=[a c] +// output: taintsToAdd=[b] taintsToRemove=[c] +func TaintSetDiff(taintsNew, taintsOld []v1.Taint) (taintsToAdd []*v1.Taint, taintsToRemove []*v1.Taint) { + for _, taint := range taintsNew { + if !TaintExists(taintsOld, &taint) { + t := taint + taintsToAdd = append(taintsToAdd, &t) + } + } + + for _, taint := range taintsOld { + if !TaintExists(taintsNew, &taint) { + t := taint + taintsToRemove = append(taintsToRemove, &t) + } + } + + return +} + +// TaintSetFilter filters from the taint slice according to the passed fn function to get the filtered taint slice. +func TaintSetFilter(taints []v1.Taint, fn func(*v1.Taint) bool) []v1.Taint { + res := []v1.Taint{} + + for _, taint := range taints { + if fn(&taint) { + res = append(res, taint) + } + } + + return res +} + +// CheckTaintValidation checks if the given taint is valid. +// Returns error if the given taint is invalid. +func CheckTaintValidation(taint v1.Taint) error { + if errs := validation.IsQualifiedName(taint.Key); len(errs) > 0 { + return fmt.Errorf("invalid taint key: %s", strings.Join(errs, "; ")) + } + if taint.Value != "" { + if errs := validation.IsValidLabelValue(taint.Value); len(errs) > 0 { + return fmt.Errorf("invalid taint value: %s", strings.Join(errs, "; ")) + } + } + if taint.Effect != "" { + if err := validateTaintEffect(taint.Effect); err != nil { + return err + } + } + + return nil +} diff --git a/pkg/util/taints/taints_test.go b/pkg/util/taints/taints_test.go new file mode 100644 index 00000000000..3d6816647d2 --- /dev/null +++ b/pkg/util/taints/taints_test.go @@ -0,0 +1,984 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taints + +import ( + "reflect" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + v1 "k8s.io/api/core/v1" +) + +func TestAddOrUpdateTaint(t *testing.T) { + taint := v1.Taint{ + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + } + + taintNew := v1.Taint{ + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoSchedule, + } + + taintUpdateValue := taint + taintUpdateValue.Value = "bar_1" + + testcases := []struct { + name string + node *v1.Node + taint *v1.Taint + expectedUpdate bool + expectedTaints []v1.Taint + }{ + { + name: "add a new taint", + node: &v1.Node{}, + taint: &taint, + expectedUpdate: true, + expectedTaints: []v1.Taint{taint}, + }, + { + name: "add a unique taint", + node: &v1.Node{ + Spec: v1.NodeSpec{Taints: []v1.Taint{taint}}, + }, + taint: &taintNew, + expectedUpdate: true, + expectedTaints: []v1.Taint{taint, taintNew}, + }, + { + name: "add duplicate taint", + node: &v1.Node{ + Spec: v1.NodeSpec{Taints: []v1.Taint{taint}}, + }, + taint: &taint, + expectedUpdate: false, + expectedTaints: []v1.Taint{taint}, + }, + { + name: "update taint value", + node: &v1.Node{ + Spec: v1.NodeSpec{Taints: []v1.Taint{taint}}, + }, + taint: &taintUpdateValue, + expectedUpdate: true, + expectedTaints: []v1.Taint{taintUpdateValue}, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + newNode, updated, err := AddOrUpdateTaint(tc.node, tc.taint) + if err != nil { + t.Errorf("[%s] should not raise error but got %v", tc.name, err) + } + if updated != tc.expectedUpdate { + t.Errorf("[%s] expected taints to not be updated", tc.name) + } + if diff := cmp.Diff(newNode.Spec.Taints, tc.expectedTaints); diff != "" { + t.Errorf("Unexpected result (-want, +got):\n%s", diff) + } + }) + } +} + +func TestTaintExists(t *testing.T) { + testingTaints := []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + } + + cases := []struct { + name string + taintToFind *v1.Taint + expectedResult bool + }{ + { + name: "taint exists", + taintToFind: &v1.Taint{Key: "foo_1", Value: "bar_1", Effect: v1.TaintEffectNoExecute}, + expectedResult: true, + }, + { + name: "different key", + taintToFind: &v1.Taint{Key: "no_such_key", Value: "bar_1", Effect: v1.TaintEffectNoExecute}, + expectedResult: false, + }, + { + name: "different effect", + taintToFind: &v1.Taint{Key: "foo_1", Value: "bar_1", Effect: v1.TaintEffectNoSchedule}, + expectedResult: false, + }, + } + + for _, c := range cases { + result := TaintExists(testingTaints, c.taintToFind) + + if result != c.expectedResult { + t.Errorf("[%s] unexpected results: %v", c.name, result) + continue + } + } +} + +func TestTaintKeyExists(t *testing.T) { + testingTaints := []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + } + + cases := []struct { + name string + taintKeyToMatch string + expectedResult bool + }{ + { + name: "taint key exists", + taintKeyToMatch: "foo_1", + expectedResult: true, + }, + { + name: "taint key does not exist", + taintKeyToMatch: "foo_3", + expectedResult: false, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + result := TaintKeyExists(testingTaints, c.taintKeyToMatch) + + if result != c.expectedResult { + t.Errorf("[%s] unexpected results: %v", c.name, result) + } + }) + } +} + +func TestTaintSetFilter(t *testing.T) { + testTaint1 := v1.Taint{ + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + } + testTaint2 := v1.Taint{ + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + } + + testTaint3 := v1.Taint{ + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoSchedule, + } + testTaints := []v1.Taint{testTaint1, testTaint2, testTaint3} + + testcases := []struct { + name string + fn func(t *v1.Taint) bool + expectedTaints []v1.Taint + }{ + { + name: "Filter out nothing", + fn: func(t *v1.Taint) bool { + if t.Key == v1.TaintNodeUnschedulable { + return true + } + return false + }, + expectedTaints: []v1.Taint{}, + }, + { + name: "Filter out a subset", + fn: func(t *v1.Taint) bool { + if t.Effect == v1.TaintEffectNoExecute { + return true + } + return false + }, + expectedTaints: []v1.Taint{testTaint1}, + }, + { + name: "Filter out everything", + fn: func(t *v1.Taint) bool { return true }, + expectedTaints: []v1.Taint{testTaint1, testTaint2, testTaint3}, + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + taintsAfterFilter := TaintSetFilter(testTaints, tc.fn) + if diff := cmp.Diff(tc.expectedTaints, taintsAfterFilter); diff != "" { + t.Errorf("Unexpected postFilterResult (-want, +got):\n%s", diff) + } + }) + } +} + +func TestRemoveTaint(t *testing.T) { + cases := []struct { + name string + node *v1.Node + taintToRemove *v1.Taint + expectedTaints []v1.Taint + expectedResult bool + }{ + { + name: "remove taint unsuccessfully", + node: &v1.Node{ + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }, + taintToRemove: &v1.Taint{ + Key: "foo_1", + Effect: v1.TaintEffectNoSchedule, + }, + expectedTaints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: false, + }, + { + name: "remove taint successfully", + node: &v1.Node{ + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }, + taintToRemove: &v1.Taint{ + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + expectedTaints: []v1.Taint{}, + expectedResult: true, + }, + { + name: "remove taint from node with no taint", + node: &v1.Node{ + Spec: v1.NodeSpec{ + Taints: []v1.Taint{}, + }, + }, + taintToRemove: &v1.Taint{ + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + expectedTaints: []v1.Taint{}, + expectedResult: false, + }, + } + + for _, c := range cases { + newNode, result, err := RemoveTaint(c.node, c.taintToRemove) + if err != nil { + t.Errorf("[%s] should not raise error but got: %v", c.name, err) + } + if result != c.expectedResult { + t.Errorf("[%s] should return %t, but got: %t", c.name, c.expectedResult, result) + } + if !reflect.DeepEqual(newNode.Spec.Taints, c.expectedTaints) { + t.Errorf("[%s] the new node object should have taints %v, but got: %v", c.name, c.expectedTaints, newNode.Spec.Taints) + } + } +} + +func TestDeleteTaint(t *testing.T) { + cases := []struct { + name string + taints []v1.Taint + taintToDelete *v1.Taint + expectedTaints []v1.Taint + expectedResult bool + }{ + { + name: "delete taint with different name", + taints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + taintToDelete: &v1.Taint{Key: "foo_1", Effect: v1.TaintEffectNoSchedule}, + expectedTaints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: false, + }, + { + name: "delete taint with different effect", + taints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + taintToDelete: &v1.Taint{Key: "foo", Effect: v1.TaintEffectNoExecute}, + expectedTaints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: false, + }, + { + name: "delete taint successfully", + taints: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + }, + taintToDelete: &v1.Taint{Key: "foo", Effect: v1.TaintEffectNoSchedule}, + expectedTaints: []v1.Taint{}, + expectedResult: true, + }, + { + name: "delete taint from empty taint array", + taints: []v1.Taint{}, + taintToDelete: &v1.Taint{Key: "foo", Effect: v1.TaintEffectNoSchedule}, + expectedTaints: []v1.Taint{}, + expectedResult: false, + }, + } + + for _, c := range cases { + taints, result := DeleteTaint(c.taints, c.taintToDelete) + if result != c.expectedResult { + t.Errorf("[%s] should return %t, but got: %t", c.name, c.expectedResult, result) + } + if !reflect.DeepEqual(taints, c.expectedTaints) { + t.Errorf("[%s] the result taints should be %v, but got: %v", c.name, c.expectedTaints, taints) + } + } +} + +func TestDeleteTaintByKey(t *testing.T) { + cases := []struct { + name string + taints []v1.Taint + taintKey string + expectedTaints []v1.Taint + expectedResult bool + }{ + { + name: "delete taint unsuccessfully", + taints: []v1.Taint{ + { + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + }, + taintKey: "foo_1", + expectedTaints: []v1.Taint{ + { + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: false, + }, + { + name: "delete taint successfully", + taints: []v1.Taint{ + { + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + }, + taintKey: "foo", + expectedTaints: []v1.Taint{}, + expectedResult: true, + }, + { + name: "delete taint from empty taint array", + taints: []v1.Taint{}, + taintKey: "foo", + expectedTaints: []v1.Taint{}, + expectedResult: false, + }, + } + + for _, c := range cases { + taints, result := DeleteTaintsByKey(c.taints, c.taintKey) + if result != c.expectedResult { + t.Errorf("[%s] should return %t, but got: %t", c.name, c.expectedResult, result) + } + if !reflect.DeepEqual(c.expectedTaints, taints) { + t.Errorf("[%s] the result taints should be %v, but got: %v", c.name, c.expectedTaints, taints) + } + } +} + +func TestCheckIfTaintsAlreadyExists(t *testing.T) { + oldTaints := []v1.Taint{ + { + Key: "foo_1", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "foo_2", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "foo_3", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + } + + cases := []struct { + name string + taintsToCheck []v1.Taint + expectedResult string + }{ + { + name: "empty array", + taintsToCheck: []v1.Taint{}, + expectedResult: "", + }, + { + name: "no match", + taintsToCheck: []v1.Taint{ + { + Key: "foo_1", + Effect: v1.TaintEffectNoExecute, + }, + }, + expectedResult: "", + }, + { + name: "match one taint", + taintsToCheck: []v1.Taint{ + { + Key: "foo_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: "foo_2", + }, + { + name: "match two taints", + taintsToCheck: []v1.Taint{ + { + Key: "foo_2", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "foo_3", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedResult: "foo_2,foo_3", + }, + } + + for _, c := range cases { + result := CheckIfTaintsAlreadyExists(oldTaints, c.taintsToCheck) + if result != c.expectedResult { + t.Errorf("[%s] should return '%s', but got: '%s'", c.name, c.expectedResult, result) + } + } +} + +func TestParseTaints(t *testing.T) { + cases := []struct { + name string + spec []string + expectedTaints []v1.Taint + expectedTaintsToRemove []v1.Taint + expectedErr bool + }{ + { + name: "invalid empty spec format", + spec: []string{""}, + expectedErr: true, + }, + // taint spec format without the suffix '-' must be either '=:', ':', or '' + { + name: "invalid spec format without effect", + spec: []string{"foo=abc"}, + expectedErr: true, + }, + { + name: "invalid spec format with multiple '=' separators", + spec: []string{"foo=abc=xyz:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec format with multiple ':' separators", + spec: []string{"foo=abc:xyz:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value without separator", + spec: []string{"foo"}, + expectedErr: true, + }, + // taint spec must consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character. + { + name: "invalid spec taint value with special chars '%^@'", + spec: []string{"foo=nospecialchars%^@:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with non-alphanumeric characters", + spec: []string{"foo=Tama-nui-te-rā.is.Māori.sun:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with special chars '\\'", + spec: []string{"foo=\\backslashes\\are\\bad:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with start with an non-alphanumeric character '-'", + spec: []string{"foo=-starts-with-dash:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with end with an non-alphanumeric character '-'", + spec: []string{"foo=ends-with-dash-:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with start with an non-alphanumeric character '.'", + spec: []string{"foo=.starts.with.dot:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value with end with an non-alphanumeric character '.'", + spec: []string{"foo=ends.with.dot.:NoSchedule"}, + expectedErr: true, + }, + // The value range of taint effect is "NoSchedule", "PreferNoSchedule", "NoExecute" + { + name: "invalid spec effect for adding taint", + spec: []string{"foo=abc:invalid_effect"}, + expectedErr: true, + }, + { + name: "invalid spec effect for deleting taint", + spec: []string{"foo:invalid_effect-"}, + expectedErr: true, + }, + { + name: "duplicated taints with the same key and effect", + spec: []string{"foo=abc:NoSchedule", "foo=abc:NoSchedule"}, + expectedErr: true, + }, + { + name: "invalid spec taint value exceeding the limit", + spec: []string{strings.Repeat("a", 64)}, + expectedErr: true, + }, + { + name: "add new taints with no special chars", + spec: []string{"foo=abc:NoSchedule", "bar=abc:NoSchedule", "baz:NoSchedule", "qux:NoSchedule", "foobar=:NoSchedule"}, + expectedTaints: []v1.Taint{ + { + Key: "foo", + Value: "abc", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "bar", + Value: "abc", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "baz", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "qux", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "foobar", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedErr: false, + }, + { + name: "delete taints with no special chars", + spec: []string{"foo:NoSchedule-", "bar:NoSchedule-", "qux=:NoSchedule-", "dedicated-"}, + expectedTaintsToRemove: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "qux", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "dedicated", + }, + }, + expectedErr: false, + }, + { + name: "add taints and delete taints with no special chars", + spec: []string{"foo=abc:NoSchedule", "bar=abc:NoSchedule", "baz:NoSchedule", "qux:NoSchedule", "foobar=:NoSchedule", "foo:NoSchedule-", "bar:NoSchedule-", "baz=:NoSchedule-"}, + expectedTaints: []v1.Taint{ + { + Key: "foo", + Value: "abc", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "bar", + Value: "abc", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "baz", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "qux", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "foobar", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToRemove: []v1.Taint{ + { + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "baz", + Value: "", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedErr: false, + }, + } + + for _, c := range cases { + taints, taintsToRemove, err := ParseTaints(c.spec) + if c.expectedErr && err == nil { + t.Errorf("[%s] expected error for spec %s, but got nothing", c.name, c.spec) + } + if !c.expectedErr && err != nil { + t.Errorf("[%s] expected no error for spec %s, but got: %v", c.name, c.spec, err) + } + if !reflect.DeepEqual(c.expectedTaints, taints) { + t.Errorf("[%s] expected returen taints as %v, but got: %v", c.name, c.expectedTaints, taints) + } + if !reflect.DeepEqual(c.expectedTaintsToRemove, taintsToRemove) { + t.Errorf("[%s] expected return taints to be removed as %v, but got: %v", c.name, c.expectedTaintsToRemove, taintsToRemove) + } + } +} + +func TestValidateTaint(t *testing.T) { + cases := []struct { + name string + taintsToCheck v1.Taint + expectedErr bool + }{ + { + name: "taint invalid key", + taintsToCheck: v1.Taint{Key: "", Value: "bar_1", Effect: v1.TaintEffectNoExecute}, + expectedErr: true, + }, + { + name: "taint invalid value", + taintsToCheck: v1.Taint{Key: "foo_1", Value: strings.Repeat("a", 64), Effect: v1.TaintEffectNoExecute}, + expectedErr: true, + }, + { + name: "taint invalid effect", + taintsToCheck: v1.Taint{Key: "foo_2", Value: "bar_2", Effect: "no_such_effect"}, + expectedErr: true, + }, + { + name: "valid taint", + taintsToCheck: v1.Taint{Key: "foo_3", Value: "bar_3", Effect: v1.TaintEffectNoExecute}, + expectedErr: false, + }, + { + name: "valid taint", + taintsToCheck: v1.Taint{Key: "foo_4", Effect: v1.TaintEffectNoExecute}, + expectedErr: false, + }, + { + name: "valid taint", + taintsToCheck: v1.Taint{Key: "foo_5", Value: "bar_5"}, + expectedErr: false, + }, + } + + for _, c := range cases { + err := CheckTaintValidation(c.taintsToCheck) + + if c.expectedErr && err == nil { + t.Errorf("[%s] expected error for spec %+v, but got nothing", c.name, c.taintsToCheck) + } + } +} + +func TestTaintSetDiff(t *testing.T) { + cases := []struct { + name string + t1 []v1.Taint + t2 []v1.Taint + expectedTaintsToAdd []*v1.Taint + expectedTaintsToRemove []*v1.Taint + }{ + { + name: "two_taints_are_nil", + expectedTaintsToAdd: nil, + expectedTaintsToRemove: nil, + }, + { + name: "one_taint_is_nil_and_the_other_is_not_nil", + t1: []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToAdd: []*v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToRemove: nil, + }, + { + name: "shared_taints_with_the_same_key_value_effect", + t1: []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + t2: []v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToAdd: []*v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + }, + expectedTaintsToRemove: []*v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + }, + }, + { + name: "shared_taints_with_the_same_key_effect_different_value", + t1: []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "different-value", + Effect: v1.TaintEffectNoSchedule, + }, + }, + t2: []v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToAdd: []*v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + }, + expectedTaintsToRemove: []*v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + }, + }, + { + name: "shared_taints_with_the_same_key_different_value_effect", + t1: []v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "different-value", + Effect: v1.TaintEffectNoExecute, + }, + }, + t2: []v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + expectedTaintsToAdd: []*v1.Taint{ + { + Key: "foo_1", + Value: "bar_1", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "different-value", + Effect: v1.TaintEffectNoExecute, + }, + }, + expectedTaintsToRemove: []*v1.Taint{ + { + Key: "foo_3", + Value: "bar_3", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "foo_2", + Value: "bar_2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + add, remove := TaintSetDiff(tt.t1, tt.t2) + if !reflect.DeepEqual(add, tt.expectedTaintsToAdd) { + t.Errorf("taintsToAdd: %v should equal %v, but get unexpected results", add, tt.expectedTaintsToAdd) + } + if !reflect.DeepEqual(remove, tt.expectedTaintsToRemove) { + t.Errorf("taintsToRemove: %v should equal %v, but get unexpected results", remove, tt.expectedTaintsToRemove) + } + }) + } +} diff --git a/pkg/yurtmanager/controller/apis/config/types.go b/pkg/yurtmanager/controller/apis/config/types.go index d5df32f337c..af132f76b57 100644 --- a/pkg/yurtmanager/controller/apis/config/types.go +++ b/pkg/yurtmanager/controller/apis/config/types.go @@ -19,6 +19,7 @@ package config import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" componentbaseconfig "k8s.io/component-base/config" + "k8s.io/kube-controller-manager/config/v1alpha1" nodepoolconfig "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/nodepool/config" platformadminconfig "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/platformadmin/config" @@ -53,6 +54,8 @@ type YurtManagerConfiguration struct { // YurtAppOverriderControllerConfiguration holds configuration for YurtAppOverriderController related features. YurtAppOverriderController yurtappoverriderconfig.YurtAppOverriderControllerConfiguration + + NodeLifeCycleController v1alpha1.NodeLifecycleControllerConfiguration } type GenericConfiguration struct { @@ -64,6 +67,7 @@ type GenericConfiguration struct { RestConfigQPS int RestConfigBurst int WorkingNamespace string + Kubeconfig string // Controllers is the list of controllers to enable or disable // '*' means "all enabled by default controllers" // 'foo' means "enable 'foo'" diff --git a/pkg/yurtmanager/controller/controller.go b/pkg/yurtmanager/controller/controller.go index aaf66932148..1fe4dcf873d 100644 --- a/pkg/yurtmanager/controller/controller.go +++ b/pkg/yurtmanager/controller/controller.go @@ -17,6 +17,7 @@ limitations under the License. package controller import ( + "context" "fmt" "k8s.io/apimachinery/pkg/api/meta" @@ -44,9 +45,10 @@ import ( "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/yurtcoordinator/delegatelease" "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/yurtcoordinator/podbinding" "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/yurtstaticset" + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/nodelifecycle" ) -type InitFunc func(*config.CompletedConfig, manager.Manager) error +type InitFunc func(context.Context, *config.CompletedConfig, manager.Manager) error type ControllerInitializersFunc func() (initializers map[string]InitFunc) @@ -90,6 +92,7 @@ func NewControllerInitializers() map[string]InitFunc { register(names.GatewayDNSController, dns.Add) register(names.GatewayInternalServiceController, gatewayinternalservice.Add) register(names.GatewayPublicServiceController, gatewaypublicservice.Add) + register(names.NodeLifeCycleController, nodelifecycle.Add) return controllers } @@ -99,14 +102,14 @@ func NewControllerInitializers() map[string]InitFunc { // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete -func SetupWithManager(c *config.CompletedConfig, m manager.Manager) error { +func SetupWithManager(ctx context.Context, c *config.CompletedConfig, m manager.Manager) error { for controllerName, fn := range NewControllerInitializers() { if !app.IsControllerEnabled(controllerName, ControllersDisabledByDefault, c.ComponentConfig.Generic.Controllers) { klog.Warningf("Controller %v is disabled", controllerName) continue } - if err := fn(c, m); err != nil { + if err := fn(ctx, c, m); err != nil { if kindMatchErr, ok := err.(*meta.NoKindMatchError); ok { klog.Infof("CRD %v is not installed, its controller will perform noops!", kindMatchErr.GroupKind) continue diff --git a/pkg/yurtmanager/controller/csrapprover/csrapprover_controller.go b/pkg/yurtmanager/controller/csrapprover/csrapprover_controller.go index ab70a65bef2..7e9f3d28419 100644 --- a/pkg/yurtmanager/controller/csrapprover/csrapprover_controller.go +++ b/pkg/yurtmanager/controller/csrapprover/csrapprover_controller.go @@ -101,7 +101,7 @@ type csrRecognizer struct { // Add creates a new CsrApprover Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(_ *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, _ *appconfig.CompletedConfig, mgr manager.Manager) error { r := &ReconcileCsrApprover{} // Create a new controller c, err := controller.New(names.CsrApproverController, mgr, controller.Options{ diff --git a/pkg/yurtmanager/controller/daemonpodupdater/daemon_pod_updater_controller.go b/pkg/yurtmanager/controller/daemonpodupdater/daemon_pod_updater_controller.go index 7e9cc55c7ba..014ecbd6b96 100644 --- a/pkg/yurtmanager/controller/daemonpodupdater/daemon_pod_updater_controller.go +++ b/pkg/yurtmanager/controller/daemonpodupdater/daemon_pod_updater_controller.go @@ -100,7 +100,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Daemonpodupdater Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { klog.Infof("daemonupdater-controller add controller %s", controllerKind.String()) return add(mgr, newReconciler(c, mgr)) } diff --git a/pkg/yurtmanager/controller/internal/controller/controller.go b/pkg/yurtmanager/controller/internal/controller/controller.go new file mode 100644 index 00000000000..31157852b2d --- /dev/null +++ b/pkg/yurtmanager/controller/internal/controller/controller.go @@ -0,0 +1,222 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/runtime/inject" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +var _ inject.Injector = &Controller{} + +// Controller implements controller.Controller. +type Controller struct { + // Name is used to uniquely identify a Controller in tracing, logging and monitoring. Name is required. + Name string + + // MakeQueue constructs the queue for this controller once the controller is ready to start. + // This exists because the standard Kubernetes workqueues start themselves immediately, which + // leads to goroutine leaks if something calls controller.New repeatedly. + MakeQueue func() workqueue.RateLimitingInterface + + // Queue is an listeningQueue that listens for events from Informers and adds object keys to + // the Queue for processing + Queue workqueue.RateLimitingInterface + + // SetFields is used to inject dependencies into other objects such as Sources, EventHandlers and Predicates + // Deprecated: the caller should handle injected fields itself. + SetFields func(i interface{}) error + + // mu is used to synchronize Controller setup + mu sync.Mutex + + // Started is true if the Controller has been Started + Started bool + + // ctx is the context that was passed to Start() and used when starting watches. + // + // According to the docs, contexts should not be stored in a struct: https://golang.org/pkg/context, + // while we usually always strive to follow best practices, we consider this a legacy case and it should + // undergo a major refactoring and redesign to allow for context to not be stored in a struct. + ctx context.Context + + // CacheSyncTimeout refers to the time limit set on waiting for cache to sync + // Defaults to 2 minutes if not set. + CacheSyncTimeout time.Duration + + // startWatches maintains a list of sources, handlers, and predicates to start when the controller is started. + startWatches []watchDescription + + // RecoverPanic indicates whether the panic caused by reconcile should be recovered. + RecoverPanic bool +} + +// watchDescription contains all the information necessary to start a watch. +type watchDescription struct { + src source.Source + handler handler.EventHandler + predicates []predicate.Predicate +} + +// Watch implements controller.Controller. +func (c *Controller) Watch(src source.Source, evthdler handler.EventHandler, prct ...predicate.Predicate) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Inject Cache into arguments + if err := c.SetFields(src); err != nil { + return err + } + if err := c.SetFields(evthdler); err != nil { + return err + } + for _, pr := range prct { + if err := c.SetFields(pr); err != nil { + return err + } + } + + // Controller hasn't started yet, store the watches locally and return. + // + // These watches are going to be held on the controller struct until the manager or user calls Start(...). + if !c.Started { + c.startWatches = append(c.startWatches, watchDescription{src: src, handler: evthdler, predicates: prct}) + return nil + } + + klog.V(2).InfoS("Starting EventSource", "source", src) + return src.Start(c.ctx, evthdler, c.Queue, prct...) +} + +// Start implements controller.Controller. +func (c *Controller) Start(ctx context.Context) error { + // use an IIFE to get proper lock handling + // but lock outside to get proper handling of the queue shutdown + c.mu.Lock() + if c.Started { + return errors.New("controller was started more than once. This is likely to be caused by being added to a manager multiple times") + } + + // Set the internal context. + c.ctx = ctx + + c.Queue = c.MakeQueue() + go func() { + <-ctx.Done() + c.Queue.ShutDown() + }() + + err := func() error { + defer c.mu.Unlock() + + // TODO(pwittrock): Reconsider HandleCrash + defer utilruntime.HandleCrash() + + // NB(directxman12): launch the sources *before* trying to wait for the + // caches to sync so that they have a chance to register their intendeded + // caches. + for _, watch := range c.startWatches { + klog.V(2).InfoS("Starting EventSource", "source", fmt.Sprintf("%s", watch.src), "controller", c.Name) + + if err := watch.src.Start(ctx, watch.handler, c.Queue, watch.predicates...); err != nil { + return err + } + } + + // Start the SharedIndexInformer factories to begin populating the SharedIndexInformer caches + klog.V(2).InfoS("Starting Controller WatchSource", "controller", c.Name) + + for _, watch := range c.startWatches { + syncingSource, ok := watch.src.(source.SyncingSource) + if !ok { + continue + } + + if err := func() error { + // use a context with timeout for launching sources and syncing caches. + sourceStartCtx, cancel := context.WithTimeout(ctx, c.CacheSyncTimeout) + defer cancel() + + // WaitForSync waits for a definitive timeout, and returns if there + // is an error or a timeout + if err := syncingSource.WaitForSync(sourceStartCtx); err != nil { + err := fmt.Errorf("failed to wait for %s caches to sync: %w", c.Name, err) + klog.ErrorS(err, "Could not wait for Cache to sync") + return err + } + + return nil + }(); err != nil { + return err + } + } + + // All the watches have been started, we can reset the local slice. + // + // We should never hold watches more than necessary, each watch source can hold a backing cache, + // which won't be garbage collected if we hold a reference to it. + c.startWatches = nil + + klog.V(2).InfoS("No workers need to be started", "controller", c.Name) + c.Started = true + return nil + }() + if err != nil { + return err + } + + <-ctx.Done() + klog.V(2).InfoS("No Reconcile controller finished", "controller", c.Name) + return nil +} + +// InjectFunc implement SetFields.Injector. +func (c *Controller) InjectFunc(f inject.Func) error { + c.SetFields = f + return nil +} + +func (c *Controller) WaitForStarted(ctx context.Context) bool { + err := wait.PollImmediateUntil(200*time.Millisecond, func() (bool, error) { + c.mu.Lock() + started := c.Started + c.mu.Unlock() + if !started { + return false, nil + } + return true, nil + }, ctx.Done()) + if err != nil { + klog.V(2).InfoS("failed to start %s controller , %v", c.Name, err) + return false + } + + klog.V(2).InfoS("%s controller started", c.Name) + return true +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/metrics.go b/pkg/yurtmanager/controller/nodelifecycle/metrics.go new file mode 100644 index 00000000000..fcd5e08cfab --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/metrics.go @@ -0,0 +1,107 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodelifecycle + +import ( + "sync" + + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +const ( + nodeControllerSubsystem = "node_collector" + zoneHealthStatisticKey = "zone_health" + zoneSizeKey = "zone_size" + zoneNoUnhealthyNodesKey = "unhealthy_nodes_in_zone" + evictionsTotalKey = "evictions_total" + + updateNodeHealthKey = "update_node_health_duration_seconds" + updateAllNodesHealthKey = "update_all_nodes_health_duration_seconds" +) + +var ( + zoneHealth = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: nodeControllerSubsystem, + Name: zoneHealthStatisticKey, + Help: "Gauge measuring percentage of healthy nodes per zone.", + StabilityLevel: metrics.ALPHA, + }, + []string{"zone"}, + ) + zoneSize = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: nodeControllerSubsystem, + Name: zoneSizeKey, + Help: "Gauge measuring number of registered Nodes per zones.", + StabilityLevel: metrics.ALPHA, + }, + []string{"zone"}, + ) + unhealthyNodes = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: nodeControllerSubsystem, + Name: zoneNoUnhealthyNodesKey, + Help: "Gauge measuring number of not Ready Nodes per zones.", + StabilityLevel: metrics.ALPHA, + }, + []string{"zone"}, + ) + evictionsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: nodeControllerSubsystem, + Name: evictionsTotalKey, + Help: "Number of Node evictions that happened since current instance of NodeController started.", + StabilityLevel: metrics.STABLE, + }, + []string{"zone"}, + ) + + updateNodeHealthDuration = metrics.NewHistogram( + &metrics.HistogramOpts{ + Subsystem: nodeControllerSubsystem, + Name: updateNodeHealthKey, + Help: "Duration in seconds for NodeController to update the health of a single node.", + Buckets: metrics.ExponentialBuckets(0.001, 4, 8), // 1ms -> ~15s + StabilityLevel: metrics.ALPHA, + }, + ) + updateAllNodesHealthDuration = metrics.NewHistogram( + &metrics.HistogramOpts{ + Subsystem: nodeControllerSubsystem, + Name: updateAllNodesHealthKey, + Help: "Duration in seconds for NodeController to update the health of all nodes.", + Buckets: metrics.ExponentialBuckets(0.01, 4, 8), // 10ms -> ~3m + StabilityLevel: metrics.ALPHA, + }, + ) +) + +var registerMetrics sync.Once + +// Register the metrics that are to be monitored. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(zoneHealth) + legacyregistry.MustRegister(zoneSize) + legacyregistry.MustRegister(unhealthyNodes) + legacyregistry.MustRegister(evictionsTotal) + legacyregistry.MustRegister(updateNodeHealthDuration) + legacyregistry.MustRegister(updateAllNodesHealthDuration) + }) +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller.go new file mode 100644 index 00000000000..9a7dcb444bb --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller.go @@ -0,0 +1,1361 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// The Controller sets tainted annotations on nodes. +// Tainted nodes should not be used for new work loads and +// some effort should be given to getting existing work +// loads off of tainted nodes. + +package nodelifecycle + +import ( + "context" + "fmt" + "sync" + "time" + + apps "k8s.io/api/apps/v1" + coordinationv1 "k8s.io/api/coordination/v1" + coordv1 "k8s.io/api/coordination/v1" + v1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/flowcontrol" + "k8s.io/client-go/util/workqueue" + nodetopology "k8s.io/component-helpers/node/topology" + "k8s.io/klog/v2" + kubeletapis "k8s.io/kubelet/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/source" + + appconfig "github.com/openyurtio/openyurt/cmd/yurt-manager/app/config" + "github.com/openyurtio/openyurt/cmd/yurt-manager/names" + taintutils "github.com/openyurtio/openyurt/pkg/util/taints" + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/nodelifecycle/scheduler" + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util" + controllerutil "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/node" + nodeutil "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/node" +) + +func init() { + // Register prometheus metrics + Register() +} + +var ( + // UnreachableTaintTemplate is the taint for when a node becomes unreachable. + UnreachableTaintTemplate = &v1.Taint{ + Key: v1.TaintNodeUnreachable, + Effect: v1.TaintEffectNoExecute, + } + + // NotReadyTaintTemplate is the taint for when a node is not ready for + // executing pods + NotReadyTaintTemplate = &v1.Taint{ + Key: v1.TaintNodeNotReady, + Effect: v1.TaintEffectNoExecute, + } + + // map {NodeConditionType: {ConditionStatus: TaintKey}} + // represents which NodeConditionType under which ConditionStatus should be + // tainted with which TaintKey + // for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs + nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{ + v1.NodeReady: { + v1.ConditionFalse: v1.TaintNodeNotReady, + v1.ConditionUnknown: v1.TaintNodeUnreachable, + }, + v1.NodeMemoryPressure: { + v1.ConditionTrue: v1.TaintNodeMemoryPressure, + }, + v1.NodeDiskPressure: { + v1.ConditionTrue: v1.TaintNodeDiskPressure, + }, + v1.NodeNetworkUnavailable: { + v1.ConditionTrue: v1.TaintNodeNetworkUnavailable, + }, + v1.NodePIDPressure: { + v1.ConditionTrue: v1.TaintNodePIDPressure, + }, + } + + taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{ + v1.TaintNodeNotReady: v1.NodeReady, + v1.TaintNodeUnreachable: v1.NodeReady, + v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable, + v1.TaintNodeMemoryPressure: v1.NodeMemoryPressure, + v1.TaintNodeDiskPressure: v1.NodeDiskPressure, + v1.TaintNodePIDPressure: v1.NodePIDPressure, + } +) + +// ZoneState is the state of a given zone. +type ZoneState string + +const ( + stateInitial = ZoneState("Initial") + stateNormal = ZoneState("Normal") + stateFullDisruption = ZoneState("FullDisruption") + statePartialDisruption = ZoneState("PartialDisruption") +) + +const ( + // The amount of time the nodecontroller should sleep between retrying node health updates + retrySleepTime = 20 * time.Millisecond + nodeNameKeyIndex = "spec.nodeName" + // podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass. + // Pod update workers will only handle lagging cache pods. 4 workers should be enough. + podUpdateWorkerSize = 4 +) + +// labelReconcileInfo lists Node labels to reconcile, and how to reconcile them. +// primaryKey and secondaryKey are keys of labels to reconcile. +// - If both keys exist, but their values don't match. Use the value from the +// primaryKey as the source of truth to reconcile. +// - If ensureSecondaryExists is true, and the secondaryKey does not +// exist, secondaryKey will be added with the value of the primaryKey. +var labelReconcileInfo = []struct { + primaryKey string + secondaryKey string + ensureSecondaryExists bool +}{ + { + // Reconcile the beta and the stable OS label using the stable label as the source of truth. + // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels + primaryKey: v1.LabelOSStable, + secondaryKey: kubeletapis.LabelOS, + ensureSecondaryExists: true, + }, + { + // Reconcile the beta and the stable arch label using the stable label as the source of truth. + // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels + primaryKey: v1.LabelArchStable, + secondaryKey: kubeletapis.LabelArch, + ensureSecondaryExists: true, + }, +} + +type nodeHealthData struct { + probeTimestamp metav1.Time + readyTransitionTimestamp metav1.Time + status *v1.NodeStatus + lease *coordv1.Lease +} + +func (n *nodeHealthData) deepCopy() *nodeHealthData { + if n == nil { + return nil + } + return &nodeHealthData{ + probeTimestamp: n.probeTimestamp, + readyTransitionTimestamp: n.readyTransitionTimestamp, + status: n.status.DeepCopy(), + lease: n.lease.DeepCopy(), + } +} + +type nodeHealthMap struct { + lock sync.RWMutex + nodeHealths map[string]*nodeHealthData +} + +func newNodeHealthMap() *nodeHealthMap { + return &nodeHealthMap{ + nodeHealths: make(map[string]*nodeHealthData), + } +} + +// getDeepCopy - returns copy of node health data. +// It prevents data being changed after retrieving it from the map. +func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData { + n.lock.RLock() + defer n.lock.RUnlock() + return n.nodeHealths[name].deepCopy() +} + +func (n *nodeHealthMap) set(name string, data *nodeHealthData) { + n.lock.Lock() + defer n.lock.Unlock() + n.nodeHealths[name] = data +} + +type podUpdateItem struct { + namespace string + name string +} + +// ReconcileNodeLifeCycle is the controller that manages node's life cycle. +type ReconcileNodeLifeCycle struct { + client.Client + taintManager *scheduler.NoExecuteTaintManager + kubeClient clientset.Interface + + // This timestamp is to be used instead of LastProbeTime stored in Condition. We do this + // to avoid the problem with time skew across the cluster. + now func() metav1.Time + + enterPartialDisruptionFunc func(nodeNum int) float32 + enterFullDisruptionFunc func(nodeNum int) float32 + computeZoneStateFunc func(nodeConditions []*v1.NodeCondition) (int, ZoneState) + + knownNodeSet map[string]*v1.Node + // per Node map storing last observed health together with a local time when it was observed. + nodeHealthMap *nodeHealthMap + + // evictorLock protects zonePodEvictor and zoneNoExecuteTainter. + evictorLock sync.Mutex + // workers that are responsible for tainting nodes. + zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue + + nodesToRetry sync.Map + + zoneStates map[string]ZoneState + + getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error) + + recorder record.EventRecorder + + // Value controlling Controller monitoring period, i.e. how often does Controller + // check node health signal posted from kubelet. This value should be lower than + // nodeMonitorGracePeriod. + // TODO: Change node health monitor to watch based. + nodeMonitorPeriod time.Duration + + // When node is just created, e.g. cluster bootstrap or node creation, we give + // a longer grace period. + nodeStartupGracePeriod time.Duration + + // Controller will not proactively sync node health, but will monitor node + // health signal updated from kubelet. There are 2 kinds of node healthiness + // signals: NodeStatus and NodeLease. If it doesn't receive update for this amount + // of time, it will start posting "NodeReady==ConditionUnknown". The amount of + // time before which Controller start evicting pods is controlled via flag + // 'pod-eviction-timeout'. + // Note: be cautious when changing the constant, it must work with + // nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease + // controller. The node health signal update frequency is the minimal of the + // two. + // There are several constraints: + // 1. nodeMonitorGracePeriod must be N times more than the node health signal + // update frequency, where N means number of retries allowed for kubelet to + // post node status/lease. It is pointless to make nodeMonitorGracePeriod + // be less than the node health signal update frequency, since there will + // only be fresh values from Kubelet at an interval of node health signal + // update frequency. + // 2. nodeMonitorGracePeriod can't be too large for user experience - larger + // value takes longer for user to see up-to-date node health. + nodeMonitorGracePeriod time.Duration + + // Number of workers Controller uses to process node monitor health updates. + // Defaults to scheduler.UpdateWorkerSize. + nodeUpdateWorkerSize int + + evictionLimiterQPS float32 + secondaryEvictionLimiterQPS float32 + largeClusterThreshold int32 + unhealthyZoneThreshold float32 + + nodeUpdateQueue workqueue.Interface + podUpdateQueue workqueue.RateLimitingInterface +} + +// Add creates a new CsrApprover Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller +// and Start it when the Manager is Started. +func Add(ctx context.Context, cfg *appconfig.CompletedConfig, mgr manager.Manager) error { + nc := newReconciler(cfg, mgr) + // Create a new controller + c, err := util.NewNoReconcileController(names.NodeLifeCycleController, mgr, controller.Options{}) + if err != nil { + return err + } + + podsPredicate := predicate.Funcs{ + CreateFunc: func(evt event.CreateEvent) bool { + pod := evt.Object.(*v1.Pod) + nc.podUpdated(nil, pod) + if nc.taintManager != nil { + nc.taintManager.PodUpdated(nil, pod) + } + return false + }, + UpdateFunc: func(evt event.UpdateEvent) bool { + prevPod := evt.ObjectOld.(*v1.Pod) + newPod := evt.ObjectNew.(*v1.Pod) + nc.podUpdated(prevPod, newPod) + if nc.taintManager != nil { + nc.taintManager.PodUpdated(prevPod, newPod) + } + return false + }, + DeleteFunc: func(evt event.DeleteEvent) bool { + pod := evt.Object.(*v1.Pod) + nc.podUpdated(pod, nil) + if nc.taintManager != nil { + nc.taintManager.PodUpdated(pod, nil) + } + return false + }, + GenericFunc: func(evt event.GenericEvent) bool { + return false + }, + } + c.Watch(&source.Kind{Type: &v1.Pod{}}, &handler.Funcs{}, podsPredicate) + if err := mgr.GetFieldIndexer().IndexField(context.TODO(), &v1.Pod{}, nodeNameKeyIndex, func(rawObj client.Object) []string { + pod, ok := rawObj.(*v1.Pod) + if !ok { + return []string{} + } + if len(pod.Spec.NodeName) == 0 { + return []string{} + } + return []string{pod.Spec.NodeName} + }); err != nil { + klog.Errorf("could not register spec.NodeName field indexers for nodelifecycle controller, %v", err) + return err + } + + nc.taintManager = scheduler.NewNoExecuteTaintManager(nc.recorder, nc.Client, nc.getPodsAssignedToNode) + nodesTaintManagerPredicate := predicate.Funcs{ + CreateFunc: func(evt event.CreateEvent) bool { + node := evt.Object.(*v1.Node).DeepCopy() + nc.taintManager.NodeUpdated(nil, node) + return false + }, + UpdateFunc: func(evt event.UpdateEvent) bool { + oldNode := evt.ObjectOld.(*v1.Node).DeepCopy() + newNode := evt.ObjectNew.(*v1.Node).DeepCopy() + nc.taintManager.NodeUpdated(oldNode, newNode) + return false + }, + DeleteFunc: func(evt event.DeleteEvent) bool { + node := evt.Object.(*v1.Node).DeepCopy() + nc.taintManager.NodeUpdated(node, nil) + return false + }, + GenericFunc: func(evt event.GenericEvent) bool { + return false + }, + } + c.Watch(&source.Kind{Type: &v1.Node{}}, &handler.Funcs{}, nodesTaintManagerPredicate) + + nodesUpdateQueuePredicate := predicate.Funcs{ + CreateFunc: func(evt event.CreateEvent) bool { + node := evt.Object.(*v1.Node) + nc.nodeUpdateQueue.Add(node.Name) + return false + }, + UpdateFunc: func(evt event.UpdateEvent) bool { + newNode := evt.ObjectNew.(*v1.Node) + nc.nodeUpdateQueue.Add(newNode.Name) + return false + }, + DeleteFunc: func(evt event.DeleteEvent) bool { + node := evt.Object.(*v1.Node) + nc.nodesToRetry.Delete(node.Name) + return false + }, + GenericFunc: func(evt event.GenericEvent) bool { + return false + }, + } + c.Watch(&source.Kind{Type: &v1.Node{}}, &handler.Funcs{}, nodesUpdateQueuePredicate) + c.Watch(&source.Kind{Type: &apps.DaemonSet{}}, &handler.Funcs{}) + c.Watch(&source.Kind{Type: &coordinationv1.Lease{}}, &handler.Funcs{}) + + go nc.Run(ctx, c.WaitForStarted) + return nil +} + +func GenGetPodsAssignedToNode(c client.Client) func(string) ([]*v1.Pod, error) { + return func(name string) ([]*v1.Pod, error) { + listOptions := &client.ListOptions{ + FieldSelector: fields.SelectorFromSet(fields.Set{ + nodeNameKeyIndex: name, + }), + } + + podList := &v1.PodList{} + err := c.List(context.TODO(), podList, listOptions) + if err != nil { + klog.Errorf("could not get podList for node(%s), %v", name, err) + return nil, err + } + + pods := make([]*v1.Pod, len(podList.Items)) + for i := range podList.Items { + pods[i] = &podList.Items[i] + } + return pods, nil + } +} + +func (nc *ReconcileNodeLifeCycle) InjectConfig(cfg *rest.Config) error { + kubeClient, err := clientset.NewForConfig(cfg) + if err != nil { + klog.Errorf("failed to create kube client, %v", err) + return err + } + nc.kubeClient = kubeClient + return nil +} + +// newReconciler returns a new reconcile.Reconciler +func newReconciler(cfg *appconfig.CompletedConfig, mgr manager.Manager) *ReconcileNodeLifeCycle { + nc := &ReconcileNodeLifeCycle{ + Client: mgr.GetClient(), + recorder: mgr.GetEventRecorderFor(names.NodeLifeCycleController), + now: metav1.Now, + knownNodeSet: make(map[string]*v1.Node), + nodeHealthMap: newNodeHealthMap(), + nodeUpdateWorkerSize: scheduler.UpdateWorkerSize, + zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), + nodesToRetry: sync.Map{}, + zoneStates: make(map[string]ZoneState), + nodeMonitorPeriod: metav1.Duration{Duration: 5 * time.Second}.Duration, + nodeStartupGracePeriod: cfg.ComponentConfig.NodeLifeCycleController.NodeStartupGracePeriod.Duration, + nodeMonitorGracePeriod: cfg.ComponentConfig.NodeLifeCycleController.NodeMonitorGracePeriod.Duration, + evictionLimiterQPS: cfg.ComponentConfig.NodeLifeCycleController.NodeEvictionRate, + secondaryEvictionLimiterQPS: cfg.ComponentConfig.NodeLifeCycleController.SecondaryNodeEvictionRate, + largeClusterThreshold: cfg.ComponentConfig.NodeLifeCycleController.LargeClusterSizeThreshold, + unhealthyZoneThreshold: cfg.ComponentConfig.NodeLifeCycleController.UnhealthyZoneThreshold, + nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"), + podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"), + } + nc.getPodsAssignedToNode = GenGetPodsAssignedToNode(nc.Client) + nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc + nc.enterFullDisruptionFunc = nc.HealthyQPSFunc + nc.computeZoneStateFunc = nc.ComputeZoneState + + return nc +} + +// Run starts an asynchronous loop that monitors the status of cluster nodes. +func (nc *ReconcileNodeLifeCycle) Run(ctx context.Context, waitForControllerStarted func(ctx context.Context) bool) { + defer utilruntime.HandleCrash() + + // Close node update queue to cleanup go routine. + defer nc.nodeUpdateQueue.ShutDown() + defer nc.podUpdateQueue.ShutDown() + + klog.Info("Starting node controller") + defer klog.Info("Shutting down node controller") + + if !waitForControllerStarted(ctx) { + return + } + + go nc.taintManager.Run(ctx) + + // Start workers to reconcile labels and/or update NoSchedule taint for nodes. + for i := 0; i < scheduler.UpdateWorkerSize; i++ { + // Thanks to "workqueue", each worker just need to get item from queue, because + // the item is flagged when got from queue: if new event come, the new item will + // be re-queued until "Done", so no more than one worker handle the same item and + // no event missed. + go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second) + } + + for i := 0; i < podUpdateWorkerSize; i++ { + go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second) + } + + // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated + // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. + go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod) + + // Incorporate the results of node health signal pushed from kubelet to master. + go wait.UntilWithContext(ctx, func(ctx context.Context) { + if err := nc.monitorNodeHealth(ctx); err != nil { + klog.Errorf("Error monitoring node health, %v", err) + } + }, nc.nodeMonitorPeriod) + + <-ctx.Done() +} + +func (nc *ReconcileNodeLifeCycle) doNodeProcessingPassWorker(ctx context.Context) { + for { + obj, shutdown := nc.nodeUpdateQueue.Get() + // "nodeUpdateQueue" will be shutdown when "stopCh" closed; + // we do not need to re-check "stopCh" again. + if shutdown { + return + } + nodeName := obj.(string) + if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil { + klog.ErrorS(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName)) + // TODO(k82cn): Add nodeName back to the queue + } + // TODO: re-evaluate whether there are any labels that need to be + // reconcile in 1.19. Remove this function if it's no longer necessary. + if err := nc.reconcileNodeLabels(ctx, nodeName); err != nil { + klog.ErrorS(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName)) + // TODO(yujuhong): Add nodeName back to the queue + } + nc.nodeUpdateQueue.Done(nodeName) + } +} + +func (nc *ReconcileNodeLifeCycle) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error { + //node, err := nc.nodeLister.Get(nodeName) + node := new(v1.Node) + err := nc.Client.Get(ctx, types.NamespacedName{Name: nodeName}, node) + if err != nil { + // If node not found, just ignore it. + if apierrors.IsNotFound(err) { + return nil + } + return err + } + + // Map node's condition to Taints. + var taints []v1.Taint + for _, condition := range node.Status.Conditions { + if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found { + if taintKey, found := taintMap[condition.Status]; found { + taints = append(taints, v1.Taint{ + Key: taintKey, + Effect: v1.TaintEffectNoSchedule, + }) + } + } + } + if node.Spec.Unschedulable { + // If unschedulable, append related taint. + taints = append(taints, v1.Taint{ + Key: v1.TaintNodeUnschedulable, + Effect: v1.TaintEffectNoSchedule, + }) + } + + // Get exist taints of node. + nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool { + // only NoSchedule taints are candidates to be compared with "taints" later + if t.Effect != v1.TaintEffectNoSchedule { + return false + } + // Find unschedulable taint of node. + if t.Key == v1.TaintNodeUnschedulable { + return true + } + // Find node condition taints of node. + _, found := taintKeyToNodeConditionMap[t.Key] + return found + }) + taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints) + // If nothing to add or delete, return true directly. + if len(taintsToAdd) == 0 && len(taintsToDel) == 0 { + return nil + } + if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) { + return fmt.Errorf("failed to swap taints of node %+v", node) + } + return nil +} + +func (nc *ReconcileNodeLifeCycle) doNoExecuteTaintingPass(ctx context.Context) { + // Extract out the keys of the map in order to not hold + // the evictorLock for the entire function and hold it + // only when nescessary. + var zoneNoExecuteTainterKeys []string + func() { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + + zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter)) + for k := range nc.zoneNoExecuteTainter { + zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k) + } + }() + for _, k := range zoneNoExecuteTainterKeys { + var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue + func() { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + // Extracting the value without checking if the key + // exists or not is safe to do here since zones do + // not get removed, and consequently pod evictors for + // these zones also do not get removed, only added. + zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k] + }() + // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). + zoneNoExecuteTainterWorker.Try(func(value scheduler.TimedValue) (bool, time.Duration) { + //node, err := nc.nodeLister.Get(value.Value) + node := new(v1.Node) + err := nc.Client.Get(ctx, types.NamespacedName{Name: value.Value}, node) + if apierrors.IsNotFound(err) { + klog.InfoS("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) + return true, 0 + } else if err != nil { + klog.InfoS("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) + // retry in 50 millisecond + return false, 50 * time.Millisecond + } + _, condition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) + // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. + taintToAdd := v1.Taint{} + oppositeTaint := v1.Taint{} + switch condition.Status { + case v1.ConditionFalse: + taintToAdd = *NotReadyTaintTemplate + oppositeTaint = *UnreachableTaintTemplate + case v1.ConditionUnknown: + taintToAdd = *UnreachableTaintTemplate + oppositeTaint = *NotReadyTaintTemplate + default: + // It seems that the Node is ready again, so there's no need to taint it. + klog.V(4).InfoS("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value)) + return true, 0 + } + result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node) + if result { + // Count the number of evictions. + zone := nodetopology.GetZoneKey(node) + evictionsTotal.WithLabelValues(zone).Inc() + } + + return result, 0 + }) + } +} + +// monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown". +// This function will +// - add nodes which are not ready or not reachable for a long period of time to a rate-limited +// queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function, +// - update the PodReady condition Pods according to the state of the Node Ready condition. +func (nc *ReconcileNodeLifeCycle) monitorNodeHealth(ctx context.Context) error { + start := nc.now() + defer func() { + updateAllNodesHealthDuration.Observe(time.Since(start.Time).Seconds()) + }() + + // We are listing nodes from local cache as we can tolerate some small delays + // comparing to state from etcd and there is eventual consistency anyway. + //nodes, err := nc.nodeLister.List(labels.Everything()) + nodeList := new(v1.NodeList) + err := nc.Client.List(ctx, nodeList, &client.ListOptions{}) + if err != nil { + return err + } + newNodeSlice := make([]*v1.Node, len(nodeList.Items), len(nodeList.Items)) + for i := range nodeList.Items { + newNodeSlice[i] = &nodeList.Items[i] + } + + // xiyuan added: for cdn , the vc nodes always keep ready and do not send heartbeat + // nodes of this type need to skip monitoring + var nodes []*v1.Node + for j := range newNodeSlice { + if newNodeSlice[j].Annotations != nil && newNodeSlice[j].Annotations[scheduler.AnnotationKeyVirtualClusterNode] == "true" { + klog.V(5).Infof("Skip monitoring node: %v in virtual cluster", newNodeSlice[j].Name) + } else { + nodes = append(nodes, newNodeSlice[j]) + } + } + + added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes) + for i := range newZoneRepresentatives { + nc.addPodEvictorForNewZone(newZoneRepresentatives[i]) + } + + for i := range added { + klog.V(1).InfoS("Controller observed a new Node", "node", klog.KRef("", added[i].Name)) + controllerutil.RecordNodeEvent(ctx, nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) + nc.knownNodeSet[added[i].Name] = added[i] + nc.addPodEvictorForNewZone(added[i]) + nc.markNodeAsReachable(ctx, added[i]) + } + + for i := range deleted { + klog.V(1).InfoS("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name)) + controllerutil.RecordNodeEvent(ctx, nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name)) + delete(nc.knownNodeSet, deleted[i].Name) + } + + var zoneToNodeConditionsLock sync.Mutex + zoneToNodeConditions := map[string][]*v1.NodeCondition{} + updateNodeFunc := func(piece int) { + start := nc.now() + defer func() { + updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds()) + }() + + var observedReadyCondition v1.NodeCondition + var currentReadyCondition *v1.NodeCondition + node := nodes[piece].DeepCopy() + + if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) { + var err error + _, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node) + if err == nil { + return true, nil + } + name := node.Name + node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name)) + return false, err + } + return false, nil + }); err != nil { + klog.ErrorS(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node)) + return + } + + // Some nodes may be excluded from disruption checking + if !isNodeExcludedFromDisruptionChecks(node) { + zoneToNodeConditionsLock.Lock() + zoneToNodeConditions[nodetopology.GetZoneKey(node)] = append(zoneToNodeConditions[nodetopology.GetZoneKey(node)], currentReadyCondition) + zoneToNodeConditionsLock.Unlock() + } + + if currentReadyCondition != nil { + pods, err := nc.getPodsAssignedToNode(node.Name) + if err != nil { + utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err)) + if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { + // If error happened during node status transition (Ready -> NotReady) + // we need to mark node for retry to force MarkPodsNotReady execution + // in the next iteration. + nc.nodesToRetry.Store(node.Name, struct{}{}) + } + return + } + nc.processTaintBaseEviction(ctx, node, &observedReadyCondition) + + _, needsRetry := nc.nodesToRetry.Load(node.Name) + switch { + case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue: + // Report node event only once when status changed. + controllerutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady") + fallthrough + case needsRetry && observedReadyCondition.Status != v1.ConditionTrue: + // Ignore mark the pods NotReady if the node has bounded to node. + if nodeutil.IsPodBoundenToNode(node) { + return + } + + if err = controllerutil.MarkPodsNotReady(ctx, nc.Client, nc.recorder, pods, node.Name); err != nil { + utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err)) + nc.nodesToRetry.Store(node.Name, struct{}{}) + return + } + } + } + nc.nodesToRetry.Delete(node.Name) + } + + // Marking the pods not ready on a node requires looping over them and + // updating each pod's status one at a time. This is performed serially, and + // can take a while if we're processing each node serially as well. So we + // process them with bounded concurrency instead, since most of the time is + // spent waiting on io. + workqueue.ParallelizeUntil(ctx, nc.nodeUpdateWorkerSize, len(nodes), updateNodeFunc) + + nc.handleDisruption(ctx, zoneToNodeConditions, nodes) + + return nil +} + +func (nc *ReconcileNodeLifeCycle) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) { + decisionTimestamp := nc.now() + // Check eviction timeout against decisionTimestamp + switch observedReadyCondition.Status { + case v1.ConditionFalse: + // We want to update the taint straight away if Node is already tainted with the UnreachableTaint + if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { + taintToAdd := *NotReadyTaintTemplate + if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { + klog.ErrorS(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle") + } + } else if nc.markNodeForTainting(node, v1.ConditionFalse) { + klog.V(2).InfoS("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) + } + case v1.ConditionUnknown: + // We want to update the taint straight away if Node is already tainted with the UnreachableTaint + if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { + taintToAdd := *UnreachableTaintTemplate + if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { + klog.ErrorS(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle") + } + } else if nc.markNodeForTainting(node, v1.ConditionUnknown) { + klog.V(2).InfoS("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) + } + case v1.ConditionTrue: + removed, err := nc.markNodeAsReachable(ctx, node) + if err != nil { + klog.ErrorS(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node)) + } + if removed { + klog.V(2).InfoS("Node is healthy again, removing all taints", "node", klog.KObj(node)) + } + } +} + +// labelNodeDisruptionExclusion is a label on nodes that controls whether they are +// excluded from being considered for disruption checks by the node controller. +const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption" + +func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool { + if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok { + return true + } + return false +} + +// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to +// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred. +func (nc *ReconcileNodeLifeCycle) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) { + nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name) + defer func() { + nc.nodeHealthMap.set(node.Name, nodeHealth) + }() + + var gracePeriod time.Duration + var observedReadyCondition v1.NodeCondition + _, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) + if currentReadyCondition == nil { + // If ready condition is nil, then kubelet (or nodecontroller) never posted node status. + // A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set + // to node.CreationTimestamp to avoid handle the corner case. + observedReadyCondition = v1.NodeCondition{ + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: node.CreationTimestamp, + LastTransitionTime: node.CreationTimestamp, + } + gracePeriod = nc.nodeStartupGracePeriod + if nodeHealth != nil { + nodeHealth.status = &node.Status + } else { + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: node.CreationTimestamp, + readyTransitionTimestamp: node.CreationTimestamp, + } + } + } else { + // If ready condition is not nil, make a copy of it, since we may modify it in place later. + observedReadyCondition = *currentReadyCondition + gracePeriod = nc.nodeMonitorGracePeriod + } + // There are following cases to check: + // - both saved and new status have no Ready Condition set - we leave everything as it is, + // - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd, + // - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do, + // - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be + // unresponsive, so we leave it as it is, + // - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State - + // everything's in order, no transition occurred, we update only probeTimestamp, + // - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State - + // Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp. + // TODO: things to consider: + // - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it, + // - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check + // if that's the case, but it does not seem necessary. + var savedCondition *v1.NodeCondition + var savedLease *coordv1.Lease + if nodeHealth != nil { + _, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) + savedLease = nodeHealth.lease + } + + if nodeHealth == nil { + klog.InfoS("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node)) + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: nc.now(), + readyTransitionTimestamp: nc.now(), + } + } else if savedCondition == nil && currentReadyCondition != nil { + klog.V(1).InfoS("Creating timestamp entry for newly observed Node", "node", klog.KObj(node)) + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: nc.now(), + readyTransitionTimestamp: nc.now(), + } + } else if savedCondition != nil && currentReadyCondition == nil { + klog.ErrorS(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node)) + // TODO: figure out what to do in this case. For now we do the same thing as above. + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: nc.now(), + readyTransitionTimestamp: nc.now(), + } + } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime { + var transitionTime metav1.Time + // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", + // otherwise we leave it as it is. + if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { + klog.V(3).InfoS("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition) + transitionTime = nc.now() + } else { + transitionTime = nodeHealth.readyTransitionTimestamp + } + if loggerV := klog.V(5); loggerV.Enabled() { + loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status) + } else { + klog.V(3).InfoS("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node)) + } + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: nc.now(), + readyTransitionTimestamp: transitionTime, + } + } + // Always update the probe time if node lease is renewed. + // Note: If kubelet never posted the node status, but continues renewing the + // heartbeat leases, the node controller will assume the node is healthy and + // take no action. + //observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name) + observedLease := new(coordinationv1.Lease) + err := nc.Get(ctx, types.NamespacedName{Namespace: v1.NamespaceNodeLease, Name: node.Name}, observedLease) + if err == nil && observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) { + nodeHealth.lease = observedLease + nodeHealth.probeTimestamp = nc.now() + } + + if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) { + // NodeReady condition or lease was last set longer ago than gracePeriod, so + // update it to Unknown (regardless of its current value) in the master. + + nodeConditionTypes := []v1.NodeConditionType{ + v1.NodeReady, + v1.NodeMemoryPressure, + v1.NodeDiskPressure, + v1.NodePIDPressure, + // We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level. + // v1.NodeNetworkUnavailable, + } + + nowTimestamp := nc.now() + for _, nodeConditionType := range nodeConditionTypes { + _, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType) + if currentCondition == nil { + klog.V(2).InfoS("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node)) + node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ + Type: nodeConditionType, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: node.CreationTimestamp, + LastTransitionTime: nowTimestamp, + }) + } else { + klog.V(2).InfoS("Node hasn't been updated", + "node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition) + if currentCondition.Status != v1.ConditionUnknown { + currentCondition.Status = v1.ConditionUnknown + currentCondition.Reason = "NodeStatusUnknown" + currentCondition.Message = "Kubelet stopped posting node status." + currentCondition.LastTransitionTime = nowTimestamp + } + } + } + // We need to update currentReadyCondition due to its value potentially changed. + _, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) + + if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { + //if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { + if err := nc.Status().Update(ctx, node, &client.UpdateOptions{}); err != nil { + klog.ErrorS(err, "Error updating node", "node", klog.KObj(node)) + return gracePeriod, observedReadyCondition, currentReadyCondition, err + } + nodeHealth = &nodeHealthData{ + status: &node.Status, + probeTimestamp: nodeHealth.probeTimestamp, + readyTransitionTimestamp: nc.now(), + lease: observedLease, + } + return gracePeriod, observedReadyCondition, currentReadyCondition, nil + } + } + + return gracePeriod, observedReadyCondition, currentReadyCondition, nil +} + +func (nc *ReconcileNodeLifeCycle) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { + newZoneStates := map[string]ZoneState{} + allAreFullyDisrupted := true + for k, v := range zoneToNodeConditions { + zoneSize.WithLabelValues(k).Set(float64(len(v))) + unhealthy, newState := nc.computeZoneStateFunc(v) + zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v))) + unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy)) + if newState != stateFullDisruption { + allAreFullyDisrupted = false + } + newZoneStates[k] = newState + if _, had := nc.zoneStates[k]; !had { + klog.ErrorS(nil, "Setting initial state for unseen zone", "zone", k) + nc.zoneStates[k] = stateInitial + } + } + + allWasFullyDisrupted := true + for k, v := range nc.zoneStates { + if _, have := zoneToNodeConditions[k]; !have { + zoneSize.WithLabelValues(k).Set(0) + zoneHealth.WithLabelValues(k).Set(100) + unhealthyNodes.WithLabelValues(k).Set(0) + delete(nc.zoneStates, k) + continue + } + if v != stateFullDisruption { + allWasFullyDisrupted = false + break + } + } + + // At least one node was responding in previous pass or in the current pass. Semantics is as follows: + // - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use, + // - if the new state is "normal" we resume normal operation (go back to default limiter settings), + // - if new state is "fullDisruption" we restore normal eviction rate, + // - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions. + if !allAreFullyDisrupted || !allWasFullyDisrupted { + // We're switching to full disruption mode + if allAreFullyDisrupted { + klog.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode") + for i := range nodes { + _, err := nc.markNodeAsReachable(ctx, nodes[i]) + if err != nil { + klog.ErrorS(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) + } + } + // We stop all evictions. + for k := range nc.zoneStates { + nc.zoneNoExecuteTainter[k].SwapLimiter(0) + } + for k := range nc.zoneStates { + nc.zoneStates[k] = stateFullDisruption + } + // All rate limiters are updated, so we can return early here. + return + } + // We're exiting full disruption mode + if allWasFullyDisrupted { + klog.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode") + // When exiting disruption mode update probe timestamps on all Nodes. + now := nc.now() + for i := range nodes { + v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name) + v.probeTimestamp = now + v.readyTransitionTimestamp = now + nc.nodeHealthMap.set(nodes[i].Name, v) + } + // We reset all rate limiters to settings appropriate for the given state. + for k := range nc.zoneStates { + nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k]) + nc.zoneStates[k] = newZoneStates[k] + } + return + } + // We know that there's at least one not-fully disrupted so, + // we can use default behavior for rate limiters + for k, v := range nc.zoneStates { + newState := newZoneStates[k] + if v == newState { + continue + } + klog.InfoS("Controller detected that zone is now in new state", "zone", k, "newState", newState) + nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState) + nc.zoneStates[k] = newState + } + } +} + +func (nc *ReconcileNodeLifeCycle) podUpdated(oldPod, newPod *v1.Pod) { + if newPod == nil { + return + } + if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) { + podItem := podUpdateItem{newPod.Namespace, newPod.Name} + nc.podUpdateQueue.Add(podItem) + } +} + +func (nc *ReconcileNodeLifeCycle) doPodProcessingWorker(ctx context.Context) { + for { + obj, shutdown := nc.podUpdateQueue.Get() + // "podUpdateQueue" will be shutdown when "stopCh" closed; + // we do not need to re-check "stopCh" again. + if shutdown { + return + } + + podItem := obj.(podUpdateItem) + nc.processPod(ctx, podItem) + } +} + +// processPod is processing events of assigning pods to nodes. In particular: +// 1. for NodeReady=true node, taint eviction for this pod will be cancelled +// 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready +// 3. if node doesn't exist in cache, it will be skipped. +func (nc *ReconcileNodeLifeCycle) processPod(ctx context.Context, podItem podUpdateItem) { + defer nc.podUpdateQueue.Done(podItem) + //pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) + pod := new(v1.Pod) + err := nc.Client.Get(ctx, types.NamespacedName{Namespace: podItem.namespace, Name: podItem.name}, pod) + if err != nil { + if apierrors.IsNotFound(err) { + // If the pod was deleted, there is no need to requeue. + return + } + klog.InfoS("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err) + nc.podUpdateQueue.AddRateLimited(podItem) + return + } + + nodeName := pod.Spec.NodeName + + nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName) + if nodeHealth == nil { + // Node data is not gathered yet or node has been removed in the meantime. + return + } + + node := new(v1.Node) + //_, err = nc.nodeLister.Get(nodeName) + err = nc.Client.Get(ctx, types.NamespacedName{Name: nodeName}, node) + if err != nil { + klog.InfoS("Failed to read node", "node", klog.KRef("", nodeName), "err", err) + nc.podUpdateQueue.AddRateLimited(podItem) + return + } + + // Ignore mark the pods NotReady if the node has bounded to node. + if nodeutil.IsPodBoundenToNode(node) { + return + } + + _, currentReadyCondition := controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) + if currentReadyCondition == nil { + // Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted). + // In both cases, the pod will be handled correctly (evicted if needed) during processing + // of the next node update event. + return + } + + pods := []*v1.Pod{pod} + if currentReadyCondition.Status != v1.ConditionTrue { + if err := controllerutil.MarkPodsNotReady(ctx, nc.Client, nc.recorder, pods, nodeName); err != nil { + klog.InfoS("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) + nc.podUpdateQueue.AddRateLimited(podItem) + } + } +} + +func (nc *ReconcileNodeLifeCycle) setLimiterInZone(zone string, zoneSize int, state ZoneState) { + switch state { + case stateNormal: + nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) + case statePartialDisruption: + nc.zoneNoExecuteTainter[zone].SwapLimiter( + nc.enterPartialDisruptionFunc(zoneSize)) + case stateFullDisruption: + nc.zoneNoExecuteTainter[zone].SwapLimiter( + nc.enterFullDisruptionFunc(zoneSize)) + } +} + +// classifyNodes classifies the allNodes to three categories: +// 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet' +// 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes' +// 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states +func (nc *ReconcileNodeLifeCycle) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) { + for i := range allNodes { + if _, has := nc.knownNodeSet[allNodes[i].Name]; !has { + added = append(added, allNodes[i]) + } else { + // Currently, we only consider new zone as updated. + zone := nodetopology.GetZoneKey(allNodes[i]) + if _, found := nc.zoneStates[zone]; !found { + newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i]) + } + } + } + + // If there's a difference between lengths of known Nodes and observed nodes + // we must have removed some Node. + if len(nc.knownNodeSet)+len(added) != len(allNodes) { + knowSetCopy := map[string]*v1.Node{} + for k, v := range nc.knownNodeSet { + knowSetCopy[k] = v + } + for i := range allNodes { + delete(knowSetCopy, allNodes[i].Name) + } + for i := range knowSetCopy { + deleted = append(deleted, knowSetCopy[i]) + } + } + return +} + +// HealthyQPSFunc returns the default value for cluster eviction rate - we take +// nodeNum for consistency with ReducedQPSFunc. +func (nc *ReconcileNodeLifeCycle) HealthyQPSFunc(_ int) float32 { + return nc.evictionLimiterQPS +} + +// ReducedQPSFunc returns the QPS for when a the cluster is large make +// evictions slower, if they're small stop evictions altogether. +func (nc *ReconcileNodeLifeCycle) ReducedQPSFunc(nodeNum int) float32 { + if int32(nodeNum) > nc.largeClusterThreshold { + return nc.secondaryEvictionLimiterQPS + } + return 0 +} + +// addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor. +func (nc *ReconcileNodeLifeCycle) addPodEvictorForNewZone(node *v1.Node) { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + zone := nodetopology.GetZoneKey(node) + if _, found := nc.zoneStates[zone]; !found { + nc.zoneStates[zone] = stateInitial + nc.zoneNoExecuteTainter[zone] = + scheduler.NewRateLimitedTimedQueue( + flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) + // Init the metric for the new zone. + klog.InfoS("Initializing eviction metric for zone", "zone", zone) + evictionsTotal.WithLabelValues(zone).Add(0) + } +} + +func (nc *ReconcileNodeLifeCycle) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + if status == v1.ConditionFalse { + if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { + nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) + } + } + + if status == v1.ConditionUnknown { + if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { + nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) + } + } + + return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID)) +} + +func (nc *ReconcileNodeLifeCycle) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) { + err := controllerutil.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate) + if err != nil { + klog.ErrorS(err, "Failed to remove taint from node", "node", klog.KObj(node)) + return false, err + } + err = controllerutil.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate) + if err != nil { + klog.ErrorS(err, "Failed to remove taint from node", "node", klog.KObj(node)) + return false, err + } + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + + return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name), nil +} + +// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. +// The zone is considered: +// - fullyDisrupted if there're no Ready Nodes, +// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready, +// - normal otherwise +func (nc *ReconcileNodeLifeCycle) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) { + readyNodes := 0 + notReadyNodes := 0 + for i := range nodeReadyConditions { + if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue { + readyNodes++ + } else { + notReadyNodes++ + } + } + switch { + case readyNodes == 0 && notReadyNodes > 0: + return notReadyNodes, stateFullDisruption + case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold: + return notReadyNodes, statePartialDisruption + default: + return notReadyNodes, stateNormal + } +} + +// reconcileNodeLabels reconciles node labels. +func (nc *ReconcileNodeLifeCycle) reconcileNodeLabels(ctx context.Context, nodeName string) error { + //node, err := nc.nodeLister.Get(nodeName) + node := new(v1.Node) + err := nc.Client.Get(ctx, types.NamespacedName{Name: nodeName}, node) + if err != nil { + // If node not found, just ignore it. + if apierrors.IsNotFound(err) { + return nil + } + return err + } + + if node.Labels == nil { + // Nothing to reconcile. + return nil + } + + labelsToUpdate := map[string]string{} + for _, r := range labelReconcileInfo { + primaryValue, primaryExists := node.Labels[r.primaryKey] + secondaryValue, secondaryExists := node.Labels[r.secondaryKey] + + if !primaryExists { + // The primary label key does not exist. This should not happen + // within our supported version skew range, when no external + // components/factors modifying the node object. Ignore this case. + continue + } + if secondaryExists && primaryValue != secondaryValue { + // Secondary label exists, but not consistent with the primary + // label. Need to reconcile. + labelsToUpdate[r.secondaryKey] = primaryValue + + } else if !secondaryExists && r.ensureSecondaryExists { + // Apply secondary label based on primary label. + labelsToUpdate[r.secondaryKey] = primaryValue + } + } + + if len(labelsToUpdate) == 0 { + return nil + } + if !controllerutil.AddOrUpdateLabelsOnNode(ctx, nc.kubeClient, labelsToUpdate, node) { + return fmt.Errorf("failed update labels for node %+v", node) + } + return nil +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller_test.go new file mode 100644 index 00000000000..84e9fd39d5b --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -0,0 +1,3467 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodelifecycle + +import ( + "context" + "fmt" + goruntime "runtime" + "strings" + "sync" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + coordv1 "k8s.io/api/coordination/v1" + v1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + testcore "k8s.io/client-go/testing" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + kubeletapis "k8s.io/kubelet/pkg/apis" + "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" + + taintutils "github.com/openyurtio/openyurt/pkg/util/taints" + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/nodelifecycle/scheduler" + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/testutil" + nodeutil "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/node" +) + +const ( + testNodeMonitorGracePeriod = 40 * time.Second + testNodeStartupGracePeriod = 60 * time.Second + testNodeMonitorPeriod = 5 * time.Second + testRateLimiterQPS = float32(100000) + testLargeClusterThreshold = 20 + testUnhealthyThreshold = float32(0.55) +) + +func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease { + return &coordv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Namespace: v1.NamespaceNodeLease, + }, + Spec: coordv1.LeaseSpec{ + HolderIdentity: pointer.String(nodeName), + RenewTime: &renewTime, + }, + } +} + +func newNodeLifecycleControllerFromClient( + ctx context.Context, + handler *testutil.ImprovedFakeNodeHandler, + evictionLimiterQPS float32, + secondaryEvictionLimiterQPS float32, + largeClusterThreshold int32, + unhealthyZoneThreshold float32, + nodeMonitorGracePeriod time.Duration, + nodeStartupGracePeriod time.Duration, + nodeMonitorPeriod time.Duration, +) (*ReconcileNodeLifeCycle, error) { + + nc := &ReconcileNodeLifeCycle{ + Client: handler, + kubeClient: handler, + recorder: testutil.NewFakeRecorder(), + now: metav1.Now, + knownNodeSet: make(map[string]*v1.Node), + nodeHealthMap: newNodeHealthMap(), + nodeUpdateWorkerSize: scheduler.UpdateWorkerSize, + zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), + nodesToRetry: sync.Map{}, + zoneStates: make(map[string]ZoneState), + nodeMonitorPeriod: nodeMonitorPeriod, + nodeStartupGracePeriod: nodeStartupGracePeriod, + nodeMonitorGracePeriod: nodeMonitorGracePeriod, + evictionLimiterQPS: evictionLimiterQPS, + secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, + largeClusterThreshold: largeClusterThreshold, + unhealthyZoneThreshold: unhealthyZoneThreshold, + nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"), + podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"), + } + nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc + nc.enterFullDisruptionFunc = nc.HealthyQPSFunc + nc.computeZoneStateFunc = nc.ComputeZoneState + nc.getPodsAssignedToNode = GenGetPodsAssignedToNode(handler) + + return nc, nil +} + +func TestMonitorNodeHealth(t *testing.T) { + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + timeToPass := 60 * time.Minute + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.NewTime(fakeNow.Add(timeToPass)), + LastTransitionTime: fakeNow, + }, + }, + } + unhealthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + // Node status was updated by nodecontroller timeToPass ago + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + } + + tests := map[string]struct { + nodeList []*v1.Node + updatedNodeStatuses map[string]v1.NodeStatus + expectedInitialStates map[string]ZoneState + expectedFollowingStates map[string]ZoneState + }{ + "No Disruption: Node created recently without failure domain labels (happens only at cluster startup)": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + "": stateNormal, + }, + expectedFollowingStates: map[string]ZoneState{ + "": stateNormal, + }, + }, + "No Disruption: Initially both zones down, one comes back": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone2", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone2", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region1", "zone2"): stateFullDisruption, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region1", "zone2"): stateNormal, + }, + }, + "Partial Disruption: Nodes created recently without status conditions (happens only at cluster startup)": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": unhealthyNodeNewStatus, + "node2": unhealthyNodeNewStatus, + "node3": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + // we've not received any status for the nodes yet + // so the controller assumes the zones is fully disrupted + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, + }, + }, + "Partial Disruption: one Node failed leading to the number of healthy Nodes to exceed the configured threshold": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node4", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": unhealthyNodeNewStatus, + "node2": unhealthyNodeNewStatus, + "node3": healthyNodeNewStatus, + "node4": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateNormal, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, + }, + }, + "Full Disruption: the zone has less than 2 Nodes down, the last healthy Node has failed": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": unhealthyNodeNewStatus, + "node2": unhealthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + // if a zone has a number of unhealthy nodes less or equal to 2 + // the controller will consider it normal regardless on + // the ration of healthy vs unhealthy nodes + testutil.CreateZoneID("region1", "zone1"): stateNormal, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + }, + }, + "Full Disruption: all the Nodes in one zone are down": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone2", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone2", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region1", "zone2"): stateNormal, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region1", "zone2"): stateNormal, + }, + }, + "Full Disruption: all the Nodes in both the zones are down": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region2", + v1.LabelTopologyZone: "zone2", + v1.LabelFailureDomainBetaRegion: "region2", + v1.LabelFailureDomainBetaZone: "zone2", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node1": unhealthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, + }, + }, + "Full Disruption: Ready condition removed from the Node": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": { + Conditions: []v1.NodeCondition{}, + }, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateNormal, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + }, + }, + "Full Disruption: the only available Node has the node.kubernetes.io/exclude-disruption label": { + nodeList: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node-master", + CreationTimestamp: fakeNow, + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + labelNodeDisruptionExclusion: "", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + updatedNodeStatuses: map[string]v1.NodeStatus{ + "node0": unhealthyNodeNewStatus, + "node-master": healthyNodeNewStatus, + }, + expectedInitialStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + }, + expectedFollowingStates: map[string]ZoneState{ + testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, + }, + }, + } + + for testName, tt := range tests { + t.Run(testName, func(t *testing.T) { + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(tt.nodeList, nil) + nodeController, _ := newNodeLifecycleControllerFromClient( + context.TODO(), + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod) + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 { + return testRateLimiterQPS + } + nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 { + return testRateLimiterQPS + } + + syncAndDiffZoneState := func(wanted map[string]ZoneState) { + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Errorf("unexpected error: %v", err) + } + if diff := cmp.Diff(wanted, nodeController.zoneStates); diff != "" { + t.Errorf("unexpected zone state (-want +got):\n%s", diff) + } + } + + // initial zone state + nodeController.now = func() metav1.Time { return fakeNow } + syncAndDiffZoneState(tt.expectedInitialStates) + + // following zone state + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } + //for i := range tt.updatedNodeStatuses { + // fakeNodeHandler.Existing[i].Status = tt.updatedNodeStatuses[i] + //} + if err := fakeNodeHandler.UpdateNodeStatuses(tt.updatedNodeStatuses); err != nil { + t.Errorf("failed to update node status, %v", err) + } + syncAndDiffZoneState(tt.expectedFollowingStates) + }) + } +} + +func TestPodStatusChange(t *testing.T) { + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. Because of how the tests are written we need to update + // the status of this Node. + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status has just been updated, and is NotReady for 10min. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + + // Node created long time ago, node controller posted Unknown for a long period of time. + table := []struct { + nodes []*v1.Node + pods *v1.PodList + timeToPass time.Duration + newNodeStatus v1.NodeStatus + secondNodeNewStatus v1.NodeStatus + expectedPodUpdate bool + expectedReason string + description string + }{ + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + timeToPass: 60 * time.Minute, + newNodeStatus: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + // Node status was updated by nodecontroller 1hr ago + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + secondNodeNewStatus: healthyNodeNewStatus, + expectedPodUpdate: true, + expectedReason: nodeutil.NodeUnreachablePodReason, + description: "Node created long time ago, node controller posted Unknown for a " + + "long period of time, the pod status must include reason for termination.", + }, + } + + for _, item := range table { + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(item.nodes, item.pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + context.TODO(), + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Errorf("unexpected error: %v", err) + } + if item.timeToPass > 0 { + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } + //item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus + //item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus + fakeNodeHandler.UpdateNodeStatuses(map[string]v1.NodeStatus{ + "node0": item.newNodeStatus, + "node1": item.secondNodeNewStatus, + }) + } + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Errorf("unexpected error: %v", err) + } + zones := testutil.GetZones(fakeNodeHandler) + for _, zone := range zones { + nodeController.zoneNoExecuteTainter[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { + nodeUID, _ := value.UID.(string) + pods, err := nodeController.getPodsAssignedToNode(value.Value) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + nodeutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID) + return true, 0 + }) + } + + podReasonUpdate := false + for _, action := range fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" { + updateReason := action.(testcore.UpdateActionImpl).GetObject().(*v1.Pod).Status.Reason + podReasonUpdate = true + if updateReason != item.expectedReason { + t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description) + } + } + } + + if podReasonUpdate != item.expectedPodUpdate { + t.Errorf("expected pod update: %+v, got %+v for %+v", item.expectedPodUpdate, podReasonUpdate, item.description) + } + } +} + +func TestMonitorNodeHealthUpdateStatus(t *testing.T) { + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + table := []struct { + nodes []*v1.Node + pods *v1.PodList + timeToPass time.Duration + newNodeStatus v1.NodeStatus + expectedRequestCount int + expectedNodes []*v1.Node + expectedPodStatusUpdate bool + }{ + // Node created long time ago, without status: + // Expect Unknown status posted from node controller. + { + + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 2, // List+Update + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeMemoryPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodePIDPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + expectedPodStatusUpdate: false, // Pod was never scheduled + }, + // Node created recently, without status. + // Expect no action from node controller (within startup grace period). + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 1, // List + expectedNodes: nil, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. + // Expect Unknown status posted from node controller. + { + + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 3, // (List+)List+Update + timeToPass: time.Hour, + newNodeStatus: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + Reason: "NodeStatusUnknown", + Message: "Kubelet stopped posting node status.", + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, + }, + { + Type: v1.NodeMemoryPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, + }, + { + Type: v1.NodePIDPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + expectedPodStatusUpdate: true, + }, + // Node created long time ago, with status updated recently. + // Expect no action from node controller (within monitor grace period). + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status has just been updated. + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 1, // List + expectedNodes: nil, + expectedPodStatusUpdate: false, + }, + } + for i, item := range table { + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(item.nodes, item.pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + context.TODO(), + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Errorf("unexpected error: %v", err) + } + if item.timeToPass > 0 { + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } + //item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + fakeNodeHandler.UpdateNodeStatuses(map[string]v1.NodeStatus{ + "node0": item.newNodeStatus, + }) + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Errorf("unexpected error: %v", err) + } + } + if item.expectedRequestCount != fakeNodeHandler.RequestCount { + t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, fakeNodeHandler.RequestCount) + } + + if len(fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, fakeNodeHandler.UpdatedNodes) { + t.Errorf("Case[%d] unexpected nodes, expected nodes: %#+v\n, got nodes: %#+v", i, item.expectedNodes[0], fakeNodeHandler.UpdatedNodes[0]) + } + + if len(fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, fakeNodeHandler.UpdatedNodeStatuses) { + t.Errorf("Case[%d] unexpected node status: expected %#+v\n, but got: %#+v", i, item.expectedNodes[0], fakeNodeHandler.UpdatedNodeStatuses[0]) + } + + podStatusUpdated := false + for _, action := range fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { + podStatusUpdated = true + } + } + if podStatusUpdated != item.expectedPodStatusUpdate { + t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) + } + } +} + +func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { + nodeCreationTime := metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC) + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + testcases := []struct { + description string + //fakeNodeHandler *testutil.FakeNodeHandler + nodes []*v1.Node + pods *v1.PodList + lease *coordv1.Lease + timeToPass time.Duration + newNodeStatus map[string]v1.NodeStatus + newLease *coordv1.Lease + expectedRequestCount int + expectedNodes []*v1.Node + expectedPodStatusUpdate bool + }{ + // Node created recently, without status. Node lease is missing. + // Expect no action from node controller (within startup grace period). + { + description: "Node created recently, without status. Node lease is missing.", + + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 1, // List + expectedNodes: nil, + expectedPodStatusUpdate: false, + }, + // Node created recently, without status. Node lease is renewed recently. + // Expect no action from node controller (within startup grace period). + { + description: "Node created recently, without status. Node lease is renewed recently.", + + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + expectedRequestCount: 1, // List + expectedNodes: nil, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, without status. Node lease is missing. + // Expect Unknown status posted from node controller. + { + description: "Node created long time ago, without status. Node lease is missing.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedRequestCount: 2, // List+Update + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeMemoryPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodePIDPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + expectedPodStatusUpdate: false, // Pod was never scheduled because the node was never ready. + }, + // Node created long time ago, without status. Node lease is renewed recently. + // Expect no action from node controller (within monitor grace period). + { + description: "Node created long time ago, without status. Node lease is renewed recently.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + timeToPass: time.Hour, + newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. + expectedRequestCount: 2, // List+List + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + }, + }, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, without status. Node lease is expired. + // Expect Unknown status posted from node controller. + { + description: "Node created long time ago, without status. Node lease is expired.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + timeToPass: time.Hour, + newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. + expectedRequestCount: 3, // List+List+Update + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodeMemoryPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodePIDPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + }, + }, + }, + }, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed. + // Expect no action from node controller (within monitor grace period). + { + description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + expectedRequestCount: 2, // List+List + timeToPass: time.Hour, + newNodeStatus: map[string]v1.NodeStatus{ + // Node status hasn't been updated for 1 hour. + "node0": { + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated by kubelet recently. Node lease is expired. + // Expect no action from node controller (within monitor grace period). + { + description: "Node created long time ago, with status updated by kubelet recently. Node lease is expired.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + expectedRequestCount: 2, // List+List + timeToPass: time.Hour, + newNodeStatus: map[string]v1.NodeStatus{ + // Node status is updated after 1 hour. + "node0": { + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + LastTransitionTime: fakeNow, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired. + // Expect Unknown status posted from node controller. + { + description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.", + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), + expectedRequestCount: 3, // List+List+Update + timeToPass: time.Hour, + newNodeStatus: map[string]v1.NodeStatus{ + // Node status hasn't been updated for 1 hour. + "node0": { + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. + expectedNodes: []*v1.Node{ + { + TypeMeta: metav1.TypeMeta{ + Kind: "Node", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: nodeCreationTime, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + Reason: "NodeStatusUnknown", + Message: "Kubelet stopped posting node status.", + LastHeartbeatTime: fakeNow, + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodeMemoryPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodeDiskPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + { + Type: v1.NodePIDPressure, + Status: v1.ConditionUnknown, + Reason: "NodeStatusNeverUpdated", + Message: "Kubelet never posted node status.", + LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated + LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + expectedPodStatusUpdate: true, + }, + } + + for i, item := range testcases { + t.Run(item.description, func(t *testing.T) { + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(item.nodes, item.pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + context.TODO(), + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + // t.Fatalf("unexpected error: %v", err) + //} + //if err := nodeController.syncLeaseStore(item.lease); err != nil { + if err := fakeNodeHandler.UpdateLease(item.lease); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if item.timeToPass > 0 { + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } + //item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + if err := fakeNodeHandler.UpdateNodeStatuses(item.newNodeStatus); err != nil { + t.Fatalf("unexpected error: %v", err) + } + //if err := nodeController.syncLeaseStore(item.newLease); err != nil { + if err := fakeNodeHandler.UpdateLease(item.newLease); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + if item.expectedRequestCount != fakeNodeHandler.RequestCount { + t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, fakeNodeHandler.RequestCount) + } + + if len(fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, fakeNodeHandler.UpdatedNodes) { + t.Errorf("case[%d] expected nodes: %#+v\n, got %#+v", i, item.expectedNodes[0], fakeNodeHandler.UpdatedNodes[0]) + } + + if len(fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, fakeNodeHandler.UpdatedNodeStatuses) { + t.Errorf("case[%d]: expected nodes: %#+v\n, got %#+v", i, item.expectedNodes[0], fakeNodeHandler.UpdatedNodeStatuses[0]) + } + + podStatusUpdated := false + for _, action := range fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { + podStatusUpdated = true + } + } + if podStatusUpdated != item.expectedPodStatusUpdate { + t.Errorf("expect pod status updated to be %v, but got %v", item.expectedPodStatusUpdate, podStatusUpdated) + } + }) + } +} + +func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + table := []struct { + //fakeNodeHandler *testutil.FakeNodeHandler + nodes []*v1.Node + pods *v1.PodList + timeToPass time.Duration + newNodeStatus map[string]v1.NodeStatus + expectedPodStatusUpdate bool + }{ + // Node created recently, without status. + // Expect no action from node controller (within startup grace period). + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated recently. + // Expect no action from node controller (within monitor grace period). + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status has just been updated. + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + expectedPodStatusUpdate: false, + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. + // Expect pods status updated and Unknown node status posted from node controller + { + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + pods: &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}, + timeToPass: 1 * time.Minute, + newNodeStatus: map[string]v1.NodeStatus{ + "node0": { + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + expectedPodStatusUpdate: true, + }, + } + + ctx := context.TODO() + for i, item := range table { + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(item.nodes, item.pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("Case[%d] unexpected error: %v", i, err) + } + if item.timeToPass > 0 { + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } + //item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus + //if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { + if err := fakeNodeHandler.UpdateNodeStatuses(item.newNodeStatus); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("Case[%d] unexpected error: %v", i, err) + } + } + + podStatusUpdated := false + for _, action := range fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { + podStatusUpdated = true + } + } + if podStatusUpdated != item.expectedPodStatusUpdate { + t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) + } + } +} + +// TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize tests the happy path of +// TestMonitorNodeHealthMarkPodsNotReady with a large number of nodes/pods and +// varying numbers of workers. +func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) { + const numNodes = 50 + const podsPerNode = 100 + makeNodes := func() []*v1.Node { + nodes := make([]*v1.Node, numNodes) + // Node created long time ago, with status updated by kubelet exceeds grace period. + // Expect pods status updated and Unknown node status posted from node controller + for i := 0; i < numNodes; i++ { + nodes[i] = &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("node%d", i), + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + } + } + return nodes + } + makePods := func() []v1.Pod { + pods := make([]v1.Pod, numNodes*podsPerNode) + for i := 0; i < numNodes*podsPerNode; i++ { + pods[i] = *testutil.NewPod(fmt.Sprintf("pod%d", i), fmt.Sprintf("node%d", i%numNodes)) + } + return pods + } + + table := []struct { + workers int + }{ + {workers: 0}, // will default to scheduler.UpdateWorkerSize + {workers: 1}, + } + + ctx := context.TODO() + for i, item := range table { + fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + + nodes := makeNodes() + pods := &v1.PodList{Items: makePods()} + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + if item.workers != 0 { + nodeController.nodeUpdateWorkerSize = item.workers + } + + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("Case[%d] unexpected error: %v", i, err) + } + + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(1 * time.Minute)} } + nodeStatuses := make(map[string]v1.NodeStatus, numNodes) + for i := 0; i < numNodes; i++ { + nodeStatuses[fmt.Sprintf("node%d", i)] = v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + // Node status hasn't been updated for 1hr. + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + } + } + + if err := fakeNodeHandler.UpdateNodeStatuses(nodeStatuses); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("Case[%d] unexpected error: %v", i, err) + } + + podStatusUpdates := 0 + for _, action := range fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { + podStatusUpdates++ + } + } + const expectedPodStatusUpdates = numNodes * podsPerNode + if podStatusUpdates != expectedPodStatusUpdates { + t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, expectedPodStatusUpdates, podStatusUpdates) + } + } +} + +func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { + type nodeIteration struct { + timeToPass time.Duration + newNodes []*v1.Node + } + timeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) + timePlusTwoMinutes := metav1.Date(2015, 1, 1, 12, 0, 2, 0, time.UTC) + makeNodes := func(status v1.ConditionStatus, lastHeartbeatTime, lastTransitionTime metav1.Time) []*v1.Node { + return []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: timeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: status, + LastHeartbeatTime: lastHeartbeatTime, + LastTransitionTime: lastTransitionTime, + }, + }, + }, + }, + } + } + table := []struct { + desc string + fakeNodeHandler *testutil.ImprovedFakeNodeHandler + updateReactor func() error + fakeGetPodsAssignedToNode func(c client.Client) func(string) ([]*v1.Pod, error) + nodeIterations []nodeIteration + expectedPodStatusUpdates int + }{ + // Node created long time ago, with status updated by kubelet exceeds grace period. + // First monitorNodeHealth check will update pod status to NotReady. + // Second monitorNodeHealth check will do no updates (no retry). + { + desc: "successful pod status update, no retry required", + fakeNodeHandler: testutil.NewImprovedFakeNodeHandler([]*v1.Node{}, &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + fakeGetPodsAssignedToNode: GenGetPodsAssignedToNode, + nodeIterations: []nodeIteration{ + { + timeToPass: 0, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), + }, + }, + expectedPodStatusUpdates: 1, + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. + // First monitorNodeHealth check will fail to update pod status to NotReady. + // Second monitorNodeHealth check will update pod status to NotReady (retry). + { + desc: "unsuccessful pod status update, retry required", + fakeNodeHandler: testutil.NewImprovedFakeNodeHandler([]*v1.Node{}, &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + updateReactor: func() func() error { + i := 0 + return func() error { + i++ + switch i { + case 1: + return fmt.Errorf("fake error") + default: + return nil + } + } + }(), + fakeGetPodsAssignedToNode: GenGetPodsAssignedToNode, + nodeIterations: []nodeIteration{ + { + timeToPass: 0, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), + }, + }, + expectedPodStatusUpdates: 2, // One failed and one retry. + }, + // Node created long time ago, with status updated by kubelet exceeds grace period. + // First monitorNodeHealth check will fail to list pods. + // Second monitorNodeHealth check will update pod status to NotReady (retry). + { + desc: "unsuccessful pod list, retry required", + fakeNodeHandler: testutil.NewImprovedFakeNodeHandler([]*v1.Node{}, &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + fakeGetPodsAssignedToNode: func(c client.Client) func(string) ([]*v1.Pod, error) { + i := 0 + f := GenGetPodsAssignedToNode(c) + return func(nodeName string) ([]*v1.Pod, error) { + i++ + if i == 1 { + return nil, fmt.Errorf("fake error") + } + return f(nodeName) + } + }, + nodeIterations: []nodeIteration{ + { + timeToPass: 0, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), + }, + { + timeToPass: 1 * time.Minute, + newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), + }, + }, + expectedPodStatusUpdates: 1, + }, + } + + for i, item := range table { + t.Run(item.desc, func(t *testing.T) { + ctx := context.TODO() + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + item.fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + if item.updateReactor != nil { + item.fakeNodeHandler.PodUpdateReactor = item.updateReactor + } + nodeController.now = func() metav1.Time { return timeNow } + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.getPodsAssignedToNode = item.fakeGetPodsAssignedToNode(item.fakeNodeHandler.ClientWrapper) + for _, iteration := range item.nodeIterations { + nodeController.now = func() metav1.Time { return metav1.Time{Time: timeNow.Add(iteration.timeToPass)} } + if err := item.fakeNodeHandler.UpdateNodes(iteration.newNodes); err != nil { + t.Errorf("failed to update node, %v", err) + } + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + } + + podStatusUpdates := 0 + for _, action := range item.fakeNodeHandler.Actions() { + if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { + podStatusUpdates++ + } + } + if podStatusUpdates != item.expectedPodStatusUpdates { + t.Errorf("case[%d], expect pod status updated to happen %d times, but got %d", i, item.expectedPodStatusUpdates, podStatusUpdates) + } + }) + } +} + +// TestApplyNoExecuteTaints, ensures we just have a NoExecute taint applied to node. +// NodeController is just responsible for enqueuing the node to tainting queue from which taint manager picks up +// and evicts the pods on the node. +func TestApplyNoExecuteTaints(t *testing.T) { + // TODO: Remove skip once https://github.com/kubernetes/kubernetes/pull/114607 merges. + if goruntime.GOOS == "windows" { + t.Skip("Skipping test on Windows.") + } + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + // + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + // Unreachable Taint with effect 'NoExecute' should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // NotReady Taint with NoExecute effect should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + originalTaint := UnreachableTaintTemplate + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + nodeController.doNoExecuteTaintingPass(ctx) + //node0, err := fakeNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) + node0, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + if !taintutils.TaintExists(node0.Spec.Taints, UnreachableTaintTemplate) { + t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) + } + klog.Infof("node0 test ended") + //node2, err := fakeNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) + node2, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node2...") + return + } + if !taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) { + t.Errorf("Can't find taint %v in %v", NotReadyTaintTemplate, node2.Spec.Taints) + } + klog.Infof("node2 init test ended") + + // Make node3 healthy again. + node2.Status = healthyNodeNewStatus + //_, err = fakeNodeHandler.UpdateStatus(ctx, node2, metav1.UpdateOptions{}) + _, err = fakeNodeHandler.DelegateNodeHandler.UpdateStatus(ctx, node2, metav1.UpdateOptions{}) + if err != nil { + t.Errorf(err.Error()) + return + } + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + nodeController.doNoExecuteTaintingPass(ctx) + + //node2, err = fakeNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) + node2, err = fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node2...") + return + } + // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect). + if taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) || len(node2.Spec.Taints) > 0 { + t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node2.Spec.Taints) + } +} + +// TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice +func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { + // TODO: Remove skip once https://github.com/kubernetes/kubernetes/pull/114607 merges. + if goruntime.GOOS == "windows" { + t.Skip("Skipping test on Windows.") + } + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + // Unreachable Taint with effect 'NoExecute' should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // NotReady Taint with NoExecute effect should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + // 1. monitor node health twice, add untainted node once + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + + // 2. mark node0 healthy + node0, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + node0.Status = healthyNodeNewStatus + _, err = fakeNodeHandler.DelegateNodeHandler.UpdateStatus(ctx, node0, metav1.UpdateOptions{}) + if err != nil { + t.Errorf(err.Error()) + return + } + + // add other notReady nodes + //fakeNodeHandler.Existing = append(fakeNodeHandler.Existing, []*v1.Node{ + if err = fakeNodeHandler.UpdateNodes([]*v1.Node{ + // Unreachable Taint with effect 'NoExecute' should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node4", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // NotReady Taint with NoExecute effect should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node5", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }); err != nil { + t.Errorf("unexpected error: %v", err) + } + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + // 3. start monitor node health again, add untainted node twice, construct UniqueQueue with duplicated node cache + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + + // 4. do NoExecute taint pass + // when processing with node0, condition.Status is NodeReady, and return true with default case + // then remove the set value and queue value both, the taint job never stuck + nodeController.doNoExecuteTaintingPass(ctx) + + // 5. get node3 and node5, see if it has ready got NoExecute taint + node3, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node3", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node3...") + return + } + if !taintutils.TaintExists(node3.Spec.Taints, UnreachableTaintTemplate) || len(node3.Spec.Taints) == 0 { + t.Errorf("Not found taint %v in %v, which should be present in %s", UnreachableTaintTemplate, node3.Spec.Taints, node3.Name) + } + node5, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node5", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node5...") + return + } + if !taintutils.TaintExists(node5.Spec.Taints, NotReadyTaintTemplate) || len(node5.Spec.Taints) == 0 { + t.Errorf("Not found taint %v in %v, which should be present in %s", NotReadyTaintTemplate, node5.Spec.Taints, node5.Name) + } +} + +func TestSwapUnreachableNotReadyTaints(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. Because of how the tests are written we need to update + // the status of this Node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + newNodeStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + // Node status has just been updated, and is NotReady for 10min. + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 9, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + originalTaint := UnreachableTaintTemplate + updatedTaint := NotReadyTaintTemplate + + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + nodeController.doNoExecuteTaintingPass(ctx) + + node0, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + node1, err := fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node1", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node1...") + return + } + + if originalTaint != nil && !taintutils.TaintExists(node0.Spec.Taints, originalTaint) { + t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) + } + + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Time} } + + node0.Status = newNodeStatus + node1.Status = healthyNodeNewStatus + _, err = fakeNodeHandler.DelegateNodeHandler.UpdateStatus(ctx, node0, metav1.UpdateOptions{}) + if err != nil { + t.Errorf(err.Error()) + return + } + _, err = fakeNodeHandler.DelegateNodeHandler.UpdateStatus(ctx, node1, metav1.UpdateOptions{}) + if err != nil { + t.Errorf(err.Error()) + return + } + + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + nodeController.doNoExecuteTaintingPass(ctx) + + node0, err = fakeNodeHandler.DelegateNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + if updatedTaint != nil { + if !taintutils.TaintExists(node0.Spec.Taints, updatedTaint) { + t.Errorf("Can't find taint %v in %v", updatedTaint, node0.Spec.Taints) + } + } +} + +func TestTaintsNodeByCondition(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + + networkUnavailableTaint := &v1.Taint{ + Key: v1.TaintNodeNetworkUnavailable, + Effect: v1.TaintEffectNoSchedule, + } + notReadyTaint := &v1.Taint{ + Key: v1.TaintNodeNotReady, + Effect: v1.TaintEffectNoSchedule, + } + unreachableTaint := &v1.Taint{ + Key: v1.TaintNodeUnreachable, + Effect: v1.TaintEffectNoSchedule, + } + + tests := []struct { + Name string + Node *v1.Node + ExpectedTaints []*v1.Taint + }{ + { + Name: "NetworkUnavailable is true", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + { + Type: v1.NodeNetworkUnavailable, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, + }, + { + Name: "NetworkUnavailable is true", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + { + Type: v1.NodeNetworkUnavailable, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, + }, + { + Name: "Ready is false", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + ExpectedTaints: []*v1.Taint{notReadyTaint}, + }, + { + Name: "Ready is unknown", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + ExpectedTaints: []*v1.Taint{unreachableTaint}, + }, + } + + for _, test := range tests { + fakeNodeHandler.DelegateNodeHandler.Update(ctx, test.Node, metav1.UpdateOptions{}) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + nodeController.doNoScheduleTaintingPass(ctx, test.Node.Name) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + node0 := new(v1.Node) + err := nodeController.Get(ctx, client.ObjectKey{Name: "node0"}, node0) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + if len(node0.Spec.Taints) != len(test.ExpectedTaints) { + t.Errorf("%s: Unexpected number of taints: expected %d, got %d", + test.Name, len(test.ExpectedTaints), len(node0.Spec.Taints)) + } + for _, taint := range test.ExpectedTaints { + if !taintutils.TaintExists(node0.Spec.Taints, taint) { + t.Errorf("%s: Can't find taint %v in %v", test.Name, taint, node0.Spec.Taints) + } + } + } +} + +func TestNodeEventGeneration(t *testing.T) { + fakeNow := metav1.Date(2016, 9, 10, 12, 0, 0, 0, time.UTC) + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + UID: "1234567890", + CreationTimestamp: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + fakeRecorder := testutil.NewFakeRecorder() + nodeController.recorder = fakeRecorder + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Errorf("unexpected error: %v", err) + //} + if err := nodeController.monitorNodeHealth(ctx); err != nil { + t.Errorf("unexpected error: %v", err) + } + if len(fakeRecorder.Events) != 1 { + t.Fatalf("unexpected events, got %v, expected %v: %+v", len(fakeRecorder.Events), 1, fakeRecorder.Events) + } + if fakeRecorder.Events[0].Reason != "RegisteredNode" { + var reasons []string + for _, event := range fakeRecorder.Events { + reasons = append(reasons, event.Reason) + } + t.Fatalf("unexpected events generation: %v", strings.Join(reasons, ",")) + } + for _, event := range fakeRecorder.Events { + involvedObject := event.InvolvedObject + actualUID := string(involvedObject.UID) + if actualUID != "1234567890" { + t.Fatalf("unexpected event uid: %v", actualUID) + } + } +} + +func TestReconcileNodeLabels(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + + tests := []struct { + Name string + Node *v1.Node + ExpectedLabels map[string]string + }{ + { + Name: "No-op if node has no labels", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + }, + ExpectedLabels: nil, + }, + { + Name: "No-op if no target labels present", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + }, + }, + }, + ExpectedLabels: map[string]string{ + v1.LabelTopologyRegion: "region1", + }, + }, + { + Name: "Create OS/arch beta labels when they don't exist", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelOSStable: "linux", + v1.LabelArchStable: "amd64", + }, + }, + }, + ExpectedLabels: map[string]string{ + kubeletapis.LabelOS: "linux", + kubeletapis.LabelArch: "amd64", + v1.LabelOSStable: "linux", + v1.LabelArchStable: "amd64", + }, + }, + { + Name: "Reconcile OS/arch beta labels to match stable labels", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + kubeletapis.LabelOS: "windows", + kubeletapis.LabelArch: "arm", + v1.LabelOSStable: "linux", + v1.LabelArchStable: "amd64", + }, + }, + }, + ExpectedLabels: map[string]string{ + kubeletapis.LabelOS: "linux", + kubeletapis.LabelArch: "amd64", + v1.LabelOSStable: "linux", + v1.LabelArchStable: "amd64", + }, + }, + } + + for _, test := range tests { + fakeNodeHandler.DelegateNodeHandler.Update(ctx, test.Node, metav1.UpdateOptions{}) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Fatalf("unexpected error: %v", err) + //} + nodeController.reconcileNodeLabels(ctx, test.Node.Name) + //if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + // t.Fatalf("unexpected error: %v", err) + //} + node0 := new(v1.Node) + err := nodeController.Get(ctx, client.ObjectKey{Name: "node0"}, node0) + if err != nil { + t.Fatalf("Can't get current node0...") + } + if len(node0.Labels) != len(test.ExpectedLabels) { + t.Errorf("%s: Unexpected number of taints: expected %d, got %d", + test.Name, len(test.ExpectedLabels), len(node0.Labels)) + } + for key, expectedValue := range test.ExpectedLabels { + actualValue, ok := node0.Labels[key] + if !ok { + t.Errorf("%s: Can't find label %v in %v", test.Name, key, node0.Labels) + } + if actualValue != expectedValue { + t.Errorf("%s: label %q: expected value %q, got value %q", test.Name, key, expectedValue, actualValue) + } + } + } +} + +func TestTryUpdateNodeHealth(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) + + //fakeNodeHandler := &testutil.FakeNodeHandler{ + // Existing: []*v1.Node{ + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + } + //Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + pods := &v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}} + //} + + ctx := context.TODO() + fakeNodeHandler := testutil.NewImprovedFakeNodeHandler(nodes, pods) + nodeController, _ := newNodeLifecycleControllerFromClient( + ctx, + fakeNodeHandler, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + ) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + //nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + + getStatus := func(cond *v1.NodeCondition) *v1.ConditionStatus { + if cond == nil { + return nil + } + return &cond.Status + } + + tests := []struct { + name string + node *v1.Node + }{ + { + name: "Status true", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status false", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status unknown", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status nil", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{}, + }, + }, + }, + { + name: "Status true - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status false - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status unknown - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status nil - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{}, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + nodeController.nodeHealthMap.set(test.node.Name, &nodeHealthData{ + status: &test.node.Status, + probeTimestamp: test.node.CreationTimestamp, + readyTransitionTimestamp: test.node.CreationTimestamp, + }) + _, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(ctx, test.node) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + _, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap.getDeepCopy(test.node.Name).status, v1.NodeReady) + savedStatus := getStatus(savedReadyCondition) + currentStatus := getStatus(currentReadyCondition) + if !apiequality.Semantic.DeepEqual(currentStatus, savedStatus) { + t.Errorf("expected %v, got %v", savedStatus, currentStatus) + } + }) + } +} + +func Test_isNodeExcludedFromDisruptionChecks(t *testing.T) { + validNodeStatus := v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: "Test"}}} + tests := []struct { + name string + + input *v1.Node + want bool + }{ + {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{}}}}, + {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}}, + {want: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if result := isNodeExcludedFromDisruptionChecks(tt.input); result != tt.want { + t.Errorf("isNodeExcludedFromDisruptionChecks() = %v, want %v", result, tt.want) + } + }) + } +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue.go new file mode 100644 index 00000000000..868890dfa4f --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue.go @@ -0,0 +1,308 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "container/heap" + "sync" + "time" + + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/flowcontrol" + "k8s.io/klog/v2" +) + +const ( + // NodeHealthUpdateRetry controls the number of retries of writing + // node health update. + NodeHealthUpdateRetry = 5 + // NodeEvictionPeriod controls how often NodeController will try to + // evict Pods from non-responsive Nodes. + NodeEvictionPeriod = 100 * time.Millisecond + // EvictionRateLimiterBurst is the burst value for all eviction rate + // limiters + EvictionRateLimiterBurst = 1 +) + +// TimedValue is a value that should be processed at a designated time. +type TimedValue struct { + Value string + // UID could be anything that helps identify the value + UID interface{} + AddedAt time.Time + ProcessAt time.Time +} + +// now is used to test time +var now = time.Now + +// TimedQueue is a priority heap where the lowest ProcessAt is at the front of the queue +type TimedQueue []*TimedValue + +// Len is the length of the queue. +func (h TimedQueue) Len() int { return len(h) } + +// Less returns true if queue[i] < queue[j]. +func (h TimedQueue) Less(i, j int) bool { return h[i].ProcessAt.Before(h[j].ProcessAt) } + +// Swap swaps index i and j. +func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +// Push a new TimedValue on to the queue. +func (h *TimedQueue) Push(x interface{}) { + *h = append(*h, x.(*TimedValue)) +} + +// Pop the lowest ProcessAt item. +func (h *TimedQueue) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// UniqueQueue is a FIFO queue which additionally guarantees that any +// element can be added only once until it is removed. +type UniqueQueue struct { + lock sync.Mutex + queue TimedQueue + set sets.String +} + +// Add a new value to the queue if it wasn't added before, or was +// explicitly removed by the Remove call. Returns true if new value +// was added. +func (q *UniqueQueue) Add(value TimedValue) bool { + q.lock.Lock() + defer q.lock.Unlock() + + if q.set.Has(value.Value) { + return false + } + heap.Push(&q.queue, &value) + q.set.Insert(value.Value) + return true +} + +// Replace replaces an existing value in the queue if it already +// exists, otherwise it does nothing. Returns true if the item was +// found. +func (q *UniqueQueue) Replace(value TimedValue) bool { + q.lock.Lock() + defer q.lock.Unlock() + + for i := range q.queue { + if q.queue[i].Value != value.Value { + continue + } + heap.Remove(&q.queue, i) + heap.Push(&q.queue, &value) + return true + } + return false +} + +// RemoveFromQueue the value from the queue, but keeps it in the set, +// so it won't be added second time. Returns true if something was +// removed. +func (q *UniqueQueue) RemoveFromQueue(value string) bool { + q.lock.Lock() + defer q.lock.Unlock() + + if !q.set.Has(value) { + return false + } + for i, val := range q.queue { + if val.Value == value { + heap.Remove(&q.queue, i) + return true + } + } + return false +} + +// Remove the value from the queue, so Get() call won't return it, and +// allow subsequent addition of the given value. If the value is not +// present does nothing and returns false. +func (q *UniqueQueue) Remove(value string) bool { + q.lock.Lock() + defer q.lock.Unlock() + + if !q.set.Has(value) { + return false + } + q.set.Delete(value) + for i, val := range q.queue { + if val.Value == value { + heap.Remove(&q.queue, i) + return true + } + } + return true +} + +// Get returns the oldest added value that wasn't returned yet. +func (q *UniqueQueue) Get() (TimedValue, bool) { + q.lock.Lock() + defer q.lock.Unlock() + if len(q.queue) == 0 { + return TimedValue{}, false + } + result := heap.Pop(&q.queue).(*TimedValue) + q.set.Delete(result.Value) + return *result, true +} + +// Head returns the oldest added value that wasn't returned yet +// without removing it. +func (q *UniqueQueue) Head() (TimedValue, bool) { + q.lock.Lock() + defer q.lock.Unlock() + if len(q.queue) == 0 { + return TimedValue{}, false + } + result := q.queue[0] + return *result, true +} + +// Clear removes all items from the queue and duplication preventing +// set. +func (q *UniqueQueue) Clear() { + q.lock.Lock() + defer q.lock.Unlock() + if q.queue.Len() > 0 { + q.queue = make(TimedQueue, 0) + } + if len(q.set) > 0 { + q.set = sets.NewString() + } +} + +// RateLimitedTimedQueue is a unique item priority queue ordered by +// the expected next time of execution. It is also rate limited. +type RateLimitedTimedQueue struct { + queue UniqueQueue + limiterLock sync.Mutex + limiter flowcontrol.RateLimiter +} + +// NewRateLimitedTimedQueue creates new queue which will use given +// RateLimiter to oversee execution. +func NewRateLimitedTimedQueue(limiter flowcontrol.RateLimiter) *RateLimitedTimedQueue { + return &RateLimitedTimedQueue{ + queue: UniqueQueue{ + queue: TimedQueue{}, + set: sets.NewString(), + }, + limiter: limiter, + } +} + +// ActionFunc takes a timed value and returns false if the item must +// be retried, with an optional time.Duration if some minimum wait +// interval should be used. +type ActionFunc func(TimedValue) (bool, time.Duration) + +// Try processes the queue.Ends prematurely if RateLimiter forbids an +// action and leak is true. Otherwise, requeues the item to be +// processed. Each value is processed once if fn returns true, +// otherwise it is added back to the queue. The returned remaining is +// used to identify the minimum time to execute the next item in the +// queue. The same value is processed only once unless Remove is +// explicitly called on it (it's done by the cancelPodEviction +// function in NodeController when Node becomes Ready again) TODO: +// figure out a good way to do garbage collection for all Nodes that +// were removed from the cluster. +func (q *RateLimitedTimedQueue) Try(fn ActionFunc) { + val, ok := q.queue.Head() + q.limiterLock.Lock() + defer q.limiterLock.Unlock() + for ok { + // rate limit the queue checking + if !q.limiter.TryAccept() { + klog.V(10).InfoS("Try rate limited", "value", val) + // Try again later + break + } + + now := now() + if now.Before(val.ProcessAt) { + break + } + + if ok, wait := fn(val); !ok { + val.ProcessAt = now.Add(wait + 1) + q.queue.Replace(val) + } else { + q.queue.RemoveFromQueue(val.Value) + } + val, ok = q.queue.Head() + } +} + +// Add value to the queue to be processed. Won't add the same +// value(comparison by value) a second time if it was already added +// and not removed. +func (q *RateLimitedTimedQueue) Add(value string, uid interface{}) bool { + now := now() + return q.queue.Add(TimedValue{ + Value: value, + UID: uid, + AddedAt: now, + ProcessAt: now, + }) +} + +// Remove Node from the Evictor. The Node won't be processed until +// added again. +func (q *RateLimitedTimedQueue) Remove(value string) bool { + return q.queue.Remove(value) +} + +// Clear removes all items from the queue +func (q *RateLimitedTimedQueue) Clear() { + q.queue.Clear() +} + +// SwapLimiter safely swaps current limiter for this queue with the +// passed one if capacities or qps's differ. +func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) { + q.limiterLock.Lock() + defer q.limiterLock.Unlock() + if q.limiter.QPS() == newQPS { + return + } + var newLimiter flowcontrol.RateLimiter + if newQPS <= 0 { + newLimiter = flowcontrol.NewFakeNeverRateLimiter() + } else { + newLimiter = flowcontrol.NewTokenBucketRateLimiter(newQPS, EvictionRateLimiterBurst) + + // If we're currently waiting on limiter, we drain the new one - this is a good approach when Burst value is 1 + // TODO: figure out if we need to support higher Burst values and decide on the drain logic, should we keep: + // - saturation (percentage of used tokens) + // - number of used tokens + // - number of available tokens + // - something else + if q.limiter.TryAccept() == false { + newLimiter.TryAccept() + } + } + q.limiter.Stop() + q.limiter = newLimiter +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue_test.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue_test.go new file mode 100644 index 00000000000..cd0a14b45dc --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/rate_limited_queue_test.go @@ -0,0 +1,333 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "reflect" + "testing" + "time" + + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/flowcontrol" +) + +func CheckQueueEq(lhs []string, rhs TimedQueue) bool { + for i := 0; i < len(lhs); i++ { + if rhs[i].Value != lhs[i] { + return false + } + } + return true +} + +func CheckSetEq(lhs, rhs sets.String) bool { + return lhs.IsSuperset(rhs) && rhs.IsSuperset(lhs) +} + +func TestAddNode(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + + queuePattern := []string{"first", "second", "third"} + if len(evictor.queue.queue) != len(queuePattern) { + t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) + } + if !CheckQueueEq(queuePattern, evictor.queue.queue) { + t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) + } + + setPattern := sets.NewString("first", "second", "third") + if len(evictor.queue.set) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, evictor.queue.set) { + t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) + } +} + +func TestDelNode(t *testing.T) { + defer func() { now = time.Now }() + var tick int64 + now = func() time.Time { + t := time.Unix(tick, 0) + tick++ + return t + } + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + evictor.Remove("first") + + queuePattern := []string{"second", "third"} + if len(evictor.queue.queue) != len(queuePattern) { + t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) + } + if !CheckQueueEq(queuePattern, evictor.queue.queue) { + t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) + } + + setPattern := sets.NewString("second", "third") + if len(evictor.queue.set) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, evictor.queue.set) { + t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) + } + + evictor = NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + evictor.Remove("second") + + queuePattern = []string{"first", "third"} + if len(evictor.queue.queue) != len(queuePattern) { + t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) + } + if !CheckQueueEq(queuePattern, evictor.queue.queue) { + t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) + } + + setPattern = sets.NewString("first", "third") + if len(evictor.queue.set) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, evictor.queue.set) { + t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) + } + + evictor = NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + evictor.Remove("third") + + queuePattern = []string{"first", "second"} + if len(evictor.queue.queue) != len(queuePattern) { + t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) + } + if !CheckQueueEq(queuePattern, evictor.queue.queue) { + t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) + } + + setPattern = sets.NewString("first", "second") + if len(evictor.queue.set) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, evictor.queue.set) { + t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) + } +} + +func TestTry(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + evictor.Remove("second") + + deletedMap := sets.NewString() + evictor.Try(func(value TimedValue) (bool, time.Duration) { + deletedMap.Insert(value.Value) + return true, 0 + }) + + setPattern := sets.NewString("first", "third") + if len(deletedMap) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, deletedMap) { + t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) + } +} + +func TestTryOrdering(t *testing.T) { + defer func() { now = time.Now }() + current := time.Unix(0, 0) + delay := 0 + // the current time is incremented by 1ms every time now is invoked + now = func() time.Time { + if delay > 0 { + delay-- + } else { + current = current.Add(time.Millisecond) + } + t.Logf("time %d", current.UnixNano()) + return current + } + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + + order := []string{} + count := 0 + hasQueued := false + evictor.Try(func(value TimedValue) (bool, time.Duration) { + count++ + t.Logf("eviction %d", count) + if value.ProcessAt.IsZero() { + t.Fatalf("processAt should not be zero") + } + switch value.Value { + case "first": + if !value.AddedAt.Equal(time.Unix(0, time.Millisecond.Nanoseconds())) { + t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) + } + + case "second": + if !value.AddedAt.Equal(time.Unix(0, 2*time.Millisecond.Nanoseconds())) { + t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) + } + if hasQueued { + if !value.ProcessAt.Equal(time.Unix(0, 6*time.Millisecond.Nanoseconds())) { + t.Fatalf("process time for %s is %v", value.Value, value.ProcessAt) + } + break + } + hasQueued = true + delay = 1 + t.Logf("going to delay") + return false, 2 * time.Millisecond + + case "third": + if !value.AddedAt.Equal(time.Unix(0, 3*time.Millisecond.Nanoseconds())) { + t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) + } + } + order = append(order, value.Value) + return true, 0 + }) + if !reflect.DeepEqual(order, []string{"first", "third"}) { + t.Fatalf("order was wrong: %v", order) + } + if count != 3 { + t.Fatalf("unexpected iterations: %d", count) + } +} + +func TestTryRemovingWhileTry(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + + processing := make(chan struct{}) + wait := make(chan struct{}) + order := []string{} + count := 0 + queued := false + + // while the Try function is processing "second", remove it from the queue + // we should not see "second" retried. + go func() { + <-processing + evictor.Remove("second") + close(wait) + }() + evictor.Try(func(value TimedValue) (bool, time.Duration) { + count++ + if value.AddedAt.IsZero() { + t.Fatalf("added should not be zero") + } + if value.ProcessAt.IsZero() { + t.Fatalf("next should not be zero") + } + if !queued && value.Value == "second" { + queued = true + close(processing) + <-wait + return false, time.Millisecond + } + order = append(order, value.Value) + return true, 0 + }) + + if !reflect.DeepEqual(order, []string{"first", "third"}) { + t.Fatalf("order was wrong: %v", order) + } + if count != 3 { + t.Fatalf("unexpected iterations: %d", count) + } +} + +func TestClear(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + + evictor.Clear() + + if len(evictor.queue.queue) != 0 { + t.Fatalf("Clear should remove all elements from the queue.") + } +} + +func TestSwapLimiter(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + fakeAlways := flowcontrol.NewFakeAlwaysRateLimiter() + qps := evictor.limiter.QPS() + if qps != fakeAlways.QPS() { + t.Fatalf("QPS does not match create one: %v instead of %v", qps, fakeAlways.QPS()) + } + + evictor.SwapLimiter(0) + qps = evictor.limiter.QPS() + fakeNever := flowcontrol.NewFakeNeverRateLimiter() + if qps != fakeNever.QPS() { + t.Fatalf("QPS does not match create one: %v instead of %v", qps, fakeNever.QPS()) + } + + createdQPS := float32(5.5) + evictor.SwapLimiter(createdQPS) + qps = evictor.limiter.QPS() + if qps != createdQPS { + t.Fatalf("QPS does not match create one: %v instead of %v", qps, createdQPS) + } +} + +func TestAddAfterTry(t *testing.T) { + evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) + evictor.Add("first", "11111") + evictor.Add("second", "22222") + evictor.Add("third", "33333") + evictor.Remove("second") + + deletedMap := sets.NewString() + evictor.Try(func(value TimedValue) (bool, time.Duration) { + deletedMap.Insert(value.Value) + return true, 0 + }) + + setPattern := sets.NewString("first", "third") + if len(deletedMap) != len(setPattern) { + t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) + } + if !CheckSetEq(setPattern, deletedMap) { + t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) + } + + evictor.Add("first", "11111") + evictor.Try(func(value TimedValue) (bool, time.Duration) { + t.Errorf("We shouldn't process the same value if the explicit remove wasn't called.") + return true, 0 + }) +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager.go new file mode 100644 index 00000000000..37e099d3e9c --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager.go @@ -0,0 +1,497 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "fmt" + "hash/fnv" + "io" + "math" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/helper" +) + +const ( + // TODO (k82cn): Figure out a reasonable number of workers/channels and propagate + // the number of workers up making it a parameter of Run() function. + + // NodeUpdateChannelSize defines the size of channel for node update events. + NodeUpdateChannelSize = 10 + // UpdateWorkerSize defines the size of workers for node update or/and pod update. + UpdateWorkerSize = 8 + podUpdateChannelSize = 1 + retries = 5 + + // LabelExcludeEviction is a label on nodes that controls whether they can be evicted. + LabelExcludeEviction = "alibabacloud.com/exclude-eviction" + + // AnnotationKeyVirtualClusterNode is an internal key for esk, value is 'true' of 'false' + AnnotationKeyVirtualClusterNode = "node.beta.alibabacloud.com/is-vc-node" +) + +type nodeUpdateItem struct { + nodeName string +} + +type podUpdateItem struct { + podName string + podNamespace string + nodeName string +} + +func hash(val string, max int) int { + hasher := fnv.New32a() + io.WriteString(hasher, val) + return int(hasher.Sum32() % uint32(max)) +} + +// GetPodsByNodeNameFunc returns the list of pods assigned to the specified node. +type GetPodsByNodeNameFunc func(nodeName string) ([]*v1.Pod, error) + +// NoExecuteTaintManager listens to Taint/Toleration changes and is responsible for removing Pods +// from Nodes tainted with NoExecute Taints. +type NoExecuteTaintManager struct { + cacheClient client.Client + recorder record.EventRecorder + getPodsAssignedToNode GetPodsByNodeNameFunc + + taintEvictionQueue *TimedWorkerQueue + // keeps a map from nodeName to all noExecute taints on that Node + taintedNodesLock sync.Mutex + taintedNodes map[string][]v1.Taint + + nodeUpdateChannels []chan nodeUpdateItem + podUpdateChannels []chan podUpdateItem + + nodeUpdateQueue workqueue.Interface + podUpdateQueue workqueue.Interface +} + +func deletePodHandler(c client.Client, emitEventFunc func(types.NamespacedName)) func(ctx context.Context, args *WorkArgs) error { + return func(ctx context.Context, args *WorkArgs) error { + ns := args.NamespacedName.Namespace + name := args.NamespacedName.Name + klog.Infof("NoExecuteTaintManager is deleting pod %s", args.NamespacedName.String()) + if emitEventFunc != nil { + emitEventFunc(args.NamespacedName) + } + var err error + for i := 0; i < retries; i++ { + //err = c.CoreV1().Pods(ns).Delete(ctx, name, metav1.DeleteOptions{}) + err = c.Delete(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: name, + }, + }) + if err == nil { + break + } + time.Sleep(10 * time.Millisecond) + } + return err + } +} + +func getNoExecuteTaints(taints []v1.Taint) []v1.Taint { + result := []v1.Taint{} + for i := range taints { + if taints[i].Effect == v1.TaintEffectNoExecute { + result = append(result, taints[i]) + } + } + return result +} + +// getMinTolerationTime returns minimal toleration time from the given slice, or -1 if it's infinite. +func getMinTolerationTime(tolerations []v1.Toleration) time.Duration { + minTolerationTime := int64(math.MaxInt64) + if len(tolerations) == 0 { + return 0 + } + + for i := range tolerations { + if tolerations[i].TolerationSeconds != nil { + tolerationSeconds := *(tolerations[i].TolerationSeconds) + if tolerationSeconds <= 0 { + return 0 + } else if tolerationSeconds < minTolerationTime { + minTolerationTime = tolerationSeconds + } + } + } + + if minTolerationTime == int64(math.MaxInt64) { + return -1 + } + return time.Duration(minTolerationTime) * time.Second +} + +// NewNoExecuteTaintManager creates a new NoExecuteTaintManager that will use passed clientset to +// communicate with the API server. +func NewNoExecuteTaintManager(recorder record.EventRecorder, cacheClient client.Client, getPodsAssignedToNode GetPodsByNodeNameFunc) *NoExecuteTaintManager { + tm := &NoExecuteTaintManager{ + cacheClient: cacheClient, + recorder: recorder, + getPodsAssignedToNode: getPodsAssignedToNode, + taintedNodes: make(map[string][]v1.Taint), + + nodeUpdateQueue: workqueue.NewNamed("noexec_taint_node"), + podUpdateQueue: workqueue.NewNamed("noexec_taint_pod"), + } + tm.taintEvictionQueue = CreateWorkerQueue(deletePodHandler(cacheClient, tm.emitPodDeletionEvent)) + + return tm +} + +// Run starts NoExecuteTaintManager which will run in loop until `stopCh` is closed. +func (tc *NoExecuteTaintManager) Run(ctx context.Context) { + defer utilruntime.HandleCrash() + klog.Info("Starting NoExecuteTaintManager") + + defer tc.nodeUpdateQueue.ShutDown() + defer tc.podUpdateQueue.ShutDown() + + for i := 0; i < UpdateWorkerSize; i++ { + tc.nodeUpdateChannels = append(tc.nodeUpdateChannels, make(chan nodeUpdateItem, NodeUpdateChannelSize)) + tc.podUpdateChannels = append(tc.podUpdateChannels, make(chan podUpdateItem, podUpdateChannelSize)) + } + + // Functions that are responsible for taking work items out of the workqueues and putting them + // into channels. + go func(stopCh <-chan struct{}) { + for { + item, shutdown := tc.nodeUpdateQueue.Get() + if shutdown { + break + } + nodeUpdate := item.(nodeUpdateItem) + hash := hash(nodeUpdate.nodeName, UpdateWorkerSize) + select { + case <-stopCh: + tc.nodeUpdateQueue.Done(item) + return + case tc.nodeUpdateChannels[hash] <- nodeUpdate: + // tc.nodeUpdateQueue.Done is called by the nodeUpdateChannels worker + } + } + }(ctx.Done()) + + go func(stopCh <-chan struct{}) { + for { + item, shutdown := tc.podUpdateQueue.Get() + if shutdown { + break + } + // The fact that pods are processed by the same worker as nodes is used to avoid races + // between node worker setting tc.taintedNodes and pod worker reading this to decide + // whether to delete pod. + // It's possible that even without this assumption this code is still correct. + podUpdate := item.(podUpdateItem) + hash := hash(podUpdate.nodeName, UpdateWorkerSize) + select { + case <-stopCh: + tc.podUpdateQueue.Done(item) + return + case tc.podUpdateChannels[hash] <- podUpdate: + // tc.podUpdateQueue.Done is called by the podUpdateChannels worker + } + } + }(ctx.Done()) + + wg := sync.WaitGroup{} + wg.Add(UpdateWorkerSize) + for i := 0; i < UpdateWorkerSize; i++ { + go tc.worker(ctx, i, wg.Done, ctx.Done()) + } + wg.Wait() +} + +func (tc *NoExecuteTaintManager) worker(ctx context.Context, worker int, done func(), stopCh <-chan struct{}) { + defer done() + + // When processing events we want to prioritize Node updates over Pod updates, + // as NodeUpdates that interest NoExecuteTaintManager should be handled as soon as possible - + // we don't want user (or system) to wait until PodUpdate queue is drained before it can + // start evicting Pods from tainted Nodes. + for { + select { + case <-stopCh: + return + case nodeUpdate := <-tc.nodeUpdateChannels[worker]: + tc.handleNodeUpdate(ctx, nodeUpdate) + tc.nodeUpdateQueue.Done(nodeUpdate) + case podUpdate := <-tc.podUpdateChannels[worker]: + // If we found a Pod update we need to empty Node queue first. + priority: + for { + select { + case nodeUpdate := <-tc.nodeUpdateChannels[worker]: + tc.handleNodeUpdate(ctx, nodeUpdate) + tc.nodeUpdateQueue.Done(nodeUpdate) + default: + break priority + } + } + // After Node queue is emptied we process podUpdate. + tc.handlePodUpdate(ctx, podUpdate) + tc.podUpdateQueue.Done(podUpdate) + } + } +} + +// PodUpdated is used to notify NoExecuteTaintManager about Pod changes. +func (tc *NoExecuteTaintManager) PodUpdated(oldPod *v1.Pod, newPod *v1.Pod) { + podName := "" + podNamespace := "" + nodeName := "" + oldTolerations := []v1.Toleration{} + if oldPod != nil { + podName = oldPod.Name + podNamespace = oldPod.Namespace + nodeName = oldPod.Spec.NodeName + oldTolerations = oldPod.Spec.Tolerations + } + newTolerations := []v1.Toleration{} + if newPod != nil { + podName = newPod.Name + podNamespace = newPod.Namespace + nodeName = newPod.Spec.NodeName + newTolerations = newPod.Spec.Tolerations + } + + if oldPod != nil && newPod != nil && helper.Semantic.DeepEqual(oldTolerations, newTolerations) && oldPod.Spec.NodeName == newPod.Spec.NodeName { + return + } + updateItem := podUpdateItem{ + podName: podName, + podNamespace: podNamespace, + nodeName: nodeName, + } + + tc.podUpdateQueue.Add(updateItem) +} + +// NodeUpdated is used to notify NoExecuteTaintManager about Node changes. +func (tc *NoExecuteTaintManager) NodeUpdated(oldNode *v1.Node, newNode *v1.Node) { + nodeName := "" + oldTaints := []v1.Taint{} + if oldNode != nil { + nodeName = oldNode.Name + oldTaints = getNoExecuteTaints(oldNode.Spec.Taints) + } + + newTaints := []v1.Taint{} + if newNode != nil { + nodeName = newNode.Name + newTaints = getNoExecuteTaints(newNode.Spec.Taints) + } + + if oldNode != nil && newNode != nil && helper.Semantic.DeepEqual(oldTaints, newTaints) { + return + } + updateItem := nodeUpdateItem{ + nodeName: nodeName, + } + + tc.nodeUpdateQueue.Add(updateItem) +} + +func (tc *NoExecuteTaintManager) cancelWorkWithEvent(nsName types.NamespacedName) { + if tc.taintEvictionQueue.CancelWork(nsName.String()) { + tc.emitCancelPodDeletionEvent(nsName) + } +} + +func (tc *NoExecuteTaintManager) processPodOnNode( + ctx context.Context, + podNamespacedName types.NamespacedName, + nodeName string, + tolerations []v1.Toleration, + taints []v1.Taint, + now time.Time, +) { + if len(taints) == 0 { + tc.cancelWorkWithEvent(podNamespacedName) + } + allTolerated, usedTolerations := helper.GetMatchingTolerations(taints, tolerations) + if !allTolerated { + klog.V(2).Infof("Not all taints are tolerated after update for pod(%s) on node(%s)", podNamespacedName.String(), nodeName) + // We're canceling scheduled work (if any), as we're going to delete the Pod right away. + tc.cancelWorkWithEvent(podNamespacedName) + tc.taintEvictionQueue.AddWork(ctx, NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), time.Now(), time.Now()) + return + } + minTolerationTime := getMinTolerationTime(usedTolerations) + // getMinTolerationTime returns negative value to denote infinite toleration. + if minTolerationTime < 0 { + klog.V(4).Infof("Current tolerations for pod(%s) tolerate forever, cancelling any scheduled deletion", podNamespacedName.String()) + tc.cancelWorkWithEvent(podNamespacedName) + return + } + + startTime := now + triggerTime := startTime.Add(minTolerationTime) + scheduledEviction := tc.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) + if scheduledEviction != nil { + startTime = scheduledEviction.CreatedAt + if startTime.Add(minTolerationTime).Before(triggerTime) { + return + } + tc.cancelWorkWithEvent(podNamespacedName) + } + tc.taintEvictionQueue.AddWork(ctx, NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime) +} + +func (tc *NoExecuteTaintManager) handlePodUpdate(ctx context.Context, podUpdate podUpdateItem) { + //pod, err := tc.podLister.Pods(podUpdate.podNamespace).Get(podUpdate.podName) + var pod v1.Pod + err := tc.cacheClient.Get(ctx, types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName}, &pod) + if err != nil { + if apierrors.IsNotFound(err) { + // Delete + podNamespacedName := types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName} + klog.V(4).Infof("Noticed pod(%s) deletion", podNamespacedName.String()) + tc.cancelWorkWithEvent(podNamespacedName) + return + } + utilruntime.HandleError(fmt.Errorf("could not get pod %s/%s: %v", podUpdate.podName, podUpdate.podNamespace, err)) + return + } + + // We key the workqueue and shard workers by nodeName. If we don't match the current state we should not be the one processing the current object. + if pod.Spec.NodeName != podUpdate.nodeName { + return + } + + // Create or Update + podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} + klog.V(4).Infof("Noticed pod(%s) update", podNamespacedName.String()) + nodeName := pod.Spec.NodeName + if nodeName == "" { + return + } + taints, ok := func() ([]v1.Taint, bool) { + tc.taintedNodesLock.Lock() + defer tc.taintedNodesLock.Unlock() + taints, ok := tc.taintedNodes[nodeName] + return taints, ok + }() + // It's possible that Node was deleted, or Taints were removed before, which triggered + // eviction cancelling if it was needed. + if !ok { + return + } + tc.processPodOnNode(ctx, podNamespacedName, nodeName, pod.Spec.Tolerations, taints, time.Now()) +} + +func (tc *NoExecuteTaintManager) handleNodeUpdate(ctx context.Context, nodeUpdate nodeUpdateItem) { + //node, err := tc.nodeLister.Get(nodeUpdate.nodeName) + node := new(v1.Node) + err := tc.cacheClient.Get(ctx, types.NamespacedName{Name: nodeUpdate.nodeName}, node) + if err != nil { + if apierrors.IsNotFound(err) { + // Delete + klog.V(4).Infof("Noticed node(%s) deletion", nodeUpdate.nodeName) + tc.taintedNodesLock.Lock() + defer tc.taintedNodesLock.Unlock() + delete(tc.taintedNodes, nodeUpdate.nodeName) + return + } + utilruntime.HandleError(fmt.Errorf("cannot get node %s: %v", nodeUpdate.nodeName, err)) + return + } + + // Create or Update + klog.V(4).Infof("Noticed node(%s) update", node.Name) + taints := getNoExecuteTaints(node.Spec.Taints) + func() { + tc.taintedNodesLock.Lock() + defer tc.taintedNodesLock.Unlock() + klog.V(4).Infof("Updating known taints on node(%s), %v", node.Name, taints) + if len(taints) == 0 { + delete(tc.taintedNodes, node.Name) + } else { + tc.taintedNodes[node.Name] = taints + } + }() + + // This is critical that we update tc.taintedNodes before we call getPodsAssignedToNode: + // getPodsAssignedToNode can be delayed as long as all future updates to pods will call + // tc.PodUpdated which will use tc.taintedNodes to potentially delete delayed pods. + pods, err := tc.getPodsAssignedToNode(node.Name) + if err != nil { + klog.Errorf("Failed to get pods assigned to node(%s), %v", node.Name, err) + return + } + if len(pods) == 0 { + return + } + // Short circuit, to make this controller a bit faster. + if len(taints) == 0 { + klog.V(4).Infof("All taints were removed from the node(%s). Cancelling all evictions...", node.Name) + for i := range pods { + tc.cancelWorkWithEvent(types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name}) + } + return + } + + now := time.Now() + for _, pod := range pods { + podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} + tc.processPodOnNode(ctx, podNamespacedName, node.Name, pod.Spec.Tolerations, taints, now) + } +} + +func (tc *NoExecuteTaintManager) emitPodDeletionEvent(nsName types.NamespacedName) { + if tc.recorder == nil { + return + } + ref := &v1.ObjectReference{ + APIVersion: "v1", + Kind: "Pod", + Name: nsName.Name, + Namespace: nsName.Namespace, + } + tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Marking for deletion Pod %s", nsName.String()) +} + +func (tc *NoExecuteTaintManager) emitCancelPodDeletionEvent(nsName types.NamespacedName) { + if tc.recorder == nil { + return + } + ref := &v1.ObjectReference{ + APIVersion: "v1", + Kind: "Pod", + Name: nsName.Name, + Namespace: nsName.Namespace, + } + tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Cancelling deletion of Pod %s", nsName.String()) +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager_test.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager_test.go new file mode 100644 index 00000000000..ef148351b56 --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/taint_manager_test.go @@ -0,0 +1,1047 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "fmt" + "sort" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + flag "github.com/spf13/pflag" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + clienttesting "k8s.io/client-go/testing" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/testutil" +) + +var timeForControllerToProgressForSanityCheck = 20 * time.Millisecond + +func init() { + klog.InitFlags(nil) + flag.Set("v", "5") + flag.Parse() +} +func getPodsAssignedToNode(ctx context.Context, c client.Client) GetPodsByNodeNameFunc { + return func(nodeName string) ([]*v1.Pod, error) { + var pods v1.PodList + err := c.List(ctx, &pods, &client.ListOptions{}) + if err != nil { + return []*v1.Pod{}, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) + } + rPods := make([]*v1.Pod, 0) + for i := range pods.Items { + if pods.Items[i].Spec.NodeName == nodeName { + rPods = append(rPods, &pods.Items[i]) + } + } + return rPods, nil + } +} + +func createNoExecuteTaint(index int) v1.Taint { + now := metav1.Now() + return v1.Taint{ + Key: "testTaint" + fmt.Sprintf("%v", index), + Value: "test" + fmt.Sprintf("%v", index), + Effect: v1.TaintEffectNoExecute, + TimeAdded: &now, + } +} + +func addToleration(pod *v1.Pod, index int, duration int64) *v1.Pod { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + if duration < 0 { + pod.Spec.Tolerations = []v1.Toleration{{Key: "testTaint" + fmt.Sprintf("%v", index), Value: "test" + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoExecute}} + + } else { + pod.Spec.Tolerations = []v1.Toleration{{Key: "testTaint" + fmt.Sprintf("%v", index), Value: "test" + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoExecute, TolerationSeconds: &duration}} + } + return pod +} + +func addTaintsToNode(node *v1.Node, key, value string, indices []int) *v1.Node { + taints := []v1.Taint{} + for _, index := range indices { + taints = append(taints, createNoExecuteTaint(index)) + } + node.Spec.Taints = taints + return node +} + +type timestampedPod struct { + names []string + timestamp time.Duration +} + +type durationSlice []timestampedPod + +func (a durationSlice) Len() int { return len(a) } +func (a durationSlice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a durationSlice) Less(i, j int) bool { return a[i].timestamp < a[j].timestamp } + +func TestFilterNoExecuteTaints(t *testing.T) { + taints := []v1.Taint{ + { + Key: "one", + Value: "one", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "two", + Value: "two", + Effect: v1.TaintEffectNoSchedule, + }, + } + taints = getNoExecuteTaints(taints) + if len(taints) != 1 || taints[0].Key != "one" { + t.Errorf("Filtering doesn't work. Got %v", taints) + } +} + +func TestCreatePod(t *testing.T) { + testCases := []struct { + description string + pod *v1.Pod + taintedNodes map[string][]v1.Taint + expectPatch bool + expectDelete bool + enablePodDisruptionConditions bool + }{ + { + description: "not scheduled - ignore", + pod: NewPod("pod1", ""), + taintedNodes: map[string][]v1.Taint{}, + expectDelete: false, + }, + { + description: "scheduled on untainted Node", + pod: NewPod("pod1", "node1"), + taintedNodes: map[string][]v1.Taint{}, + expectDelete: false, + }, + { + description: "schedule on tainted Node", + pod: NewPod("pod1", "node1"), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: true, + }, + //{ + // description: "schedule on tainted Node; PodDisruptionConditions enabled", + // pod: NewPod("pod1", "node1"), + // taintedNodes: map[string][]v1.Taint{ + // "node1": {createNoExecuteTaint(1)}, + // }, + // expectPatch: true, + // expectDelete: true, + // enablePodDisruptionConditions: true, + //}, + { + description: "schedule on tainted Node with finite toleration", + pod: addToleration(NewPod("pod1", "node1"), 1, 100), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: false, + }, + { + description: "schedule on tainted Node with infinite toleration", + pod: addToleration(NewPod("pod1", "node1"), 1, -1), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: false, + }, + { + description: "schedule on tainted Node with infinite invalid toleration", + pod: addToleration(NewPod("pod1", "node1"), 2, -1), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: true, + }, + } + + for _, item := range testCases { + t.Run(item.description, func(t *testing.T) { + //defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, item.enablePodDisruptionConditions)() + ctx, cancel := context.WithCancel(context.Background()) + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*item.pod}}) + //controller, podIndexer, _ := setupNewNoExecuteTaintManager(ctx, fakeActionClient) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(item.pod).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + controller.taintedNodes = item.taintedNodes + + //podIndexer.Add(item.pod) + controller.PodUpdated(nil, item.pod) + + verifyPodActions(t, item.description, fakeClientset, item.expectPatch, item.expectDelete) + + cancel() + }) + } +} + +func TestDeletePod(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + //fakeClientset := fake.NewSimpleClientset() + //controller, _, _ := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + controller.taintedNodes = map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + } + controller.PodUpdated(NewPod("pod1", "node1"), nil) + // wait a bit to see if nothing will panic + time.Sleep(timeForControllerToProgressForSanityCheck) +} + +func TestUpdatePod(t *testing.T) { + testCases := []struct { + description string + prevPod *v1.Pod + awaitForScheduledEviction bool + newPod *v1.Pod + taintedNodes map[string][]v1.Taint + expectPatch bool + expectDelete bool + enablePodDisruptionConditions bool + }{ + //{ + // description: "scheduling onto tainted Node results in patch and delete when PodDisruptionConditions enabled", + // prevPod: NewPod("pod1", ""), + // newPod: NewPod("pod1", "node1"), + // taintedNodes: map[string][]v1.Taint{ + // "node1": {createNoExecuteTaint(1)}, + // }, + // expectPatch: true, + // expectDelete: true, + // enablePodDisruptionConditions: true, + //}, + { + description: "scheduling onto tainted Node", + prevPod: NewPod("pod1", ""), + newPod: NewPod("pod1", "node1"), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: true, + }, + { + description: "scheduling onto tainted Node with toleration", + prevPod: addToleration(NewPod("pod1", ""), 1, -1), + newPod: addToleration(NewPod("pod1", "node1"), 1, -1), + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: false, + }, + { + description: "removing toleration", + prevPod: addToleration(NewPod("pod1", "node1"), 1, 100), + newPod: NewPod("pod1", "node1"), + awaitForScheduledEviction: true, + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: true, + }, + { + description: "lengthening toleration shouldn't work", + prevPod: addToleration(NewPod("pod1", "node1"), 1, 1), + newPod: addToleration(NewPod("pod1", "node1"), 1, 100), + awaitForScheduledEviction: true, + taintedNodes: map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + }, + expectDelete: true, + }, + } + + for _, item := range testCases { + t.Run(item.description, func(t *testing.T) { + //defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, item.enablePodDisruptionConditions)() + ctx, cancel := context.WithCancel(context.Background()) + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*item.prevPod}}) + //controller, podIndexer, _ := setupNewNoExecuteTaintManager(context.TODO(), fakeClientset) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithLists(&v1.PodList{Items: []v1.Pod{*item.prevPod}}).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + controller.taintedNodes = item.taintedNodes + go controller.Run(ctx) + + //podIndexer.Add(item.prevPod) + controller.PodUpdated(nil, item.prevPod) + + if item.awaitForScheduledEviction { + nsName := types.NamespacedName{Namespace: item.prevPod.Namespace, Name: item.prevPod.Name} + err := wait.PollImmediate(time.Millisecond*10, time.Second, func() (bool, error) { + scheduledEviction := controller.taintEvictionQueue.GetWorkerUnsafe(nsName.String()) + return scheduledEviction != nil, nil + }) + if err != nil { + t.Fatalf("Failed to await for scheduled eviction: %q", err) + } + } + + //podIndexer.Update(item.newPod) + fakeClientset.Update(ctx, item.newPod, &client.UpdateOptions{}) + controller.PodUpdated(item.prevPod, item.newPod) + + verifyPodActions(t, item.description, fakeClientset, item.expectPatch, item.expectDelete) + cancel() + }) + } +} + +func TestCreateNode(t *testing.T) { + testCases := []struct { + description string + pods []v1.Pod + node *v1.Node + expectPatch bool + expectDelete bool + }{ + { + description: "Creating Node matching already assigned Pod", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + node: NewNode("node1"), + expectPatch: false, + expectDelete: false, + }, + { + description: "Creating tainted Node matching already assigned Pod", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + node: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + //expectPatch: true, + expectDelete: true, + }, + { + description: "Creating tainted Node matching already assigned tolerating Pod", + pods: []v1.Pod{ + *addToleration(NewPod("pod1", "node1"), 1, -1), + }, + node: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + expectPatch: false, + expectDelete: false, + }, + } + + for _, item := range testCases { + ctx, cancel := context.WithCancel(context.Background()) + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) + //controller, _, nodeIndexer := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //nodeIndexer.Add(item.node) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithLists(&v1.PodList{Items: item.pods}).WithObjects(item.node).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + controller.NodeUpdated(nil, item.node) + + verifyPodActions(t, item.description, fakeClientset, item.expectPatch, item.expectDelete) + + cancel() + } +} + +func TestDeleteNode(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + //fakeClientset := fake.NewSimpleClientset() + //controller, _, _ := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + controller.taintedNodes = map[string][]v1.Taint{ + "node1": {createNoExecuteTaint(1)}, + } + go controller.Run(ctx) + controller.NodeUpdated(NewNode("node1"), nil) + + // await until controller.taintedNodes is empty + err := wait.PollImmediate(10*time.Millisecond, time.Second, func() (bool, error) { + controller.taintedNodesLock.Lock() + defer controller.taintedNodesLock.Unlock() + _, ok := controller.taintedNodes["node1"] + return !ok, nil + }) + if err != nil { + t.Errorf("Failed to await for processing node deleted: %q", err) + } + cancel() +} + +func TestUpdateNode(t *testing.T) { + testCases := []struct { + description string + pods []v1.Pod + oldNode *v1.Node + newNode *v1.Node + expectPatch bool + expectDelete bool + additionalSleep time.Duration + enablePodDisruptionConditions bool + }{ + //{ + // description: "Added taint, expect node patched and deleted when PodDisruptionConditions is enabled", + // pods: []v1.Pod{ + // *NewPod("pod1", "node1"), + // }, + // oldNode: NewNode("node1"), + // newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + // expectPatch: true, + // expectDelete: true, + // enablePodDisruptionConditions: true, + //}, + { + description: "Added taint", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + expectDelete: true, + }, + { + description: "Added tolerated taint", + pods: []v1.Pod{ + *addToleration(NewPod("pod1", "node1"), 1, 100), + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + expectDelete: false, + }, + { + description: "Only one added taint tolerated", + pods: []v1.Pod{ + *addToleration(NewPod("pod1", "node1"), 1, 100), + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), + expectDelete: true, + }, + { + description: "Taint removed", + pods: []v1.Pod{ + *addToleration(NewPod("pod1", "node1"), 1, 1), + }, + oldNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + newNode: NewNode("node1"), + expectDelete: false, + additionalSleep: 1500 * time.Millisecond, + }, + { + description: "Pod with multiple tolerations are evicted when first one runs out", + pods: []v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: "pod1", + }, + Spec: v1.PodSpec{ + NodeName: "node1", + Tolerations: []v1.Toleration{ + {Key: "testTaint1", Value: "test1", Effect: v1.TaintEffectNoExecute, TolerationSeconds: &[]int64{1}[0]}, + {Key: "testTaint2", Value: "test2", Effect: v1.TaintEffectNoExecute, TolerationSeconds: &[]int64{100}[0]}, + }, + }, + Status: v1.PodStatus{ + Conditions: []v1.PodCondition{ + { + Type: v1.PodReady, + Status: v1.ConditionTrue, + }, + }, + }, + }, + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), + expectDelete: true, + }, + } + + for _, item := range testCases { + t.Run(item.description, func(t *testing.T) { + //defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, item.enablePodDisruptionConditions)() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) + //controller, _, nodeIndexer := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //nodeIndexer.Add(item.newNode) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithLists(&v1.PodList{Items: item.pods}).WithObjects(item.newNode).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + controller.NodeUpdated(item.oldNode, item.newNode) + + if item.additionalSleep > 0 { + time.Sleep(item.additionalSleep) + } + + verifyPodActions(t, item.description, fakeClientset, item.expectPatch, item.expectDelete) + }) + } +} + +func TestUpdateNodeWithMultipleTaints(t *testing.T) { + taint1 := createNoExecuteTaint(1) + taint2 := createNoExecuteTaint(2) + + minute := int64(60) + pod := NewPod("pod1", "node1") + pod.Spec.Tolerations = []v1.Toleration{ + {Key: taint1.Key, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoExecute}, + {Key: taint2.Key, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoExecute, TolerationSeconds: &minute}, + } + podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} + + untaintedNode := NewNode("node1") + + doubleTaintedNode := NewNode("node1") + doubleTaintedNode.Spec.Taints = []v1.Taint{taint1, taint2} + + singleTaintedNode := NewNode("node1") + singleTaintedNode.Spec.Taints = []v1.Taint{taint1} + + ctx, cancel := context.WithCancel(context.TODO()) + //fakeClientset := fake.NewSimpleClientset(pod) + //controller, _, nodeIndexer := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(pod).WithObjects(untaintedNode).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + + // no taint + //nodeIndexer.Add(untaintedNode) + controller.handleNodeUpdate(ctx, nodeUpdateItem{"node1"}) + // verify pod is not queued for deletion + if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { + t.Fatalf("pod queued for deletion with no taints") + } + + // no taint -> infinitely tolerated taint + //nodeIndexer.Update(singleTaintedNode) + nodeCopy := singleTaintedNode.DeepCopy() + fakeClientset.Update(ctx, nodeCopy, &client.UpdateOptions{}) + controller.handleNodeUpdate(ctx, nodeUpdateItem{"node1"}) + // verify pod is not queued for deletion + if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { + t.Fatalf("pod queued for deletion with permanently tolerated taint") + } + + // infinitely tolerated taint -> temporarily tolerated taint + //nodeIndexer.Update(doubleTaintedNode) + fakeClientset.Update(ctx, doubleTaintedNode, &client.UpdateOptions{}) + controller.handleNodeUpdate(ctx, nodeUpdateItem{"node1"}) + // verify pod is queued for deletion + if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) == nil { + t.Fatalf("pod not queued for deletion after addition of temporarily tolerated taint") + } + + // temporarily tolerated taint -> infinitely tolerated taint + //nodeIndexer.Update(singleTaintedNode) + if err := fakeClientset.Update(ctx, singleTaintedNode, &client.UpdateOptions{}); err != nil { + t.Errorf("failed to update singleTaintedNode, %v", err) + } + controller.handleNodeUpdate(ctx, nodeUpdateItem{"node1"}) + // verify pod is not queued for deletion + if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { + t.Fatalf("pod queued for deletion after removal of temporarily tolerated taint") + } + + // verify pod is not deleted + for _, action := range fakeClientset.Actions() { + if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { + t.Error("Unexpected deletion") + } + } + cancel() +} + +func TestUpdateNodeWithMultiplePods(t *testing.T) { + testCases := []struct { + description string + pods []v1.Pod + oldNode *v1.Node + newNode *v1.Node + expectedDeleteTimes durationSlice + }{ + { + description: "Pods with different toleration times are evicted appropriately", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + *addToleration(NewPod("pod2", "node1"), 1, 1), + *addToleration(NewPod("pod3", "node1"), 1, -1), + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + expectedDeleteTimes: durationSlice{ + {[]string{"pod1"}, 0}, + {[]string{"pod2"}, time.Second}, + }, + }, + { + description: "Evict all pods not matching all taints instantly", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + *addToleration(NewPod("pod2", "node1"), 1, 1), + *addToleration(NewPod("pod3", "node1"), 1, -1), + }, + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), + expectedDeleteTimes: durationSlice{ + {[]string{"pod1", "pod2", "pod3"}, 0}, + }, + }, + } + + for _, item := range testCases { + t.Run(item.description, func(t *testing.T) { + t.Logf("Starting testcase %q", item.description) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) + sort.Sort(item.expectedDeleteTimes) + //controller, _, nodeIndexer := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //nodeIndexer.Add(item.newNode) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithLists(&v1.PodList{Items: item.pods}).WithObjects(item.newNode).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + controller.NodeUpdated(item.oldNode, item.newNode) + + startedAt := time.Now() + for i := range item.expectedDeleteTimes { + if i == 0 || item.expectedDeleteTimes[i-1].timestamp != item.expectedDeleteTimes[i].timestamp { + // compute a grace duration to give controller time to process updates. Choose big + // enough intervals in the test cases above to avoid flakes. + var increment time.Duration + if i == len(item.expectedDeleteTimes)-1 || item.expectedDeleteTimes[i+1].timestamp == item.expectedDeleteTimes[i].timestamp { + increment = 500 * time.Millisecond + } else { + increment = ((item.expectedDeleteTimes[i+1].timestamp - item.expectedDeleteTimes[i].timestamp) / time.Duration(2)) + } + + sleepTime := item.expectedDeleteTimes[i].timestamp - time.Since(startedAt) + increment + if sleepTime < 0 { + sleepTime = 0 + } + t.Logf("Sleeping for %v", sleepTime) + time.Sleep(sleepTime) + } + + for delay, podName := range item.expectedDeleteTimes[i].names { + deleted := false + for _, action := range fakeClientset.Actions() { + deleteAction, ok := action.(clienttesting.DeleteActionImpl) + if !ok { + t.Logf("Found not-delete action with verb %v. Ignoring.", action.GetVerb()) + continue + } + if deleteAction.GetResource().Resource != "pods" { + continue + } + if podName == deleteAction.GetName() { + deleted = true + } + } + if !deleted { + t.Errorf("Failed to deleted pod %v after %v", podName, delay) + } + } + for _, action := range fakeClientset.Actions() { + deleteAction, ok := action.(clienttesting.DeleteActionImpl) + if !ok { + t.Logf("Found not-delete action with verb %v. Ignoring.", action.GetVerb()) + continue + } + if deleteAction.GetResource().Resource != "pods" { + continue + } + deletedPodName := deleteAction.GetName() + expected := false + for _, podName := range item.expectedDeleteTimes[i].names { + if podName == deletedPodName { + expected = true + } + } + if !expected { + t.Errorf("Pod %v was deleted even though it shouldn't have", deletedPodName) + } + } + fakeClientset.ClearActions() + } + }) + } +} + +func TestGetMinTolerationTime(t *testing.T) { + one := int64(1) + two := int64(2) + oneSec := 1 * time.Second + + tests := []struct { + tolerations []v1.Toleration + expected time.Duration + }{ + { + tolerations: []v1.Toleration{}, + expected: 0, + }, + { + tolerations: []v1.Toleration{ + { + TolerationSeconds: nil, + }, + }, + expected: -1, + }, + { + tolerations: []v1.Toleration{ + { + TolerationSeconds: &one, + }, + { + TolerationSeconds: &two, + }, + }, + expected: oneSec, + }, + + { + tolerations: []v1.Toleration{ + { + TolerationSeconds: &one, + }, + { + TolerationSeconds: nil, + }, + }, + expected: oneSec, + }, + { + tolerations: []v1.Toleration{ + { + TolerationSeconds: nil, + }, + { + TolerationSeconds: &one, + }, + }, + expected: oneSec, + }, + } + + for _, test := range tests { + got := getMinTolerationTime(test.tolerations) + if got != test.expected { + t.Errorf("Incorrect min toleration time: got %v, expected %v", got, test.expected) + } + } +} + +// TestEventualConsistency verifies if getPodsAssignedToNode returns incomplete data +// (e.g. due to watch latency), it will reconcile the remaining pods eventually. +// This scenario is partially covered by TestUpdatePods, but given this is an important +// property of TaintManager, it's better to have explicit test for this. +func TestEventualConsistency(t *testing.T) { + testCases := []struct { + description string + pods []v1.Pod + prevPod *v1.Pod + newPod *v1.Pod + oldNode *v1.Node + newNode *v1.Node + expectPatch bool + expectDelete bool + }{ + { + description: "existing pod2 scheduled onto tainted Node", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + prevPod: NewPod("pod2", ""), + newPod: NewPod("pod2", "node1"), + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + //expectPatch: true, + expectDelete: true, + }, + { + description: "existing pod2 with taint toleration scheduled onto tainted Node", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + prevPod: addToleration(NewPod("pod2", ""), 1, 100), + newPod: addToleration(NewPod("pod2", "node1"), 1, 100), + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + //expectPatch: true, + expectDelete: true, + }, + { + description: "new pod2 created on tainted Node", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + prevPod: nil, + newPod: NewPod("pod2", "node1"), + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + //expectPatch: true, + expectDelete: true, + }, + { + description: "new pod2 with tait toleration created on tainted Node", + pods: []v1.Pod{ + *NewPod("pod1", "node1"), + }, + prevPod: nil, + newPod: addToleration(NewPod("pod2", "node1"), 1, 100), + oldNode: NewNode("node1"), + newNode: addTaintsToNode(NewNode("node1"), "testTaint1", "taint1", []int{1}), + //expectPatch: true, + expectDelete: true, + }, + } + + for _, item := range testCases { + t.Run(item.description, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + //fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) + //controller, podIndexer, nodeIndexer := setupNewNoExecuteTaintManager(ctx, fakeClientset) + //nodeIndexer.Add(item.newNode) + //controller.recorder = testutil.NewFakeRecorder(0) + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithLists(&v1.PodList{Items: item.pods}).WithObjects(item.newNode).Build() + fakeClientset := testutil.NewClientWrapper(fakeClient, scheme) + controller := NewNoExecuteTaintManager(testutil.NewFakeRecorder(), fakeClientset, getPodsAssignedToNode(ctx, fakeClientset)) + go controller.Run(ctx) + + if item.prevPod != nil { + //podIndexer.Add(item.prevPod) + fakeClientset.Update(ctx, item.prevPod, &client.UpdateOptions{}) + controller.PodUpdated(nil, item.prevPod) + } + + // First we simulate NodeUpdate that should delete 'pod1'. It doesn't know about 'pod2' yet. + controller.NodeUpdated(item.oldNode, item.newNode) + + verifyPodActions(t, item.description, fakeClientset, item.expectPatch, item.expectDelete) + fakeClientset.ClearActions() + + // And now the delayed update of 'pod2' comes to the TaintManager. We should delete it as well. + //podIndexer.Update(item.newPod) + fakeClientset.Update(ctx, item.newPod, &client.UpdateOptions{}) + controller.PodUpdated(item.prevPod, item.newPod) + // wait a bit + time.Sleep(timeForControllerToProgressForSanityCheck) + }) + } +} + +func verifyPodActions(t *testing.T, description string, fakeClientset *testutil.ClientWrapper, expectPatch, expectDelete bool) { + t.Helper() + podPatched := false + podDeleted := false + // use Poll instead of PollImmediate to give some processing time to the controller that the expected + // actions are likely to be already sent + err := wait.Poll(10*time.Millisecond, 5*time.Second, func() (bool, error) { + for _, action := range fakeClientset.Actions() { + if action.GetVerb() == "patch" && action.GetResource().Resource == "pods" { + podPatched = true + } + if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { + podDeleted = true + } + } + return podPatched == expectPatch && podDeleted == expectDelete, nil + }) + if err != nil { + t.Errorf("Failed waiting for the expected actions: %q", err) + } + if podPatched != expectPatch { + t.Errorf("[%v]Unexpected test result. Expected patch %v, got %v", description, expectPatch, podPatched) + } + if podDeleted != expectDelete { + t.Errorf("[%v]Unexpected test result. Expected delete %v, got %v", description, expectDelete, podDeleted) + } +} + +// TestPodDeletionEvent Verify that the output events are as expected +func TestPodDeletionEvent(t *testing.T) { + f := func(path cmp.Path) bool { + switch path.String() { + // These fields change at runtime, so ignore it + case "LastTimestamp", "FirstTimestamp", "ObjectMeta.Name": + return true + } + return false + } + + t.Run("emitPodDeletionEvent", func(t *testing.T) { + controller := &NoExecuteTaintManager{} + recorder := testutil.NewFakeRecorder() + controller.recorder = recorder + controller.emitPodDeletionEvent(types.NamespacedName{ + Name: "test", + Namespace: "test", + }) + want := []*v1.Event{ + { + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test", + }, + InvolvedObject: v1.ObjectReference{ + Kind: "Pod", + APIVersion: "v1", + Namespace: "test", + Name: "test", + }, + Reason: "TaintManagerEviction", + Type: "Normal", + Count: 1, + Message: "Marking for deletion Pod test/test", + Source: v1.EventSource{Component: "nodeControllerTest"}, + }, + } + if diff := cmp.Diff(want, recorder.Events, cmp.FilterPath(f, cmp.Ignore())); len(diff) > 0 { + t.Errorf("emitPodDeletionEvent() returned data (-want,+got):\n%s", diff) + } + }) + + t.Run("emitCancelPodDeletionEvent", func(t *testing.T) { + controller := &NoExecuteTaintManager{} + recorder := testutil.NewFakeRecorder() + controller.recorder = recorder + controller.emitCancelPodDeletionEvent(types.NamespacedName{ + Name: "test", + Namespace: "test", + }) + want := []*v1.Event{ + { + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test", + }, + InvolvedObject: v1.ObjectReference{ + Kind: "Pod", + APIVersion: "v1", + Namespace: "test", + Name: "test", + }, + Reason: "TaintManagerEviction", + Type: "Normal", + Count: 1, + Message: "Cancelling deletion of Pod test/test", + Source: v1.EventSource{Component: "nodeControllerTest"}, + }, + } + if diff := cmp.Diff(want, recorder.Events, cmp.FilterPath(f, cmp.Ignore())); len(diff) > 0 { + t.Errorf("emitPodDeletionEvent() returned data (-want,+got):\n%s", diff) + } + }) +} + +// NewNode is a helper function for creating Nodes for testing. +func NewNode(name string) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + } +} + +// NewPod is a helper function for creating Pods for testing. +func NewPod(name, host string) *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: name, + }, + Spec: v1.PodSpec{ + NodeName: host, + }, + Status: v1.PodStatus{ + Conditions: []v1.PodCondition{ + { + Type: v1.PodReady, + Status: v1.ConditionTrue, + }, + }, + }, + } + + return pod +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers.go new file mode 100644 index 00000000000..3fd50beb111 --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers.go @@ -0,0 +1,154 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "sync" + "time" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + "k8s.io/utils/clock" +) + +// WorkArgs keeps arguments that will be passed to the function executed by the worker. +type WorkArgs struct { + NamespacedName types.NamespacedName +} + +// KeyFromWorkArgs creates a key for the given `WorkArgs` +func (w *WorkArgs) KeyFromWorkArgs() string { + return w.NamespacedName.String() +} + +// NewWorkArgs is a helper function to create new `WorkArgs` +func NewWorkArgs(name, namespace string) *WorkArgs { + return &WorkArgs{types.NamespacedName{Namespace: namespace, Name: name}} +} + +// TimedWorker is a responsible for executing a function no earlier than at FireAt time. +type TimedWorker struct { + WorkItem *WorkArgs + CreatedAt time.Time + FireAt time.Time + Timer clock.Timer +} + +// createWorker creates a TimedWorker that will execute `f` not earlier than `fireAt`. +func createWorker(ctx context.Context, args *WorkArgs, createdAt time.Time, fireAt time.Time, f func(ctx context.Context, args *WorkArgs) error, clock clock.WithDelayedExecution) *TimedWorker { + delay := fireAt.Sub(createdAt) + fWithErrorLogging := func() { + err := f(ctx, args) + if err != nil { + klog.Errorf("NodeLifecycle: timed worker failed, %v", err) + } + } + if delay <= 0 { + go fWithErrorLogging() + return nil + } + timer := clock.AfterFunc(delay, fWithErrorLogging) + return &TimedWorker{ + WorkItem: args, + CreatedAt: createdAt, + FireAt: fireAt, + Timer: timer, + } +} + +// Cancel cancels the execution of function by the `TimedWorker` +func (w *TimedWorker) Cancel() { + if w != nil { + w.Timer.Stop() + } +} + +// TimedWorkerQueue keeps a set of TimedWorkers that are still wait for execution. +type TimedWorkerQueue struct { + sync.Mutex + // map of workers keyed by string returned by 'KeyFromWorkArgs' from the given worker. + workers map[string]*TimedWorker + workFunc func(ctx context.Context, args *WorkArgs) error + clock clock.WithDelayedExecution +} + +// CreateWorkerQueue creates a new TimedWorkerQueue for workers that will execute +// given function `f`. +func CreateWorkerQueue(f func(ctx context.Context, args *WorkArgs) error) *TimedWorkerQueue { + return &TimedWorkerQueue{ + workers: make(map[string]*TimedWorker), + workFunc: f, + clock: clock.RealClock{}, + } +} + +func (q *TimedWorkerQueue) getWrappedWorkerFunc(key string) func(ctx context.Context, args *WorkArgs) error { + return func(ctx context.Context, args *WorkArgs) error { + err := q.workFunc(ctx, args) + q.Lock() + defer q.Unlock() + if err == nil { + // To avoid duplicated calls we keep the key in the queue, to prevent + // subsequent additions. + q.workers[key] = nil + } else { + delete(q.workers, key) + } + return err + } +} + +// AddWork adds a work to the WorkerQueue which will be executed not earlier than `fireAt`. +func (q *TimedWorkerQueue) AddWork(ctx context.Context, args *WorkArgs, createdAt time.Time, fireAt time.Time) { + key := args.KeyFromWorkArgs() + klog.V(4).Infof("Adding TimedWorkerQueue item=%s at createTime=%v and to be fired at firedTime=%v", key, createdAt, fireAt) + + q.Lock() + defer q.Unlock() + if _, exists := q.workers[key]; exists { + klog.Info("Trying to add already existing work(%v), skipping", args) + return + } + worker := createWorker(ctx, args, createdAt, fireAt, q.getWrappedWorkerFunc(key), q.clock) + q.workers[key] = worker +} + +// CancelWork removes scheduled function execution from the queue. Returns true if work was cancelled. +func (q *TimedWorkerQueue) CancelWork(key string) bool { + q.Lock() + defer q.Unlock() + worker, found := q.workers[key] + result := false + if found { + klog.V(4).Infof("Cancelling TimedWorkerQueue item=%s, time=%v", key, time.Now()) + if worker != nil { + result = true + worker.Cancel() + } + delete(q.workers, key) + } + return result +} + +// GetWorkerUnsafe returns a TimedWorker corresponding to the given key. +// Unsafe method - workers have attached goroutines which can fire after this function is called. +func (q *TimedWorkerQueue) GetWorkerUnsafe(key string) *TimedWorker { + q.Lock() + defer q.Unlock() + return q.workers[key] +} diff --git a/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers_test.go b/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers_test.go new file mode 100644 index 00000000000..63371757429 --- /dev/null +++ b/pkg/yurtmanager/controller/nodelifecycle/scheduler/timed_workers_test.go @@ -0,0 +1,153 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + testingclock "k8s.io/utils/clock/testing" +) + +func TestExecute(t *testing.T) { + testVal := int32(0) + wg := sync.WaitGroup{} + wg.Add(5) + queue := CreateWorkerQueue(func(ctx context.Context, args *WorkArgs) error { + atomic.AddInt32(&testVal, 1) + wg.Done() + return nil + }) + now := time.Now() + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, now) + // Adding the same thing second time should be no-op + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, now) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, now) + wg.Wait() + lastVal := atomic.LoadInt32(&testVal) + if lastVal != 5 { + t.Errorf("Expected testVal = 5, got %v", lastVal) + } +} + +func TestExecuteDelayed(t *testing.T) { + testVal := int32(0) + wg := sync.WaitGroup{} + wg.Add(5) + queue := CreateWorkerQueue(func(ctx context.Context, args *WorkArgs) error { + atomic.AddInt32(&testVal, 1) + wg.Done() + return nil + }) + now := time.Now() + then := now.Add(10 * time.Second) + fakeClock := testingclock.NewFakeClock(now) + queue.clock = fakeClock + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + fakeClock.Step(11 * time.Second) + wg.Wait() + lastVal := atomic.LoadInt32(&testVal) + if lastVal != 5 { + t.Errorf("Expected testVal = 5, got %v", lastVal) + } +} + +func TestCancel(t *testing.T) { + testVal := int32(0) + wg := sync.WaitGroup{} + wg.Add(3) + queue := CreateWorkerQueue(func(ctx context.Context, args *WorkArgs) error { + atomic.AddInt32(&testVal, 1) + wg.Done() + return nil + }) + now := time.Now() + then := now.Add(10 * time.Second) + fakeClock := testingclock.NewFakeClock(now) + queue.clock = fakeClock + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) + queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) + fakeClock.Step(11 * time.Second) + wg.Wait() + lastVal := atomic.LoadInt32(&testVal) + if lastVal != 3 { + t.Errorf("Expected testVal = 3, got %v", lastVal) + } +} + +func TestCancelAndReadd(t *testing.T) { + testVal := int32(0) + wg := sync.WaitGroup{} + wg.Add(4) + queue := CreateWorkerQueue(func(ctx context.Context, args *WorkArgs) error { + atomic.AddInt32(&testVal, 1) + wg.Done() + return nil + }) + now := time.Now() + then := now.Add(10 * time.Second) + fakeClock := testingclock.NewFakeClock(now) + queue.clock = fakeClock + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("1", "1"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) + queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) + queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) + queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) + queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) + fakeClock.Step(11 * time.Second) + wg.Wait() + lastVal := atomic.LoadInt32(&testVal) + if lastVal != 4 { + t.Errorf("Expected testVal = 4, got %v", lastVal) + } +} diff --git a/pkg/yurtmanager/controller/nodepool/nodepool_controller.go b/pkg/yurtmanager/controller/nodepool/nodepool_controller.go index 913fb2e9ba3..9ea4b1565a9 100644 --- a/pkg/yurtmanager/controller/nodepool/nodepool_controller.go +++ b/pkg/yurtmanager/controller/nodepool/nodepool_controller.go @@ -71,7 +71,7 @@ var _ reconcile.Reconciler = &ReconcileNodePool{} // Add creates a new NodePool Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *config.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *config.CompletedConfig, mgr manager.Manager) error { klog.Infof("nodepool-controller add controller %s", controllerResource.String()) r := &ReconcileNodePool{ cfg: c.ComponentConfig.NodePoolController, diff --git a/pkg/yurtmanager/controller/platformadmin/platformadmin_controller.go b/pkg/yurtmanager/controller/platformadmin/platformadmin_controller.go index a2e8b932a8d..1004794bcc0 100644 --- a/pkg/yurtmanager/controller/platformadmin/platformadmin_controller.go +++ b/pkg/yurtmanager/controller/platformadmin/platformadmin_controller.go @@ -119,7 +119,7 @@ var _ reconcile.Reconciler = &ReconcilePlatformAdmin{} // Add creates a new PlatformAdmin Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/controller/raven/dns/dns_controller.go b/pkg/yurtmanager/controller/raven/dns/dns_controller.go index 17eb950e7c7..38347ba643c 100644 --- a/pkg/yurtmanager/controller/raven/dns/dns_controller.go +++ b/pkg/yurtmanager/controller/raven/dns/dns_controller.go @@ -52,7 +52,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Ravendns Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { return add(mgr, newReconciler(mgr)) } diff --git a/pkg/yurtmanager/controller/raven/gatewayinternalservice/gateway_internal_service_controller.go b/pkg/yurtmanager/controller/raven/gatewayinternalservice/gateway_internal_service_controller.go index 15277e96218..112713c41c0 100644 --- a/pkg/yurtmanager/controller/raven/gatewayinternalservice/gateway_internal_service_controller.go +++ b/pkg/yurtmanager/controller/raven/gatewayinternalservice/gateway_internal_service_controller.go @@ -61,7 +61,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Service Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { return add(mgr, newReconciler(c, mgr)) } diff --git a/pkg/yurtmanager/controller/raven/gatewaypickup/gateway_pickup_controller.go b/pkg/yurtmanager/controller/raven/gatewaypickup/gateway_pickup_controller.go index 055841e6297..49239234832 100644 --- a/pkg/yurtmanager/controller/raven/gatewaypickup/gateway_pickup_controller.go +++ b/pkg/yurtmanager/controller/raven/gatewaypickup/gateway_pickup_controller.go @@ -67,7 +67,7 @@ const ( // Add creates a new Gateway Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/controller/raven/gatewaypublicservice/gateway_public_service_controller.go b/pkg/yurtmanager/controller/raven/gatewaypublicservice/gateway_public_service_controller.go index ad43608c58a..9d6e42e34d9 100644 --- a/pkg/yurtmanager/controller/raven/gatewaypublicservice/gateway_public_service_controller.go +++ b/pkg/yurtmanager/controller/raven/gatewaypublicservice/gateway_public_service_controller.go @@ -59,7 +59,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Service Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { return add(mgr, newReconciler(mgr)) } diff --git a/pkg/yurtmanager/controller/servicetopology/endpoints/endpoints_controller.go b/pkg/yurtmanager/controller/servicetopology/endpoints/endpoints_controller.go index bb0d3407016..acb0a208524 100644 --- a/pkg/yurtmanager/controller/servicetopology/endpoints/endpoints_controller.go +++ b/pkg/yurtmanager/controller/servicetopology/endpoints/endpoints_controller.go @@ -51,7 +51,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Servicetopology endpoints Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { klog.Infof("servicetopology-endpoints-controller add controller %s", controllerKind.String()) return add(mgr, newReconciler(c, mgr)) } diff --git a/pkg/yurtmanager/controller/servicetopology/endpointslice/endpointslice_controller.go b/pkg/yurtmanager/controller/servicetopology/endpointslice/endpointslice_controller.go index f5541322571..a680cc5d0cf 100644 --- a/pkg/yurtmanager/controller/servicetopology/endpointslice/endpointslice_controller.go +++ b/pkg/yurtmanager/controller/servicetopology/endpointslice/endpointslice_controller.go @@ -54,7 +54,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new Servicetopology endpointslice Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(_ *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, _ *appconfig.CompletedConfig, mgr manager.Manager) error { r := &ReconcileServiceTopologyEndpointSlice{} c, err := controller.New(names.ServiceTopologyEndpointSliceController, mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: concurrentReconciles}) if err != nil { diff --git a/pkg/yurtmanager/controller/testutil/test_utils.go b/pkg/yurtmanager/controller/testutil/test_utils.go new file mode 100644 index 00000000000..8b64b0b4906 --- /dev/null +++ b/pkg/yurtmanager/controller/testutil/test_utils.go @@ -0,0 +1,869 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testutil + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "sync" + "time" + + coordv1 "k8s.io/api/coordination/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/watch" + v1apply "k8s.io/client-go/applyconfigurations/core/v1" + "k8s.io/client-go/kubernetes/fake" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + clienttesting "k8s.io/client-go/testing" + ref "k8s.io/client-go/tools/reference" + utilnode "k8s.io/component-helpers/node/topology" + "k8s.io/klog/v2" + "k8s.io/utils/clock" + testingclock "k8s.io/utils/clock/testing" + ctlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +type ImprovedFakeNodeHandler struct { + sync.Mutex + *ClientWrapper + baseClient ctlclient.Client + DelegateNodeHandler *FakeNodeHandler + *fake.Clientset + UpdatedNodes []*v1.Node + UpdatedNodeStatuses []*v1.Node +} + +func NewImprovedFakeNodeHandler(nodes []*v1.Node, pods *v1.PodList) *ImprovedFakeNodeHandler { + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + clientBuilder := fakeclient.NewClientBuilder().WithScheme(scheme) + for i := range nodes { + clientBuilder.WithObjects(nodes[i]) + } + if pods != nil { + clientBuilder.WithLists(pods) + } + delegateClient := clientBuilder.Build() + clientWrapper := NewClientWrapper(delegateClient, scheme) + m := &ImprovedFakeNodeHandler{ + ClientWrapper: clientWrapper, + baseClient: delegateClient, + DelegateNodeHandler: &FakeNodeHandler{ + runtimeClient: delegateClient, + }, + Clientset: fake.NewSimpleClientset(), + UpdatedNodes: make([]*v1.Node, 0), + UpdatedNodeStatuses: make([]*v1.Node, 0), + } + m.ClientWrapper.NodeUpdateReactor = m.SyncNode + m.DelegateNodeHandler.NodeUpdateReactor = m.SyncNode + + for i := range nodes { + m.Clientset.Tracker().Add(nodes[i]) + } + if pods != nil { + m.Clientset.Tracker().Add(pods) + } + + return m +} + +func (m *ImprovedFakeNodeHandler) UpdateLease(lease *coordv1.Lease) error { + if lease == nil { + return nil + } + m.baseClient.Delete(context.TODO(), lease) + if err := m.baseClient.Create(context.TODO(), lease); err != nil { + return err + } + + return nil +} + +func (m *ImprovedFakeNodeHandler) UpdateNodeStatuses(updatedNodeStatuses map[string]v1.NodeStatus) error { + nodeList := new(v1.NodeList) + err := m.baseClient.List(context.TODO(), nodeList, &ctlclient.ListOptions{}) + if err != nil { + return err + } + + for _, node := range nodeList.Items { + if status, ok := updatedNodeStatuses[node.Name]; ok { + node.Status = status + if err = m.baseClient.Status().Update(context.TODO(), &node, &ctlclient.UpdateOptions{}); err != nil { + return err + } + } + } + + return nil +} + +func (m *ImprovedFakeNodeHandler) UpdateNodes(updatedNodes []*v1.Node) error { + for i := range updatedNodes { + oldNode := new(v1.Node) + if err := m.baseClient.Get(context.TODO(), ctlclient.ObjectKey{Name: updatedNodes[i].Name}, oldNode); err != nil { + if err = m.baseClient.Create(context.TODO(), updatedNodes[i]); err != nil { + return err + } + } else { + if err = m.baseClient.Update(context.TODO(), updatedNodes[i], &ctlclient.UpdateOptions{}); err != nil { + return err + } + } + } + return nil +} + +func (m *ImprovedFakeNodeHandler) SyncNode(node *v1.Node, syncStatus bool) { + m.Lock() + defer m.Unlock() + found := false + node.ResourceVersion = "" + for i := range m.UpdatedNodes { + if m.UpdatedNodes[i].Name == node.Name { + m.UpdatedNodes[i] = node + found = true + break + } + } + + if !found { + m.UpdatedNodes = append(m.UpdatedNodes, node) + } + + if syncStatus { + m.UpdatedNodeStatuses = append(m.UpdatedNodeStatuses, node) + } +} + +// FakeLegacyHandler is a fake implementation of CoreV1Interface. +type FakeLegacyHandler struct { + v1core.CoreV1Interface + n *FakeNodeHandler +} + +// Core returns fake CoreInterface. +func (m *ImprovedFakeNodeHandler) Core() v1core.CoreV1Interface { + return &FakeLegacyHandler{m.Clientset.CoreV1(), m.DelegateNodeHandler} +} + +// CoreV1 returns fake CoreV1Interface +func (m *ImprovedFakeNodeHandler) CoreV1() v1core.CoreV1Interface { + return &FakeLegacyHandler{m.Clientset.CoreV1(), m.DelegateNodeHandler} +} + +// Nodes return fake NodeInterfaces. +func (m *FakeLegacyHandler) Nodes() v1core.NodeInterface { + return m.n +} + +type ClientWrapper struct { + sync.RWMutex + RequestCount int + delegateClient ctlclient.Client + scheme *runtime.Scheme + actions []clienttesting.Action + PodUpdateReactor func() error + NodeUpdateReactor func(node *v1.Node, syncStatus bool) +} + +func NewClientWrapper(client ctlclient.Client, scheme *runtime.Scheme) *ClientWrapper { + return &ClientWrapper{ + delegateClient: client, + scheme: scheme, + actions: make([]clienttesting.Action, 0), + } +} + +func (m *ClientWrapper) Get(ctx context.Context, key ctlclient.ObjectKey, obj ctlclient.Object) error { + m.Lock() + defer m.Unlock() + gvk, err := apiutil.GVKForObject(obj, m.scheme) + if err != nil { + klog.Infof("failed to get gvk for obj %v, %v", obj, err) + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + m.actions = append(m.actions, clienttesting.NewGetAction(gvr, key.Namespace, key.Name)) + defer func() { + if strings.Contains(gvr.Resource, "node") { + m.RequestCount++ + } + }() + return m.delegateClient.Get(ctx, key, obj) +} + +func (m *ClientWrapper) List(ctx context.Context, list ctlclient.ObjectList, opts ...ctlclient.ListOption) error { + m.Lock() + defer m.Unlock() + gvk, err := apiutil.GVKForObject(list, m.scheme) + if err != nil { + return err + } + gvk.Kind = strings.TrimSuffix(gvk.Kind, "list") + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + m.actions = append(m.actions, clienttesting.NewListAction(gvr, gvk, "", metav1.ListOptions{})) + defer func() { + if strings.Contains(gvr.Resource, "node") { + m.RequestCount++ + } + }() + return m.delegateClient.List(ctx, list, opts...) +} + +func (m *ClientWrapper) Create(ctx context.Context, obj ctlclient.Object, opts ...ctlclient.CreateOption) error { + m.Lock() + defer m.Unlock() + return m.delegateClient.Create(ctx, obj, opts...) +} + +func (m *ClientWrapper) Update(ctx context.Context, obj ctlclient.Object, opts ...ctlclient.UpdateOption) error { + m.Lock() + defer m.Unlock() + gvk, err := apiutil.GVKForObject(obj, m.scheme) + if err != nil { + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + m.actions = append(m.actions, clienttesting.NewUpdateAction(gvr, obj.GetNamespace(), obj)) + defer func() { + if strings.Contains(gvr.Resource, "node") { + m.RequestCount++ + } + }() + + if err := m.delegateClient.Update(ctx, obj, opts...); err != nil { + return err + } + + if strings.Contains(gvr.Resource, "node") && m.NodeUpdateReactor != nil { + newNode := new(v1.Node) + if err := m.delegateClient.Get(ctx, ctlclient.ObjectKey{Name: obj.GetName()}, newNode); err != nil { + return err + } + m.NodeUpdateReactor(newNode, false) + } + return nil +} + +// Delete deletes the given obj from Kubernetes cluster. +func (m *ClientWrapper) Delete(ctx context.Context, obj ctlclient.Object, opts ...ctlclient.DeleteOption) error { + m.Lock() + defer m.Unlock() + gvk, err := apiutil.GVKForObject(obj, m.scheme) + if err != nil { + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + m.actions = append(m.actions, clienttesting.NewDeleteAction(gvr, obj.GetNamespace(), obj.GetName())) + defer func() { + if strings.Contains(gvr.Resource, "node") { + m.RequestCount++ + } + }() + return m.delegateClient.Delete(ctx, obj, opts...) +} + +func (m *ClientWrapper) DeleteAllOf(ctx context.Context, obj ctlclient.Object, opts ...ctlclient.DeleteAllOfOption) error { + return m.delegateClient.DeleteAllOf(ctx, obj, opts...) +} + +// Patch patches the given obj in the Kubernetes cluster. obj must be a +// struct pointer so that obj can be updated with the content returned by the Server. +func (m *ClientWrapper) Patch(ctx context.Context, obj ctlclient.Object, patch ctlclient.Patch, opts ...ctlclient.PatchOption) error { + m.Lock() + defer m.Unlock() + gvk, err := apiutil.GVKForObject(obj, m.scheme) + if err != nil { + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + data, _ := patch.Data(obj) + m.actions = append(m.actions, clienttesting.NewPatchAction(gvr, obj.GetNamespace(), obj.GetName(), patch.Type(), data)) + defer func() { + if strings.Contains(gvr.Resource, "node") { + m.RequestCount++ + } + }() + if err = m.delegateClient.Patch(ctx, obj, patch, opts...); err != nil { + return err + } + + if strings.Contains(gvr.Resource, "node") && m.NodeUpdateReactor != nil { + newNode := new(v1.Node) + if err := m.delegateClient.Get(ctx, ctlclient.ObjectKey{Name: obj.GetName()}, newNode); err != nil { + return err + } + m.NodeUpdateReactor(newNode, false) + } + return nil +} + +type fakeStatusWriter struct { + client *ClientWrapper + statusWriter ctlclient.StatusWriter +} + +func (sw *fakeStatusWriter) Update(ctx context.Context, obj ctlclient.Object, opts ...ctlclient.UpdateOption) error { + // TODO(droot): This results in full update of the obj (spec + status). Need + // a way to update status field only. + gvk, err := apiutil.GVKForObject(obj, sw.client.scheme) + if err != nil { + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + sw.client.Lock() + sw.client.actions = append(sw.client.actions, clienttesting.NewUpdateSubresourceAction(gvr, "status", obj.GetNamespace(), obj)) + sw.client.Unlock() + + defer func() { + if strings.Contains(gvr.Resource, "node") { + sw.client.RequestCount++ + } + }() + + if sw.client.PodUpdateReactor != nil { + if gvr.Resource == "pods" { + if err := sw.client.PodUpdateReactor(); err != nil { + return err + } + } + } + if err = sw.statusWriter.Update(ctx, obj, opts...); err != nil { + return err + } + + if strings.Contains(gvr.Resource, "node") && sw.client.NodeUpdateReactor != nil { + newNode := new(v1.Node) + if err := sw.client.delegateClient.Get(ctx, ctlclient.ObjectKey{Name: obj.GetName()}, newNode); err != nil { + return err + } + sw.client.NodeUpdateReactor(newNode, true) + } + return nil +} + +func (sw *fakeStatusWriter) Patch(ctx context.Context, obj ctlclient.Object, patch ctlclient.Patch, opts ...ctlclient.PatchOption) error { + // TODO(droot): This results in full update of the obj (spec + status). Need + // a way to update status field only. + patchData, err := patch.Data(obj) + if err != nil { + return err + } + + gvk, err := apiutil.GVKForObject(obj, sw.client.scheme) + if err != nil { + return err + } + + gvr, _ := meta.UnsafeGuessKindToResource(gvk) + sw.client.Lock() + sw.client.actions = append(sw.client.actions, clienttesting.NewPatchSubresourceAction(gvr, obj.GetNamespace(), obj.GetName(), patch.Type(), patchData, "status")) + sw.client.Unlock() + + defer func() { + if strings.Contains(gvr.Resource, "node") { + sw.client.RequestCount++ + } + }() + + if err = sw.statusWriter.Patch(ctx, obj, patch, opts...); err != nil { + return err + } + + if strings.Contains(gvr.Resource, "node") && sw.client.NodeUpdateReactor != nil { + newNode := new(v1.Node) + if err := sw.client.delegateClient.Get(ctx, ctlclient.ObjectKey{Name: obj.GetName()}, newNode); err != nil { + return err + } + sw.client.NodeUpdateReactor(newNode, false) + } + return nil +} + +func (m *ClientWrapper) Status() ctlclient.StatusWriter { + return &fakeStatusWriter{ + client: m, + statusWriter: m.delegateClient.Status(), + } +} + +func (m *ClientWrapper) Scheme() *runtime.Scheme { + return m.delegateClient.Scheme() +} + +// RESTMapper returns the rest this client is using. +func (m *ClientWrapper) RESTMapper() meta.RESTMapper { + return m.delegateClient.RESTMapper() +} + +func (m *ClientWrapper) ClearActions() { + m.Lock() + defer m.Unlock() + + m.actions = make([]clienttesting.Action, 0) +} + +func (m *ClientWrapper) Actions() []clienttesting.Action { + m.RLock() + defer m.RUnlock() + fa := make([]clienttesting.Action, len(m.actions)) + copy(fa, m.actions) + return fa +} + +// FakeNodeHandler is a fake implementation of NodesInterface and NodeInterface. It +// allows test cases to have fine-grained control over mock behaviors. We also need +// PodsInterface and PodInterface to test list & delete pods, which is implemented in +// the embedded client.Fake field. +type FakeNodeHandler struct { + RequestCount int + + // Synchronization + lock sync.Mutex + DeleteWaitChan chan struct{} + PatchWaitChan chan struct{} + + runtimeClient ctlclient.Client + NodeUpdateReactor func(node *v1.Node, syncStatus bool) +} + +// GetUpdatedNodesCopy returns a slice of Nodes with updates applied. +func (m *FakeNodeHandler) GetUpdatedNodesCopy() []*v1.Node { + nodeList, err := m.List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return []*v1.Node{} + } + updatedNodesCopy := make([]*v1.Node, len(nodeList.Items), len(nodeList.Items)) + for i := range nodeList.Items { + updatedNodesCopy[i] = &nodeList.Items[i] + } + return updatedNodesCopy +} + +// GetUpdatedNodeStatusesCopy returns a slice of Nodes status with updates applied. +func (m *FakeNodeHandler) GetUpdatedNodeStatusesCopy() []*v1.NodeStatus { + nodes := m.GetUpdatedNodesCopy() + statuses := make([]*v1.NodeStatus, len(nodes), len(nodes)) + for i := range nodes { + statuses[i] = &nodes[i].Status + } + + return statuses +} + +// Create adds a new Node to the fake store. +func (m *FakeNodeHandler) Create(ctx context.Context, node *v1.Node, _ metav1.CreateOptions) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + if err := m.runtimeClient.Create(ctx, node, &ctlclient.CreateOptions{}); err != nil { + return nil, err + } + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: node.Name}, newNode); err != nil { + return nil, err + } + + return newNode, nil +} + +// Get returns a Node from the fake store. +func (m *FakeNodeHandler) Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: name}, newNode); err != nil { + klog.Errorf("failed to get node(%s), %v", name, err) + return nil, err + } + + return newNode, nil +} + +// List returns a list of Nodes from the fake store. +func (m *FakeNodeHandler) List(ctx context.Context, opts metav1.ListOptions) (*v1.NodeList, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + clientOpts, err := convertListOptions(opts) + if err != nil { + return nil, err + } + + nodeList := &v1.NodeList{} + if err := m.runtimeClient.List(ctx, nodeList, &clientOpts); err != nil { + return nil, err + } + + return nodeList, nil +} + +func convertListOptions(opts metav1.ListOptions) (ctlclient.ListOptions, error) { + var clientOpts ctlclient.ListOptions + if opts.LabelSelector != "" { + if selector, err := labels.Parse(opts.LabelSelector); err != nil { + return clientOpts, err + } else { + clientOpts.LabelSelector = selector + } + } + + if opts.FieldSelector != "" { + if selector, err := fields.ParseSelector(opts.FieldSelector); err != nil { + return clientOpts, err + } else { + clientOpts.FieldSelector = selector + } + } + + return clientOpts, nil +} + +// Delete deletes a Node from the fake store. +func (m *FakeNodeHandler) Delete(ctx context.Context, id string, opt metav1.DeleteOptions) error { + m.lock.Lock() + defer func() { + m.RequestCount++ + if m.DeleteWaitChan != nil { + m.DeleteWaitChan <- struct{}{} + } + m.lock.Unlock() + }() + + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: id, + }, + } + return m.runtimeClient.Delete(ctx, node, &ctlclient.DeleteOptions{}) +} + +// DeleteCollection deletes a collection of Nodes from the fake store. +func (m *FakeNodeHandler) DeleteCollection(_ context.Context, opt metav1.DeleteOptions, listOpts metav1.ListOptions) error { + return nil +} + +// Update updates a Node in the fake store. +func (m *FakeNodeHandler) Update(ctx context.Context, node *v1.Node, _ metav1.UpdateOptions) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + if err := m.runtimeClient.Update(ctx, node, &ctlclient.UpdateOptions{}); err != nil { + return node, err + } + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: node.Name}, newNode); err != nil { + return newNode, err + } + + if m.NodeUpdateReactor != nil { + m.NodeUpdateReactor(newNode, false) + } + return newNode, nil +} + +// UpdateStatus updates a status of a Node in the fake store. +func (m *FakeNodeHandler) UpdateStatus(ctx context.Context, node *v1.Node, _ metav1.UpdateOptions) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + if err := m.runtimeClient.Status().Update(ctx, node, &ctlclient.UpdateOptions{}); err != nil { + return node, err + } + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: node.Name}, newNode); err != nil { + return newNode, err + } + + if m.NodeUpdateReactor != nil { + m.NodeUpdateReactor(newNode, true) + } + + return newNode, nil +} + +// PatchStatus patches a status of a Node in the fake store. +func (m *FakeNodeHandler) PatchStatus(ctx context.Context, nodeName string, data []byte) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + m.lock.Unlock() + }() + + node := &v1.Node{} + err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: nodeName}, node) + if err != nil { + return nil, err + } + + patch := ctlclient.RawPatch(types.StrategicMergePatchType, data) + if err := m.runtimeClient.Status().Patch(ctx, node, patch); err != nil { + return node, err + } + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: node.Name}, newNode); err != nil { + return newNode, err + } + + if m.NodeUpdateReactor != nil { + m.NodeUpdateReactor(newNode, false) + } + + return newNode, nil +} + +// Watch watches Nodes in a fake store. +func (m *FakeNodeHandler) Watch(_ context.Context, opts metav1.ListOptions) (watch.Interface, error) { + return watch.NewFake(), nil +} + +// Patch patches a Node in the fake store. +func (m *FakeNodeHandler) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, _ metav1.PatchOptions, subresources ...string) (*v1.Node, error) { + m.lock.Lock() + defer func() { + m.RequestCount++ + if m.PatchWaitChan != nil { + m.PatchWaitChan <- struct{}{} + } + m.lock.Unlock() + }() + + node := new(v1.Node) + err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: name}, node) + if err != nil { + return nil, err + } + + patch := ctlclient.RawPatch(pt, data) + if err := m.runtimeClient.Patch(ctx, node, patch); err != nil { + return node, err + } + + newNode := new(v1.Node) + if err := m.runtimeClient.Get(ctx, ctlclient.ObjectKey{Name: node.Name}, newNode); err != nil { + return newNode, err + } + + if m.NodeUpdateReactor != nil { + m.NodeUpdateReactor(newNode, false) + } + + return newNode, nil +} + +// Apply applies a NodeApplyConfiguration to a Node in the fake store. +func (m *FakeNodeHandler) Apply(ctx context.Context, node *v1apply.NodeApplyConfiguration, opts metav1.ApplyOptions) (*v1.Node, error) { + patchOpts := opts.ToPatchOptions() + data, err := json.Marshal(node) + if err != nil { + return nil, err + } + name := node.Name + if name == nil { + return nil, fmt.Errorf("deployment.Name must be provided to Apply") + } + + return m.Patch(ctx, *name, types.ApplyPatchType, data, patchOpts) +} + +// ApplyStatus applies a status of a Node in the fake store. +func (m *FakeNodeHandler) ApplyStatus(ctx context.Context, node *v1apply.NodeApplyConfiguration, opts metav1.ApplyOptions) (*v1.Node, error) { + patchOpts := opts.ToPatchOptions() + data, err := json.Marshal(node) + if err != nil { + return nil, err + } + name := node.Name + if name == nil { + return nil, fmt.Errorf("deployment.Name must be provided to Apply") + } + + return m.Patch(ctx, *name, types.ApplyPatchType, data, patchOpts, "status") +} + +// FakeRecorder is used as a fake during testing. +type FakeRecorder struct { + sync.Mutex + source v1.EventSource + Events []*v1.Event + clock clock.Clock +} + +// Event emits a fake event to the fake recorder +func (f *FakeRecorder) Event(obj runtime.Object, eventtype, reason, message string) { + f.generateEvent(obj, metav1.Now(), eventtype, reason, message) +} + +// Eventf emits a fake formatted event to the fake recorder +func (f *FakeRecorder) Eventf(obj runtime.Object, eventtype, reason, messageFmt string, args ...interface{}) { + f.Event(obj, eventtype, reason, fmt.Sprintf(messageFmt, args...)) +} + +// AnnotatedEventf emits a fake formatted event to the fake recorder +func (f *FakeRecorder) AnnotatedEventf(obj runtime.Object, annotations map[string]string, eventtype, reason, messageFmt string, args ...interface{}) { + f.Eventf(obj, eventtype, reason, messageFmt, args...) +} + +func (f *FakeRecorder) generateEvent(obj runtime.Object, timestamp metav1.Time, eventtype, reason, message string) { + scheme := runtime.NewScheme() + clientgoscheme.AddToScheme(scheme) + f.Lock() + defer f.Unlock() + ref, err := ref.GetReference(scheme, obj) + if err != nil { + klog.ErrorS(err, "Encountered error while getting reference") + return + } + event := f.makeEvent(ref, eventtype, reason, message) + event.Source = f.source + if f.Events != nil { + f.Events = append(f.Events, event) + } +} + +func (f *FakeRecorder) makeEvent(ref *v1.ObjectReference, eventtype, reason, message string) *v1.Event { + t := metav1.Time{Time: f.clock.Now()} + namespace := ref.Namespace + if namespace == "" { + namespace = metav1.NamespaceDefault + } + + clientref := v1.ObjectReference{ + Kind: ref.Kind, + Namespace: ref.Namespace, + Name: ref.Name, + UID: ref.UID, + APIVersion: ref.APIVersion, + ResourceVersion: ref.ResourceVersion, + FieldPath: ref.FieldPath, + } + + return &v1.Event{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%v.%x", ref.Name, t.UnixNano()), + Namespace: namespace, + }, + InvolvedObject: clientref, + Reason: reason, + Message: message, + FirstTimestamp: t, + LastTimestamp: t, + Count: 1, + Type: eventtype, + } +} + +// NewFakeRecorder returns a pointer to a newly constructed FakeRecorder. +func NewFakeRecorder() *FakeRecorder { + return &FakeRecorder{ + source: v1.EventSource{Component: "nodeControllerTest"}, + Events: []*v1.Event{}, + clock: testingclock.NewFakeClock(time.Now()), + } +} + +// NewNode is a helper function for creating Nodes for testing. +func NewNode(name string) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), + v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), + }, + }, + } +} + +// NewPod is a helper function for creating Pods for testing. +func NewPod(name, host string) *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: name, + }, + Spec: v1.PodSpec{ + NodeName: host, + }, + Status: v1.PodStatus{ + Conditions: []v1.PodCondition{ + { + Type: v1.PodReady, + Status: v1.ConditionTrue, + }, + }, + }, + } + + return pod +} + +// GetZones returns list of zones for all Nodes stored in FakeNodeHandler +func GetZones(nodeHandler *ImprovedFakeNodeHandler) []string { + nodes, _ := nodeHandler.DelegateNodeHandler.List(context.TODO(), metav1.ListOptions{}) + zones := sets.NewString() + for _, node := range nodes.Items { + zones.Insert(utilnode.GetZoneKey(&node)) + } + return zones.List() +} + +// CreateZoneID returns a single zoneID for a given region and zone. +func CreateZoneID(region, zone string) string { + return region + ":\x00:" + zone +} diff --git a/pkg/yurtmanager/controller/util/helper/helpers.go b/pkg/yurtmanager/controller/util/helper/helpers.go new file mode 100644 index 00000000000..bbd6dda2c9b --- /dev/null +++ b/pkg/yurtmanager/controller/util/helper/helpers.go @@ -0,0 +1,75 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package helper + +import ( + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/conversion" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" +) + +// Semantic can do semantic deep equality checks for core objects. +// Example: apiequality.Semantic.DeepEqual(aPod, aPodWithNonNilButEmptyMaps) == true +var Semantic = conversion.EqualitiesOrDie( + func(a, b resource.Quantity) bool { + // Ignore formatting, only care that numeric value stayed the same. + // TODO: if we decide it's important, it should be safe to start comparing the format. + // + // Uninitialized quantities are equivalent to 0 quantities. + return a.Cmp(b) == 0 + }, + func(a, b metav1.MicroTime) bool { + return a.UTC() == b.UTC() + }, + func(a, b metav1.Time) bool { + return a.UTC() == b.UTC() + }, + func(a, b labels.Selector) bool { + return a.String() == b.String() + }, + func(a, b fields.Selector) bool { + return a.String() == b.String() + }, +) + +// GetMatchingTolerations returns true and list of Tolerations matching all Taints if all are tolerated, or false otherwise. +func GetMatchingTolerations(taints []v1.Taint, tolerations []v1.Toleration) (bool, []v1.Toleration) { + if len(taints) == 0 { + return true, []v1.Toleration{} + } + if len(tolerations) == 0 && len(taints) > 0 { + return false, []v1.Toleration{} + } + result := []v1.Toleration{} + for i := range taints { + tolerated := false + for j := range tolerations { + if tolerations[j].ToleratesTaint(&taints[i]) { + result = append(result, tolerations[j]) + tolerated = true + break + } + } + if !tolerated { + return false, []v1.Toleration{} + } + } + return true, result +} diff --git a/pkg/yurtmanager/controller/util/helper/helpers_test.go b/pkg/yurtmanager/controller/util/helper/helpers_test.go new file mode 100644 index 00000000000..b71ac46244e --- /dev/null +++ b/pkg/yurtmanager/controller/util/helper/helpers_test.go @@ -0,0 +1,46 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package helper + +import ( + "testing" + + "k8s.io/apimachinery/pkg/api/resource" +) + +func TestSemantic(t *testing.T) { + table := []struct { + a, b interface{} + shouldEqual bool + }{ + {resource.MustParse("0"), resource.Quantity{}, true}, + {resource.Quantity{}, resource.MustParse("0"), true}, + {resource.Quantity{}, resource.MustParse("1m"), false}, + { + resource.NewQuantity(5, resource.BinarySI), + resource.NewQuantity(5, resource.DecimalSI), + true, + }, + {resource.MustParse("2m"), resource.MustParse("1m"), false}, + } + + for index, item := range table { + if e, a := item.shouldEqual, Semantic.DeepEqual(item.a, item.b); e != a { + t.Errorf("case[%d], expected %v, got %v.", index, e, a) + } + } +} diff --git a/pkg/yurtmanager/controller/util/node/controller_utils.go b/pkg/yurtmanager/controller/util/node/controller_utils.go index b96981d06ad..03bcb70a295 100644 --- a/pkg/yurtmanager/controller/util/node/controller_utils.go +++ b/pkg/yurtmanager/controller/util/node/controller_utils.go @@ -18,9 +18,250 @@ limitations under the License. package node import ( + "context" + "encoding/json" + "fmt" + "time" + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/strategicpatch" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + clientretry "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/kubelet/util/format" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/openyurtio/openyurt/pkg/projectinfo" + taintutils "github.com/openyurtio/openyurt/pkg/util/taints" + utilpod "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/pod" ) +const ( + // NodeUnreachablePodReason is the reason on a pod when its state cannot be confirmed as kubelet is unresponsive + // on the node it is (was) running. + NodeUnreachablePodReason = "NodeLost" + // NodeUnreachablePodMessage is the message on a pod when its state cannot be confirmed as kubelet is unresponsive + // on the node it is (was) running. + NodeUnreachablePodMessage = "Node %v which was running pod %v is unresponsive" + + // PodBindingAnnotation can be added into pod annotation, which indicates that this pod will be bound to the node that it is scheduled to. + PodBindingAnnotation = "apps.openyurt.io/binding" +) + +var UpdateTaintBackoff = wait.Backoff{ + Steps: 5, + Duration: 100 * time.Millisecond, + Jitter: 1.0, +} + +var UpdateLabelBackoff = wait.Backoff{ + Steps: 5, + Duration: 100 * time.Millisecond, + Jitter: 1.0, +} + +// DeletePods will delete all pods from master running on given node, +// and return true if any pods were deleted, or were found pending +// deletion. +func DeletePods(ctx context.Context, c client.Client, pods []*v1.Pod, recorder record.EventRecorder, nodeName, nodeUID string) (bool, error) { + remaining := false + var updateErrList []error + + if len(pods) > 0 { + RecordNodeEvent(ctx, recorder, nodeName, nodeUID, v1.EventTypeNormal, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeName)) + } + + for i := range pods { + // Defensive check, also needed for tests. + if pods[i].Spec.NodeName != nodeName { + continue + } + + // Pod will be modified, so making copy is required. + pod := pods[i].DeepCopy() + // Set reason and message in the pod object. + if _, err := SetPodTerminationReason(ctx, c, pod, nodeName); err != nil { + if apierrors.IsConflict(err) { + updateErrList = append(updateErrList, + fmt.Errorf("update status failed for pod %q: %v", format.Pod(pod), err)) + continue + } + } + // if the pod has already been marked for deletion, we still return true that there are remaining pods. + if pod.DeletionGracePeriodSeconds != nil { + remaining = true + continue + } + + klog.InfoS("Starting deletion of pod", "pod", klog.KObj(pod)) + recorder.Eventf(pod, v1.EventTypeNormal, "NodeControllerEviction", "Marking for deletion Pod %s from Node %s", pod.Name, nodeName) + //if err := kubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + if err := c.Delete(ctx, pod); err != nil { + if apierrors.IsNotFound(err) { + // NotFound error means that pod was already deleted. + // There is nothing left to do with this pod. + continue + } + return false, err + } + remaining = true + } + + if len(updateErrList) > 0 { + return false, utilerrors.NewAggregate(updateErrList) + } + return remaining, nil +} + +// SetPodTerminationReason attempts to set a reason and message in the +// pod status, updates it in the apiserver, and returns an error if it +// encounters one. +func SetPodTerminationReason(ctx context.Context, c client.Client, pod *v1.Pod, nodeName string) (*v1.Pod, error) { + if pod.Status.Reason == NodeUnreachablePodReason { + return pod, nil + } + + pod.Status.Reason = NodeUnreachablePodReason + pod.Status.Message = fmt.Sprintf(NodeUnreachablePodMessage, nodeName, pod.Name) + + var err error + if err = c.Status().Update(ctx, pod); err != nil { + return nil, err + } + + return pod, nil +} + +// MarkPodsNotReady updates ready status of given pods running on +// given node from master return true if success +func MarkPodsNotReady(ctx context.Context, c client.Client, recorder record.EventRecorder, pods []*v1.Pod, nodeName string) error { + klog.V(2).InfoS("Update ready status of pods on node", "node", klog.KRef("", nodeName)) + + errs := []error{} + for i := range pods { + // Defensive check, also needed for tests. + if pods[i].Spec.NodeName != nodeName { + continue + } + + // Pod will be modified, so making copy is required. + pod := pods[i].DeepCopy() + for _, cond := range pod.Status.Conditions { + if cond.Type != v1.PodReady { + continue + } + + cond.Status = v1.ConditionFalse + if !utilpod.UpdatePodCondition(&pod.Status, &cond) { + break + } + + klog.V(2).InfoS("Updating ready status of pod to false", "pod", klog.KObj(pod)) + //if _, err := kubeClient.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil { + if err := c.Status().Update(ctx, pod, &client.UpdateOptions{}); err != nil { + if apierrors.IsNotFound(err) { + // NotFound error means that pod was already deleted. + // There is nothing left to do with this pod. + continue + } + klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err) + errs = append(errs, err) + } + // record NodeNotReady event after updateStatus to make sure pod still exists + recorder.Event(pod, v1.EventTypeWarning, "NodeNotReady", "Node is not ready") + break + } + } + + return utilerrors.NewAggregate(errs) +} + +// RecordNodeEvent records a event related to a node. +func RecordNodeEvent(ctx context.Context, recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) { + ref := &v1.ObjectReference{ + APIVersion: "v1", + Kind: "Node", + Name: nodeName, + UID: types.UID(nodeUID), + Namespace: "", + } + klog.V(2).InfoS("Recording event message for node", "event", event, "node", klog.KRef("", nodeName)) + recorder.Eventf(ref, eventtype, reason, "Node %s event: %s", nodeName, event) +} + +// RecordNodeStatusChange records a event related to a node status change. (Common to lifecycle and ipam) +func RecordNodeStatusChange(recorder record.EventRecorder, node *v1.Node, newStatus string) { + ref := &v1.ObjectReference{ + APIVersion: "v1", + Kind: "Node", + Name: node.Name, + UID: node.UID, + Namespace: "", + } + klog.V(2).InfoS("Recording status change event message for node", "status", newStatus, "node", node.Name) + // TODO: This requires a transaction, either both node status is updated + // and event is recorded or neither should happen, see issue #6055. + recorder.Eventf(ref, v1.EventTypeNormal, newStatus, "Node %s status is now: %s", node.Name, newStatus) +} + +// SwapNodeControllerTaint returns true in case of success and false +// otherwise. +func SwapNodeControllerTaint(ctx context.Context, kubeClient clientset.Interface, taintsToAdd, taintsToRemove []*v1.Taint, node *v1.Node) bool { + for _, taintToAdd := range taintsToAdd { + now := metav1.Now() + taintToAdd.TimeAdded = &now + } + + err := AddOrUpdateTaintOnNode(ctx, kubeClient, node.Name, taintsToAdd...) + if err != nil { + utilruntime.HandleError( + fmt.Errorf( + "unable to taint %+v unresponsive Node %q: %v", + taintsToAdd, + node.Name, + err)) + return false + } + klog.V(4).InfoS("Added taint to node", "taint", taintsToAdd, "node", klog.KRef("", node.Name)) + + err = RemoveTaintOffNode(ctx, kubeClient, node.Name, node, taintsToRemove...) + if err != nil { + utilruntime.HandleError( + fmt.Errorf( + "unable to remove %+v unneeded taint from unresponsive Node %q: %v", + taintsToRemove, + node.Name, + err)) + return false + } + klog.V(4).InfoS("Made sure that node has no taint", "node", klog.KRef("", node.Name), "taint", taintsToRemove) + + return true +} + +// AddOrUpdateLabelsOnNode updates the labels on the node and returns true on +// success and false on failure. +func AddOrUpdateLabelsOnNode(ctx context.Context, kubeClient clientset.Interface, labelsToUpdate map[string]string, node *v1.Node) bool { + if err := addOrUpdateLabelsOnNode(kubeClient, node.Name, labelsToUpdate); err != nil { + utilruntime.HandleError( + fmt.Errorf( + "unable to update labels %+v for Node %q: %v", + labelsToUpdate, + node.Name, + err)) + return false + } + klog.V(4).InfoS("Updated labels to node", "label", labelsToUpdate, "node", klog.KRef("", node.Name)) + return true +} + // GetNodeCondition extracts the provided condition from the given status and returns that. // Returns nil and -1 if the condition is not present, and the index of the located condition. func GetNodeCondition(status *v1.NodeStatus, conditionType v1.NodeConditionType) (int, *v1.NodeCondition) { @@ -34,3 +275,185 @@ func GetNodeCondition(status *v1.NodeStatus, conditionType v1.NodeConditionType) } return -1, nil } + +// AddOrUpdateTaintOnNode add taints to the node. If taint was added into node, it'll issue API calls +// to update nodes; otherwise, no API calls. Return error if any. +func AddOrUpdateTaintOnNode(ctx context.Context, c clientset.Interface, nodeName string, taints ...*v1.Taint) error { + if len(taints) == 0 { + return nil + } + firstTry := true + return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error { + var err error + var oldNode *v1.Node + // First we try getting node from the API server cache, as it's cheaper. If it fails + // we get it from etcd to be sure to have fresh data. + option := metav1.GetOptions{} + if firstTry { + option.ResourceVersion = "0" + firstTry = false + } + oldNode, err = c.CoreV1().Nodes().Get(ctx, nodeName, option) + if err != nil { + return err + } + + var newNode *v1.Node + oldNodeCopy := oldNode + updated := false + for _, taint := range taints { + curNewNode, ok, err := taintutils.AddOrUpdateTaint(oldNodeCopy, taint) + if err != nil { + return fmt.Errorf("failed to update taint of node") + } + updated = updated || ok + newNode = curNewNode + oldNodeCopy = curNewNode + } + if !updated { + return nil + } + return PatchNodeTaints(ctx, c, nodeName, oldNode, newNode) + }) +} + +// RemoveTaintOffNode is for cleaning up taints temporarily added to node, +// won't fail if target taint doesn't exist or has been removed. +// If passed a node it'll check if there's anything to be done, if taint is not present it won't issue +// any API calls. +func RemoveTaintOffNode(ctx context.Context, c clientset.Interface, nodeName string, node *v1.Node, taints ...*v1.Taint) error { + if len(taints) == 0 { + return nil + } + // Short circuit for limiting amount of API calls. + if node != nil { + match := false + for _, taint := range taints { + if taintutils.TaintExists(node.Spec.Taints, taint) { + match = true + break + } + } + if !match { + return nil + } + } + + firstTry := true + return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error { + var err error + var oldNode *v1.Node + // First we try getting node from the API server cache, as it's cheaper. If it fails + // we get it from etcd to be sure to have fresh data. + option := metav1.GetOptions{} + if firstTry { + option.ResourceVersion = "0" + firstTry = false + } + oldNode, err = c.CoreV1().Nodes().Get(ctx, nodeName, option) + if err != nil { + return err + } + + var newNode *v1.Node + oldNodeCopy := oldNode + updated := false + for _, taint := range taints { + curNewNode, ok, err := taintutils.RemoveTaint(oldNodeCopy, taint) + if err != nil { + return fmt.Errorf("failed to remove taint of node") + } + updated = updated || ok + newNode = curNewNode + oldNodeCopy = curNewNode + } + if !updated { + return nil + } + return PatchNodeTaints(ctx, c, nodeName, oldNode, newNode) + }) +} + +// PatchNodeTaints patches node's taints. +func PatchNodeTaints(ctx context.Context, c clientset.Interface, nodeName string, oldNode *v1.Node, newNode *v1.Node) error { + // Strip base diff node from RV to ensure that our Patch request will set RV to check for conflicts over .spec.taints. + // This is needed because .spec.taints does not specify patchMergeKey and patchStrategy and adding them is no longer an option for compatibility reasons. + // Using other Patch strategy works for adding new taints, however will not resolve problem with taint removal. + oldNodeNoRV := oldNode.DeepCopy() + oldNodeNoRV.ResourceVersion = "" + oldDataNoRV, err := json.Marshal(&oldNodeNoRV) + if err != nil { + return fmt.Errorf("failed to marshal old node %#v for node %q: %v", oldNodeNoRV, nodeName, err) + } + + newTaints := newNode.Spec.Taints + newNodeClone := oldNode.DeepCopy() + newNodeClone.Spec.Taints = newTaints + newData, err := json.Marshal(newNodeClone) + if err != nil { + return fmt.Errorf("failed to marshal new node %#v for node %q: %v", newNodeClone, nodeName, err) + } + + patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldDataNoRV, newData, v1.Node{}) + if err != nil { + return fmt.Errorf("failed to create patch for node %q: %v", nodeName, err) + } + + _, err = c.CoreV1().Nodes().Patch(ctx, nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}) + return err +} + +func addOrUpdateLabelsOnNode(kubeClient clientset.Interface, nodeName string, labelsToUpdate map[string]string) error { + firstTry := true + return clientretry.RetryOnConflict(UpdateLabelBackoff, func() error { + var err error + var node *v1.Node + // First we try getting node from the API server cache, as it's cheaper. If it fails + // we get it from etcd to be sure to have fresh data. + option := metav1.GetOptions{} + if firstTry { + option.ResourceVersion = "0" + firstTry = false + } + node, err = kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, option) + if err != nil { + return err + } + + // Make a copy of the node and update the labels. + newNode := node.DeepCopy() + if newNode.Labels == nil { + newNode.Labels = make(map[string]string) + } + for key, value := range labelsToUpdate { + newNode.Labels[key] = value + } + + oldData, err := json.Marshal(node) + if err != nil { + return fmt.Errorf("failed to marshal the existing node %#v: %v", node, err) + } + newData, err := json.Marshal(newNode) + if err != nil { + return fmt.Errorf("failed to marshal the new node %#v: %v", newNode, err) + } + patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Node{}) + if err != nil { + return fmt.Errorf("failed to create a two-way merge patch: %v", err) + } + if _, err := kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("failed to patch the node: %v", err) + } + return nil + }) +} + +func IsPodBoundenToNode(node *v1.Node) bool { + if node.Annotations != nil && + (node.Annotations[projectinfo.GetAutonomyAnnotation()] == "true" || + node.Annotations[PodBindingAnnotation] == "true") { + return true + } + + return false +} diff --git a/pkg/yurtmanager/controller/util/tools.go b/pkg/yurtmanager/controller/util/tools.go index 0a199cee4e4..2f3d9302a00 100644 --- a/pkg/yurtmanager/controller/util/tools.go +++ b/pkg/yurtmanager/controller/util/tools.go @@ -19,9 +19,16 @@ limitations under the License. package util import ( + "fmt" "sync" + "time" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/integer" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/manager" + + controllerimpl "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/internal/controller" ) // SlowStartBatch tries to call the provided function a total of 'count' times, @@ -62,3 +69,42 @@ func SlowStartBatch(count int, initialBatchSize int, fn func(index int) error) ( } return successes, nil } + +func NewNoReconcileController(name string, mgr manager.Manager, options controller.Options) (*controllerimpl.Controller, error) { + if len(name) == 0 { + return nil, fmt.Errorf("must specify Name for Controller") + } + + if options.Log == nil { + options.Log = mgr.GetLogger() + } + + if options.CacheSyncTimeout == 0 { + options.CacheSyncTimeout = 2 * time.Minute + } + + if options.RateLimiter == nil { + options.RateLimiter = workqueue.DefaultControllerRateLimiter() + } + + // Inject dependencies into Reconciler + if err := mgr.SetFields(options.Reconciler); err != nil { + return nil, err + } + + // Create controller with dependencies set + c := &controllerimpl.Controller{ + MakeQueue: func() workqueue.RateLimitingInterface { + return workqueue.NewNamedRateLimitingQueue(options.RateLimiter, name) + }, + CacheSyncTimeout: options.CacheSyncTimeout, + SetFields: mgr.SetFields, + Name: name, + RecoverPanic: options.RecoverPanic, + } + + if err := mgr.Add(c); err != nil { + return c, err + } + return c, nil +} diff --git a/pkg/yurtmanager/controller/yurtappdaemon/yurtappdaemon_controller.go b/pkg/yurtmanager/controller/yurtappdaemon/yurtappdaemon_controller.go index d36bb905cb5..9214a114f47 100644 --- a/pkg/yurtmanager/controller/yurtappdaemon/yurtappdaemon_controller.go +++ b/pkg/yurtmanager/controller/yurtappdaemon/yurtappdaemon_controller.go @@ -72,7 +72,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new YurtAppDaemon Controller and adds it to the Manager with default RBAC. // The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *config.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *config.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/controller/yurtappoverrider/yurtappoverrider_controller.go b/pkg/yurtmanager/controller/yurtappoverrider/yurtappoverrider_controller.go index 7528790202a..0a5962b81bd 100644 --- a/pkg/yurtmanager/controller/yurtappoverrider/yurtappoverrider_controller.go +++ b/pkg/yurtmanager/controller/yurtappoverrider/yurtappoverrider_controller.go @@ -61,7 +61,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new YurtAppOverrider Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/controller/yurtappset/yurtappset_controller.go b/pkg/yurtmanager/controller/yurtappset/yurtappset_controller.go index 8180aad3db7..f3ba0145ff0 100644 --- a/pkg/yurtmanager/controller/yurtappset/yurtappset_controller.go +++ b/pkg/yurtmanager/controller/yurtappset/yurtappset_controller.go @@ -72,7 +72,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new YurtAppSet Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *config.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *config.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/controller/yurtcoordinator/cert/yurtcoordinatorcert_controller.go b/pkg/yurtmanager/controller/yurtcoordinator/cert/yurtcoordinatorcert_controller.go index dc127095225..b3cd790f603 100644 --- a/pkg/yurtmanager/controller/yurtcoordinator/cert/yurtcoordinatorcert_controller.go +++ b/pkg/yurtmanager/controller/yurtcoordinator/cert/yurtcoordinatorcert_controller.go @@ -203,7 +203,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new YurtCoordinatorcert Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(cfg *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, cfg *appconfig.CompletedConfig, mgr manager.Manager) error { r := &ReconcileYurtCoordinatorCert{} // Create a new controller diff --git a/pkg/yurtmanager/controller/yurtcoordinator/delegatelease/delegatelease_controller.go b/pkg/yurtmanager/controller/yurtcoordinator/delegatelease/delegatelease_controller.go index d06ff113d71..d994ef63b4c 100644 --- a/pkg/yurtmanager/controller/yurtcoordinator/delegatelease/delegatelease_controller.go +++ b/pkg/yurtmanager/controller/yurtcoordinator/delegatelease/delegatelease_controller.go @@ -59,7 +59,7 @@ type ReconcileDelegateLease struct { // Add creates a delegatelease controller and add it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(_ *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, _ *appconfig.CompletedConfig, mgr manager.Manager) error { r := &ReconcileDelegateLease{ ldc: utils.NewLeaseDelegatedCounter(), delLdc: utils.NewLeaseDelegatedCounter(), diff --git a/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller.go b/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller.go index bcea5dbabd7..2b13609c3c7 100644 --- a/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller.go +++ b/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller.go @@ -34,8 +34,7 @@ import ( appconfig "github.com/openyurtio/openyurt/cmd/yurt-manager/app/config" "github.com/openyurtio/openyurt/cmd/yurt-manager/names" - "github.com/openyurtio/openyurt/pkg/projectinfo" - "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/yurtcoordinator/constant" + nodeutil "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/node" ) func init() { @@ -71,7 +70,7 @@ type ReconcilePodBinding struct { // Add creates a PodBingding controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { klog.Infof(Format("podbinding-controller add controller %s", controllerKind.String())) return add(mgr, newReconciler(c, mgr)) } @@ -90,23 +89,25 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { return err } - err = c.Watch(&source.Kind{Type: &corev1.Node{}}, &handler.EnqueueRequestForObject{}) - if err != nil { - return err - } - - klog.V(4).Info(Format("registering the field indexers of podbinding controller")) - err = mgr.GetFieldIndexer().IndexField(context.TODO(), &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string { - pod, ok := rawObj.(*corev1.Pod) - if ok { - return []string{pod.Spec.NodeName} - } - return []string{} - }) - if err != nil { - klog.Errorf(Format("failed to register field indexers for podbinding controller, %v", err)) - } - return err + return c.Watch(&source.Kind{Type: &corev1.Node{}}, &handler.EnqueueRequestForObject{}) + //err = c.Watch(&source.Kind{Type: &corev1.Node{}}, &handler.EnqueueRequestForObject{}) + //if err != nil { + // return err + //} + // + //klog.V(4).Info(Format("registering the field indexers of podbinding controller")) + // IndexField for spec.nodeName is registered in NodeLifeCycle, so we remove it here. + //err = mgr.GetFieldIndexer().IndexField(context.TODO(), &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string { + // pod, ok := rawObj.(*corev1.Pod) + // if ok { + // return []string{pod.Spec.NodeName} + // } + // return []string{} + //}) + //if err != nil { + // klog.Errorf(Format("failed to register field indexers for podbinding controller, %v", err)) + //} + //return err } func (r *ReconcilePodBinding) InjectClient(c client.Client) error { @@ -152,7 +153,7 @@ func (r *ReconcilePodBinding) processNode(node *corev1.Node) error { } // pod binding takes precedence against node autonomy - if isPodBoundenToNode(node) { + if nodeutil.IsPodBoundenToNode(node) { if err := r.configureTolerationForPod(pod, nil); err != nil { klog.Errorf(Format("failed to configure toleration of pod, %v", err)) } @@ -205,16 +206,6 @@ func (r *ReconcilePodBinding) configureTolerationForPod(pod *corev1.Pod, tolerat return nil } -func isPodBoundenToNode(node *corev1.Node) bool { - if node.Annotations != nil && - (node.Annotations[projectinfo.GetAutonomyAnnotation()] == "true" || - node.Annotations[constant.PodBindingAnnotation] == "true") { - return true - } - - return false -} - func isDaemonSetPodOrStaticPod(pod *corev1.Pod) bool { if pod != nil { for i := range pod.OwnerReferences { diff --git a/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller_test.go b/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller_test.go index fe08e7bfc42..7a18b7722c3 100644 --- a/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller_test.go +++ b/pkg/yurtmanager/controller/yurtcoordinator/podbinding/podbinding_controller_test.go @@ -32,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" appconfig "github.com/openyurtio/openyurt/cmd/yurt-manager/app/config" + nodeutil "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/util/node" ) var ( @@ -398,8 +399,8 @@ func TestIsPodBoundenToNode(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := isPodBoundenToNode(tt.node); got != tt.want { - t.Errorf("isPodBoundenToNode() = %v, want %v", got, tt.want) + if got := nodeutil.IsPodBoundenToNode(tt.node); got != tt.want { + t.Errorf("IsPodBoundenToNode() = %v, want %v", got, tt.want) } }) } diff --git a/pkg/yurtmanager/controller/yurtstaticset/yurtstaticset_controller.go b/pkg/yurtmanager/controller/yurtstaticset/yurtstaticset_controller.go index 4fdd470ca8b..232762cce99 100644 --- a/pkg/yurtmanager/controller/yurtstaticset/yurtstaticset_controller.go +++ b/pkg/yurtmanager/controller/yurtstaticset/yurtstaticset_controller.go @@ -126,7 +126,7 @@ func Format(format string, args ...interface{}) string { // Add creates a new YurtStaticSet Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. -func Add(c *appconfig.CompletedConfig, mgr manager.Manager) error { +func Add(ctx context.Context, c *appconfig.CompletedConfig, mgr manager.Manager) error { if _, err := mgr.GetRESTMapper().KindFor(controllerResource); err != nil { klog.Infof("resource %s doesn't exist", controllerResource.String()) return err diff --git a/pkg/yurtmanager/webhook/node/v1/node_handler.go b/pkg/yurtmanager/webhook/node/v1/node_handler.go index e63bc1730a3..707b7de1278 100644 --- a/pkg/yurtmanager/webhook/node/v1/node_handler.go +++ b/pkg/yurtmanager/webhook/node/v1/node_handler.go @@ -48,8 +48,8 @@ func (webhook *NodeHandler) SetupWebhookWithManager(mgr ctrl.Manager) (string, s Complete() } -// +kubebuilder:webhook:path=/validate-core-openyurt-io-v1-node,mutating=false,failurePolicy=fail,sideEffects=None,admissionReviewVersions=v1,groups="",resources=nodes,verbs=update,versions=v1,name=validate.core.v1.node.openyurt.io -// +kubebuilder:webhook:path=/mutate-core-openyurt-io-v1-node,mutating=true,failurePolicy=fail,sideEffects=None,admissionReviewVersions=v1,groups="",resources=nodes,verbs=create;update,versions=v1,name=mutate.core.v1.node.openyurt.io +// +kubebuilder:webhook:path=/validate-core-openyurt-io-v1-node,mutating=false,failurePolicy=ignore,sideEffects=None,admissionReviewVersions=v1,groups="",resources=nodes,verbs=update,versions=v1,name=validate.core.v1.node.openyurt.io +// +kubebuilder:webhook:path=/mutate-core-openyurt-io-v1-node,mutating=true,failurePolicy=ignore,sideEffects=None,admissionReviewVersions=v1,groups="",resources=nodes,verbs=create;update,versions=v1,name=mutate.core.v1.node.openyurt.io // NodeHandler implements a validating and defaulting webhook for Cluster. type NodeHandler struct { diff --git a/pkg/yurtmanager/webhook/pod/v1/pod_handler.go b/pkg/yurtmanager/webhook/pod/v1/pod_handler.go deleted file mode 100644 index cd7df17daea..00000000000 --- a/pkg/yurtmanager/webhook/pod/v1/pod_handler.go +++ /dev/null @@ -1,57 +0,0 @@ -/* -Copyright 2023 The OpenYurt Authors. - -Licensed under the Apache License, Version 2.0 (the License); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an AS IS BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - v1 "k8s.io/api/core/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/apiutil" - - "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/builder" - "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/util" -) - -const ( - WebhookName = "interceptpodeviction" -) - -// SetupWebhookWithManager sets up Cluster webhooks. mutate path, validatepath, error -func (webhook *PodHandler) SetupWebhookWithManager(mgr ctrl.Manager) (string, string, error) { - // init - webhook.Client = mgr.GetClient() - - gvk, err := apiutil.GVKForObject(&v1.Pod{}, mgr.GetScheme()) - if err != nil { - return "", "", err - } - return util.GenerateMutatePath(gvk), - util.GenerateValidatePath(gvk), - builder.WebhookManagedBy(mgr). - For(&v1.Pod{}). - WithValidator(webhook). - Complete() -} - -// +kubebuilder:webhook:path=/validate-core-openyurt-io-v1-pod,mutating=false,failurePolicy=fail,sideEffects=None,admissionReviewVersions=v1;v1beta1,groups="",resources=pods,verbs=delete,versions=v1,name=validate.core.v1.pod.openyurt.io - -// Cluster implements a validating and defaulting webhook for PodHandler. -type PodHandler struct { - Client client.Client -} - -var _ builder.CustomValidator = &PodHandler{} diff --git a/pkg/yurtmanager/webhook/pod/v1/pod_validation.go b/pkg/yurtmanager/webhook/pod/v1/pod_validation.go deleted file mode 100644 index 249429d12e5..00000000000 --- a/pkg/yurtmanager/webhook/pod/v1/pod_validation.go +++ /dev/null @@ -1,149 +0,0 @@ -/* -Copyright 2023 The OpenYurt Authors. - -Licensed under the Apache License, Version 2.0 (the License); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an AS IS BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1 - -import ( - "context" - "fmt" - "strings" - "time" - - leasev1 "k8s.io/api/coordination/v1" - v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/validation/field" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" - - "github.com/openyurtio/openyurt/pkg/apis/apps" - appsv1alpha1 "github.com/openyurtio/openyurt/pkg/apis/apps/v1alpha1" - "github.com/openyurtio/openyurt/pkg/yurtmanager/controller/yurtcoordinator/constant" -) - -const ( - UserNodeController = "system:serviceaccount:kube-system:node-controller" - - NodeLeaseDurationSeconds = 40 - DefaultPoolReadyNodeNumberRatioThreshold = 0.35 -) - -// ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type. -func (webhook *PodHandler) ValidateCreate(ctx context.Context, obj runtime.Object, req admission.Request) error { - return nil -} - -// ValidateUpdate implements webhook.CustomValidator so a webhook will be registered for the type. -func (webhook *PodHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object, req admission.Request) error { - return nil -} - -// ValidateDelete implements webhook.CustomValidator so a webhook will be registered for the type. -func (webhook *PodHandler) ValidateDelete(_ context.Context, obj runtime.Object, req admission.Request) error { - po, ok := obj.(*v1.Pod) - if !ok { - return apierrors.NewBadRequest(fmt.Sprintf("expected a Pod but got a %T", obj)) - } - - if allErrs := validatePodDeletion(webhook.Client, po, req); len(allErrs) > 0 { - return apierrors.NewInvalid(v1.SchemeGroupVersion.WithKind("Pod").GroupKind(), po.Name, allErrs) - } - return nil -} - -func validatePodDeletion(cli client.Client, pod *v1.Pod, req admission.Request) field.ErrorList { - if !userIsNodeController(req) { - return nil - } - - nodeName := pod.Spec.NodeName - if nodeName == "" { - return nil - } - - node := v1.Node{} - if err := cli.Get(context.TODO(), client.ObjectKey{Name: nodeName}, &node); err != nil { - return nil - } - - // only validate pod which in nodePool - var nodePoolName string - if node.Labels != nil { - if name, ok := node.Labels[apps.NodePoolLabel]; ok { - nodePoolName = name - } - } - if nodePoolName == "" { - return nil - } - - // check number of ready nodes in node pool - nodePool := appsv1alpha1.NodePool{} - if err := cli.Get(context.TODO(), client.ObjectKey{Name: nodePoolName}, &nodePool); err != nil { - return nil - } - nodeNumber := len(nodePool.Status.Nodes) - if nodeNumber == 0 { - return nil - } - readyNumber := countAliveNode(cli, nodePool.Status.Nodes) - - // When number of ready nodes in node pool is below a configurable parameter, - // we don't allow pods to move within the pool any more. - // This threshold defaults to one third of the number of pool's nodes. - if float64(readyNumber)/float64(nodeNumber) < DefaultPoolReadyNodeNumberRatioThreshold { - return field.ErrorList([]*field.Error{ - field.Invalid(field.NewPath("metadata").Child("name"), pod.Name, "nodePool has too few ready nodes")}) - } - return nil -} - -func userIsNodeController(req admission.Request) bool { - // only user is node-controller can validate pod delete/evict - return strings.Contains(req.UserInfo.Username, UserNodeController) -} - -// countAliveNode return number of node alive -func countAliveNode(cli client.Client, nodes []string) int { - cnt := 0 - for _, n := range nodes { - if nodeIsAlive(cli, n) { - cnt++ - } - } - return cnt -} - -// nodeIsAlive return true if node is alive, otherwise is false -func nodeIsAlive(cli client.Client, nodeName string) bool { - lease := leasev1.Lease{} - if err := cli.Get(context.TODO(), client.ObjectKey{Namespace: v1.NamespaceNodeLease, Name: nodeName}, &lease); err != nil { - return false - } - - // check lease update time - diff := time.Now().Sub(lease.Spec.RenewTime.Time) - if diff.Seconds() > NodeLeaseDurationSeconds { - return false - } - - // check lease if delegate or not - if lease.Annotations != nil && lease.Annotations[constant.DelegateHeartBeat] == "true" { - return false - } - return true -} diff --git a/pkg/yurtmanager/webhook/server.go b/pkg/yurtmanager/webhook/server.go index 7b43a79c414..24a69678dc5 100644 --- a/pkg/yurtmanager/webhook/server.go +++ b/pkg/yurtmanager/webhook/server.go @@ -36,7 +36,6 @@ import ( v1beta1nodepool "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/nodepool/v1beta1" v1alpha1platformadmin "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/platformadmin/v1alpha1" v1alpha2platformadmin "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/platformadmin/v1alpha2" - v1pod "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/pod/v1" "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/util" webhookcontroller "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/util/controller" v1alpha1yurtappdaemon "github.com/openyurtio/openyurt/pkg/yurtmanager/webhook/yurtappdaemon/v1alpha1" @@ -81,7 +80,6 @@ func init() { addControllerWebhook(names.YurtAppOverriderController, &v1alpha1yurtappoverrider.YurtAppOverriderHandler{}) addControllerWebhook(names.YurtAppOverriderController, &v1alpha1deploymentrender.DeploymentRenderHandler{}) - independentWebhooks[v1pod.WebhookName] = &v1pod.PodHandler{} independentWebhooks[v1node.WebhookName] = &v1node.NodeHandler{} }