From 66423629a134910370748b1bc9875a4da853db32 Mon Sep 17 00:00:00 2001 From: Eric Weber Date: Mon, 11 Dec 2023 15:15:33 -0600 Subject: [PATCH] Allow kubelet to be down for 10 seconds before responding Longhorn 7302 Signed-off-by: Eric Weber --- controller/node_controller.go | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/controller/node_controller.go b/controller/node_controller.go index acf94bbebe..bf1e543c16 100644 --- a/controller/node_controller.go +++ b/controller/node_controller.go @@ -34,6 +34,7 @@ import ( var ( nodeControllerResyncPeriod = 30 * time.Second + ignoreKubeletNotReadyTime = 10 * time.Second unknownFsid = "UNKNOWN_FSID" @@ -430,11 +431,18 @@ func (nc *NodeController) syncNode(key string) (err error) { switch con.Type { case v1.NodeReady: if con.Status != v1.ConditionTrue { - node.Status.Conditions = types.SetConditionAndRecord(node.Status.Conditions, - longhorn.NodeConditionTypeReady, longhorn.ConditionStatusFalse, - string(longhorn.NodeConditionReasonKubernetesNodeNotReady), - fmt.Sprintf("Kubernetes node %v not ready: %v", node.Name, con.Reason), - nc.eventRecorder, node, v1.EventTypeWarning) + if con.Status == v1.ConditionFalse && time.Since(con.LastTransitionTime.Time) < ignoreKubeletNotReadyTime { + // When kubelet restarts, it briefly reports Ready == False. Responding too quickly can cause + // undesirable churn. See https://github.com/longhorn/longhorn/issues/7302 for an example. + nc.logger.Warnf("Ignoring %v == %v condition due to %v until %v", v1.NodeReady, con.Status, + con.Reason, con.LastTransitionTime.Add(ignoreKubeletNotReadyTime)) + } else { + node.Status.Conditions = types.SetConditionAndRecord(node.Status.Conditions, + longhorn.NodeConditionTypeReady, longhorn.ConditionStatusFalse, + string(longhorn.NodeConditionReasonKubernetesNodeNotReady), + fmt.Sprintf("Kubernetes node %v not ready: %v", node.Name, con.Reason), + nc.eventRecorder, node, v1.EventTypeWarning) + } } case v1.NodeDiskPressure, v1.NodePIDPressure,