Skip to content

Commit

Permalink
Add event generation to eviction requests and cancellation
Browse files Browse the repository at this point in the history
Signed-off-by: Eric Weber <[email protected]>
  • Loading branch information
ejweber committed Nov 1, 2023
1 parent 321e9f9 commit ec04802
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 30 deletions.
77 changes: 47 additions & 30 deletions controller/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1446,11 +1446,16 @@ func (nc *NodeController) createSnapshotMonitor() (mon monitor.Monitor, err erro
func (nc *NodeController) syncReplicaEvictionRequested(node *longhorn.Node, kubeNode *corev1.Node) error {
log := getLoggerForNode(nc.logger, node)
node.Status.AutoEvicting = false
replicasToSync := []*longhorn.Replica{}
nodeDrainPolicy, err := nc.ds.GetSettingValueExisted(types.SettingNameNodeDrainPolicy)
if err != nil {
return err
return errors.Wrapf(err, "failed to get %v setting", types.SettingNameNodeDrainPolicy)
}

type replicaToSync struct {
*longhorn.Replica
syncReason string
}
replicasToSync := []replicaToSync{}

for diskName, diskSpec := range node.Spec.Disks {
diskStatus := node.Status.DiskStatus[diskName]
Expand All @@ -1459,62 +1464,74 @@ func (nc *NodeController) syncReplicaEvictionRequested(node *longhorn.Node, kube
if err != nil {
return err
}
shouldEvictReplica, err := nc.shouldEvictReplica(node, kubeNode, &diskSpec, replica, nodeDrainPolicy)
shouldEvictReplica, reason, err := nc.shouldEvictReplica(node, kubeNode, &diskSpec, replica,
nodeDrainPolicy)
if err != nil {
replicaLog := log.WithField("replica", replica.Name)
replicaLog.Warn("Failed to check if replica should be evicted, will enqueue then resync node")
return err
}
if replica.Spec.EvictionRequested != shouldEvictReplica {
replica.Spec.EvictionRequested = shouldEvictReplica
replicasToSync = append(replicasToSync, replica)
replicasToSync = append(replicasToSync, replicaToSync{replica, reason})
}

if replica.Spec.EvictionRequested && (nodeDrainPolicy == string(types.NodeDrainPolicyBlockForEviction) ||
nodeDrainPolicy == string(types.NodeDrainPolicyBlockForEvictionIfContainsLastReplica)) {
if replica.Spec.EvictionRequested && !node.Spec.EvictionRequested && !diskSpec.EvictionRequested {
// We don't consider the node to be auto evicting if eviction was manually requested.
node.Status.AutoEvicting = true
}
}
}

for _, replica := range replicasToSync {
replicaLog := log.WithField("replica", replica.Name)
log.Infof("Updating evictionRequested to %t", replica.Spec.EvictionRequested)
if _, err := nc.ds.UpdateReplica(replica); err != nil {
replicaLog.Warn("Failed to update evictionRequested, will enqueue then resync node")
nc.enqueueNode(node)
continue
for _, replicaToSync := range replicasToSync {
replicaLog := log.WithField("replica", replicaToSync.Name).WithField("disk", replicaToSync.Spec.DiskID)
if replicaToSync.Spec.EvictionRequested {
log.Infof("Requesting replica eviction")
if _, err := nc.ds.UpdateReplica(replicaToSync.Replica); err != nil {
replicaLog.Warn("Failed to request replica eviction, will enqueue then resync node")
nc.enqueueNode(node)
continue
}
nc.eventRecorder.Eventf(replicaToSync, corev1.EventTypeNormal, replicaToSync.syncReason, "Evicting replica %v from node %v and disk %v", replicaToSync.Name, node, replicaToSync.Spec.DiskID)
} else {
log.Infof("Cancelling replica eviction")
if _, err := nc.ds.UpdateReplica(replicaToSync.Replica); err != nil {
replicaLog.Warn("Failed to cancel replica eviction, will enqueue then resync node")
nc.enqueueNode(node)
continue
}
nc.eventRecorder.Eventf(replicaToSync, corev1.EventTypeNormal, replicaToSync.syncReason, "Cancelling replica %v eviction from node %v and disk %v", replicaToSync.Name, node, replicaToSync.Spec.DiskID)
}
}

return nil
}

func (nc *NodeController) shouldEvictReplica(node *longhorn.Node, kubeNode *corev1.Node, diskSpec *longhorn.DiskSpec,
replica *longhorn.Replica, nodeDrainPolicy string) (bool, error) {
// TODO: The replica controller previously cancelled evictions when a node was down or deleted. Is there something
// we need to do here?
// if isDownOrDeleted, err := nc.ds.IsNodeDownOrDeleted(node.Spec.Name); err != nil {
// return false, err
// } else if isDownOrDeleted {
// return false, nil
// }
replica *longhorn.Replica, nodeDrainPolicy string) (bool, string, error) {
// Replica eviction was cancelled on down or deleted nodes in previous implementations. It seems safest to continue
// this behavior unless we find a reason to change it.
if isDownOrDeleted, err := nc.ds.IsNodeDownOrDeleted(node.Spec.Name); err != nil {
return false, "", err
} else if isDownOrDeleted {
return false, longhorn.NodeConditionReasonKubernetesNodeNotReady, nil
}

if node.Spec.EvictionRequested || diskSpec.EvictionRequested {
return true, nil
return true, longhorn.ReplicaEvictionReasonUserRequested, nil
}
if !kubeNode.Spec.Unschedulable {
return false, nil // Node drain policy only takes effect on cordoned nodes.
// Node drain policy only takes effect on cordoned nodes.
return false, longhorn.ReplicaEvictionCancellationReasonNormal, nil
}
if nodeDrainPolicy == string(types.NodeDrainPolicyBlockForEviction) {
return true, nil
return true, longhorn.ReplicaEvictionReasonAutomatic, nil
}
if nodeDrainPolicy != string(types.NodeDrainPolicyBlockForEvictionIfContainsLastReplica) {
return false, nil
return false, longhorn.ReplicaEvictionCancellationReasonNormal, nil
}

pdbProtectedHealthyReplicas, err := nc.ds.ListVolumePDBProtectedHealthyReplicas(replica.Spec.VolumeName)
if err != nil {
return false, err
return false, "", err
}
hasPDBOnAnotherNode := false
for _, pdbProtectedHealthyReplica := range pdbProtectedHealthyReplicas {
Expand All @@ -1524,8 +1541,8 @@ func (nc *NodeController) shouldEvictReplica(node *longhorn.Node, kubeNode *core
}
}
if !hasPDBOnAnotherNode {
return true, nil
return true, longhorn.ReplicaEvictionReasonAutomatic, nil
}

return false, nil
return false, longhorn.ReplicaEvictionCancellationReasonNormal, nil
}
4 changes: 4 additions & 0 deletions k8s/pkg/apis/longhorn/v1beta2/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ const (

ReplicaConditionReasonRebuildFailedDisconnection = "Disconnection"
ReplicaConditionReasonRebuildFailedGeneral = "General"

ReplicaEvictionReasonAutomatic = "ReplicaEvictionAutomatic"
ReplicaEvictionReasonUserRequested = "ReplicaEvictionUserRequested"
ReplicaEvictionCancellationReasonNormal = "ReplicaEvictionNotRequired"
)

// ReplicaSpec defines the desired state of the Longhorn replica
Expand Down

0 comments on commit ec04802

Please sign in to comment.