Skip to content

Commit

Permalink
Use two fields for replica.spec.evictionRequested (aren't cancelling …
Browse files Browse the repository at this point in the history
…auto evictions)

Signed-off-by: Eric Weber <[email protected]>
  • Loading branch information
ejweber committed Oct 23, 2023
1 parent b993c20 commit 9c6d9f9
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 52 deletions.
48 changes: 27 additions & 21 deletions controller/instance_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,8 +690,6 @@ func (imc *InstanceManagerController) deleteInstanceManagerPDB(im *longhorn.Inst
}

func (imc *InstanceManagerController) canDeleteInstanceManagerPDB(im *longhorn.InstanceManager) (bool, error) {
log := getLoggerForInstanceManager(imc.logger, im)

// If there is no engine instance process inside the engine instance manager,
// it means that all volumes are detached.
// We can delete the PodDisruptionBudget for the engine instance manager.
Expand Down Expand Up @@ -734,9 +732,15 @@ func (imc *InstanceManagerController) canDeleteInstanceManagerPDB(im *longhorn.I
return false, err
}

node, err := imc.ds.GetNode(imc.controllerID)
if err != nil {
return false, err
}
if nodeDrainingPolicy == string(types.NodeDrainPolicyBlockForEviction) && len(replicasOnCurrentNode) > 0 {
// We must wait for ALL replicas to be evicted before removing the PDB. The node controller will handle eviction
// requests.
if node.Status.AutoEvicting {
// With this drain policy, we should try to evict all replicas.
imc.autoEvictReplicas(im, replicasOnCurrentNode)
}
return false, nil
}

Expand All @@ -753,42 +757,44 @@ func (imc *InstanceManagerController) canDeleteInstanceManagerPDB(im *longhorn.I

// For each replica in the target replica list, find out whether there is a PDB protected healthy replica of the
// same volume on another schedulable node.
unprotectedReplicas := []*longhorn.Replica{}
unprotectedReplicas := map[string]*longhorn.Replica{}
for _, replica := range targetReplicas {
replicaIsProtected, err := imc.replicaIsProtected(im, replica)
if err != nil {
return false, err
}
if !replicaIsProtected {
unprotectedReplicas = append(unprotectedReplicas, replica)
unprotectedReplicas[replica.Name] = replica
}
}

if len(unprotectedReplicas) == 0 {
return true, nil
}

node, err := imc.ds.GetNode(imc.controllerID)
if err != nil {
return false, err
}
if node.Status.AutoEvicting {
if nodeDrainingPolicy == string(types.NodeDrainPolicyBlockForEvictionIfContainsLastReplica) &&
node.Status.AutoEvicting {
// With this drain policy, we should try to evict unprotected replicas.
for _, replica := range unprotectedReplicas {
if replica.Spec.EvictionRequested == "" {
replicaLog := log.WithField("replica", replica.Name)
replicaLog.Infof("Requesting replica eviction")
replica.Spec.EvictionRequested = longhorn.ReplicaEvictionRequestedAuto
if _, err = imc.ds.UpdateReplica(replica); err != nil {
replicaLog.Errorf("Failed to request replica eviction, will requeue then resync instance manager: %v", err)
}
}
}
imc.autoEvictReplicas(im, unprotectedReplicas)
}

return false, nil
}

func (imc *InstanceManagerController) autoEvictReplicas(im *longhorn.InstanceManager, replicas map[string]*longhorn.Replica) {
log := getLoggerForInstanceManager(imc.logger, im)
for _, replica := range replicas {
if !replica.Spec.EvictionRequestedAuto {
replicaLog := log.WithField("replica", replica.Name)
replicaLog.Infof("Requesting automatic replica eviction")
replica.Spec.EvictionRequestedAuto = true
if _, err := imc.ds.UpdateReplica(replica); err != nil {
replicaLog.Errorf("Failed to request replica automatic eviction, will requeue then resync instance manager: %v", err)
}
}
}
}

func (imc *InstanceManagerController) replicaIsProtected(im *longhorn.InstanceManager,
replica *longhorn.Replica) (bool, error) {
vol, err := imc.ds.GetVolume(replica.Spec.VolumeName)
Expand Down
16 changes: 5 additions & 11 deletions controller/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1487,22 +1487,18 @@ func (nc *NodeController) syncReplicaEvictionRequested(node *longhorn.Node) erro
return err
}
shouldEvictReplica := nc.shouldEvictReplica(node, &diskSpec, replica)
if shouldEvictReplica && replica.Spec.EvictionRequested != longhorn.ReplicaEvictionRequestedManual {
replica.Spec.EvictionRequested = longhorn.ReplicaEvictionRequestedManual
replicasToSync = append(replicasToSync, replica)
}
if !shouldEvictReplica && replica.Spec.EvictionRequested == longhorn.ReplicaEvictionRequestedManual {
replica.Spec.EvictionRequested = ""
if replica.Spec.EvictionRequestedManual != shouldEvictReplica {
replica.Spec.EvictionRequestedManual = shouldEvictReplica
replicasToSync = append(replicasToSync, replica)
}
}
}

for _, replica := range replicasToSync {
log := getLoggerForNode(nc.logger, node).WithField("replica", replica.Name)
log.Infof("Updating evictionRequested to %t", replica.Spec.EvictionRequested)
log.Infof("Updating evictionRequestedManual to %t", replica.Spec.EvictionRequestedManual)
if _, err := nc.ds.UpdateReplica(replica); err != nil {
log.Warn("Failed to update evictionRequested, will enqueue then resync node")
log.Warn("Failed to update evictionRequestedManual, will enqueue then resync node")
nc.enqueueNode(node)
continue
}
Expand All @@ -1519,9 +1515,7 @@ func (nc *NodeController) shouldEvictReplica(node *longhorn.Node, diskSpec *long
cond.Reason == string(longhorn.NodeConditionReasonKubernetesNodeNotReady) {
return false
}

// Check if node has requested eviction or is attempting to auto-evict replicas.
if node.Spec.EvictionRequested || node.Status.AutoEvicting {
if node.Spec.EvictionRequested {
return true
}
return diskSpec.EvictionRequested
Expand Down
2 changes: 1 addition & 1 deletion controller/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (

func hasReplicaEvictionRequested(rs map[string]*longhorn.Replica) bool {
for _, r := range rs {
if r.Spec.EvictionRequested {
if r.Spec.EvictionRequestedManual || r.Spec.EvictionRequestedAuto {
return true
}
}
Expand Down
8 changes: 4 additions & 4 deletions controller/volume_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ func (c *VolumeController) EvictReplicas(v *longhorn.Volume,
hasNewReplica := false
healthyNonEvictingCount := healthyCount
for _, replica := range rs {
if replica.Spec.EvictionRequested &&
if types.IsReplicaEvictionRequested(replica) &&
e.Status.ReplicaModeMap[replica.Name] == longhorn.ReplicaModeRW {
healthyNonEvictingCount--
}
Expand Down Expand Up @@ -1030,15 +1030,15 @@ func (c *VolumeController) cleanupEvictionRequestedReplicas(v *longhorn.Volume,
if !datastore.IsAvailableHealthyReplica(r) {
continue
}
if !r.Spec.EvictionRequested {
if !types.IsReplicaEvictionRequested(r) {
hasNonEvictingHealthyReplica = true
break
}
evictingHealthyReplica = r.Name
}

for _, r := range rs {
if !r.Spec.EvictionRequested {
if !types.IsReplicaEvictionRequested(r) {
continue
}
if !hasNonEvictingHealthyReplica && r.Name == evictingHealthyReplica {
Expand Down Expand Up @@ -2474,7 +2474,7 @@ func (c *VolumeController) getReplenishReplicasCount(v *longhorn.Volume, rs map[
continue
}
// Skip the replica has been requested eviction.
if r.Spec.FailedAt == "" && (!r.Spec.EvictionRequested) && r.Spec.Active {
if r.Spec.FailedAt == "" && !types.IsReplicaEvictionRequested(r) && r.Spec.Active {
usableCount++
}
}
Expand Down
9 changes: 4 additions & 5 deletions k8s/pkg/apis/longhorn/v1beta2/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ const (

ReplicaConditionReasonRebuildFailedDisconnection = "Disconnection"
ReplicaConditionReasonRebuildFailedGeneral = "General"

ReplicaEvictionRequestedManual = ReplicaEvictionRequested("manual")
ReplicaEvictionRequestedAuto = ReplicaEvictionRequested("auto")
)

// ReplicaSpec defines the desired state of the Longhorn replica
Expand Down Expand Up @@ -52,13 +49,15 @@ type ReplicaSpec struct {
// +optional
RebuildRetryCount int `json:"rebuildRetryCount"`
// +optional
EvictionRequested ReplicaEvictionRequested `json:"evictionRequested"`
EvictionRequestedAuto bool `json:"evictionRequestedAuto"`
// +optional
EvictionRequestedManual bool `json:"evictionRequestedManual"`
}

// ReplicaStatus defines the observed state of the Longhorn replica
type ReplicaStatus struct {
InstanceStatus `json:""`
// Deprecated: Replaced by field `spec.evictionRequested`.
// Deprecated: Replaced by field `spec.evictionRequestedAuto` `spec.evictionRequestedManual`.
// +optional
EvictionRequested bool `json:"evictionRequested"`
}
Expand Down
2 changes: 1 addition & 1 deletion manager/volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ func (m *VolumeManager) DeleteReplica(volumeName, replicaName string) error {
if !datastore.IsAvailableHealthyReplica(r) {
continue
}
if r.Spec.EvictionRequested {
if types.IsReplicaEvictionRequested(r) {
continue
}
healthyReplica = r.Name
Expand Down
12 changes: 8 additions & 4 deletions scheduler/replica_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ func (rcs *ReplicaScheduler) getNodeCandidates(nodesInfo map[string]*longhorn.No
func getNodesWithEvictingReplicas(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node) map[string]*longhorn.Node {
nodesWithEvictingReplicas := map[string]*longhorn.Node{}
for _, r := range replicas {
if r.Spec.EvictionRequested {
if types.IsReplicaEvictionRequested(r) {
if node, ok := nodeInfo[r.Spec.NodeID]; ok {
nodesWithEvictingReplicas[r.Spec.NodeID] = node
}
Expand Down Expand Up @@ -584,7 +584,7 @@ func (rcs *ReplicaScheduler) isFailedReplicaReusable(r *longhorn.Replica, v *lon
if r.Spec.RebuildRetryCount >= FailedReplicaMaxRetryCount {
return false, nil
}
if r.Spec.EvictionRequested {
if types.IsReplicaEvictionRequested(r) {
return false, nil
}
if hardNodeAffinity != "" && r.Spec.NodeID != hardNodeAffinity {
Expand Down Expand Up @@ -668,7 +668,7 @@ func IsPotentiallyReusableReplica(r *longhorn.Replica, hardNodeAffinity string)
if r.Spec.RebuildRetryCount >= FailedReplicaMaxRetryCount {
return false
}
if r.Spec.EvictionRequested {
if types.IsReplicaEvictionRequested(r) {
return false
}
if hardNodeAffinity != "" && r.Spec.NodeID != hardNodeAffinity {
Expand Down Expand Up @@ -802,7 +802,7 @@ func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map
for _, r := range replicas {
if r.Spec.NodeID != "" && r.DeletionTimestamp == nil && r.Spec.FailedAt == "" {
if node, ok := nodeInfo[r.Spec.NodeID]; ok {
if r.Spec.EvictionRequested {
if types.IsReplicaEvictionRequested(r) {
if _, ok := usedNodes[r.Spec.NodeID]; !ok {
// This is an evicting replica on a thus far unused node. We won't change this again unless we
// find a non-evicting replica on this node.
Expand All @@ -828,3 +828,7 @@ func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map

return usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones
}

func isEvictionRequsted(r *longhorn.Replica) {

}
10 changes: 5 additions & 5 deletions scheduler/replica_scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
}
alreadyScheduledReplica := newReplicaForVolume(tc.volume)
alreadyScheduledReplica.Spec.NodeID = TestNode1
alreadyScheduledReplica.Spec.EvictionRequested = true
alreadyScheduledReplica.Spec.EvictionRequestedManual = true
tc.allReplicas[alreadyScheduledReplica.Name] = alreadyScheduledReplica
node1.Status.DiskStatus = map[string]*longhorn.DiskStatus{
getDiskID(TestNode1, "1"): {
Expand Down Expand Up @@ -1025,7 +1025,7 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
}
alreadyScheduledReplica = newReplicaForVolume(tc.volume)
alreadyScheduledReplica.Spec.NodeID = TestNode1
alreadyScheduledReplica.Spec.EvictionRequested = true
alreadyScheduledReplica.Spec.EvictionRequestedManual = true
tc.allReplicas[alreadyScheduledReplica.Name] = alreadyScheduledReplica
node1.Status.DiskStatus = map[string]*longhorn.DiskStatus{
getDiskID(TestNode1, "1"): {
Expand All @@ -1048,7 +1048,7 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
}
alreadyScheduledReplica = newReplicaForVolume(tc.volume)
alreadyScheduledReplica.Spec.NodeID = TestNode2
alreadyScheduledReplica.Spec.EvictionRequested = false
alreadyScheduledReplica.Spec.EvictionRequestedManual = false
tc.allReplicas[alreadyScheduledReplica.Name] = alreadyScheduledReplica
node2.Status.DiskStatus = map[string]*longhorn.DiskStatus{
getDiskID(TestNode2, "1"): {
Expand All @@ -1071,7 +1071,7 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
}
alreadyScheduledReplica = newReplicaForVolume(tc.volume)
alreadyScheduledReplica.Spec.NodeID = TestNode3
alreadyScheduledReplica.Spec.EvictionRequested = false
alreadyScheduledReplica.Spec.EvictionRequestedManual = false
tc.allReplicas[alreadyScheduledReplica.Name] = alreadyScheduledReplica
node3.Status.DiskStatus = map[string]*longhorn.DiskStatus{
getDiskID(TestNode3, "1"): {
Expand Down Expand Up @@ -1363,7 +1363,7 @@ func (s *TestSuite) TestGetCurrentNodesAndZones(c *C) {
InstanceSpec: longhorn.InstanceSpec{
NodeID: nodeName,
},
EvictionRequested: evictionRequested,
EvictionRequestedManual: evictionRequested,
},
}
}
Expand Down
4 changes: 4 additions & 0 deletions types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1066,3 +1066,7 @@ func GetBackupTargetSchemeFromURL(backupTargetURL string) string {
return ValueUnknown
}
}

func IsReplicaEvictionRequested(replica *longhorn.Replica) bool {
return replica.Spec.EvictionRequestedAuto || replica.Spec.EvictionRequestedManual
}

0 comments on commit 9c6d9f9

Please sign in to comment.