Skip to content

Commit

Permalink
[release/v1.37] Waiting for volumeAttachments deletion (#1257)
Browse files Browse the repository at this point in the history
* Waiting for volumeAttachments deletion (#1190)

* Waiting for volumeAttachments deletion

Signed-off-by: Mattia Lavacca <[email protected]>

* volumeAttachments check only for vSphere

Signed-off-by: Mattia Lavacca <[email protected]>

* ClusterRole updated

Signed-off-by: Mattia Lavacca <[email protected]>

* yaml linter fixed

Signed-off-by: Mattia Lavacca <[email protected]>

* VolumeAttachments correctly handled

Signed-off-by: Mattia Lavacca <[email protected]>

* Code factorized

Signed-off-by: Mattia Lavacca <[email protected]>

* renaming

Signed-off-by: Mattia Lavacca <[email protected]>

* fix yamllint

Signed-off-by: Mattia Lavacca <[email protected]>

* Logic applied only to vSphere

Signed-off-by: Mattia Lavacca <[email protected]>

* disable vSphere tests (#1172)

Signed-off-by: Moath Qasim <[email protected]>

* enable vSphere tests (#1180)

* enable vSphere tests

Signed-off-by: Moath Qasim <[email protected]>

# Conflicts:
#	go.sum

* refactor vSphere datastore cluster

Signed-off-by: Moath Qasim <[email protected]>

* refactor vSphere tests

Signed-off-by: Moath Qasim <[email protected]>

* enable vsphere test

Signed-off-by: Moath Qasim <[email protected]>

* debug vsphere datastore test

Signed-off-by: Moath Qasim <[email protected]>

* debug vsphere datastore test

Signed-off-by: Moath Qasim <[email protected]>

Co-authored-by: Mattia Lavacca <[email protected]>
Co-authored-by: Moath Qasim <[email protected]>
  • Loading branch information
3 people authored Apr 21, 2022
1 parent 07e3ca3 commit 763877f
Show file tree
Hide file tree
Showing 9 changed files with 463 additions and 130 deletions.
2 changes: 1 addition & 1 deletion .prow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ presubmits:
cpu: 500m

- name: pull-machine-controller-e2e-vsphere-resource-pool
always_run: true
always_run: false
decorate: true
error_on_eviction: true
clone_uri: "ssh://[email protected]/kubermatic/machine-controller.git"
Expand Down
9 changes: 9 additions & 0 deletions examples/machine-controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,15 @@ rules:
- "list"
- "get"
- "watch"
# volumeAttachments permissions are needed by vsphere clusters
- apiGroups:
- "storage.k8s.io"
resources:
- "volumeattachments"
verbs:
- "list"
- "get"
- "watch"
- apiGroups:
- ""
resources:
Expand Down
141 changes: 94 additions & 47 deletions pkg/controller/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/kubermatic/machine-controller/pkg/containerruntime"
kuberneteshelper "github.com/kubermatic/machine-controller/pkg/kubernetes"
"github.com/kubermatic/machine-controller/pkg/node/eviction"
"github.com/kubermatic/machine-controller/pkg/node/poddeletion"
"github.com/kubermatic/machine-controller/pkg/providerconfig"
providerconfigtypes "github.com/kubermatic/machine-controller/pkg/providerconfig/types"
"github.com/kubermatic/machine-controller/pkg/rhsm"
Expand Down Expand Up @@ -408,7 +409,7 @@ func (r *Reconciler) reconcile(ctx context.Context, machine *clusterv1alpha1.Mac

// step 2: check if a user requested to delete the machine
if machine.DeletionTimestamp != nil {
return r.deleteMachine(ctx, prov, machine)
return r.deleteMachine(ctx, prov, providerConfig.CloudProvider, machine)
}

// Step 3: Essentially creates an instance for the given machine.
Expand Down Expand Up @@ -462,6 +463,30 @@ func (r *Reconciler) ensureMachineHasNodeReadyCondition(machine *clusterv1alpha1
})
}

func (r *Reconciler) shouldCleanupVolumes(ctx context.Context, machine *clusterv1alpha1.Machine, providerName providerconfigtypes.CloudProvider) (bool, error) {
// we need to wait for volumeAttachments clean up only for vSphere
if providerName != providerconfigtypes.CloudProviderVsphere {
return false, nil
}

// No node - No volumeAttachments to be collected
if machine.Status.NodeRef == nil {
klog.V(4).Infof("Skipping eviction for machine %q since it does not have a node", machine.Name)
return false, nil
}

node := &corev1.Node{}
if err := r.client.Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name}, node); err != nil {
// Node does not exist - No volumeAttachments to be collected
if kerrors.IsNotFound(err) {
klog.V(4).Infof("Skipping eviction for machine %q since it does not have a node", machine.Name)
return false, nil
}
return false, fmt.Errorf("failed to get node %q", machine.Status.NodeRef.Name)
}
return true, nil
}

// evictIfNecessary checks if the machine has a node and evicts it if necessary
func (r *Reconciler) shouldEvict(ctx context.Context, machine *clusterv1alpha1.Machine) (bool, error) {
// If the deletion got triggered a few hours ago, skip eviction.
Expand Down Expand Up @@ -521,22 +546,35 @@ func (r *Reconciler) shouldEvict(ctx context.Context, machine *clusterv1alpha1.M
}

// deleteMachine makes sure that an instance has gone in a series of steps.
func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.Provider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.Provider, providerName providerconfigtypes.CloudProvider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
shouldEvict, err := r.shouldEvict(ctx, machine)
if err != nil {
return nil, err
}
shouldCleanUpVolumes, err := r.shouldCleanupVolumes(ctx, machine, providerName)
if err != nil {
return nil, err
}

var evictedSomething, deletedSomething bool
var volumesFree = true
if shouldEvict {
evictedSomething, err := eviction.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
evictedSomething, err = eviction.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
if err != nil {
return nil, fmt.Errorf("failed to evict node %s: %v", machine.Status.NodeRef.Name, err)
}
if evictedSomething {
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
}
if shouldCleanUpVolumes {
deletedSomething, volumesFree, err = poddeletion.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
if err != nil {
return nil, fmt.Errorf("failed to delete pods bound to volumes running on node %s: %v", machine.Status.NodeRef.Name, err)
}
}

if evictedSomething || deletedSomething || !volumesFree {
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
}

if result, err := r.deleteCloudProviderInstance(prov, machine); result != nil || err != nil {
return result, err
}
Expand All @@ -550,7 +588,52 @@ func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.
return nil, nil
}

return nil, r.deleteNodeForMachine(ctx, machine)
nodes, err := r.retrieveNodesRelatedToMachine(ctx, machine)
if err != nil {
return nil, err
}

return nil, r.deleteNodeForMachine(ctx, nodes, machine)
}

func (r *Reconciler) retrieveNodesRelatedToMachine(ctx context.Context, machine *clusterv1alpha1.Machine) ([]*corev1.Node, error) {
nodes := make([]*corev1.Node, 0)

// If there's NodeRef on the Machine object, retrieve the node by using the
// value of the NodeRef. If there's no NodeRef, try to find the Node by
// listing nodes using the NodeOwner label selector.
if machine.Status.NodeRef != nil {
objKey := ctrlruntimeclient.ObjectKey{Name: machine.Status.NodeRef.Name}
node := &corev1.Node{}
if err := r.client.Get(ctx, objKey, node); err != nil {
if !kerrors.IsNotFound(err) {
return nil, fmt.Errorf("failed to get node %s: %v", machine.Status.NodeRef.Name, err)
}
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
} else {
nodes = append(nodes, node)
}
} else {
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
if err != nil {
return nil, fmt.Errorf("failed to parse label selector: %v", err)
}
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
nodeList := &corev1.NodeList{}
if err := r.client.List(ctx, nodeList, listOpts); err != nil {
return nil, fmt.Errorf("failed to list nodes: %v", err)
}
if len(nodeList.Items) == 0 {
// We just want log that we didn't found the node.
klog.V(3).Infof("No node found for the machine %s", machine.Spec.Name)
}

for _, node := range nodeList.Items {
nodes = append(nodes, &node)
}
}

return nodes, nil
}

func (r *Reconciler) deleteCloudProviderInstance(prov cloudprovidertypes.Provider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
Expand Down Expand Up @@ -623,50 +706,14 @@ func (r *Reconciler) deleteCloudProviderInstance(prov cloudprovidertypes.Provide
})
}

func (r *Reconciler) deleteNodeForMachine(ctx context.Context, machine *clusterv1alpha1.Machine) error {
// If there's NodeRef on the Machine object, remove the Node by using the
// value of the NodeRef. If there's no NodeRef, try to find the Node by
// listing nodes using the NodeOwner label selector.
if machine.Status.NodeRef != nil {
objKey := ctrlruntimeclient.ObjectKey{Name: machine.Status.NodeRef.Name}
node := &corev1.Node{}
nodeFound := true
if err := r.client.Get(ctx, objKey, node); err != nil {
func (r *Reconciler) deleteNodeForMachine(ctx context.Context, nodes []*corev1.Node, machine *clusterv1alpha1.Machine) error {
// iterates on all nodes and delete them. Finally, remove the finalizer on the machine
for _, node := range nodes {
if err := r.client.Delete(ctx, node); err != nil {
if !kerrors.IsNotFound(err) {
return fmt.Errorf("failed to get node %s: %v", machine.Status.NodeRef.Name, err)
}
nodeFound = false
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
}

if nodeFound {
if err := r.client.Delete(ctx, node); err != nil {
if !kerrors.IsNotFound(err) {
return err
}
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
}
}
} else {
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
if err != nil {
return fmt.Errorf("failed to parse label selector: %v", err)
}
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
nodes := &corev1.NodeList{}
if err := r.client.List(ctx, nodes, listOpts); err != nil {
return fmt.Errorf("failed to list nodes: %v", err)
}
if len(nodes.Items) == 0 {
// We just want log that we didn't found the node. We don't want to
// return here, as we want to remove finalizers at the end.
klog.V(3).Infof("No node found for the machine %s", machine.Spec.Name)
}

for _, node := range nodes.Items {
if err := r.client.Delete(ctx, &node); err != nil {
return err
}
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
}
}

Expand Down
53 changes: 33 additions & 20 deletions pkg/controller/machine/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
tests := []struct {
name string
machine *clusterv1alpha1.Machine
nodes []runtime.Object
nodes []*corev1.Node
err error
shouldDeleteNode string
}{
Expand All @@ -489,13 +489,17 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
NodeRef: &corev1.ObjectReference{Name: "node-1"},
},
},
nodes: []runtime.Object{&corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
}}, &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
}},
nodes: []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
},
},
},
err: nil,
shouldDeleteNode: "node-1",
Expand All @@ -510,16 +514,16 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
},
Status: clusterv1alpha1.MachineStatus{},
},
nodes: []runtime.Object{
&corev1.Node{
nodes: []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
Labels: map[string]string{
NodeOwnerLabelName: string(machineUID),
},
},
},
&corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
},
Expand All @@ -538,13 +542,13 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
},
Status: clusterv1alpha1.MachineStatus{},
},
nodes: []runtime.Object{
&corev1.Node{
nodes: []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
},
},
&corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
},
Expand All @@ -564,10 +568,12 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
NodeRef: &corev1.ObjectReference{Name: "node-1"},
},
},
nodes: []runtime.Object{&corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
}},
nodes: []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-0",
},
},
},
err: nil,
shouldDeleteNode: "",
Expand All @@ -579,7 +585,9 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
ctx := context.Background()

objects := []runtime.Object{test.machine}
objects = append(objects, test.nodes...)
for _, n := range test.nodes {
objects = append(objects, n)
}

client := ctrlruntimefake.NewFakeClient(objects...)

Expand All @@ -595,7 +603,12 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
providerData: providerData,
}

err := reconciler.deleteNodeForMachine(ctx, test.machine)
nodes, err := reconciler.retrieveNodesRelatedToMachine(ctx, test.machine)
if err != nil {
return
}

err = reconciler.deleteNodeForMachine(ctx, nodes, test.machine)
if diff := deep.Equal(err, test.err); diff != nil {
t.Errorf("expected to get %v instead got: %v", test.err, err)
}
Expand Down
Loading

0 comments on commit 763877f

Please sign in to comment.