Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NR-238698: Issue fix - GcpMachinePoolMachine Status not populating correct ProvisioningState and LatestModelApplied fields #20

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cloud/scope/machinepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -689,3 +689,11 @@ func (m *MachinePoolScope) SetReplicas(replicas int32) {
func (m *MachinePoolScope) ConditionSetter() conditions.Setter {
return m.GCPMachinePool
}

// GetInstanceImage picks an image from the machine configuration, or uses a default one.
func (m *MachinePoolScope) GetInstanceImage() (*string, error) {
if m.GCPMachinePool.Spec.Image == nil {
return nil, fmt.Errorf("no image specified in machine pool spec %s\n", m.MachinePool.Name)
}
return m.GCPMachinePool.Spec.Image, nil
}
74 changes: 64 additions & 10 deletions cloud/scope/machinepoolmachine.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"fmt"
"net/url"
"os"
"path"
"strings"
"time"

Expand Down Expand Up @@ -70,9 +69,16 @@ type (
MachinePool *clusterv1exp.MachinePool
GCPMachinePool *infrav1exp.GCPMachinePool
GCPMachinePoolMachine *infrav1exp.GCPMachinePoolMachine
MachinePoolScope *MachinePoolScope
instance *compute.Instance
}
)

// SetReady sets the GCPMachinePoolMachine Ready Status to true.
func (m *MachinePoolMachineScope) SetMIGInstance(instance *compute.Instance) {
m.instance = instance
}

// PatchObject persists the machine pool configuration and status.
func (m *MachinePoolMachineScope) PatchObject(ctx context.Context) error {
return m.PatchHelper.Patch(ctx, m.GCPMachinePoolMachine)
Expand Down Expand Up @@ -121,6 +127,17 @@ func NewMachinePoolMachineScope(params MachinePoolMachineScopeParams) (*MachineP
return nil, errors.New("gcp machine pool machine is required when creating a MachinePoolScope")
}

mpScope, err := NewMachinePoolScope(MachinePoolScopeParams{
Client: params.Client,
MachinePool: params.MachinePool,
GCPMachinePool: params.GCPMachinePool,
ClusterGetter: params.ClusterGetter,
})

if err != nil {
return nil, errors.Wrap(err, "failed to build machine pool scope")
}

helper, err := patch.NewHelper(params.GCPMachinePoolMachine, params.Client)
if err != nil {
return nil, errors.Wrapf(err, "failed to init patch helper for %s %s/%s", params.GCPMachinePoolMachine.GroupVersionKind(), params.GCPMachinePoolMachine.Namespace, params.GCPMachinePoolMachine.Name)
Expand All @@ -139,11 +156,12 @@ func NewMachinePoolMachineScope(params MachinePoolMachineScopeParams) (*MachineP
GCPMachinePoolMachine: params.GCPMachinePoolMachine,
PatchHelper: helper,
CapiMachinePoolPatchHelper: capiMachinePoolPatchHelper,
MachinePoolScope: mpScope,
}, nil
}

// UpdateNodeStatus updates the GCPMachinePoolMachine conditions and ready status. It will also update the node ref and the Kubernetes version.
func (m *MachinePoolMachineScope) UpdateNodeStatus(ctx context.Context) (bool, error) {
func (m *MachinePoolMachineScope) UpdateNodeStatus(ctx context.Context, disk *compute.Disk) (bool, error) {
var node *corev1.Node
nodeRef := m.GCPMachinePoolMachine.Status.NodeRef

Expand Down Expand Up @@ -184,6 +202,17 @@ func (m *MachinePoolMachineScope) UpdateNodeStatus(ctx context.Context) (bool, e
}

m.GCPMachinePoolMachine.Status.Version = node.Status.NodeInfo.KubeletVersion

if m.instance != nil {
m.GCPMachinePoolMachine.Status.InstanceName = m.instance.Name
m.SetStatusProvisioningState()

hasLatestModel, err := m.HasLatestModelApplied(ctx, disk)
if err != nil {
return false, errors.Wrap(err, "failed to determine if the MIG instance has the latest model")
}
m.GCPMachinePoolMachine.Status.LatestModelApplied = hasLatestModel
}
}

return true, nil
Expand Down Expand Up @@ -283,8 +312,35 @@ func (m *MachinePoolMachineScope) ProviderID() string {
return fmt.Sprintf("gce://%s/%s/%s", m.Project(), m.GCPMachinePool.Spec.Zone, m.Name())
}

// SetStatusProvisioningState sets the provisioning state for the GCPMachinePoolMachine from the GCP instance status.
func (m *MachinePoolMachineScope) SetStatusProvisioningState() {
// Ref - https://cloud.google.com/compute/docs/instances/instance-life-cycle
if m.instance != nil {
var state infrav1exp.ProvisioningState
switch m.instance.Status {
case "PROVISIONING":
state = infrav1exp.Provisioning
case "DEPROVISIONING":
state = infrav1exp.Deprovisioning
case "STAGING":
state = infrav1exp.Complete
case "RUNNING", "REPAIRING", "SUSPENDED", "SUSPENDING", "STOPPED":
state = infrav1exp.Succeeded
case "STOPPING", "TERMINATED":
if m.instance.LastStartTimestamp == "" {
state = infrav1exp.Failed
} else {
state = infrav1exp.Succeeded
}
default:
state = infrav1exp.Failed
}
m.GCPMachinePoolMachine.Status.ProvisioningState = state
}
}

// HasLatestModelApplied checks if the latest model is applied to the GCPMachinePoolMachine.
func (m *MachinePoolMachineScope) HasLatestModelApplied(_ context.Context, instance *compute.Disk) (bool, error) {
func (m *MachinePoolMachineScope) HasLatestModelApplied(ctx context.Context, disk *compute.Disk) (bool, error) {
image := ""

if m.GCPMachinePool.Spec.Image == nil {
Expand All @@ -294,22 +350,20 @@ func (m *MachinePoolMachineScope) HasLatestModelApplied(_ context.Context, insta
}
image = cloud.ClusterAPIImagePrefix + strings.ReplaceAll(semver.MajorMinor(version), ".", "-")
} else {
// Ex: projects/test-lemon-peel/global/images/cluster-api-ubuntu-2204-v1-25-16-1709613458
image = *m.GCPMachinePool.Spec.Image
}

// Get the image from the disk URL path to compare with the latest image name
diskImage, err := url.Parse(instance.SourceImage)
// Ex: https://www.googleapis.com/compute/v1/projects/test-lemon-peel/global/images/cluster-api-ubuntu-2204-v1-25-16-1709613458
diskImage, err := url.Parse(disk.SourceImage)
if err != nil {
return false, err
}
instanceImage := path.Base(diskImage.Path)

// Check if the image is the latest
if image == instanceImage {
return true, nil
}

return false, nil
log.FromContext(ctx).Info(fmt.Sprintf("Comparing machine pool %s spec image %s with instance disk image %s\n", m.MachinePool.Name, image, diskImage))
return strings.Contains(diskImage.Path, image), nil
}

// CordonAndDrainNode cordon and drain the node for the GCPMachinePoolMachine.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ func getMachinesWithoutLatestModel(machinesByProviderID map[string]infrav1exp.GC
func getDeletingMachines(machinesByProviderID map[string]infrav1exp.GCPMachinePoolMachine) []infrav1exp.GCPMachinePoolMachine {
var machines []infrav1exp.GCPMachinePoolMachine
for _, v := range machinesByProviderID {
if v.Status.ProvisioningState == infrav1exp.Deleting &&
if v.Status.ProvisioningState == infrav1exp.Deprovisioning &&
// Ensure that the machine has not already been marked for deletion
v.DeletionTimestamp.IsZero() {
machines = append(machines, v)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,33 +62,25 @@ func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) {
return ctrl.Result{}, err
}

// Fetch the instances disk.
// Set the instance in scope
s.scope.SetMIGInstance(instance)

// Fetch the instance disk.
disk, err := s.GetDisk(ctx, s.scope.Project(), s.scope.Zone(), s.scope.Name())
if err != nil {
return ctrl.Result{}, err
}

// Update the GCPMachinePoolMachine status.
s.scope.GCPMachinePoolMachine.Status.InstanceName = instance.Name

// Update Node status with the instance information. If the node is not found, requeue.
if nodeFound, err := s.scope.UpdateNodeStatus(ctx); err != nil {
if nodeFound, err := s.scope.UpdateNodeStatus(ctx, disk); err != nil {
log.Error(err, "Failed to update Node status")
return ctrl.Result{}, err
} else if !nodeFound {
log.Info("Node not found, requeueing")
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}

// Update hasLatestModelApplied status.
latestModel, err := s.scope.HasLatestModelApplied(ctx, disk)
if err != nil {
log.Error(err, "Failed to check if the latest model is applied")
return ctrl.Result{}, err
}

// Update the GCPMachinePoolMachine status.
s.scope.GCPMachinePoolMachine.Status.LatestModelApplied = latestModel
s.scope.SetReady()

return ctrl.Result{}, nil
Expand All @@ -99,15 +91,15 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) {
log := log.FromContext(ctx)
log.Info("Deleting Instance Group Instances")

if s.scope.GCPMachinePoolMachine.Status.ProvisioningState != v1beta1.Deleting {
if s.scope.GCPMachinePoolMachine.Status.ProvisioningState != v1beta1.Deprovisioning {
log.Info("Deleting instance", "instance", s.scope.Name())
// Cordon and drain the node before deleting the instance.
if err := s.scope.CordonAndDrainNode(ctx); err != nil {
return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, err
}

// Delete the instance group instance
_, err := s.DeleteInstanceGroupInstances(ctx, s.scope.Project(), s.scope.Zone(), s.scope.GCPMachinePool.Name, &compute.InstanceGroupManagersDeleteInstancesRequest{
op, err := s.DeleteInstanceGroupInstances(ctx, s.scope.Project(), s.scope.Zone(), s.scope.GCPMachinePool.Name, &compute.InstanceGroupManagersDeleteInstancesRequest{
Instances: []string{fmt.Sprintf("zones/%s/instances/%s", s.scope.Zone(), s.scope.Name())},
})
if err != nil {
Expand All @@ -116,7 +108,7 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) {
}

// Update the GCPMachinePoolMachine status.
s.scope.GCPMachinePoolMachine.Status.ProvisioningState = v1beta1.Deleting
s.scope.GCPMachinePoolMachine.Status.ProvisioningState = v1beta1.ProvisioningState(op.Status)

// Wait for the instance to be deleted before proceeding.
return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil
Expand Down
21 changes: 10 additions & 11 deletions exp/api/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,20 @@ type Taint struct {
}

// ProvisioningState describes the provisioning state of an GCP resource.
// Ref - https://cloud.google.com/compute/docs/instances/instance-life-cycle
type ProvisioningState string

const (
// Creating ...
Creating ProvisioningState = "Creating"
// Deleting ...
Deleting ProvisioningState = "Deleting"
// Failed ...
Failed ProvisioningState = "Failed"
// Succeeded ...
// Provisioning - resources are allocated for the VM. The VM is not running yet
Provisioning ProvisioningState = "Provisioning"
// Deprovisioning - instance is halted. Tear down tasks like network deprogramming, releasing quota, IP, disks etc
Deprovisioning ProvisioningState = "Deprovisioning"
// Complete - resources are acquired, and the VM is preparing for first boot.
Complete ProvisioningState = "Complete"
// Succeeded - the VM is running or stopped or suspended
Succeeded ProvisioningState = "Succeeded"
// Updating ...
Updating ProvisioningState = "Updating"
// Deleted represents a deleted resource.
Deleted ProvisioningState = "Deleted"
// Failed - the VM encountered an error and will be deleted
Failed ProvisioningState = "Failed"
)

// Taints is an array of Taints.
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,8 @@ k8s.io/apiserver v0.29.5 h1:223C+JkRnGmudEU00GfpX6quDSrzwwP0DuXOYTyUYb0=
k8s.io/apiserver v0.29.5/go.mod h1:zN9xdatz5g7XwL1Xoz9hD4QQON1GN0c+1kV5e/NHejM=
k8s.io/cli-runtime v0.29.3 h1:r68rephmmytoywkw2MyJ+CxjpasJDQY7AGc3XY2iv1k=
k8s.io/cli-runtime v0.29.3/go.mod h1:aqVUsk86/RhaGJwDhHXH0jcdqBrgdF3bZWk4Z9D4mkM=
k8s.io/cli-runtime v0.29.3 h1:r68rephmmytoywkw2MyJ+CxjpasJDQY7AGc3XY2iv1k=
k8s.io/cli-runtime v0.29.3/go.mod h1:aqVUsk86/RhaGJwDhHXH0jcdqBrgdF3bZWk4Z9D4mkM=
k8s.io/client-go v0.29.5 h1:nlASXmPQy190qTteaVP31g3c/wi2kycznkTP7Sv1zPc=
k8s.io/client-go v0.29.5/go.mod h1:aY5CnqUUvXYccJhm47XHoPcRyX6vouHdIBHaKZGTbK4=
k8s.io/cluster-bootstrap v0.29.5 h1:l///rP7Y2a8czgBcITsa8N/yHen/gjWFUz4JQ/Q5LC4=
Expand Down