diff --git a/api/v1alpha1/workflow_types.go b/api/v1alpha1/workflow_types.go index 2e0a3c80e..7f1ac530a 100644 --- a/api/v1alpha1/workflow_types.go +++ b/api/v1alpha1/workflow_types.go @@ -13,6 +13,8 @@ const ( WorkflowStateTimeout = WorkflowState("STATE_TIMEOUT") WorkflowStateSuccess = WorkflowState("STATE_SUCCESS") WorkflowStatePreparing = WorkflowState("STATE_PREPARING") + StatusSuccess = "success" + StatusFailure = "failure" ) // WorkflowSpec defines the desired state of Workflow. @@ -64,7 +66,12 @@ type WorkflowStatus struct { ToggleHardware *Status `json:"toggleHardware,omitempty"` // OneTimeNetboot indicates whether the controller has successfully netbooted the associated hardware. - OneTimeNetboot *Status `json:"oneTimeNetboot,omitempty"` + OneTimeNetboot OneTimeNetbootStatus `json:"oneTimeNetboot,omitempty"` +} + +type OneTimeNetbootStatus struct { + CreationStatus *Status `json:"creationStatus,omitempty"` + DeletionStatus *Status `json:"deletionStatus,omitempty"` } // Wanted to use metav1.Status but kubebuilder errors with, "must apply listType to an array, found". @@ -92,6 +99,13 @@ type Status struct { // Details *metav1.StatusDetails `json:"details,omitempty" protobuf:"bytes,5,opt,name=details"` } +func (s *Status) IsSuccess() bool { + if s == nil { + return false + } + return s.Status == StatusSuccess +} + // Task represents a series of actions to be completed by a worker. type Task struct { Name string `json:"name"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index edf34e2a7..e2e00303c 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -716,6 +716,31 @@ func (in *OSIE) DeepCopy() *OSIE { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OneTimeNetbootStatus) DeepCopyInto(out *OneTimeNetbootStatus) { + *out = *in + if in.CreationStatus != nil { + in, out := &in.CreationStatus, &out.CreationStatus + *out = new(Status) + **out = **in + } + if in.DeletionStatus != nil { + in, out := &in.DeletionStatus, &out.DeletionStatus + *out = new(Status) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OneTimeNetbootStatus. +func (in *OneTimeNetbootStatus) DeepCopy() *OneTimeNetbootStatus { + if in == nil { + return nil + } + out := new(OneTimeNetbootStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Status) DeepCopyInto(out *Status) { *out = *in @@ -956,11 +981,7 @@ func (in *WorkflowStatus) DeepCopyInto(out *WorkflowStatus) { *out = new(Status) **out = **in } - if in.OneTimeNetboot != nil { - in, out := &in.OneTimeNetboot, &out.OneTimeNetboot - *out = new(Status) - **out = **in - } + in.OneTimeNetboot.DeepCopyInto(&out.OneTimeNetboot) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkflowStatus. diff --git a/config/crd/bases/tinkerbell.org_workflows.yaml b/config/crd/bases/tinkerbell.org_workflows.yaml index cde1ded64..358ce398a 100644 --- a/config/crd/bases/tinkerbell.org_workflows.yaml +++ b/config/crd/bases/tinkerbell.org_workflows.yaml @@ -87,22 +87,46 @@ spec: oneTimeNetboot: description: OneTimeNetboot indicates whether the controller has successfully netbooted the associated hardware. properties: - message: - description: A human-readable description of the status of this operation. - type: string - reason: - description: |- - A machine-readable description of why this operation is in the - "Failure" status. If this value is empty there - is no information available. A Reason clarifies an HTTP status - code but does not override it. - type: string - status: - description: |- - Status of the operation. - One of: "Success" or "Failure". - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: string + creationStatus: + description: Wanted to use metav1.Status but kubebuilder errors with, "must apply listType to an array, found". + properties: + message: + description: A human-readable description of the status of this operation. + type: string + reason: + description: |- + A machine-readable description of why this operation is in the + "Failure" status. If this value is empty there + is no information available. A Reason clarifies an HTTP status + code but does not override it. + type: string + status: + description: |- + Status of the operation. + One of: "Success" or "Failure". + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + type: string + type: object + deletionStatus: + description: Wanted to use metav1.Status but kubebuilder errors with, "must apply listType to an array, found". + properties: + message: + description: A human-readable description of the status of this operation. + type: string + reason: + description: |- + A machine-readable description of why this operation is in the + "Failure" status. If this value is empty there + is no information available. A Reason clarifies an HTTP status + code but does not override it. + type: string + status: + description: |- + Status of the operation. + One of: "Success" or "Failure". + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + type: string + type: object type: object state: description: State is the state of the workflow in Tinkerbell. diff --git a/internal/deprecated/workflow/reconciler.go b/internal/deprecated/workflow/reconciler.go index a266be146..06a7f615d 100644 --- a/internal/deprecated/workflow/reconciler.go +++ b/internal/deprecated/workflow/reconciler.go @@ -10,6 +10,7 @@ import ( "github.com/tinkerbell/tink/api/v1alpha1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "knative.dev/pkg/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -18,6 +19,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const ( + bmcJobName = "tink-controller-%s-one-time-netboot" +) + // Reconciler is a type for managing Workflows. type Reconciler struct { client ctrlclient.Client @@ -62,18 +67,21 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco case v1alpha1.WorkflowStateRunning: resp = r.processRunningWorkflow(ctx, wflow) case v1alpha1.WorkflowStatePreparing: - logger.Info("debugging 1") + if wflow.Status.OneTimeNetboot.CreationStatus == nil { + return reconcile.Result{Requeue: true}, nil + } + if !wflow.Status.OneTimeNetboot.CreationStatus.IsSuccess() { + return reconcile.Result{Requeue: true}, nil + } existingJob := &rufio.Job{} - jobName := fmt.Sprintf("tink-controller-%s-one-time-netboot", wflow.Spec.HardwareRef) + jobName := fmt.Sprintf(bmcJobName, wflow.Spec.HardwareRef) if err := r.client.Get(ctx, ctrlclient.ObjectKey{Name: jobName, Namespace: wflow.Namespace}, existingJob); err != nil { return reconcile.Result{}, fmt.Errorf("error getting one time netboot job: %w", err) } - logger.Info("debugging 2", "job conditions", existingJob.Status.Conditions) if existingJob.HasCondition(rufio.JobFailed, rufio.ConditionTrue) { return reconcile.Result{}, fmt.Errorf("one time netboot job failed") } if existingJob.HasCondition(rufio.JobCompleted, rufio.ConditionTrue) { - logger.Info("debugging 3") wflow.Status.State = v1alpha1.WorkflowStatePending } else { return reconcile.Result{Requeue: true}, nil @@ -109,12 +117,12 @@ func (r *Reconciler) handleHardwareAllowPXE(ctx context.Context, stored *v1alpha // before workflow case if stored.Status.ToggleHardware == nil || (stored.Status.ToggleHardware != nil && stored.Status.ToggleHardware.Status == "" && stored.Status.State == "" || stored.Status.State == v1alpha1.WorkflowStatePending) { - status := &v1alpha1.Status{Status: "success", Message: "allowPXE set to true"} + status := &v1alpha1.Status{Status: v1alpha1.StatusSuccess, Message: "allowPXE set to true"} for _, iface := range hardware.Spec.Interfaces { iface.Netboot.AllowPXE = ptr.Bool(true) } if err := r.client.Update(ctx, hardware); err != nil { - status.Status = "failed" + status.Status = v1alpha1.StatusFailure stored.Status.ToggleHardware = status return err } @@ -123,12 +131,12 @@ func (r *Reconciler) handleHardwareAllowPXE(ctx context.Context, stored *v1alpha // after workflow case if stored.Status.State == v1alpha1.WorkflowStateSuccess { - status := &v1alpha1.Status{Status: "success", Message: "allowPXE set to false"} + status := &v1alpha1.Status{Status: v1alpha1.StatusSuccess, Message: "allowPXE set to false"} for _, iface := range hardware.Spec.Interfaces { iface.Netboot.AllowPXE = ptr.Bool(false) } if err := r.client.Update(ctx, hardware); err != nil { - status.Status = "failed" + status.Status = v1alpha1.StatusFailure stored.Status.ToggleHardware = status return err } @@ -205,117 +213,85 @@ func (r *Reconciler) processNewWorkflow(ctx context.Context, logger logr.Logger, } // check if an existing job.bmc.tinkerbell.org object exists, if so delete it. - existingJob := &rufio.Job{} - jobName := fmt.Sprintf("tink-controller-%s-one-time-netboot", hardware.Name) - if err := r.client.Get(ctx, ctrlclient.ObjectKey{Name: jobName, Namespace: hardware.Namespace}, existingJob); err == nil { - logger.Info("debugging in delete", "stored.status", stored.Status, "job conditions", existingJob.Status.Conditions) - if existingJob.HasCondition(rufio.JobCompleted, rufio.ConditionFalse) || len(existingJob.Status.Conditions) != 2 { - logger.Info("one time netboot job is still running", "job", jobName, "stored.status", stored.Status, "conditions", existingJob.Status.Conditions) + if !stored.Status.OneTimeNetboot.DeletionStatus.IsSuccess() && !stored.Status.OneTimeNetboot.CreationStatus.IsSuccess() { + existingJob := &rufio.Job{} + jobName := fmt.Sprintf(bmcJobName, hardware.Name) + if err := r.client.Get(ctx, ctrlclient.ObjectKey{Name: jobName, Namespace: hardware.Namespace}, existingJob); err == nil { + opts := []ctrlclient.DeleteOption{ + ctrlclient.GracePeriodSeconds(0), + ctrlclient.PropagationPolicy(metav1.DeletePropagationForeground), + } + if err := r.client.Delete(ctx, existingJob, opts...); err != nil { + return reconcile.Result{}, fmt.Errorf("error deleting existing job.bmc.tinkerbell.org object for netbooting machine: %w", err) + } + stored.Status.OneTimeNetboot.DeletionStatus = &v1alpha1.Status{Status: v1alpha1.StatusSuccess, Message: "previous existing one time netboot job deleted"} + return reconcile.Result{Requeue: true}, nil + } else { + if apierrors.IsNotFound(err) { + stored.Status.OneTimeNetboot.DeletionStatus = &v1alpha1.Status{Status: v1alpha1.StatusSuccess, Message: "no existing one time netboot job found"} + } else { + return reconcile.Result{Requeue: true}, err + } } - opts := []ctrlclient.DeleteOption{ - ctrlclient.GracePeriodSeconds(0), - ctrlclient.PropagationPolicy(metav1.DeletePropagationForeground), - } - logger.Info("deleting existing one time netboot job", "job", jobName, "conditions", existingJob.Status.Conditions) - if err := r.client.Delete(ctx, existingJob, opts...); err != nil { - return reconcile.Result{}, fmt.Errorf("error deleting existing job.bmc.tinkerbell.org object for netbooting machine: %w", err) - } - - return reconcile.Result{Requeue: true}, nil } - // create a job.bmc.tinkerbell.org object to netboot the hardware - name := fmt.Sprintf("tink-controller-%s-one-time-netboot", hardware.Name) - ns := hardware.Namespace - efiBoot := func() bool { - for _, iface := range hardware.Spec.Interfaces { - if iface.DHCP != nil && iface.DHCP.UEFI { - return true + if !stored.Status.OneTimeNetboot.CreationStatus.IsSuccess() && stored.Status.OneTimeNetboot.DeletionStatus.IsSuccess() { + // create a job.bmc.tinkerbell.org object to netboot the hardware + name := fmt.Sprintf(bmcJobName, hardware.Name) + ns := hardware.Namespace + efiBoot := func() bool { + for _, iface := range hardware.Spec.Interfaces { + if iface.DHCP != nil && iface.DHCP.UEFI { + return true + } } - } - return false - }() - job := &rufio.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: ns, - Annotations: map[string]string{ - "tink-controller-auto-created": "true", - }, - Labels: map[string]string{ - "tink-controller-auto-created": "true", - }, - }, - Spec: rufio.JobSpec{ - MachineRef: rufio.MachineRef{ - Name: hardware.Spec.BMCRef.Name, + return false + }() + job := &rufio.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, Namespace: ns, + Annotations: map[string]string{ + "tink-controller-auto-created": "true", + }, + Labels: map[string]string{ + "tink-controller-auto-created": "true", + }, }, - Tasks: []rufio.Action{ - { - PowerAction: rufio.PowerHardOff.Ptr(), + Spec: rufio.JobSpec{ + MachineRef: rufio.MachineRef{ + Name: hardware.Spec.BMCRef.Name, + Namespace: ns, }, - { - OneTimeBootDeviceAction: &rufio.OneTimeBootDeviceAction{ - Devices: []rufio.BootDevice{ - rufio.PXE, + Tasks: []rufio.Action{ + { + PowerAction: rufio.PowerHardOff.Ptr(), + }, + { + OneTimeBootDeviceAction: &rufio.OneTimeBootDeviceAction{ + Devices: []rufio.BootDevice{ + rufio.PXE, + }, + EFIBoot: efiBoot, }, - EFIBoot: efiBoot, }, - }, - { - PowerAction: rufio.PowerOn.Ptr(), + { + PowerAction: rufio.PowerOn.Ptr(), + }, }, }, - }, - } - logger.Info("creating one time netboot job", "job", job.Name) - if err := r.client.Create(ctx, job); err != nil { - return reconcile.Result{}, fmt.Errorf("error creating job.bmc.tinkerbell.org object for netbooting machine: %w", err) - } - stored.Status.OneTimeNetboot = &v1alpha1.Status{Status: "success", Message: "one time netboot job created"} - // block until the job completes. This is needed as there can be a race condition if the Hardware is already running - // a Tink Worker. - stored.Status.State = v1alpha1.WorkflowStatePreparing - return reconcile.Result{Requeue: true}, nil - /* - logger.Info("debugging 2") - waitJob := &rufio.Job{} - if err := r.client.Get(ctx, ctrlclient.ObjectKey{Name: job.Name, Namespace: job.Namespace}, waitJob); err != nil { - return reconcile.Result{}, fmt.Errorf("error getting one time netboot job: %w", err) } - if waitJob.HasCondition(rufio.JobFailed, rufio.ConditionTrue) { - return reconcile.Result{}, fmt.Errorf("one time netboot job failed") + if err := r.client.Create(ctx, job); err != nil { + return reconcile.Result{}, fmt.Errorf("error creating job.bmc.tinkerbell.org object for netbooting machine: %w", err) } - if waitJob.HasCondition(rufio.JobCompleted, rufio.ConditionTrue) { - stored.Status.State = v1alpha1.WorkflowStatePending - return reconcile.Result{}, nil - } - */ - /* - timeout, cancel := context.WithTimeout(ctx, time.Duration(5*float64(time.Minute))) - defer cancel() - for { - select { - case <-timeout.Done(): - return reconcile.Result{}, fmt.Errorf("timeout waiting for one time netboot job to complete") - default: - time.Sleep(2 * time.Second) - waitJob := &rufio.Job{} - if err := r.client.Get(ctx, ctrlclient.ObjectKey{Name: job.Name, Namespace: job.Namespace}, waitJob); err != nil { - continue - } - if waitJob.HasCondition(rufio.JobCompleted, rufio.ConditionTrue) { - goto DONE - } - if waitJob.HasCondition(rufio.JobFailed, rufio.ConditionTrue) { - return reconcile.Result{}, fmt.Errorf("one time netboot job failed") - } - } - } - DONE: - */ + stored.Status.OneTimeNetboot.CreationStatus = &v1alpha1.Status{Status: v1alpha1.StatusSuccess, Message: "one time netboot job created"} + // block until the job completes. This is needed as there can be a race condition if the Hardware is already running + // a Tink Worker. + stored.Status.State = v1alpha1.WorkflowStatePreparing + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{Requeue: true}, nil } stored.Status.State = v1alpha1.WorkflowStatePending diff --git a/internal/server/kubernetes_api_workflow.go b/internal/server/kubernetes_api_workflow.go index e22ceb15f..f81538591 100644 --- a/internal/server/kubernetes_api_workflow.go +++ b/internal/server/kubernetes_api_workflow.go @@ -80,7 +80,7 @@ func (s *KubernetesBackedServer) GetWorkflowContexts(req *proto.WorkflowContextR if wf.Spec.BootOpts.ToggleHardware && wf.Status.ToggleHardware != nil && wf.Status.ToggleHardware.Status == "" && wf.Status.State == v1alpha1.WorkflowStatePreparing { continue } - if wf.Spec.BootOpts.OneTimeNetboot && wf.Status.ToggleHardware != nil && wf.Status.OneTimeNetboot.Status == "" && wf.Status.State == v1alpha1.WorkflowStatePreparing { + if wf.Spec.BootOpts.OneTimeNetboot && wf.Status.State == v1alpha1.WorkflowStatePreparing { continue } if err := stream.Send(getWorkflowContext(wf)); err != nil {