From dfb857912e6a4d5db043805666eb4ab82671bed5 Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 26 Nov 2024 17:08:40 -0500 Subject: [PATCH] add initial grace period before triggering a failure due to missing components --- .../appwrapper/appwrapper_controller.go | 25 +++++++++++-------- test/e2e/appwrapper_test.go | 9 +++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index 4f487a2..8fcbff7 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Detect externally deleted components and transition to Failed with no GracePeriod or retry - detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) if compStatus.deployed != compStatus.expected { - meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ - Type: string(workloadv1beta2.Unhealthy), - Status: metav1.ConditionTrue, - Reason: "MissingComponent", - Message: detailMsg, - }) - r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) - return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + // There may be a lag before created resources become visible in the cache; don't react too quickly. + whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime + graceDuration := r.admissionGraceDuration(ctx, aw) + if time.Now().After(whenDeployed.Add(graceDuration)) { + detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) + meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ + Type: string(workloadv1beta2.Unhealthy), + Status: metav1.ConditionTrue, + Reason: "MissingComponent", + Message: detailMsg, + }) + r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) + return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + } } // If a component's controller has put it into a failed state, we do not need // to allow a grace period. The situation will not self-correct. - detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed) + detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed) if compStatus.failed > 0 { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ Type: string(workloadv1beta2.Unhealthy), diff --git a/test/e2e/appwrapper_test.go b/test/e2e/appwrapper_test.go index 362f507..6b808a4 100644 --- a/test/e2e/appwrapper_test.go +++ b/test/e2e/appwrapper_test.go @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(aw.Status.Retries).Should(Equal(int32(2))) }) - It("Deleting a Running Component yields a failed AppWrapper", func() { - aw := createAppWrapper(ctx, pytorchjob(2, 500)) + It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() { + aw := toAppWrapper(pytorchjob(2, 500)) + if aw.Annotations == nil { + aw.Annotations = make(map[string]string) + } + aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s" + Expect(getClient(ctx).Create(ctx, aw)).To(Succeed()) appwrappers = append(appwrappers, aw) Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning)) aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})