From f8f798911fdfadb8306f738c8d853557a437f2be Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Sun, 22 Dec 2024 15:03:36 +0800 Subject: [PATCH 1/3] feat: Add a new event type FailedToDeleteWorkerPodCollection --- ray-operator/controllers/ray/utils/constant.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index ebb90c994e..cd6310e54d 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -246,10 +246,11 @@ const ( FailedToDeleteHeadPod K8sEventType = "FailedToDeleteHeadPod" // Worker Pod event list - CreatedWorkerPod K8sEventType = "CreatedWorkerPod" - FailedToCreateWorkerPod K8sEventType = "FailedToCreateWorkerPod" - DeletedWorkerPod K8sEventType = "DeletedWorkerPod" - FailedToDeleteWorkerPod K8sEventType = "FailedToDeleteWorkerPod" + CreatedWorkerPod K8sEventType = "CreatedWorkerPod" + FailedToCreateWorkerPod K8sEventType = "FailedToCreateWorkerPod" + DeletedWorkerPod K8sEventType = "DeletedWorkerPod" + FailedToDeleteWorkerPod K8sEventType = "FailedToDeleteWorkerPod" + FailedToDeleteWorkerPodCollection K8sEventType = "FailedToDeleteWorkerPodCollection" // Redis Cleanup Job event list CreatedRedisCleanupJob K8sEventType = "CreatedRedisCleanupJob" From e7d7f94687f5844ddc3dcc15485bf6d680ebaed0 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Mon, 23 Dec 2024 00:27:45 +0800 Subject: [PATCH 2/3] chore: add FailedToDeletePodCollections and change err msg which relate to r.deleteAllPods --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- ray-operator/controllers/ray/utils/constant.go | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 1bfa402537..1341a4ce9b 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -668,7 +668,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv if suspendStatus == rayv1.RayClusterSuspending || (!statusConditionGateEnabled && instance.Spec.Suspend != nil && *instance.Spec.Suspend) { if _, err := r.deleteAllPods(ctx, common.RayClusterAllPodsAssociationOptions(instance)); err != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeletePod), + r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeletePodCollection), "Failed deleting Pods due to suspension for RayCluster %s/%s, %v", instance.Namespace, instance.Name, err) return errstd.Join(utils.ErrFailedDeleteAllPods, err) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index cd6310e54d..3c97ca47a1 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -272,8 +272,9 @@ const ( InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec" // Generic Pod event list - DeletedPod K8sEventType = "DeletedPod" - FailedToDeletePod K8sEventType = "FailedToDeletePod" + DeletedPod K8sEventType = "DeletedPod" + FailedToDeletePod K8sEventType = "FailedToDeletePod" + FailedToDeletePodCollection K8sEventType = "FailedToDeletePodCollection" // Ingress event list CreatedIngress K8sEventType = "CreatedIngress" From debd1e12383583205ea21bf79dd99f980f7a1d68 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Sat, 28 Dec 2024 13:23:55 +0800 Subject: [PATCH 3/3] fix: change FailedToDeleteWorkerPod to FailedToDeleteWorkerPodCollection --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 1341a4ce9b..fcdad71a02 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -779,7 +779,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // Delete all workers if worker group is suspended and skip reconcile if worker.Suspend != nil && *worker.Suspend { if _, err := r.deleteAllPods(ctx, common.RayClusterGroupPodsAssociationOptions(instance, worker.GroupName)); err != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeleteWorkerPod), + r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeleteWorkerPodCollection), "Failed deleting worker Pods for suspended group %s in RayCluster %s/%s, %v", worker.GroupName, instance.Namespace, instance.Name, err) return errstd.Join(utils.ErrFailedDeleteWorkerPod, err) }