From 64a50f9fdd91b700e860aadbce83e9eb664a0305 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 5 Nov 2024 07:32:59 -0500 Subject: [PATCH] fix slice delete issue (#218) * fix timing issue * fix lint * retry if cm exists --- internal/controller/instaslice_daemonset.go | 53 +++++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/internal/controller/instaslice_daemonset.go b/internal/controller/instaslice_daemonset.go index 7d7e1ab1..6758ba0f 100644 --- a/internal/controller/instaslice_daemonset.go +++ b/internal/controller/instaslice_daemonset.go @@ -133,7 +133,6 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl. log.Error(err, "error cleaning up ci and gi retrying") return ctrl.Result{RequeueAfter: requeue2sDelay}, nil } - log.Info("done deleting ci and gi for ", "pod", allocations.PodName) } err = r.deleteConfigMap(ctx, allocations.Resourceidentifier, allocations.Namespace) @@ -300,7 +299,6 @@ func (r *InstaSliceDaemonsetReconciler) cleanUpCiAndGi(ctx context.Context, allo if err != nil { return fmt.Errorf("unable walk migs %v", err) } - for _, migdevice := range migInfos { if migdevice.uuid == allocation.GPUUUID && migdevice.start == allocation.Start { gi, ret := parent.GetGpuInstanceById(int(migdevice.giInfo.Id)) @@ -319,19 +317,30 @@ func (r *InstaSliceDaemonsetReconciler) cleanUpCiAndGi(ctx context.Context, allo } ret = ci.Destroy() if ret != nvml.SUCCESS { + log.Error(ret, "failed to destroy Compute Instance", "ComputeInstanceId", migdevice.ciInfo.Id, "PodUUID", allocation.PodUUID) return fmt.Errorf("unable to destroy ci %v for %v", ret, allocation.PodName) } - + log.Info("successfully destroyed Compute Instance", "ComputeInstanceId", migdevice.ciInfo.Id) ret = gi.Destroy() if ret != nvml.SUCCESS { + log.Error(ret, "failed to destroy GPU Instance", "GpuInstanceId", migdevice.giInfo.Id, "PodUUID", allocation.PodUUID) return fmt.Errorf("unable to destroy gi %v for %v", ret, allocation.PodName) } - log.Info("deleted ci and gi for", "pod", allocation.PodName) + log.Info("successfully destroyed GPU Instance", "GpuInstanceId", migdevice.giInfo.Id) + + log.Info("deleted ci and gi for", "pod", allocation.PodName, logMigInfosSingleLine(migInfos)) return nil } } - log.Info("mig walking did not discover any slice for ", "pod", allocation.PodName) + exists, err := r.checkConfigMapExists(ctx, allocation.Resourceidentifier, allocation.Resourceidentifier) + if err != nil { + return err + } + if exists { + log.Error(nil, "mig walking did not discover any slice for ", "pod", "migInfos", allocation.PodName, migInfos, logMigInfosSingleLine(migInfos)) + return fmt.Errorf("MIG slice not found for GPUUUID %v and Start %v", allocation.GPUUUID, allocation.Start) + } return nil } @@ -933,3 +942,37 @@ func (r *InstaSliceDaemonsetReconciler) createSliceAndPopulateMigInfos(ctx conte return migInfos, nil } + +func logMigInfosSingleLine(migInfos map[string]*MigDeviceInfo) string { + var result string + for key, info := range migInfos { + giInfoStr := "nil" + ciInfoStr := "nil" + if info.giInfo != nil { + giInfoStr = fmt.Sprintf("GpuInstanceId: %d", info.giInfo.Id) + } + if info.ciInfo != nil { + ciInfoStr = fmt.Sprintf("ComputeInstanceId: %d", info.ciInfo.Id) + } + result += fmt.Sprintf("[Key: %s, UUID: %s, GI Info: %s, CI Info: %s, Start: %d, Size: %d] ", + key, info.uuid, giInfoStr, ciInfoStr, info.start, info.size) + } + + return result +} + +func (r *InstaSliceDaemonsetReconciler) checkConfigMapExists(ctx context.Context, name, namespace string) (bool, error) { + log := logr.FromContext(ctx) + configMap := &v1.ConfigMap{} + err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, configMap) + if err != nil { + if errors.IsNotFound(err) { + log.Info("ConfigMap not found", "name", name, "namespace", namespace) + return false, nil + } + log.Error(err, "Error checking ConfigMap", "name", name, "namespace", namespace) + return false, err + } + log.Info("ConfigMap exists", "name", name, "namespace", namespace) + return true, nil +}