Skip to content

Commit

Permalink
fix slice delete issue (#218)
Browse files Browse the repository at this point in the history
* fix timing issue

* fix lint

* retry if cm exists
  • Loading branch information
asm582 authored Nov 5, 2024
1 parent eb16006 commit 64a50f9
Showing 1 changed file with 48 additions and 5 deletions.
53 changes: 48 additions & 5 deletions internal/controller/instaslice_daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ func (r *InstaSliceDaemonsetReconciler) Reconcile(ctx context.Context, req ctrl.
log.Error(err, "error cleaning up ci and gi retrying")
return ctrl.Result{RequeueAfter: requeue2sDelay}, nil
}
log.Info("done deleting ci and gi for ", "pod", allocations.PodName)
}

err = r.deleteConfigMap(ctx, allocations.Resourceidentifier, allocations.Namespace)
Expand Down Expand Up @@ -300,7 +299,6 @@ func (r *InstaSliceDaemonsetReconciler) cleanUpCiAndGi(ctx context.Context, allo
if err != nil {
return fmt.Errorf("unable walk migs %v", err)
}

for _, migdevice := range migInfos {
if migdevice.uuid == allocation.GPUUUID && migdevice.start == allocation.Start {
gi, ret := parent.GetGpuInstanceById(int(migdevice.giInfo.Id))
Expand All @@ -319,19 +317,30 @@ func (r *InstaSliceDaemonsetReconciler) cleanUpCiAndGi(ctx context.Context, allo
}
ret = ci.Destroy()
if ret != nvml.SUCCESS {
log.Error(ret, "failed to destroy Compute Instance", "ComputeInstanceId", migdevice.ciInfo.Id, "PodUUID", allocation.PodUUID)
return fmt.Errorf("unable to destroy ci %v for %v", ret, allocation.PodName)
}

log.Info("successfully destroyed Compute Instance", "ComputeInstanceId", migdevice.ciInfo.Id)
ret = gi.Destroy()
if ret != nvml.SUCCESS {
log.Error(ret, "failed to destroy GPU Instance", "GpuInstanceId", migdevice.giInfo.Id, "PodUUID", allocation.PodUUID)
return fmt.Errorf("unable to destroy gi %v for %v", ret, allocation.PodName)
}
log.Info("deleted ci and gi for", "pod", allocation.PodName)
log.Info("successfully destroyed GPU Instance", "GpuInstanceId", migdevice.giInfo.Id)

log.Info("deleted ci and gi for", "pod", allocation.PodName, logMigInfosSingleLine(migInfos))
return nil

}
}
log.Info("mig walking did not discover any slice for ", "pod", allocation.PodName)
exists, err := r.checkConfigMapExists(ctx, allocation.Resourceidentifier, allocation.Resourceidentifier)
if err != nil {
return err
}
if exists {
log.Error(nil, "mig walking did not discover any slice for ", "pod", "migInfos", allocation.PodName, migInfos, logMigInfosSingleLine(migInfos))
return fmt.Errorf("MIG slice not found for GPUUUID %v and Start %v", allocation.GPUUUID, allocation.Start)
}
return nil
}

Expand Down Expand Up @@ -933,3 +942,37 @@ func (r *InstaSliceDaemonsetReconciler) createSliceAndPopulateMigInfos(ctx conte

return migInfos, nil
}

func logMigInfosSingleLine(migInfos map[string]*MigDeviceInfo) string {
var result string
for key, info := range migInfos {
giInfoStr := "nil"
ciInfoStr := "nil"
if info.giInfo != nil {
giInfoStr = fmt.Sprintf("GpuInstanceId: %d", info.giInfo.Id)
}
if info.ciInfo != nil {
ciInfoStr = fmt.Sprintf("ComputeInstanceId: %d", info.ciInfo.Id)
}
result += fmt.Sprintf("[Key: %s, UUID: %s, GI Info: %s, CI Info: %s, Start: %d, Size: %d] ",
key, info.uuid, giInfoStr, ciInfoStr, info.start, info.size)
}

return result
}

func (r *InstaSliceDaemonsetReconciler) checkConfigMapExists(ctx context.Context, name, namespace string) (bool, error) {
log := logr.FromContext(ctx)
configMap := &v1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, configMap)
if err != nil {
if errors.IsNotFound(err) {
log.Info("ConfigMap not found", "name", name, "namespace", namespace)
return false, nil
}
log.Error(err, "Error checking ConfigMap", "name", name, "namespace", namespace)
return false, err
}
log.Info("ConfigMap exists", "name", name, "namespace", namespace)
return true, nil
}

0 comments on commit 64a50f9

Please sign in to comment.