From 09735b433c9780d080af1ed4e0343f92c40a28b9 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 10 Oct 2023 11:26:50 -0400 Subject: [PATCH 1/4] fix for some AWs that do not complete --- pkg/controller/queuejob/queuejob_controller_ex.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index ae70b837..58038dc7 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1625,7 +1625,10 @@ func (cc *XController) addQueueJob(obj interface{}) { } if latestAw.Status.State != arbv1.AppWrapperStateActive && latestAw.Status.State != arbv1.AppWrapperStateEnqueued && latestAw.Status.State != arbv1.AppWrapperStateRunningHoldCompletion { klog.V(2).Infof("[Informer-addQJ] Stopping requeue for AW %s/%s with status %s", latestAw.Namespace, latestAw.Name, latestAw.Status.State) - break // Exit the loop + AwinEtcd, err := cc.arbclients.WorkloadV1beta1().AppWrappers(latestAw.Namespace).Get(context.Background(), latestAw.Name, metav1.GetOptions{}) + if AwinEtcd.Status.State == latestAw.Status.State && err != nil { + break // Exit the loop + } } // Enqueue the latest copy of the AW. if (qj.Status.State != arbv1.AppWrapperStateCompleted && qj.Status.State != arbv1.AppWrapperStateFailed) && hasCompletionStatus { From 897e1bad7943facdaa1f597967b337f428fcbdae Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 10 Oct 2023 11:34:45 -0400 Subject: [PATCH 2/4] fix AW to make resources consistent --- test/perf-test/preempt-exp.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/perf-test/preempt-exp.yaml b/test/perf-test/preempt-exp.yaml index 35063c5d..13d9e003 100644 --- a/test/perf-test/preempt-exp.yaml +++ b/test/perf-test/preempt-exp.yaml @@ -11,15 +11,14 @@ spec: growthType: "exponential" priority: 9 resources: - Items: [] GenericItems: - replicas: 1 completionstatus: Complete custompodresources: - replicas: 1 requests: - cpu: 10m - memory: 10M + cpu: 500m + memory: 128M nvidia.com/gpu: 0 limits: cpu: 500m @@ -31,8 +30,8 @@ spec: metadata: namespace: default name: defaultaw-schd-spec-with-timeout-1 - # labels: - # appwrapper.mcad.ibm.com: defaultaw-schd-spec-with-timeout-1 + labels: + appwrapper.mcad.ibm.com: defaultaw-schd-spec-with-timeout-1 spec: parallelism: 1 completions: 1 @@ -49,8 +48,8 @@ spec: args: [ "sleep 10" ] resources: requests: - memory: "10Mi" - cpu: "10m" + memory: "128Mi" + cpu: "500m" limits: memory: "128Mi" cpu: "500m" From 2fc46ba8cb866b4ca997562af12e17016f650db8 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Tue, 10 Oct 2023 11:35:32 -0400 Subject: [PATCH 3/4] check AW status instead of jobs --- test/perf-test/perf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/perf-test/perf.sh b/test/perf-test/perf.sh index 7450d7e2..73a8c73e 100755 --- a/test/perf-test/perf.sh +++ b/test/perf-test/perf.sh @@ -70,7 +70,7 @@ check_kubectl_login_status # Track whether you have the MCAD controller installed echo "Checking MCAD Controller installation status" echo -check_mcad_installed_status +#check_mcad_installed_status echo read -p "How many appwrapper jobs do you want?" jobs @@ -114,7 +114,7 @@ done esac # Check for all jobs to report complete -jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` +jobstatus=`kubectl get appwrappers -n default --no-headers --field-selector status.successful=1 |wc -l` while [ $jobstatus -lt $jobs ] do From e66a7eb79d08aed16609d941226b0fae04c7035b Mon Sep 17 00:00:00 2001 From: James Busche Date: Tue, 10 Oct 2023 17:22:49 -0700 Subject: [PATCH 4/4] changing jobstatus from jobs to appwrappers Signed-off-by: James Busche --- test/perf-test/perf.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/perf-test/perf.sh b/test/perf-test/perf.sh index 73a8c73e..a9d47801 100755 --- a/test/perf-test/perf.sh +++ b/test/perf-test/perf.sh @@ -78,10 +78,10 @@ read -p "How many appwrapper jobs do you want?" jobs # Start the timer now SECONDS=0 -echo "jobs number is $jobs" +echo "Appwrapper number is $jobs" export STARTTIME=`date +"%T"` echo " " -echo "Jobs started at: $STARTTIME" |tee job-$STARTTIME.log +echo "Appwrappers started at: $STARTTIME" |tee job-$STARTTIME.log echo " " # This fixes the number of jobs to be one less so the for loop gets the right amount @@ -113,19 +113,19 @@ done sed -i "s/defaultaw-schd-spec-with-timeout-$next_num/defaultaw-schd-spec-with-timeout-1/g" ${SCRIPT_DIR}/preempt-exp.yaml ;; esac -# Check for all jobs to report complete -jobstatus=`kubectl get appwrappers -n default --no-headers --field-selector status.successful=1 |wc -l` +# Check for all appwrappers to report complete +jobstatus=`kubectl get appwrappers -o=custom-columns=SUCCESS:.status.Succeeded -n default |grep 1 |wc -l` while [ $jobstatus -lt $jobs ] do - echo "Number of completed jobs is: " $jobstatus " and the goal is: " $jobs + echo "Number of completed appwrappers is: " $jobstatus " and the goal is: " $jobs sleep 10 - jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` + jobstatus=`kubectl get appwrappers -o=custom-columns=SUCCESS:.status.Succeeded -n default |grep 1 |wc -l` done echo " " export FINISHTIME=`date +"%T"` -echo "All $jobstatus jobs finished: $FINISHTIME" |tee -a job-$STARTTIME.log +echo "All $jobstatus appwrappers finished: $FINISHTIME" |tee -a job-$STARTTIME.log echo "Total amount of time for $jobs appwrappers is: $SECONDS seconds" |tee -a ${SCRIPT_DIR}/job-$STARTTIME.log echo " " echo "Test results are stored in this file: ${SCRIPT_DIR}/job-$next_num-$STARTTIME.log"