diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index ae70b837..58038dc7 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1625,7 +1625,10 @@ func (cc *XController) addQueueJob(obj interface{}) { } if latestAw.Status.State != arbv1.AppWrapperStateActive && latestAw.Status.State != arbv1.AppWrapperStateEnqueued && latestAw.Status.State != arbv1.AppWrapperStateRunningHoldCompletion { klog.V(2).Infof("[Informer-addQJ] Stopping requeue for AW %s/%s with status %s", latestAw.Namespace, latestAw.Name, latestAw.Status.State) - break // Exit the loop + AwinEtcd, err := cc.arbclients.WorkloadV1beta1().AppWrappers(latestAw.Namespace).Get(context.Background(), latestAw.Name, metav1.GetOptions{}) + if AwinEtcd.Status.State == latestAw.Status.State && err != nil { + break // Exit the loop + } } // Enqueue the latest copy of the AW. if (qj.Status.State != arbv1.AppWrapperStateCompleted && qj.Status.State != arbv1.AppWrapperStateFailed) && hasCompletionStatus { diff --git a/test/perf-test/perf.sh b/test/perf-test/perf.sh index 7450d7e2..a9d47801 100755 --- a/test/perf-test/perf.sh +++ b/test/perf-test/perf.sh @@ -70,7 +70,7 @@ check_kubectl_login_status # Track whether you have the MCAD controller installed echo "Checking MCAD Controller installation status" echo -check_mcad_installed_status +#check_mcad_installed_status echo read -p "How many appwrapper jobs do you want?" jobs @@ -78,10 +78,10 @@ read -p "How many appwrapper jobs do you want?" jobs # Start the timer now SECONDS=0 -echo "jobs number is $jobs" +echo "Appwrapper number is $jobs" export STARTTIME=`date +"%T"` echo " " -echo "Jobs started at: $STARTTIME" |tee job-$STARTTIME.log +echo "Appwrappers started at: $STARTTIME" |tee job-$STARTTIME.log echo " " # This fixes the number of jobs to be one less so the for loop gets the right amount @@ -113,19 +113,19 @@ done sed -i "s/defaultaw-schd-spec-with-timeout-$next_num/defaultaw-schd-spec-with-timeout-1/g" ${SCRIPT_DIR}/preempt-exp.yaml ;; esac -# Check for all jobs to report complete -jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` +# Check for all appwrappers to report complete +jobstatus=`kubectl get appwrappers -o=custom-columns=SUCCESS:.status.Succeeded -n default |grep 1 |wc -l` while [ $jobstatus -lt $jobs ] do - echo "Number of completed jobs is: " $jobstatus " and the goal is: " $jobs + echo "Number of completed appwrappers is: " $jobstatus " and the goal is: " $jobs sleep 10 - jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` + jobstatus=`kubectl get appwrappers -o=custom-columns=SUCCESS:.status.Succeeded -n default |grep 1 |wc -l` done echo " " export FINISHTIME=`date +"%T"` -echo "All $jobstatus jobs finished: $FINISHTIME" |tee -a job-$STARTTIME.log +echo "All $jobstatus appwrappers finished: $FINISHTIME" |tee -a job-$STARTTIME.log echo "Total amount of time for $jobs appwrappers is: $SECONDS seconds" |tee -a ${SCRIPT_DIR}/job-$STARTTIME.log echo " " echo "Test results are stored in this file: ${SCRIPT_DIR}/job-$next_num-$STARTTIME.log" diff --git a/test/perf-test/preempt-exp.yaml b/test/perf-test/preempt-exp.yaml index 35063c5d..13d9e003 100644 --- a/test/perf-test/preempt-exp.yaml +++ b/test/perf-test/preempt-exp.yaml @@ -11,15 +11,14 @@ spec: growthType: "exponential" priority: 9 resources: - Items: [] GenericItems: - replicas: 1 completionstatus: Complete custompodresources: - replicas: 1 requests: - cpu: 10m - memory: 10M + cpu: 500m + memory: 128M nvidia.com/gpu: 0 limits: cpu: 500m @@ -31,8 +30,8 @@ spec: metadata: namespace: default name: defaultaw-schd-spec-with-timeout-1 - # labels: - # appwrapper.mcad.ibm.com: defaultaw-schd-spec-with-timeout-1 + labels: + appwrapper.mcad.ibm.com: defaultaw-schd-spec-with-timeout-1 spec: parallelism: 1 completions: 1 @@ -49,8 +48,8 @@ spec: args: [ "sleep 10" ] resources: requests: - memory: "10Mi" - cpu: "10m" + memory: "128Mi" + cpu: "500m" limits: memory: "128Mi" cpu: "500m"