diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index d4aa42e..f4e4fd4 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -29,7 +29,7 @@ annotations: {{- define "mlbatch.schedulingSpec" }} -{{- if .Values.terminationGracePeriodSeconds }} +{{- if ne .Values.terminationGracePeriodSeconds nil }} terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} {{- end }} {{- if .Values.bypassCoscheduler }} diff --git a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml index 91420fc..6b8543c 100644 --- a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml +++ b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml @@ -67,7 +67,7 @@ metadata: {{- if .Values.retryPausePeriodDuration }} workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "{{ .Values.retryPausePeriodDuration }}" {{- end }} - {{- if .Values.retryLimit }} + {{- if ne .Values.retryLimit nil }} workload.codeflare.dev.appwrapper/retryLimit: "{{ .Values.retryLimit }}" {{- end }} {{- if .Values.forcefulDeletionGracePeriodDuration }} diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 976afbf..89ae562 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -164,6 +164,22 @@ tests: workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "19s" workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "2s" +- it: Setting integer fault tolerance annotation to 0 + set: + retryLimit: 0 + terminationGracePeriodSeconds: 0 + asserts: + - isSubset: + path: metadata.annotations + content: + workload.codeflare.dev.appwrapper/retryLimit: "0" + - equal: + path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.spec.terminationGracePeriodSeconds + value: 0 + - equal: + path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.terminationGracePeriodSeconds + value: 0 + - it: Setting just one tolerance annotation set: deletionOnFailureGracePeriodDuration: "6h"