From 956e6821f9260e54da0ace14931a4c81b1f93b9b Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 19 Dec 2024 12:58:11 -0500 Subject: [PATCH] pytorchjob-generator: make namespace an optional value (#115) --- tools/pytorchjob-generator/README.md | 3 +- tools/pytorchjob-generator/chart/README.md | 4 +-- .../chart/templates/_helpers.tpl | 28 ++++++++------- .../chart/templates/appwrapper.yaml | 11 +++--- .../__snapshot__/helloworld_test.yaml.snap | 36 ------------------- .../chart/tests/helloworld_test.yaml | 8 +++++ .../chart/values.schema.json | 6 ++-- tools/pytorchjob-generator/chart/values.yaml | 11 +++--- .../examples/helloworld.settings.yaml | 1 - 9 files changed, 40 insertions(+), 68 deletions(-) diff --git a/tools/pytorchjob-generator/README.md b/tools/pytorchjob-generator/README.md index e42cec4..b2d3313 100644 --- a/tools/pytorchjob-generator/README.md +++ b/tools/pytorchjob-generator/README.md @@ -28,7 +28,6 @@ mlbatch/pytorchjob-generator 1.1.5 v1beta2 An AppWrapper generator f Create a `settings.yaml` file with the settings for the PyTorch job, for example: ```yaml -namespace: my-namespace # namespace to deploy to (required) jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) queueName: default-queue # local queue to submit to (default: default-queue) @@ -69,5 +68,5 @@ helm template -f settings.yaml mlbatch/pytorchjob-generator | tee generated.yaml To remove the PyTorch job from the cluster, delete the generated `AppWrapper` object: ```sh -oc delete appwrapper -n my-namespace my-job +oc delete appwrapper my-job ``` diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md index c84e549..e1a960e 100644 --- a/tools/pytorchjob-generator/chart/README.md +++ b/tools/pytorchjob-generator/chart/README.md @@ -15,10 +15,10 @@ customize the Jobs generated by the tool. | Key | Type | Default | Description | |-----|------|---------|-------------| -| namespace | string | must be provided by user | The Kubernetes namespace in which the Job will run. | | jobName | string | must be provided by user | Name of the Job. Will be the name of the AppWrapper and the PyTorchJob. | +| namespace | string | `nil` | Namespace in which to run the Job. If unspecified, the namespace will be inferred using normal Helm/Kubernetes mechanisms when the Job is submitted. | | queueName | string | `"default-queue"` | Name of the local queue to which the Job will be submitted. | -| priority | string | `"default-priority"` | Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). WARNING: "high-priority" jobs need to be approved (We're watching you...)! | +| priority | string | `"default-priority"` | Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). | | customLabels | array | `nil` | Optional array of custom labels to add to all the resources created by the Job (the PyTorchJob, the PodGroup, and the AppWrapper). | | containerImage | string | must be provided by the user | Image used for creating the Job's containers (needs to have all the applications your job may need) | | imagePullSecrets | array | `nil` | List of image-pull-secrets to be used for pulling containerImages | diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index 34f751a..f5dc4f2 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -10,20 +10,22 @@ {{- define "mlbatch.container.metadata" }} -namespace: {{ .Values.namespace }} -{{- if or .Values.customLabels .Values.autopilotHealthChecks }} -labels: - {{- include "mlbatch.customLabels" . | indent 4 }} - {{- if .Values.autopilotHealthChecks }} - autopilot: "" - {{- range $healthcheck := .Values.autopilotHealthChecks }} - {{ $healthcheck }}: "" - {{- end }} +{{- if or .Values.customLabels .Values.autopilotHealthChecks .Values.multiNicNetworkName }} +metadata: + {{- if or .Values.customLabels .Values.autopilotHealthChecks }} + labels: + {{- include "mlbatch.customLabels" . | indent 8 }} + {{- if .Values.autopilotHealthChecks }} + autopilot: "" + {{- range $healthcheck := .Values.autopilotHealthChecks }} + {{ $healthcheck }}: "" + {{- end }} + {{- end }} + {{- end }} + {{- if .Values.multiNicNetworkName }} + annotations: + k8s.v1.cni.cncf.io/networks: {{ .Values.multiNicNetworkName }} {{- end }} -{{- end }} -{{- if .Values.multiNicNetworkName }} -annotations: - k8s.v1.cni.cncf.io/networks: {{ .Values.multiNicNetworkName }} {{- end }} {{- end -}} diff --git a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml index 6b8543c..46b065c 100644 --- a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml +++ b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml @@ -52,7 +52,9 @@ apiVersion: workload.codeflare.dev/v1beta2 kind: AppWrapper metadata: name: {{ .Values.jobName }} - namespace: {{ required "Please specify a 'namespace' in the user file" .Values.namespace }} + {{- if .Values.namespace }} + namespace: {{ .Values.namespace }} + {{- end }} annotations: workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: "{{ .Chart.Version }}" {{- if .Values.admissionGracePeriodDuration }} @@ -90,7 +92,6 @@ spec: kind: "PyTorchJob" metadata: name: {{ .Values.jobName }} - namespace: {{ .Values.namespace }} {{- if .Values.customLabels }} labels: {{- include "mlbatch.customLabels" . | indent 26 }} @@ -101,8 +102,7 @@ spec: replicas: 1 restartPolicy: {{ .Values.restartPolicy | default "Never" }} template: - metadata: - {{- include "mlbatch.container.metadata" . | indent 38 }} + {{- include "mlbatch.container.metadata" . | indent 34 }} spec: {{- if .Values.serviceAccountName }} serviceAccountName: {{ .Values.serviceAccountName }} @@ -125,8 +125,7 @@ spec: replicas: {{ sub .Values.numPods 1 }} restartPolicy: {{ .Values.restartPolicy | default "Never" }} template: - metadata: - {{- include "mlbatch.container.metadata" . | indent 38 }} + {{- include "mlbatch.container.metadata" . | indent 34 }} spec: {{- if .Values.serviceAccountName }} serviceAccountName: {{ .Values.serviceAccountName }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap index 9121467..babee19 100644 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap @@ -16,15 +16,12 @@ Adding Volume Mounts: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -93,8 +90,6 @@ Adding Volume Mounts: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -177,15 +172,12 @@ Adding initContainers: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -257,8 +249,6 @@ Adding initContainers: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -344,15 +334,12 @@ AppWrapper metadata should match snapshot: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -411,8 +398,6 @@ AppWrapper metadata should match snapshot: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -485,15 +470,12 @@ AppWrapper spec should match snapshot: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -552,8 +534,6 @@ AppWrapper spec should match snapshot: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -626,15 +606,12 @@ Enabling NVMe: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -708,8 +685,6 @@ Enabling NVMe: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -797,7 +772,6 @@ Enabling RoCE GDR: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: @@ -807,7 +781,6 @@ Enabling RoCE GDR: metadata: annotations: k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - namespace: my-namespace spec: affinity: nodeAffinity: @@ -883,7 +856,6 @@ Enabling RoCE GDR: metadata: annotations: k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - namespace: my-namespace spec: affinity: nodeAffinity: @@ -970,7 +942,6 @@ Enabling all advanced features at once: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: @@ -980,7 +951,6 @@ Enabling all advanced features at once: metadata: annotations: k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - namespace: my-namespace spec: affinity: nodeAffinity: @@ -1108,7 +1078,6 @@ Enabling all advanced features at once: metadata: annotations: k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - namespace: my-namespace spec: affinity: nodeAffinity: @@ -1247,15 +1216,12 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: kind: PyTorchJob metadata: name: my-job - namespace: my-namespace spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: @@ -1328,8 +1294,6 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: replicas: 3 restartPolicy: Never template: - metadata: - namespace: my-namespace spec: affinity: nodeAffinity: diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 89ae562..01b7fb5 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -78,6 +78,14 @@ tests: - notExists: path: metadata.labels +- it: namespace can be set + set: + namespace: testing-ns + asserts: + - equal: + path: metadata.namespace + value: testing-ns + - it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts set: sshGitCloneConfig.secretName: my-git-secret diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index d56f91f..241ac06 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -2,14 +2,16 @@ "$schema": "https://json-schema.org/draft/2020-12/schema#", "type": "object", "required": [ - "namespace", "jobName", "containerImage" ], "additionalProperties": false, "properties": { - "namespace": { "$ref": "#/$defs/rfc1123Label" }, "jobName": { "type": "string" }, + "namespace": { "oneOf": [ + { "type": "null" }, + { "$ref": "#/$defs/rfc1123Label" } + ]}, "queueName": { "oneOf": [ { "type": "null" }, { "$ref": "#/$defs/rfc1123Label" } diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml index 0fec5ce..d5aca09 100644 --- a/tools/pytorchjob-generator/chart/values.yaml +++ b/tools/pytorchjob-generator/chart/values.yaml @@ -2,21 +2,20 @@ # Job Metadata #################### -# -- (string) The Kubernetes namespace in which the Job will run. -# @default -- must be provided by user -# @section -- Job Metadata -namespace: - # -- (string) Name of the Job. Will be the name of the AppWrapper and the PyTorchJob. # @default -- must be provided by user # @section -- Job Metadata jobName: +# -- (string) Namespace in which to run the Job. If unspecified, the namespace will be inferred using normal Helm/Kubernetes mechanisms when the Job is submitted. +# @section -- Job Metadata +namespace: + # -- (string) Name of the local queue to which the Job will be submitted. # @section -- Job Metadata queueName: "default-queue" -# -- (string) Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). WARNING: "high-priority" jobs need to be approved (We're watching you...)! +# -- (string) Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). # @section -- Job Metadata priority: "default-priority" diff --git a/tools/pytorchjob-generator/examples/helloworld.settings.yaml b/tools/pytorchjob-generator/examples/helloworld.settings.yaml index 06540af..a027d91 100644 --- a/tools/pytorchjob-generator/examples/helloworld.settings.yaml +++ b/tools/pytorchjob-generator/examples/helloworld.settings.yaml @@ -1,4 +1,3 @@ -namespace: my-namespace # namespace to deploy to (required) jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) queueName: default-queue # local queue to submit to (default: default-queue)