From e9db365666179dd65b82de18a117464093517a1d Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 17 Sep 2024 11:20:03 -0400 Subject: [PATCH] Enforce that numRocGdr must be 0 unless numProcs > 1 (#63) --- tools/pytorchjob-generator/chart/README.md | 2 +- tools/pytorchjob-generator/chart/values.schema.json | 13 ++++++++++++- tools/pytorchjob-generator/chart/values.yaml | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md index ee63844..cb78bcd 100644 --- a/tools/pytorchjob-generator/chart/README.md +++ b/tools/pytorchjob-generator/chart/README.md @@ -50,7 +50,7 @@ customize the Jobs generated by the tool. | Key | Type | Default | Description | |-----|------|---------|-------------| | roceGdrResName | string | nvidia.com/roce_gdr | RoCE GDR resource name (can vary by cluster configuration) | -| numRoceGdr | integer | `0` | number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). | +| numRoceGdr | integer | `0` | number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). Must be 0 unless numPods > 1. | | topologyFileConfigMap | string | `nil` | Name of configmap containining /var/run/nvidia-topologyd/virtualTopology.xml for the system e.g. nvidia-topo-gdr | | ncclGdrEnvConfigMap | string | `nil` | Name of configmap containing NCCL networking environment variables for the system e.g. nccl-netwk-env-vars | | multiNicNetworkName | string | `nil` | Name of multi-NIC network, if one is available. Note: when GDR over RoCE is used/available, the RoCE multi-nic network instance should be specified here instead of the TCP multi-nic network instance. Existing instance names can be listed with `oc get multinicnetwork`. | diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index f92dc35..6da4018 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -79,7 +79,7 @@ { "type": "null" }, { "type": "string" } ]}, - "numRoceGdr": { "type": "integer" }, + "numRoceGdr": { "type": "integer", "minimum": 0, "maximum": 2 }, "topologyFileConfigMap": { "oneOf": [ { "type": "null" }, { "$ref": "#/$defs/rfc1123Label" } @@ -134,6 +134,17 @@ "deletionOnFailureGracePeriodDuration" : { "$ref": "#/$defs/duration" } }, + "if": { + "properties": { + "numPods": { "const": 1 } + } + }, + "then": { + "properties": { + "numRoceGdr": { "const": 0 } + } + }, + "$defs": { "rfc1123Label": { "type": "string", diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml index 22e1694..f10dc3e 100644 --- a/tools/pytorchjob-generator/chart/values.yaml +++ b/tools/pytorchjob-generator/chart/values.yaml @@ -157,7 +157,7 @@ volumes: # @section -- Advanced Options roceGdrResName: # -# -- (integer) number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). +# -- (integer) number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). Must be 0 unless numPods > 1. # @section -- Advanced Options numRoceGdr: 0