Skip to content

Commit

Permalink
Sakkara support (#118)
Browse files Browse the repository at this point in the history
* Add support for Sakkara

Co-authored-by: Lixiang "Eric" Luo <lgl@@users.noreply.github.com>

* drop bypassCoscheduler; add test for setting schedulerName

---------

Co-authored-by: Lixiang "Eric" Luo <lgl@@users.noreply.github.com>
  • Loading branch information
dgrove-oss and Lixiang "Eric" Luo authored Dec 19, 2024
1 parent 20dd9eb commit 7840fa1
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 9 deletions.
2 changes: 1 addition & 1 deletion tools/pytorchjob-generator/chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. |
| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |

### Fault Tolerance
Expand Down
16 changes: 13 additions & 3 deletions tools/pytorchjob-generator/chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ metadata:
{{- if ne .Values.terminationGracePeriodSeconds nil }}
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
{{- end }}
{{- if .Values.bypassCoscheduler }}
schedulerName: default-scheduler
{{- if .Values.schedulerName }}
schedulerName: {{ .Values.schedulerName }}
{{- end }}
priorityClassName: {{ .Values.priority }}
affinity:
Expand Down Expand Up @@ -81,8 +81,14 @@ envFrom:
- configMapRef:
name: {{ .Values.ncclGdrEnvConfigMap }}
{{- end }}
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
env:
{{- if eq .Values.schedulerName "sakkara" }}
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
{{- end }}
{{- if .Values.topologyFileConfigMap }}
- name: NCCL_TOPO_FILE
value: /var/run/nvidia-topologyd/virtualTopology.xml
Expand Down Expand Up @@ -146,6 +152,10 @@ command:
#
# User commands
#
{{- if eq .Values.schedulerName "sakkara" }}
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
{{- end }}
{{- range $command := .Values.setupCommands }}
{{ $command }}
{{- end }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
- emptyDir:
medium: Memory
name: dshm
scheduler can be set:
1: |
apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
annotations:
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
labels:
kueue.x-k8s.io/queue-name: default-queue
name: my-job
namespace: my-namespace
spec:
components:
- template:
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: my-job
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
Worker:
replicas: 3
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
7 changes: 7 additions & 0 deletions tools/pytorchjob-generator/chart/tests/helloworld_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ tests:
path: metadata.namespace
value: testing-ns

- it: scheduler can be set
set:
schedulerName: sakkara
asserts:
- matchSnapshot:
path: spec.components[0].template

- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
set:
sshGitCloneConfig.secretName: my-git-secret
Expand Down
5 changes: 4 additions & 1 deletion tools/pytorchjob-generator/chart/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@
{ "type": "null" },
{ "type": "array" }
]},
"bypassCoscheduler": { "type": "boolean" },
"schedulerName": { "oneOf": [
{ "type": "null" },
{ "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
]},
"serviceAccountName": { "oneOf" : [
{ "type": "null" },
{ "$ref": "#/$defs/rfc1123Label" }
Expand Down
8 changes: 4 additions & 4 deletions tools/pytorchjob-generator/chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@ hostIgnoreList:
# - a100-large-drlfv-worker-3-with-secondary-nw5qh
# - a100-large-drlfv-worker-3-with-secondary-lb7ch

# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
# to true when explicitly directed to do so by a cluster admin!***
# -- (string) If non-nil, use the specified Kubernetes scheduler.
# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
# @section -- Advanced Options
bypassCoscheduler: false
schedulerName:

# -- (string) Service account to be used for running the Job
# @section -- Advanced Options
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
queueName: default-queue # local queue to submit to (default: default-queue)

schedulerName: sakkara
# If additional constraints are used, specify the configmap here:
#customLabels:
# - key: sakkara.group.name
# value: my-topogrp-0

numPods: 4 # total pod count including master and worker pods (default: 1)
numCpusPerPod: 500m # requested number of cpus per pod (default: 1)
numGpusPerPod: 8 # requested number of gpus per pod (default: 0)
totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi)

priority: default-priority # default-priority (default), low-priority, or high-priority

# container image for the pods (required)
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126

# setup commands to run in each pod (optional)
setupCommands:
- git clone https://github.com/dbarnett/python-helloworld
- cd python-helloworld

# main program to invoke via torchrun (optional)
mainProgram: helloworld.py

0 comments on commit 7840fa1

Please sign in to comment.