Skip to content

Commit

Permalink
Support setting shared memory for training job. (#1104)
Browse files Browse the repository at this point in the history
Signed-off-by: Syulin7 <[email protected]>
  • Loading branch information
Syulin7 authored Jul 11, 2024
1 parent a3a348c commit 5e8b6dd
Show file tree
Hide file tree
Showing 12 changed files with 88 additions and 17 deletions.
6 changes: 3 additions & 3 deletions Dockerfile.install
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ COPY . .

RUN make

RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \
tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \
RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \
tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \
mv linux-amd64/helm /usr/local/bin/helm && \
chmod u+x /usr/local/bin/helm

ENV K8S_VERSION v1.13.6
ENV K8S_VERSION v1.28.4
RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl


Expand Down
6 changes: 3 additions & 3 deletions Dockerfile.notebook.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ COPY . .

RUN make

RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \
tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \
RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \
tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \
mv linux-amd64/helm /usr/local/bin/helm && \
chmod u+x /usr/local/bin/helm

ENV K8S_VERSION v1.13.6
ENV K8S_VERSION v1.28.4
RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl

FROM $BASE_IMAGE
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile.notebook.kubeflow
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ COPY . .

RUN make

RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \
tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \
RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \
tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \
mv linux-amd64/helm /usr/local/bin/helm && \
chmod u+x /usr/local/bin/helm

ENV K8S_VERSION v1.13.6
ENV K8S_VERSION v1.28.4
RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl

FROM $BASE_IMAGE
Expand Down
12 changes: 6 additions & 6 deletions charts/pytorchjob/templates/pytorchjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,11 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shmSize }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shmSize }}
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
Expand Down Expand Up @@ -324,7 +324,7 @@ spec:
mountPath: "{{ $destPath }}"
{{- end }}
{{- end }}
{{- if .Values.shmSize }}
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
Expand Down Expand Up @@ -452,11 +452,11 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shmSize }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shmSize }}
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
Expand Down Expand Up @@ -616,7 +616,7 @@ spec:
mountPath: "{{ $destPath }}"
{{- end }}
{{- end }}
{{- if .Values.shmSize }}
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
Expand Down
1 change: 0 additions & 1 deletion charts/pytorchjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun
# git sync image
gitImage: registry.cn-zhangjiakou.aliyuncs.com/acs/git-sync:v3.3.5

shmSize: 2Gi
privileged: false

useTensorboard: false
Expand Down
41 changes: 40 additions & 1 deletion charts/tfjob/templates/tfjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
- name: init-code
Expand Down Expand Up @@ -357,6 +363,10 @@ spec:
mountPath: "{{ $destPath }}"
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
{{- if $dataDirs }}
{{- range $dataDirs }}
- mountPath: {{ .containerPath }}
Expand Down Expand Up @@ -530,6 +540,12 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
- name: init-code
Expand Down Expand Up @@ -709,7 +725,10 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}

{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
{{- end }}
{{- if .Values.chief }}
{{ .Values.chiefName | indent 4}}:
Expand Down Expand Up @@ -873,6 +892,12 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
- name: init-code
Expand Down Expand Up @@ -1050,6 +1075,10 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
{{- end }}
{{- if .Values.evaluator }}
Evaluator:
Expand Down Expand Up @@ -1175,6 +1204,12 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.shareMemory }}
{{- end }}
{{- if .Values.syncMode }}
initContainers:
- name: init-code
Expand Down Expand Up @@ -1330,4 +1365,8 @@ spec:
name: {{ .name }}
{{- end }}
{{- end }}
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
name: dshm
{{- end }}
{{- end }}
7 changes: 7 additions & 0 deletions pkg/apis/training/pytorchjob_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,13 @@ func (b *PytorchJobBuilder) TTLSecondsAfterFinished(ttl int32) *PytorchJobBuilde
return b
}

func (b *PytorchJobBuilder) ShareMemory(shm string) *PytorchJobBuilder {
if shm != "" {
b.args.ShareMemory = shm
}
return b
}

// Build is used to build the job
func (b *PytorchJobBuilder) Build() (*Job, error) {
for key, value := range b.argValues {
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/training/tfjob_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,13 @@ func (b *TFJobBuilder) TTLSecondsAfterFinished(ttl int32) *TFJobBuilder {
return b
}

func (b *TFJobBuilder) ShareMemory(shm string) *TFJobBuilder {
if shm != "" {
b.args.ShareMemory = shm
}
return b
}

func (b *TFJobBuilder) Build() (*Job, error) {
for key, value := range b.argValues {
b.AddArgValue(key, value)
Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/types/submit_pytorchjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,7 @@ type SubmitPyTorchJobArgs struct {

// TrainingOperatorCRD compatible with training-operator crd.
TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"`

// ShareMemory Specifies the shared memory size
ShareMemory string `yaml:"shareMemory"`
}
2 changes: 2 additions & 0 deletions pkg/apis/types/submit_tfjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ type SubmitTFJobArgs struct {
StartingDeadlineSeconds int64 `yaml:"startingDeadlineSeconds,omitempty"`
// Defines the TTL for cleaning up finished TFJobs. Defaults to infinite.
TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"`
// ShareMemory Specifies the shared memory size
ShareMemory string `yaml:"shareMemory"`
// for common args
CommonSubmitArgs `yaml:",inline"`

Expand Down
7 changes: 7 additions & 0 deletions pkg/argsbuilder/submit_pytorchjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func (s *SubmitPytorchJobArgsBuilder) AddCommandFlags(command *cobra.Command) {
command.Flags().StringVar(&s.args.Memory, "memory", "", "the memory resource to use for the training, like 1Gi.")
command.Flags().DurationVar(&runningTimeout, "running-timeout", runningTimeout, "Specifies the duration since startTime during which the job can remain active before it is terminated(e.g. '5s', '1m', '2h22m').")
command.Flags().DurationVar(&ttlAfterFinished, "ttl-after-finished", ttlAfterFinished, "Defines the TTL for cleaning up finished PytorchJobs(e.g. '5s', '1m', '2h22m'). Defaults to infinite.")
command.Flags().StringVar(&s.args.ShareMemory, "share-memory", "2Gi", "the shared memory of each replica to run the job, default 2Gi.")

s.AddArgValue("running-timeout", &runningTimeout).
AddArgValue("ttl-after-finished", &ttlAfterFinished)
Expand Down Expand Up @@ -163,6 +164,12 @@ func (s *SubmitPytorchJobArgsBuilder) check() error {
if s.args.TTLSecondsAfterFinished < 0 {
return fmt.Errorf("--ttl-after-finished is invalid")
}
if s.args.ShareMemory != "" {
_, err := resource.ParseQuantity(s.args.ShareMemory)
if err != nil {
return fmt.Errorf("--share-memory is invalid")
}
}
return nil
}

Expand Down
7 changes: 7 additions & 0 deletions pkg/argsbuilder/submit_tfjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ func (s *SubmitTFJobArgsBuilder) AddCommandFlags(command *cobra.Command) {
command.Flags().StringArrayVar(&evaluatorSelectors, "evaluator-selector", []string{}, `assigning jobs with "Evaluator" role to some k8s particular nodes(this option would cover --selector), usage: "--evaluator-selector=key=value"`)
command.Flags().StringArrayVar(&psSelectors, "ps-selector", []string{}, `assigning jobs with "PS" role to some k8s particular nodes(this option would cover --selector), usage: "--ps-selector=key=value"`)
command.Flags().StringVar(&roleSequence, "role-sequence", "", `specify the tfjob role sequence,like: "Worker,PS,Chief,Evaluator" or "w,p,c,e"`)
command.Flags().StringVar(&s.args.ShareMemory, "share-memory", "2Gi", "the shared memory of each replica to run the job, default 2Gi.")

s.AddArgValue("worker-selector", &workerSelectors).
AddArgValue("chief-selector", &chiefSelectors).
Expand Down Expand Up @@ -342,6 +343,12 @@ func (s *SubmitTFJobArgsBuilder) check() error {
if s.args.TTLSecondsAfterFinished < 0 {
return fmt.Errorf("--ttl-after-finished is invalid")
}
if s.args.ShareMemory != "" {
_, err := resource.ParseQuantity(s.args.ShareMemory)
if err != nil {
return fmt.Errorf("--share-memory is invalid")
}
}
return nil
}

Expand Down

0 comments on commit 5e8b6dd

Please sign in to comment.