From 4cdd684a4dacdf0278f01b336e1c47e0265b349e Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 19 Nov 2024 16:36:18 -0500 Subject: [PATCH] when serviceAcountName is gdr add the IPC_LOCK capability (#100) This is a kludge to sidestep file system permission problems in shared PVs for namespaces where multi-pod jobs run using roce_gdr (thus as root). In these namespaces, using the gdr serviceAccount will now always imply running as root. --- tools/pytorchjob-generator/chart/templates/_helpers.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index f4e4fd4..c836ddc 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -277,7 +277,7 @@ imagePullSecrets: [] {{- define "mlbatch.securityContext" }} -{{- if gt ( int .Values.numRoceGdr ) 0 }} +{{- if or (gt ( int .Values.numRoceGdr ) 0) (eq .Values.serviceAccountName "gdr") }} securityContext: capabilities: add: