diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index 5a48f17f..6b0feab7 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -30,6 +30,7 @@ import ( appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" platform "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/nimparser" "github.com/NVIDIA/k8s-nim-operator/internal/render" "github.com/NVIDIA/k8s-nim-operator/internal/shared" @@ -83,11 +84,12 @@ const ( // NIMCacheReconciler reconciles a NIMCache object type NIMCacheReconciler struct { client.Client - scheme *runtime.Scheme - log logr.Logger - Platform platform.Platform - updater conditions.Updater - recorder record.EventRecorder + scheme *runtime.Scheme + log logr.Logger + Platform platform.Platform + orchestratorType k8sutil.OrchestratorType + updater conditions.Updater + recorder record.EventRecorder } // Ensure NIMCacheReconciler implements the Reconciler interface @@ -174,6 +176,13 @@ func (r *NIMCacheReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, nil } } + + // Fetch container orchestrator type + _, err = r.GetOrchestratorType() + if err != nil { + return ctrl.Result{}, fmt.Errorf("Unable to get container orchestrator type, %v", err) + } + // Handle nim-cache reconciliation result, err = r.reconcileNIMCache(ctx, nimCache) if err != nil { @@ -221,6 +230,19 @@ func (r *NIMCacheReconciler) GetEventRecorder() record.EventRecorder { return r.recorder } +// GetOrchestratorType returns the container platform type +func (r *NIMCacheReconciler) GetOrchestratorType() (k8sutil.OrchestratorType, error) { + if r.orchestratorType == "" { + orchestratorType, err := k8sutil.GetOrchestratorType(r.GetClient()) + if err != nil { + return k8sutil.Unknown, fmt.Errorf("Unable to get container orchestrator type, %v", err) + } + r.orchestratorType = orchestratorType + r.GetLogger().Info("Container orchestrator is successfully set", "type", orchestratorType) + } + return r.orchestratorType, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *NIMCacheReconciler) SetupWithManager(mgr ctrl.Manager) error { r.recorder = mgr.GetEventRecorderFor("nimcache-controller") @@ -564,7 +586,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach // Create a configmap by extracting the model manifest // Create a temporary pod for parsing model manifest - pod := constructPodSpec(nimCache) + pod := constructPodSpec(nimCache, r.orchestratorType) // Add nimCache as owner for watching on status change if err := controllerutil.SetControllerReference(nimCache, pod, r.GetScheme()); err != nil { return false, err @@ -683,7 +705,7 @@ func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1a // If Job does not exist and caching is not complete, create a new one if err != nil && nimCache.Status.State != appsv1alpha1.NimCacheStatusReady { - job, err := r.constructJob(ctx, nimCache) + job, err := r.constructJob(ctx, nimCache, r.orchestratorType) if err != nil { logger.Error(err, "Failed to construct job") return err @@ -886,15 +908,19 @@ func getManifestConfigName(nimCache *appsv1alpha1.NIMCache) string { } // constructPodSpec constructs a Pod specification -func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod { +func constructPodSpec(nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) *corev1.Pod { labels := map[string]string{ "app": "k8s-nim-operator", "app.kubernetes.io/name": nimCache.Name, "app.kubernetes.io/managed-by": "k8s-nim-operator", } - annotations := map[string]string{ - "openshift.io/scc": "nonroot", + annotations := make(map[string]string) + + if platformType == k8sutil.OpenShift { + annotations = map[string]string{ + "openshift.io/scc": "nonroot", + } } pod := &corev1.Pod{ @@ -925,9 +951,6 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod { RunAsUser: nimCache.GetUserID(), FSGroup: nimCache.GetGroupID(), RunAsNonRoot: ptr.To[bool](true), - SeccompProfile: &corev1.SeccompProfile{ - Type: corev1.SeccompProfileTypeRuntimeDefault, - }, }, ServiceAccountName: NIMCacheServiceAccount, Tolerations: nimCache.GetTolerations(), @@ -935,6 +958,13 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod { }, } + // SeccompProfile must be set for TKGS + if platformType == k8sutil.TKGS { + pod.Spec.Containers[0].SecurityContext.SeccompProfile = &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + } + } + pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ { Name: nimCache.Spec.Source.NGC.PullSecret, @@ -971,7 +1001,7 @@ func (r *NIMCacheReconciler) getPodLogs(ctx context.Context, pod *corev1.Pod) (s return buf.String(), nil } -func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { +func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) (*batchv1.Job, error) { logger := r.GetLogger() pvcName := getPvcName(nimCache, nimCache.Spec.Storage.PVC) labels := map[string]string{ @@ -981,10 +1011,13 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a } annotations := map[string]string{ - "openshift.io/scc": "nonroot", "sidecar.istio.io/inject": "false", } + if platformType == k8sutil.OpenShift { + annotations["openshift.io/scc"] = "nonroot" + } + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: nimCache.Name + "-job", @@ -1001,9 +1034,6 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a RunAsUser: nimCache.GetUserID(), FSGroup: nimCache.GetGroupID(), RunAsNonRoot: ptr.To[bool](true), - SeccompProfile: &corev1.SeccompProfile{ - Type: corev1.SeccompProfileTypeRuntimeDefault, - }, }, Containers: []corev1.Container{}, RestartPolicy: corev1.RestartPolicyNever, @@ -1027,6 +1057,14 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a TTLSecondsAfterFinished: ptr.To[int32](600), // cleanup automatically after job finishes }, } + + // SeccompProfile must be set for TKGS + if platformType == k8sutil.TKGS { + job.Spec.Template.Spec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + } + } + if nimCache.Spec.Source.DataStore != nil { outputPath := "/output" if nimCache.Spec.Storage.HostPath != nil { diff --git a/internal/controller/nimcache_controller_test.go b/internal/controller/nimcache_controller_test.go index dfdf2a84..079b0a33 100644 --- a/internal/controller/nimcache_controller_test.go +++ b/internal/controller/nimcache_controller_test.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/interceptor" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/nimparser" ) @@ -338,7 +339,7 @@ var _ = Describe("NIMCache Controller", func() { Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}}, }, } - pod := constructPodSpec(nimCache) + pod := constructPodSpec(nimCache, k8sutil.K8s) Expect(pod.Name).To(Equal(getPodName(nimCache))) Expect(pod.Spec.Containers[0].Image).To(Equal("nvcr.io/nim:test")) Expect(pod.Spec.ImagePullSecrets[0].Name).To(Equal("my-secret")) @@ -359,7 +360,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - pod := constructPodSpec(nimCache) + pod := constructPodSpec(nimCache, k8sutil.K8s) err := cli.Create(context.TODO(), pod) Expect(err).ToNot(HaveOccurred()) @@ -387,7 +388,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := reconciler.constructJob(context.TODO(), nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -418,7 +419,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := reconciler.constructJob(context.TODO(), nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -445,7 +446,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := reconciler.constructJob(context.TODO(), nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -481,7 +482,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := reconciler.constructJob(context.TODO(), nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s) Expect(err).ToNot(HaveOccurred()) err = cli.Create(context.TODO(), job) @@ -544,7 +545,7 @@ var _ = Describe("NIMCache Controller", func() { err := reconciler.Create(context.TODO(), configMap) Expect(err).ToNot(HaveOccurred()) - job, err := reconciler.constructJob(context.TODO(), nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s) Expect(err).ToNot(HaveOccurred()) err = cli.Create(context.TODO(), job) diff --git a/internal/controller/nimservice_controller.go b/internal/controller/nimservice_controller.go index 9f67244d..5a77e711 100644 --- a/internal/controller/nimservice_controller.go +++ b/internal/controller/nimservice_controller.go @@ -18,11 +18,13 @@ package controller import ( "context" + "fmt" "reflect" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" platform "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/render" "github.com/NVIDIA/k8s-nim-operator/internal/shared" "github.com/go-logr/logr" @@ -48,13 +50,14 @@ const NIMServiceFinalizer = "finalizer.nimservice.apps.nvidia.com" // NIMServiceReconciler reconciles a NIMService object type NIMServiceReconciler struct { client.Client - scheme *runtime.Scheme - log logr.Logger - updater conditions.Updater - renderer render.Renderer - Config *rest.Config - Platform platform.Platform - recorder record.EventRecorder + scheme *runtime.Scheme + log logr.Logger + updater conditions.Updater + renderer render.Renderer + Config *rest.Config + Platform platform.Platform + orchestratorType k8sutil.OrchestratorType + recorder record.EventRecorder } // Ensure NIMServiceReconciler implements the Reconciler interface @@ -145,6 +148,12 @@ func (r *NIMServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) } } + // Fetch container orchestrator type + _, err := r.GetOrchestratorType() + if err != nil { + return ctrl.Result{}, fmt.Errorf("Unable to get container orchestrator type, %v", err) + } + // Handle platform-specific reconciliation if result, err := r.Platform.Sync(ctx, r, nimService); err != nil { logger.Error(err, "error reconciling NIMService", "name", nimService.Name) @@ -189,6 +198,19 @@ func (r *NIMServiceReconciler) GetEventRecorder() record.EventRecorder { return r.recorder } +// GetOrchestratorType returns the container platform type +func (r *NIMServiceReconciler) GetOrchestratorType() (k8sutil.OrchestratorType, error) { + if r.orchestratorType == "" { + orchestratorType, err := k8sutil.GetOrchestratorType(r.GetClient()) + if err != nil { + return k8sutil.Unknown, fmt.Errorf("Unable to get container orchestrator type, %v", err) + } + r.orchestratorType = orchestratorType + r.GetLogger().Info("Container orchestrator is successfully set", "type", orchestratorType) + } + return r.orchestratorType, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *NIMServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { r.recorder = mgr.GetEventRecorderFor("nimservice-controller") diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 201b0cc9..9510b9de 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -22,6 +22,7 @@ import ( appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/render" rendertypes "github.com/NVIDIA/k8s-nim-operator/internal/render/types" "github.com/NVIDIA/k8s-nim-operator/internal/shared" @@ -75,6 +76,11 @@ func (r *NIMServiceReconciler) GetEventRecorder() record.EventRecorder { return r.recorder } +// GetOrchestratorType returns the container platform type +func (r *NIMServiceReconciler) GetOrchestratorType() k8sutil.OrchestratorType { + return r.orchestratorType +} + func (r *NIMServiceReconciler) cleanupNIMService(ctx context.Context, nimService *appsv1alpha1.NIMService) error { // All dependent (owned) objects will be automatically garbage collected. // TODO: Handle any custom cleanup logic for the NIM microservice @@ -173,6 +179,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi var modelPVC *appsv1alpha1.PersistentVolumeClaim modelProfile := "" + deploymentParams.OrchestratorType = string(r.GetOrchestratorType()) + // Select PVC for model store if nimService.GetNIMCacheName() != "" { // Fetch PVC for the associated NIMCache instance and mount it diff --git a/internal/controller/platform/standalone/standalone.go b/internal/controller/platform/standalone/standalone.go index 7db9160b..b7148386 100644 --- a/internal/controller/platform/standalone/standalone.go +++ b/internal/controller/platform/standalone/standalone.go @@ -21,6 +21,7 @@ import ( appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/render" "github.com/NVIDIA/k8s-nim-operator/internal/shared" "github.com/go-logr/logr" @@ -54,11 +55,12 @@ type NIMCacheReconciler struct { // NIMServiceReconciler represents the NIMService reconciler instance for standalone mode type NIMServiceReconciler struct { client.Client - scheme *runtime.Scheme - log logr.Logger - updater conditions.Updater - renderer render.Renderer - recorder record.EventRecorder + scheme *runtime.Scheme + log logr.Logger + updater conditions.Updater + renderer render.Renderer + recorder record.EventRecorder + orchestratorType k8sutil.OrchestratorType } // NewNIMCacheReconciler returns NIMCacheReconciler for standalone mode @@ -74,12 +76,15 @@ func NewNIMCacheReconciler(r shared.Reconciler) *NIMCacheReconciler { // NewNIMServiceReconciler returns NIMServiceReconciler for standalone mode func NewNIMServiceReconciler(r shared.Reconciler) *NIMServiceReconciler { + orchestratorType, _ := r.GetOrchestratorType() + return &NIMServiceReconciler{ - Client: r.GetClient(), - scheme: r.GetScheme(), - log: r.GetLogger(), - updater: r.GetUpdater(), - recorder: r.GetEventRecorder(), + Client: r.GetClient(), + scheme: r.GetScheme(), + log: r.GetLogger(), + updater: r.GetUpdater(), + recorder: r.GetEventRecorder(), + orchestratorType: orchestratorType, } } diff --git a/internal/k8sutil/k8sutil.go b/internal/k8sutil/k8sutil.go index ab8cf16d..bb143491 100644 --- a/internal/k8sutil/k8sutil.go +++ b/internal/k8sutil/k8sutil.go @@ -24,35 +24,35 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// PlatformType is the underlying container orchestrator type -type PlatformType string +// OrchestratorType is the underlying container orchestrator type +type OrchestratorType string const ( // TKGS is the VMware Tanzu Kubernetes Grid Service - TKGS PlatformType = "TKGS" + TKGS OrchestratorType = "TKGS" // OpenShift is the RedHat Openshift Container Platform - OpenShift PlatformType = "OpenShift" + OpenShift OrchestratorType = "OpenShift" // GKE is the Google Kubernetes Engine Service - GKE PlatformType = "GKE" + GKE OrchestratorType = "GKE" // EKS is the Amazon Elastic Kubernetes Service - EKS PlatformType = "EKS" + EKS OrchestratorType = "EKS" // AKS is the Azure Kubernetes Service - AKS PlatformType = "AKS" + AKS OrchestratorType = "AKS" // OKE is the Oracle Kubernetes Service - OKE PlatformType = "OKE" + OKE OrchestratorType = "OKE" // Ezmeral is the HPE Ezmeral Data Fabric - Ezmeral PlatformType = "Ezmeral" + Ezmeral OrchestratorType = "Ezmeral" // RKE is the Rancker Kubernetes Engine - RKE PlatformType = "RKE" + RKE OrchestratorType = "RKE" // K8s is the upstream Kubernetes Distribution - K8s PlatformType = "Kubernetes" + K8s OrchestratorType = "Kubernetes" // Unknown distribution type - Unknown PlatformType = "Unknown" + Unknown OrchestratorType = "Unknown" ) -// GetContainerPlatform checks the platform by looking for specific node labels that identify +// GetOrchestratorType checks the container orchestrator by looking for specific node labels that identify // TKGS, OpenShift, or CSP-specific Kubernetes distributions. -func GetContainerPlatform(k8sClient client.Client) (PlatformType, error) { +func GetOrchestratorType(k8sClient client.Client) (OrchestratorType, error) { nodes := &corev1.NodeList{} err := k8sClient.List(context.TODO(), nodes) if err != nil { diff --git a/internal/k8sutil/k8sutil_test.go b/internal/k8sutil/k8sutil_test.go index 4e129167..8395c66b 100644 --- a/internal/k8sutil/k8sutil_test.go +++ b/internal/k8sutil/k8sutil_test.go @@ -28,7 +28,7 @@ func TestDetectPlatform(t *testing.T) { tests := []struct { name string labels map[string]string - expected PlatformType + expected OrchestratorType }{ {"TKGS platform detected", map[string]string{"node.vmware.com/tkg": "true"}, TKGS}, {"OpenShift platform detected", map[string]string{"node.openshift.io/os_id": "rhcos"}, OpenShift}, @@ -53,7 +53,7 @@ func TestDetectPlatform(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(node).Build() // Detect container platform - platform, err := GetContainerPlatform(fakeClient) + platform, err := GetOrchestratorType(fakeClient) if err != nil { t.Fatalf("GetContainerPlatform failed: %v", err) } diff --git a/internal/render/types/types.go b/internal/render/types/types.go index 04e24f89..d764a409 100644 --- a/internal/render/types/types.go +++ b/internal/render/types/types.go @@ -80,6 +80,7 @@ type DeploymentParams struct { UserID *int64 GroupID *int64 RuntimeClassName string + OrchestratorType string } // StatefulSetParams holds the parameters for rendering a StatefulSet template @@ -110,6 +111,7 @@ type StatefulSetParams struct { StartupProbe *corev1.Probe NIMCachePVC string RuntimeClassName string + OrchestratorType string } // ServiceParams holds the parameters for rendering a Service template diff --git a/internal/shared/reconciler.go b/internal/shared/reconciler.go index b921b503..96ff7a76 100644 --- a/internal/shared/reconciler.go +++ b/internal/shared/reconciler.go @@ -18,6 +18,7 @@ package shared import ( "github.com/NVIDIA/k8s-nim-operator/internal/conditions" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/render" "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/runtime" @@ -34,4 +35,5 @@ type Reconciler interface { GetUpdater() conditions.Updater GetRenderer() render.Renderer GetEventRecorder() record.EventRecorder + GetOrchestratorType() (k8sutil.OrchestratorType, error) } diff --git a/manifests/deployment.yaml b/manifests/deployment.yaml index 9f9ddffe..b4c7f4ce 100644 --- a/manifests/deployment.yaml +++ b/manifests/deployment.yaml @@ -114,8 +114,10 @@ spec: {{- end }} {{- end }} securityContext: + {{- if eq .OrchestratorType "TKGS" }} seccompProfile: type: RuntimeDefault + {{- end }} {{- if .UserID }} runAsUser: {{ .UserID }} {{- end }}