Skip to content

Commit

Permalink
Add orchestrator type specific spec for pods created by the operator (N…
Browse files Browse the repository at this point in the history
…VIDIA#201)

* Add orchestrator type specific spec for pods created by the operator

For e.g. seccompprofile is a must for TKGS while not supported on OCP with the nonroot SCC

Signed-off-by: Shiva Krishna, Merla <[email protected]>

---------

Signed-off-by: Shiva Krishna, Merla <[email protected]>
Co-authored-by: Vishesh Tanksale <[email protected]>
Signed-off-by: Vishesh Tanksale <[email protected]>
  • Loading branch information
shivamerla and visheshtanksale committed Nov 6, 2024
1 parent df1234b commit 58f1d4a
Show file tree
Hide file tree
Showing 10 changed files with 138 additions and 58 deletions.
74 changes: 56 additions & 18 deletions internal/controller/nimcache_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
platform "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/nimparser"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
Expand Down Expand Up @@ -83,11 +84,12 @@ const (
// NIMCacheReconciler reconciles a NIMCache object
type NIMCacheReconciler struct {
client.Client
scheme *runtime.Scheme
log logr.Logger
Platform platform.Platform
updater conditions.Updater
recorder record.EventRecorder
scheme *runtime.Scheme
log logr.Logger
Platform platform.Platform
orchestratorType k8sutil.OrchestratorType
updater conditions.Updater
recorder record.EventRecorder
}

// Ensure NIMCacheReconciler implements the Reconciler interface
Expand Down Expand Up @@ -174,6 +176,13 @@ func (r *NIMCacheReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
return ctrl.Result{}, nil
}
}

// Fetch container orchestrator type
_, err = r.GetOrchestratorType()
if err != nil {
return ctrl.Result{}, fmt.Errorf("Unable to get container orchestrator type, %v", err)
}

// Handle nim-cache reconciliation
result, err = r.reconcileNIMCache(ctx, nimCache)
if err != nil {
Expand Down Expand Up @@ -221,6 +230,19 @@ func (r *NIMCacheReconciler) GetEventRecorder() record.EventRecorder {
return r.recorder
}

// GetOrchestratorType returns the container platform type
func (r *NIMCacheReconciler) GetOrchestratorType() (k8sutil.OrchestratorType, error) {
if r.orchestratorType == "" {
orchestratorType, err := k8sutil.GetOrchestratorType(r.GetClient())
if err != nil {
return k8sutil.Unknown, fmt.Errorf("Unable to get container orchestrator type, %v", err)
}
r.orchestratorType = orchestratorType
r.GetLogger().Info("Container orchestrator is successfully set", "type", orchestratorType)
}
return r.orchestratorType, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *NIMCacheReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.recorder = mgr.GetEventRecorderFor("nimcache-controller")
Expand Down Expand Up @@ -564,7 +586,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach

// Create a configmap by extracting the model manifest
// Create a temporary pod for parsing model manifest
pod := constructPodSpec(nimCache)
pod := constructPodSpec(nimCache, r.orchestratorType)
// Add nimCache as owner for watching on status change
if err := controllerutil.SetControllerReference(nimCache, pod, r.GetScheme()); err != nil {
return false, err
Expand Down Expand Up @@ -683,7 +705,7 @@ func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1a

// If Job does not exist and caching is not complete, create a new one
if err != nil && nimCache.Status.State != appsv1alpha1.NimCacheStatusReady {
job, err := r.constructJob(ctx, nimCache)
job, err := r.constructJob(ctx, nimCache, r.orchestratorType)
if err != nil {
logger.Error(err, "Failed to construct job")
return err
Expand Down Expand Up @@ -886,15 +908,19 @@ func getManifestConfigName(nimCache *appsv1alpha1.NIMCache) string {
}

// constructPodSpec constructs a Pod specification
func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod {
func constructPodSpec(nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) *corev1.Pod {
labels := map[string]string{
"app": "k8s-nim-operator",
"app.kubernetes.io/name": nimCache.Name,
"app.kubernetes.io/managed-by": "k8s-nim-operator",
}

annotations := map[string]string{
"openshift.io/scc": "nonroot",
annotations := make(map[string]string)

if platformType == k8sutil.OpenShift {
annotations = map[string]string{
"openshift.io/scc": "nonroot",
}
}

pod := &corev1.Pod{
Expand Down Expand Up @@ -925,16 +951,20 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod {
RunAsUser: nimCache.GetUserID(),
FSGroup: nimCache.GetGroupID(),
RunAsNonRoot: ptr.To[bool](true),
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
},
},
ServiceAccountName: NIMCacheServiceAccount,
Tolerations: nimCache.GetTolerations(),
NodeSelector: nimCache.GetNodeSelectors(),
},
}

// SeccompProfile must be set for TKGS
if platformType == k8sutil.TKGS {
pod.Spec.Containers[0].SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
}
}

pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
{
Name: nimCache.Spec.Source.NGC.PullSecret,
Expand Down Expand Up @@ -971,7 +1001,7 @@ func (r *NIMCacheReconciler) getPodLogs(ctx context.Context, pod *corev1.Pod) (s
return buf.String(), nil
}

func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) {
func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) (*batchv1.Job, error) {
logger := r.GetLogger()
pvcName := getPvcName(nimCache, nimCache.Spec.Storage.PVC)
labels := map[string]string{
Expand All @@ -981,10 +1011,13 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
}

annotations := map[string]string{
"openshift.io/scc": "nonroot",
"sidecar.istio.io/inject": "false",
}

if platformType == k8sutil.OpenShift {
annotations["openshift.io/scc"] = "nonroot"
}

job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: nimCache.Name + "-job",
Expand All @@ -1001,9 +1034,6 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
RunAsUser: nimCache.GetUserID(),
FSGroup: nimCache.GetGroupID(),
RunAsNonRoot: ptr.To[bool](true),
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
},
},
Containers: []corev1.Container{},
RestartPolicy: corev1.RestartPolicyNever,
Expand All @@ -1027,6 +1057,14 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
TTLSecondsAfterFinished: ptr.To[int32](600), // cleanup automatically after job finishes
},
}

// SeccompProfile must be set for TKGS
if platformType == k8sutil.TKGS {
job.Spec.Template.Spec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
}
}

if nimCache.Spec.Source.DataStore != nil {
outputPath := "/output"
if nimCache.Spec.Storage.HostPath != nil {
Expand Down
15 changes: 8 additions & 7 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/nimparser"
)

Expand Down Expand Up @@ -338,7 +339,7 @@ var _ = Describe("NIMCache Controller", func() {
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}},
},
}
pod := constructPodSpec(nimCache)
pod := constructPodSpec(nimCache, k8sutil.K8s)
Expect(pod.Name).To(Equal(getPodName(nimCache)))
Expect(pod.Spec.Containers[0].Image).To(Equal("nvcr.io/nim:test"))
Expect(pod.Spec.ImagePullSecrets[0].Name).To(Equal("my-secret"))
Expand All @@ -359,7 +360,7 @@ var _ = Describe("NIMCache Controller", func() {
},
}

pod := constructPodSpec(nimCache)
pod := constructPodSpec(nimCache, k8sutil.K8s)

err := cli.Create(context.TODO(), pod)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -387,7 +388,7 @@ var _ = Describe("NIMCache Controller", func() {
},
}

job, err := reconciler.constructJob(context.TODO(), nimCache)
job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s)
Expect(err).ToNot(HaveOccurred())

Expect(job.Name).To(Equal(getJobName(nimCache)))
Expand Down Expand Up @@ -418,7 +419,7 @@ var _ = Describe("NIMCache Controller", func() {
},
}

job, err := reconciler.constructJob(context.TODO(), nimCache)
job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s)
Expect(err).ToNot(HaveOccurred())

Expect(job.Name).To(Equal(getJobName(nimCache)))
Expand All @@ -445,7 +446,7 @@ var _ = Describe("NIMCache Controller", func() {
},
}

job, err := reconciler.constructJob(context.TODO(), nimCache)
job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s)
Expect(err).ToNot(HaveOccurred())

Expect(job.Name).To(Equal(getJobName(nimCache)))
Expand Down Expand Up @@ -481,7 +482,7 @@ var _ = Describe("NIMCache Controller", func() {
},
}

job, err := reconciler.constructJob(context.TODO(), nimCache)
job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s)
Expect(err).ToNot(HaveOccurred())

err = cli.Create(context.TODO(), job)
Expand Down Expand Up @@ -544,7 +545,7 @@ var _ = Describe("NIMCache Controller", func() {
err := reconciler.Create(context.TODO(), configMap)
Expect(err).ToNot(HaveOccurred())

job, err := reconciler.constructJob(context.TODO(), nimCache)
job, err := reconciler.constructJob(context.TODO(), nimCache, k8sutil.K8s)
Expect(err).ToNot(HaveOccurred())

err = cli.Create(context.TODO(), job)
Expand Down
36 changes: 29 additions & 7 deletions internal/controller/nimservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ package controller

import (
"context"
"fmt"
"reflect"

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
platform "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
"github.com/go-logr/logr"
Expand All @@ -48,13 +50,14 @@ const NIMServiceFinalizer = "finalizer.nimservice.apps.nvidia.com"
// NIMServiceReconciler reconciles a NIMService object
type NIMServiceReconciler struct {
client.Client
scheme *runtime.Scheme
log logr.Logger
updater conditions.Updater
renderer render.Renderer
Config *rest.Config
Platform platform.Platform
recorder record.EventRecorder
scheme *runtime.Scheme
log logr.Logger
updater conditions.Updater
renderer render.Renderer
Config *rest.Config
Platform platform.Platform
orchestratorType k8sutil.OrchestratorType
recorder record.EventRecorder
}

// Ensure NIMServiceReconciler implements the Reconciler interface
Expand Down Expand Up @@ -145,6 +148,12 @@ func (r *NIMServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
}

// Fetch container orchestrator type
_, err := r.GetOrchestratorType()
if err != nil {
return ctrl.Result{}, fmt.Errorf("Unable to get container orchestrator type, %v", err)
}

// Handle platform-specific reconciliation
if result, err := r.Platform.Sync(ctx, r, nimService); err != nil {
logger.Error(err, "error reconciling NIMService", "name", nimService.Name)
Expand Down Expand Up @@ -189,6 +198,19 @@ func (r *NIMServiceReconciler) GetEventRecorder() record.EventRecorder {
return r.recorder
}

// GetOrchestratorType returns the container platform type
func (r *NIMServiceReconciler) GetOrchestratorType() (k8sutil.OrchestratorType, error) {
if r.orchestratorType == "" {
orchestratorType, err := k8sutil.GetOrchestratorType(r.GetClient())
if err != nil {
return k8sutil.Unknown, fmt.Errorf("Unable to get container orchestrator type, %v", err)
}
r.orchestratorType = orchestratorType
r.GetLogger().Info("Container orchestrator is successfully set", "type", orchestratorType)
}
return r.orchestratorType, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *NIMServiceReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.recorder = mgr.GetEventRecorderFor("nimservice-controller")
Expand Down
8 changes: 8 additions & 0 deletions internal/controller/platform/standalone/nimservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
rendertypes "github.com/NVIDIA/k8s-nim-operator/internal/render/types"
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
Expand Down Expand Up @@ -75,6 +76,11 @@ func (r *NIMServiceReconciler) GetEventRecorder() record.EventRecorder {
return r.recorder
}

// GetOrchestratorType returns the container platform type
func (r *NIMServiceReconciler) GetOrchestratorType() k8sutil.OrchestratorType {
return r.orchestratorType
}

func (r *NIMServiceReconciler) cleanupNIMService(ctx context.Context, nimService *appsv1alpha1.NIMService) error {
// All dependent (owned) objects will be automatically garbage collected.
// TODO: Handle any custom cleanup logic for the NIM microservice
Expand Down Expand Up @@ -173,6 +179,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
var modelPVC *appsv1alpha1.PersistentVolumeClaim
modelProfile := ""

deploymentParams.OrchestratorType = string(r.GetOrchestratorType())

// Select PVC for model store
if nimService.GetNIMCacheName() != "" {
// Fetch PVC for the associated NIMCache instance and mount it
Expand Down
25 changes: 15 additions & 10 deletions internal/controller/platform/standalone/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
"github.com/go-logr/logr"
Expand Down Expand Up @@ -54,11 +55,12 @@ type NIMCacheReconciler struct {
// NIMServiceReconciler represents the NIMService reconciler instance for standalone mode
type NIMServiceReconciler struct {
client.Client
scheme *runtime.Scheme
log logr.Logger
updater conditions.Updater
renderer render.Renderer
recorder record.EventRecorder
scheme *runtime.Scheme
log logr.Logger
updater conditions.Updater
renderer render.Renderer
recorder record.EventRecorder
orchestratorType k8sutil.OrchestratorType
}

// NewNIMCacheReconciler returns NIMCacheReconciler for standalone mode
Expand All @@ -74,12 +76,15 @@ func NewNIMCacheReconciler(r shared.Reconciler) *NIMCacheReconciler {

// NewNIMServiceReconciler returns NIMServiceReconciler for standalone mode
func NewNIMServiceReconciler(r shared.Reconciler) *NIMServiceReconciler {
orchestratorType, _ := r.GetOrchestratorType()

return &NIMServiceReconciler{
Client: r.GetClient(),
scheme: r.GetScheme(),
log: r.GetLogger(),
updater: r.GetUpdater(),
recorder: r.GetEventRecorder(),
Client: r.GetClient(),
scheme: r.GetScheme(),
log: r.GetLogger(),
updater: r.GetUpdater(),
recorder: r.GetEventRecorder(),
orchestratorType: orchestratorType,
}
}

Expand Down
Loading

0 comments on commit 58f1d4a

Please sign in to comment.