Skip to content

Commit

Permalink
Adding support for runtimeclassname to NIMCache
Browse files Browse the repository at this point in the history
Signed-off-by: Vishesh Tanksale <[email protected]>
  • Loading branch information
visheshtanksale committed Nov 7, 2024
1 parent a53a852 commit d41f835
Show file tree
Hide file tree
Showing 14 changed files with 47 additions and 21 deletions.
7 changes: 7 additions & 0 deletions api/apps/v1alpha1/nimcache_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ type NIMCacheSpec struct {
CertConfig *CertConfig `json:"certConfig,omitempty"`
// Env are the additional custom environment variabes for the caching job
Env []corev1.EnvVar `json:"env,omitempty"`
// RuntimeClassName is the runtimeclass for the caching job
RuntimeClassName string `json:"runtimeClassName,omitempty"`
}

// NIMSource defines the source for caching NIM model
Expand Down Expand Up @@ -272,6 +274,11 @@ func (n *NIMCache) GetNodeSelectors() map[string]string {
return n.Spec.NodeSelectors
}

// GetRuntimeClassName return the runtime class name for the NIMCache Job
func (n *NIMCache) GetRuntimeClassName() string {
return n.Spec.RuntimeClassName
}

// +kubebuilder:object:root=true

// NIMCacheList contains a list of NIMCache
Expand Down
18 changes: 9 additions & 9 deletions api/apps/v1alpha1/nimservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ type NIMServiceSpec struct {
Metrics Metrics `json:"metrics,omitempty"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default:=1
Replicas int `json:"replicas,omitempty"`
UserID *int64 `json:"userID,omitempty"`
GroupID *int64 `json:"groupID,omitempty"`
RuntimeClass string `json:"runtimeClass,omitempty"`
Replicas int `json:"replicas,omitempty"`
UserID *int64 `json:"userID,omitempty"`
GroupID *int64 `json:"groupID,omitempty"`
RuntimeClassName string `json:"runtimeClassName,omitempty"`
}

// NIMCacheVolSpec defines the spec to use NIMCache volume
Expand Down Expand Up @@ -439,9 +439,9 @@ func (n *NIMService) GetServiceAccountName() string {
return n.Name
}

// GetRuntimeClass return the runtime class name for the NIMService deployment
func (n *NIMService) GetRuntimeClass() string {
return n.Spec.RuntimeClass
// GetRuntimeClassName return the runtime class name for the NIMService deployment
func (n *NIMService) GetRuntimeClassName() string {
return n.Spec.RuntimeClassName
}

// GetNIMCacheName returns the NIMCache name to use for the NIMService deployment
Expand Down Expand Up @@ -586,7 +586,7 @@ func (n *NIMService) GetDeploymentParams() *rendertypes.DeploymentParams {
params.ServiceAccountName = n.GetServiceAccountName()

// Set runtime class
params.RuntimeClassName = n.GetRuntimeClass()
params.RuntimeClassName = n.GetRuntimeClassName()

return params
}
Expand Down Expand Up @@ -633,7 +633,7 @@ func (n *NIMService) GetStatefulSetParams() *rendertypes.StatefulSetParams {
params.ServiceAccountName = n.GetServiceAccountName()

// Set runtime class
params.RuntimeClassName = n.GetRuntimeClass()
params.RuntimeClassName = n.GetRuntimeClassName()
return params
}

Expand Down
4 changes: 4 additions & 0 deletions bundle/manifests/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
runtimeClassName:
description: RuntimeClassName is the runtimeclass for the caching
job
type: string
source:
description: Source is the NIM model source to cache
properties:
Expand Down
2 changes: 1 addition & 1 deletion bundle/manifests/apps.nvidia.com_nimpipelines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically
Expand Down
2 changes: 1 addition & 1 deletion bundle/manifests/apps.nvidia.com_nimservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically scale
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
runtimeClassName:
description: RuntimeClassName is the runtimeclass for the caching
job
type: string
source:
description: Source is the NIM model source to cache
properties:
Expand Down
2 changes: 1 addition & 1 deletion config/crd/bases/apps.nvidia.com_nimpipelines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically
Expand Down
2 changes: 1 addition & 1 deletion config/crd/bases/apps.nvidia.com_nimservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically scale
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
runtimeClassName:
description: RuntimeClassName is the runtimeclass for the caching
job
type: string
source:
description: Source is the NIM model source to cache
properties:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
runtimeClass:
runtimeClassName:
type: string
scale:
description: Autoscaling defines attributes to automatically scale
Expand Down
5 changes: 4 additions & 1 deletion internal/controller/nimcache_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache, platformType k8sutil.Orch
"app.kubernetes.io/name": nimCache.Name,
"app.kubernetes.io/managed-by": "k8s-nim-operator",
}

runtimeClassName := nimCache.GetRuntimeClassName()
annotations := make(map[string]string)

if platformType == k8sutil.OpenShift {
Expand All @@ -931,6 +931,7 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache, platformType k8sutil.Orch
Annotations: annotations,
},
Spec: corev1.PodSpec{
RuntimeClassName: &runtimeClassName,
Containers: []corev1.Container{
{
Name: NIMCacheContainerName,
Expand Down Expand Up @@ -1004,6 +1005,7 @@ func (r *NIMCacheReconciler) getPodLogs(ctx context.Context, pod *corev1.Pod) (s
func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) (*batchv1.Job, error) {
logger := r.GetLogger()
pvcName := getPvcName(nimCache, nimCache.Spec.Storage.PVC)
runtimeClassName := nimCache.GetRuntimeClassName()
labels := map[string]string{
"app": "k8s-nim-operator",
"app.kubernetes.io/name": nimCache.Name,
Expand All @@ -1030,6 +1032,7 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
Annotations: annotations,
},
Spec: corev1.PodSpec{
RuntimeClassName: &runtimeClassName,
SecurityContext: &corev1.PodSecurityContext{
RunAsUser: nimCache.GetUserID(),
FSGroup: nimCache.GetGroupID(),
Expand Down
10 changes: 7 additions & 3 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,16 @@ var _ = Describe("NIMCache Controller", func() {
Context("When creating a NIMCache", func() {
It("should create a Job and PVC", func() {
ctx := context.TODO()
runtimeClassName := "test-class"
NIMCache := &appsv1alpha1.NIMCache{
ObjectMeta: metav1.ObjectMeta{
Name: "test-nimcache",
Namespace: "default",
},
Spec: appsv1alpha1.NIMCacheSpec{
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "test-container", PullSecret: "my-secret"}},
Storage: appsv1alpha1.NIMCacheStorage{PVC: appsv1alpha1.PersistentVolumeClaim{Create: ptr.To[bool](true), StorageClass: "standard", Size: "1Gi"}},
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "test-container", PullSecret: "my-secret"}},
Storage: appsv1alpha1.NIMCacheStorage{PVC: appsv1alpha1.PersistentVolumeClaim{Create: ptr.To[bool](true), StorageClass: "standard", Size: "1Gi"}},
RuntimeClassName: runtimeClassName,
},
Status: appsv1alpha1.NIMCacheStatus{
State: appsv1alpha1.NimCacheStatusNotReady,
Expand All @@ -128,11 +130,13 @@ var _ = Describe("NIMCache Controller", func() {

// Check if the Job was created
// Wait for reconciliation to complete with a timeout
job := &batchv1.Job{}
Eventually(func() error {
job := &batchv1.Job{}
jobName := types.NamespacedName{Name: "test-nimcache-job", Namespace: "default"}
return cli.Get(ctx, jobName, job)
}, time.Second*10).Should(Succeed())
Expect(job.Spec.Template.Spec.Containers[0].Image).To(Equal("test-container"))
Expect(job.Spec.Template.Spec.RuntimeClassName).To(Equal(&runtimeClassName))

// Check if the PVC was created
// Wait for reconciliation to complete with a timeout
Expand Down
4 changes: 2 additions & 2 deletions internal/controller/platform/standalone/nimservice_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
},
},
},
RuntimeClass: "nvidia",
RuntimeClassName: "nvidia",
},
Status: appsv1alpha1.NIMServiceStatus{
State: conditions.NotReady,
Expand Down Expand Up @@ -460,7 +460,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
Expect(deployment.Spec.Template.Spec.Containers[0].ReadinessProbe).To(Equal(nimService.Spec.ReadinessProbe.Probe))
Expect(deployment.Spec.Template.Spec.Containers[0].LivenessProbe).To(Equal(nimService.Spec.LivenessProbe.Probe))
Expect(deployment.Spec.Template.Spec.Containers[0].StartupProbe).To(Equal(nimService.Spec.StartupProbe.Probe))
Expect(*deployment.Spec.Template.Spec.RuntimeClassName).To(Equal(nimService.Spec.RuntimeClass))
Expect(*deployment.Spec.Template.Spec.RuntimeClassName).To(Equal(nimService.Spec.RuntimeClassName))

sortEnvVars(deployment.Spec.Template.Spec.Containers[0].Env)
sortEnvVars(nimService.Spec.Env)
Expand Down

0 comments on commit d41f835

Please sign in to comment.