Skip to content

Commit

Permalink
Adding default node selector for NIMCache pod and job (#239)
Browse files Browse the repository at this point in the history
Signed-off-by: Vishesh Tanksale <[email protected]>
  • Loading branch information
visheshtanksale authored Nov 21, 2024
1 parent edb985e commit feb392b
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 23 deletions.
9 changes: 6 additions & 3 deletions api/apps/v1alpha1/nimcache_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ type NIMCacheSpec struct {
Resources Resources `json:"resources,omitempty"`
// Tolerations for running the job to cache the NIM model
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
// NodeSelectors are the node selector labels to schedule the caching job.
NodeSelectors map[string]string `json:"gpuSelectors,omitempty"`
// NodeSelector is the node selector labels to schedule the caching job.
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// UserID is the user ID for the caching job
UserID *int64 `json:"userID,omitempty"`
// GroupID is the group ID for the caching job
Expand Down Expand Up @@ -273,7 +273,10 @@ func (n *NIMCache) GetTolerations() []corev1.Toleration {

// GetNodeSelectors returns nodeselectors configured for the NIMCache Job
func (n *NIMCache) GetNodeSelectors() map[string]string {
return n.Spec.NodeSelectors
if n.Spec.NodeSelector == nil {
return map[string]string{"feature.node.kubernetes.io/pci-10de.present": "true"}
}
return n.Spec.NodeSelector
}

// GetRuntimeClassName return the runtime class name for the NIMCache Job
Expand Down
4 changes: 2 additions & 2 deletions api/apps/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions bundle/manifests/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,16 @@ spec:
- name
type: object
type: array
gpuSelectors:
additionalProperties:
type: string
description: NodeSelectors are the node selector labels to schedule
the caching job.
type: object
groupID:
description: GroupID is the group ID for the caching job
format: int64
type: integer
nodeSelector:
additionalProperties:
type: string
description: NodeSelector is the node selector labels to schedule
the caching job.
type: object
resources:
description: Resources defines the minimum resources required for
the caching job to run(cpu, memory, gpu).
Expand Down
12 changes: 6 additions & 6 deletions config/crd/bases/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,16 @@ spec:
- name
type: object
type: array
gpuSelectors:
additionalProperties:
type: string
description: NodeSelectors are the node selector labels to schedule
the caching job.
type: object
groupID:
description: GroupID is the group ID for the caching job
format: int64
type: integer
nodeSelector:
additionalProperties:
type: string
description: NodeSelector is the node selector labels to schedule
the caching job.
type: object
resources:
description: Resources defines the minimum resources required for
the caching job to run(cpu, memory, gpu).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,16 @@ spec:
- name
type: object
type: array
gpuSelectors:
additionalProperties:
type: string
description: NodeSelectors are the node selector labels to schedule
the caching job.
type: object
groupID:
description: GroupID is the group ID for the caching job
format: int64
type: integer
nodeSelector:
additionalProperties:
type: string
description: NodeSelector is the node selector labels to schedule
the caching job.
type: object
resources:
description: Resources defines the minimum resources required for
the caching job to run(cpu, memory, gpu).
Expand Down
28 changes: 28 additions & 0 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ var _ = Describe("NIMCache Controller", func() {
}, time.Second*10).Should(Succeed())
Expect(job.Spec.Template.Spec.Containers[0].Image).To(Equal("test-container"))
Expect(job.Spec.Template.Spec.RuntimeClassName).To(Equal(&runtimeClassName))
Expect(job.Spec.Template.Spec.NodeSelector).To(Equal(map[string]string{"feature.node.kubernetes.io/pci-10de.present": "true"}))

// Check if the PVC was created
// Wait for reconciliation to complete with a timeout
Expand Down Expand Up @@ -350,6 +351,33 @@ var _ = Describe("NIMCache Controller", func() {
Expect(*pod.Spec.SecurityContext.RunAsUser).To(Equal(int64(1000)))
Expect(*pod.Spec.SecurityContext.FSGroup).To(Equal(int64(2000)))
Expect(*pod.Spec.SecurityContext.RunAsNonRoot).To(Equal(true))
Expect(pod.Spec.NodeSelector["feature.node.kubernetes.io/pci-10de.present"]).To(Equal("true"))
})

It("should construct a pod with runtime class and node selector", func() {
runtimeClassName := "test-class"
nimCache := &appsv1alpha1.NIMCache{
ObjectMeta: metav1.ObjectMeta{
Name: "test-nimcache",
Namespace: "default",
},
Spec: appsv1alpha1.NIMCacheSpec{
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}},
RuntimeClassName: runtimeClassName,
NodeSelector: map[string]string{
"test-label": "true",
},
},
}
pod := constructPodSpec(nimCache, k8sutil.K8s)
Expect(pod.Name).To(Equal(getPodName(nimCache)))
Expect(pod.Spec.Containers[0].Image).To(Equal("nvcr.io/nim:test"))
Expect(pod.Spec.ImagePullSecrets[0].Name).To(Equal("my-secret"))
Expect(*pod.Spec.SecurityContext.RunAsUser).To(Equal(int64(1000)))
Expect(*pod.Spec.SecurityContext.FSGroup).To(Equal(int64(2000)))
Expect(*pod.Spec.SecurityContext.RunAsNonRoot).To(Equal(true))
Expect(pod.Spec.NodeSelector["test-label"]).To(Equal("true"))
Expect(pod.Spec.RuntimeClassName).To(Equal(&runtimeClassName))
})

It("should create a pod with the correct specifications", func() {
Expand Down

0 comments on commit feb392b

Please sign in to comment.