diff --git a/charts/intel-gpu-resource-driver/Chart.yaml b/charts/intel-gpu-resource-driver/Chart.yaml index b32b263..9bd90bb 100644 --- a/charts/intel-gpu-resource-driver/Chart.yaml +++ b/charts/intel-gpu-resource-driver/Chart.yaml @@ -3,5 +3,5 @@ name: intel-gpu-resource-driver description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver type: application -version: 0.5.1 -appVersion: "v0.5.1" +version: 0.6.0 +appVersion: "v0.6.0" diff --git a/charts/intel-gpu-resource-driver/LICENSE b/charts/intel-gpu-resource-driver/LICENSE index a4cc602..0378d97 100644 --- a/charts/intel-gpu-resource-driver/LICENSE +++ b/charts/intel-gpu-resource-driver/LICENSE @@ -1,4 +1,4 @@ -Copyright 2023 Intel Corporation +Copyright 2024 Intel Corporation SPDX-License-Identifier: Apache-2.0 Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/charts/intel-gpu-resource-driver/README.md b/charts/intel-gpu-resource-driver/README.md index 4e0681d..f27419b 100644 --- a/charts/intel-gpu-resource-driver/README.md +++ b/charts/intel-gpu-resource-driver/README.md @@ -16,8 +16,6 @@ helm repo update You can execute `helm search repo intel` command to see pulled charts [optional]. ## Install Helm Chart -CRDs of the GPU driver are installed as part of the chart first. - ``` helm install intel-gpu-resource-driver intel/intel-gpu-resource-driver ``` @@ -45,9 +43,7 @@ You may also run `helm show values` on this chart's dependencies for additional | image.repository | string | `intel` | | image.name | string | `"intel-gpu-resource-driver"` | | image.pullPolicy | string | `"IfNotPresent"` | -| image.tag | string | `"v0.5.1"` | - -If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with CRDs and deployment YAMLs - they might change between releases. +| image.tag | string | `"v0.6.0"` | > [!Note] -> Helm does not support _upgrading_ (or deleting) CRDs to prevent data loss. Only installing CRDs is supported. Details: https://github.com/helm/community/blob/main/hips/hip-0011.md \ No newline at end of file +> When upgrading, CRDs from previous version need to be removed manually because Helm supports neither upgrading nor deleting CRDs, see: https://github.com/helm/community/blob/main/hips/hip-0011.md diff --git a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuallocationstates.yaml b/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuallocationstates.yaml deleted file mode 100644 index 3f82a68..0000000 --- a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuallocationstates.yaml +++ /dev/null @@ -1,191 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gpuallocationstates.gpu.resource.intel.com -spec: - group: gpu.resource.intel.com - names: - kind: GpuAllocationState - listKind: GpuAllocationStateList - plural: gpuallocationstates - singular: gas - scope: Namespaced - versions: - - name: v1alpha2 - schema: - openAPIV3Schema: - description: - GpuAllocationState holds the state required for allocation on - a node. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GpuAllocationStateSpec is the spec for the GpuAllocationState - CRD. - properties: - allocatableDevices: - additionalProperties: - description: AllocatableGpu represents an allocatable Gpu on a node. - properties: - ecc: - description: - True if ECC is enabled, might impact memory amount - and VF profiles. - type: boolean - maxvfs: - description: Greater than 0 if SR-IOV is supported / enabled. - format: int64 - type: integer - memory: - description: Amount of local memory in MiB. - format: int64 - maximum: 1048576 - minimum: 0 - type: integer - millicores: - description: Amount of GPU millicores. - format: int64 - maximum: 1000 - minimum: 0 - type: integer - model: - description: pci-id of the Gpu device. - type: string - parentuid: - description: Device where VF should be / is provisioned. - type: string - type: - description: - "Type of the device: bare-metal Gpu or SR-IOV Virtual - Function (VF)." - enum: - - gpu - - vf - - any - type: string - uid: - description: - "Unique identifier of device: PCI address and PCI - Device ID." - type: string - vfindex: - description: Index of SR-IOV Virtual Function - format: int64 - type: integer - required: - - ecc - - maxvfs - - memory - - millicores - - model - - parentuid - - type - - uid - - vfindex - type: object - type: object - allocatedClaims: - additionalProperties: - description: Resources that were allocated for the claim by controller. - properties: - gpus: - description: - AllocatedGpus represents a list of allocated devices - on a node. - items: - description: - AllocatedGpu represents an allocated Gpu on a - node. - properties: - memory: - description: Amount of local memory in MiB. - format: int64 - maximum: 1048576 - minimum: 0 - type: integer - millicores: - description: Amount of GPU millicores. - format: int64 - maximum: 1000 - minimum: 0 - type: integer - parentuid: - description: Device where VF should be / is provisioned. - type: string - profile: - description: - Virtual Function profile defines amount of - local memory and time slice VF gets. - type: string - type: - description: - "Type of the device: bare-metal Gpu or SR-IOV - Virtual Function (VF)." - enum: - - gpu - - vf - - any - type: string - uid: - description: - "Unique identifier of device: PCI address - and PCI Device ID." - type: string - vfindex: - description: Index of SR-IOV Virtual Function - format: int64 - type: integer - required: - - memory - - millicores - - parentuid - - profile - - type - - uid - - vfindex - type: object - maxItems: 640 - type: array - required: - - gpus - type: object - type: object - taintedDevices: - additionalProperties: - description: TaintedGpu represents a tainted Gpu on a node. - properties: - reasons: - additionalProperties: - type: boolean - description: |- - Reasons why device is tainted, which _all_ need to be - resolved, before device can be dropped from taints map. - type: object - type: object - type: object - type: object - status: - type: string - type: object - served: true - storage: true diff --git a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclaimparameters.yaml b/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclaimparameters.yaml deleted file mode 100644 index 161c96d..0000000 --- a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclaimparameters.yaml +++ /dev/null @@ -1,83 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gpuclaimparameters.gpu.resource.intel.com -spec: - group: gpu.resource.intel.com - names: - kind: GpuClaimParameters - listKind: GpuClaimParametersList - plural: gpuclaimparameters - singular: gpuclaimparameters - scope: Namespaced - versions: - - name: v1alpha2 - schema: - openAPIV3Schema: - description: - GpuClaimParameters holds the set of parameters provided when - creating a resource claim for a GPU. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GpuClaimParametersSpec is the spec for the GpuClaimParameters - CRD. - properties: - count: - description: - How many items of the Type are being requested. 10 PCIe - devices x 64 SR-IOV VFs each = 640 items maximum on one Node. - format: int64 - maximum: 640 - minimum: 1 - type: integer - memory: - description: Per GPU memory request, in MiB, maximum 1048576 (1 TiB) - format: int64 - maximum: 1048576 - minimum: 8 - type: integer - millicores: - description: Per GPU millicores request. - format: int64 - maximum: 1000 - minimum: 1 - type: integer - shareable: - description: - True if the same ResourceClaim can be shared by multiple - Pods. - type: boolean - type: - description: "Type of the GPU device: physical or virtual or any." - enum: - - gpu - - vf - - any - type: string - required: - - count - type: object - type: object - served: true - storage: true diff --git a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclassparameters.yaml b/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclassparameters.yaml deleted file mode 100644 index 019a862..0000000 --- a/charts/intel-gpu-resource-driver/crds/gpu.resource.intel.com_gpuclassparameters.yaml +++ /dev/null @@ -1,73 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.15.0 - name: gpuclassparameters.gpu.resource.intel.com -spec: - group: gpu.resource.intel.com - names: - kind: GpuClassParameters - listKind: GpuClassParametersList - plural: gpuclassparameters - singular: gpuclassparameters - scope: Cluster - versions: - - name: v1alpha2 - schema: - openAPIV3Schema: - description: - GpuClassParameters holds the set of parameters provided when - creating a resource class for this driver. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: - GpuClassParametersSpec is the spec for the GpuClassParametersSpec - CRD. - properties: - deviceSelector: - items: - description: - DeviceSelector allows one to match on a specific type - of Device as part of the class. - properties: - name: - type: string - type: - type: string - required: - - name - - type - type: object - type: array - monitor: - type: boolean - shared: - description: - If true, ResourceClaims of this class share GPU allocated - to them. If false, the GPU is allocated to ResourceClaim exclusively. - type: boolean - required: - - shared - type: object - type: object - served: true - storage: true diff --git a/charts/intel-gpu-resource-driver/templates/clusterrole.yaml b/charts/intel-gpu-resource-driver/templates/clusterrole.yaml index 7cf9846..a4ff6a7 100644 --- a/charts/intel-gpu-resource-driver/templates/clusterrole.yaml +++ b/charts/intel-gpu-resource-driver/templates/clusterrole.yaml @@ -5,11 +5,11 @@ metadata: namespace: {{ include "intel-gpu-resource-driver.namespace" . }} rules: - apiGroups: [""] - resources: ["pods", "nodes", "events"] - verbs: ["get", "list", "create", "watch", "patch"] + resources: ["nodes"] + verbs: ["get"] - apiGroups: ["resource.k8s.io"] - resources: ["resourceclaims", "resourceclasses", "podschedulings","resourceclaims/status", "podschedulings/status", "podschedulingcontexts", "podschedulingcontexts/status"] - verbs: ["get", "update", "list", "watch", "patch"] -- apiGroups: ["gpu.resource.intel.com"] - resources: ["*"] - verbs: ["*"] + resources: ["resourceslices"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaims"] + verbs: ["get"] diff --git a/charts/intel-gpu-resource-driver/templates/device-class.yaml b/charts/intel-gpu-resource-driver/templates/device-class.yaml new file mode 100644 index 0000000..cb28849 --- /dev/null +++ b/charts/intel-gpu-resource-driver/templates/device-class.yaml @@ -0,0 +1,9 @@ +apiVersion: resource.k8s.io/v1alpha3 +kind: DeviceClass +metadata: + name: gpu.intel.com + +spec: + selectors: + - cel: + expression: device.driver == "gpu.intel.com" diff --git a/charts/intel-gpu-resource-driver/templates/resource-class.yaml b/charts/intel-gpu-resource-driver/templates/resource-class.yaml deleted file mode 100644 index 80e3f2c..0000000 --- a/charts/intel-gpu-resource-driver/templates/resource-class.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: gpu.resource.intel.com/v1alpha2 -kind: GpuClassParameters -metadata: - name: intel-gpu-params-shared - labels: - {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} -spec: - shared: true ---- -apiVersion: resource.k8s.io/v1alpha2 -kind: ResourceClass -metadata: - name: intel-gpu-shared -driverName: gpu.resource.intel.com -parametersRef: - apiGroup: gpu.resource.intel.com/v1alpha2 - kind: GpuClassParameters - name: intel-gpu-params-shared - ---- -apiVersion: gpu.resource.intel.com/v1alpha2 -kind: GpuClassParameters -metadata: - name: intel-gpu-params -spec: - shared: false ---- -apiVersion: resource.k8s.io/v1alpha2 -kind: ResourceClass -metadata: - name: intel-gpu -driverName: gpu.resource.intel.com -parametersRef: - apiGroup: gpu.resource.intel.com/v1alpha2 - kind: GpuClassParameters - name: intel-gpu-params - ---- -apiVersion: gpu.resource.intel.com/v1alpha2 -kind: GpuClassParameters -metadata: - name: intel-gpu-monitor-params -spec: - monitor: true - shared: false ---- -apiVersion: resource.k8s.io/v1alpha2 -kind: ResourceClass -metadata: - name: intel-gpu-monitor -driverName: gpu.resource.intel.com -parametersRef: - apiGroup: gpu.resource.intel.com/v1alpha2 - kind: GpuClassParameters - name: intel-gpu-monitor-params \ No newline at end of file diff --git a/charts/intel-gpu-resource-driver/templates/resource-defaults.yaml b/charts/intel-gpu-resource-driver/templates/resource-defaults.yaml deleted file mode 100644 index a941fa4..0000000 --- a/charts/intel-gpu-resource-driver/templates/resource-defaults.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: defaults - namespace: {{ include "intel-gpu-resource-driver.namespace" . }} -data: -# It is possible to override driver hardcoded defaults: -# - resourceClaimParameters: -# if resource claim did not include any parameters, these will be used -# - vf-memory -# per product amount of local memory, in MiB, the auto-added VFs will get - resourceClaimParameters.config: | - count=1 - type=gpu - shareable=true - vf-memory.config: | - { - "max1550": 16385, - "max1450": 16384, - "max1100": 8192, - "flex140": 2048, - "flex170": 4096, - } diff --git a/charts/intel-gpu-resource-driver/templates/resource-driver.yaml b/charts/intel-gpu-resource-driver/templates/resource-driver.yaml index 05829ef..2fbba87 100644 --- a/charts/intel-gpu-resource-driver/templates/resource-driver.yaml +++ b/charts/intel-gpu-resource-driver/templates/resource-driver.yaml @@ -16,30 +16,6 @@ spec: spec: serviceAccount: intel-gpu-resource-driver-service-account serviceAccountName: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} - initContainers: - - name: init - image: {{ include "intel-gpu-resource-driver.fullimage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/kubelet-gpu-plugin", "--status", "NotReady"] - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - securityContext: - privileged: false - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10001 - seccompProfile: - type: RuntimeDefault containers: - name: kubelet-plugin image: {{ include "intel-gpu-resource-driver.fullimage" . }} @@ -56,11 +32,6 @@ spec: fieldPath: metadata.namespace - name: SYSFS_ROOT value: "/sysfs" - # Use this to tell kubelet-plugin where the DRI devices nodes should be. - # This will be prefix for CDI devices, runtime will try to mount devices - # with this prefix into workloads. - #- name: DEV_DRI_PATH - # value: "/fake/dri" volumeMounts: - name: plugins-registry mountPath: /var/lib/kubelet/plugins_registry @@ -70,9 +41,7 @@ spec: mountPath: /etc/cdi - name: varruncdi mountPath: /var/run/cdi - - name: defaults - mountPath: "/defaults" - readOnly: true + # when using fake sysfs - mount at the same place as on host - name: sysfs mountPath: "/sysfs" securityContext: @@ -100,9 +69,6 @@ spec: - name: sysfs hostPath: path: /sys - - name: defaults - configMap: - name: defaults {{- with .Values.kubeletPlugin.tolerations }} tolerations: {{- toYaml . | nindent 8 }} @@ -115,55 +81,3 @@ spec: affinity: {{- toYaml . | nindent 8 }} {{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: intel-gpu-resource-driver-controller - namespace: {{ include "intel-gpu-resource-driver.namespace" . }} - labels: - {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - app: intel-gpu-resource-driver - template: - metadata: - labels: - app: intel-gpu-resource-driver - spec: - serviceAccount: intel-gpu-resource-driver-service-account - serviceAccountName: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} - containers: - - name: controller - image: {{ include "intel-gpu-resource-driver.fullimage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/gpu-controller"] - env: - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - securityContext: - privileged: false - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 10001 - seccompProfile: - type: RuntimeDefault - {{- with .Values.controller.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml b/charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml new file mode 100644 index 0000000..d7fbece --- /dev/null +++ b/charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml @@ -0,0 +1,7 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: resourceslices-policy-dra-kubelet-plugin-gpu +spec: + policyName: resourceslices-policy-dra-kubelet-plugin-gpu + validationActions: [Deny] diff --git a/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml b/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml new file mode 100644 index 0000000..dfa1256 --- /dev/null +++ b/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml @@ -0,0 +1,31 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: resourceslices-policy-dra-kubelet-plugin-gpu +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["resource.k8s.io"] + apiVersions: ["v1alpha3"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["resourceslices"] + matchConditions: + - name: isRestrictedUser + expression: >- + request.userInfo.username == "system:serviceaccount:intel-gpu-resource-driver:intel-gpu-resource-driver-service-account" + variables: + - name: userNodeName + expression: >- + request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') + - name: objectNodeName + expression: >- + (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") + validations: + - expression: variables.userNodeName != "" + message: >- + no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled + - expression: variables.userNodeName == variables.objectNodeName + messageExpression: >- + "this user running on node '"+variables.userNodeName+"' may not modify " + + (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") diff --git a/charts/intel-gpu-resource-driver/values.yaml b/charts/intel-gpu-resource-driver/values.yaml index f48373e..a3ee0eb 100644 --- a/charts/intel-gpu-resource-driver/values.yaml +++ b/charts/intel-gpu-resource-driver/values.yaml @@ -9,29 +9,14 @@ image: repository: intel name: intel-gpu-resource-driver pullPolicy: IfNotPresent - tag: "v0.5.1" + tag: "v0.6.0" serviceAccount: + create: true annotations: {} - name: "intel-gpu-resource-driver-service-account" + name: intel-gpu-resource-driver-service-account automount: true -# Define Controller Part -controller: - podAnnotations: {} - tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - nodeSelector: - {} - #node-role.kubernetes.io/control-plane: "" - affinity: {} - -# Define Kubelet-Plugin Part kubeletPlugin: podAnnotations: {} tolerations: []