diff --git a/charts/kaito/gpu-provisioner/Chart.yaml b/charts/kaito/gpu-provisioner/Chart.yaml index 9c361639b..a5945884e 100644 --- a/charts/kaito/gpu-provisioner/Chart.yaml +++ b/charts/kaito/gpu-provisioner/Chart.yaml @@ -5,7 +5,7 @@ type: application version: 0.0.1 appVersion: 0.1.0 sources: -- https://github.com/Azure/gpu-provisioner + - https://github.com/Azure/gpu-provisioner maintainers: - name: Fei-Guo email: vrgf2003@gmail.com diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md index 5fcf189b9..4f79b9ee5 100644 --- a/charts/kaito/gpu-provisioner/README.md +++ b/charts/kaito/gpu-provisioner/README.md @@ -14,57 +14,58 @@ helm install gpu-provisioner ./charts/kaito/gpu-provisioner ## Values -| Key | Type | Default | Description | -|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | -| additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | -| additionalLabels | object | `{}` | Additional labels to add into metadata. | -| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.azure.com/cluster","operator":"Exists"},{"key":"type","operator":"NotIn","values":["virtual-kubelet"]},{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]},{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | -| controller.env | list | `[{"name":"ARM_SUBSCRIPTION_ID","value":null},{"name":"LOCATION","value":null},{"name":"AZURE_CLUSTER_NAME","value":null},{"name":"AZURE_NODE_RESOURCE_GROUP","value":null},{"name":"ARM_RESOURCE_GROUP","value":"l"},{"name":"LEADER_ELECT","value":"false"}]` | Additional environment variables for the controller pod. | -| controller.envFrom | list | `[]` | | -| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | -| controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | -| controller.image.digest | string | `""` | SHA256 digest of the controller image. | -| controller.image.repository | string | `"mcr.microsoft.com/aks/kaito/gpu-provisioner"` | Repository path to the controller image. | -| controller.image.tag | string | `"0.0.1"` | Tag of the controller image. | -| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | -| controller.logLevel | string | `"debug"` | Controller log level, defaults to the global log level | -| controller.metrics.port | int | `8000` | The container port to use for metrics. | -| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | -| controller.resources | object | `{"limits":{"cpu":"500m"},"requests":{"cpu":"200m"}}` | Resources for the controller pod. | -| controller.securityContext | object | `{}` | SecurityContext for the controller container. | -| dnsConfig | object | `{}` | Configure DNS Config for the pod | -| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | -| extraVolumes | list | `[]` | Additional volumes for the pod. | -| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | -| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | -| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | -| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | -| logEncoding | string | `"console"` | Global log encoding | -| logLevel | string | `"debug"` | Global log level | -| nameOverride | string | `""` | Overrides the chart's name. | -| namespace | string | `"gpu-provisioner"` | | -| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | -| podAnnotations | object | `{}` | Additional annotations for the pod. | -| podDisruptionBudget.maxUnavailable | int | `1` | | -| podDisruptionBudget.name | string | `"karpenter"` | | -| podLabels | object | `{}` | Additional labels for the pod. | -| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | -| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | -| replicas | int | `1` | Number of replicas. | -| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | -| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | -| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | -| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | -| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | -| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | -| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings.azure | object | `{"clusterName":"new_demo","tags":null}` | Azure-specific configuration values | -| settings.azure.clusterName | string | `"new_demo"` | Cluster name. | -| settings.azure.tags | string | `nil` | The global tags to use on all Azure infrastructure resources (VMs, etc.) TODO: not propagated yet ... | -| settings.featureGates | string | `nil` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | -| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | -| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | -| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | -| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | -| workloadIdentity | object | `{"clientId":null,"tenantId":null}` | Global Settings to configure gpu-provisioner | +| Key | Type | Default | Description | +|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | +| additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | +| additionalLabels | object | `{}` | Additional labels to add into metadata. | +| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.azure.com/cluster","operator":"Exists"},{"key":"type","operator":"NotIn","values":["virtual-kubelet"]},{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]},{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | +| controller.env | list | `[{"name":"ARM_SUBSCRIPTION_ID","value":null},{"name":"AZURE_CLUSTER_NAME","value":null},{"name":"AZURE_NODE_RESOURCE_GROUP","value":null},{"name":"ARM_RESOURCE_GROUP","value":null}]` | Additional environment variables for the controller pod. | +| controller.envFrom | list | `[]` | | +| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | +| controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | +| controller.image.digest | string | `""` | SHA256 digest of the controller image. | +| controller.image.repository | string | `"ghcr.io/azure/gpu-provisioner"` | Repository path to the controller image. | +| controller.image.tag | string | `"0.0.1"` | Tag of the controller image. | +| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | +| controller.logLevel | string | `"debug"` | Controller log level, defaults to the global log level | +| controller.metrics.port | int | `8000` | The container port to use for metrics. | +| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | +| controller.resources | object | `{"limits":{"cpu":"500m"},"requests":{"cpu":"200m"}}` | Resources for the controller pod. | +| controller.securityContext | object | `{}` | SecurityContext for the controller container. | +| dnsConfig | object | `{}` | Configure DNS Config for the pod | +| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | +| extraVolumes | list | `[]` | Additional volumes for the pod. | +| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | +| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | +| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | +| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | +| logEncoding | string | `"console"` | Global log encoding | +| logLevel | string | `"debug"` | Global log level | +| nameOverride | string | `""` | Overrides the chart's name. | +| namespace | string | `"gpu-provisioner"` | | +| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | +| podAnnotations | object | `{}` | Additional annotations for the pod. | +| podDisruptionBudget.maxUnavailable | int | `1` | | +| podDisruptionBudget.name | string | `"karpenter"` | | +| podLabels | object | `{}` | Additional labels for the pod. | +| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | +| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | +| replicas | int | `1` | Number of replicas. | +| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | +| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | +| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | +| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | +| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | +| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | +| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | +| settings.azure | object | `{"clusterName":null}` | Azure-specific configuration values | +| settings.azure.clusterName | string | `nil` | Cluster name. | +| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | +| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | +| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | +| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | +| workloadIdentity | object | `{"clientId":"","tenantId":""}` | Global Settings to configure gpu-provisioner | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.11.3](https://github.com/norwoodj/helm-docs/releases/v1.11.3) diff --git a/charts/kaito/gpu-provisioner/templates/deployment.yaml b/charts/kaito/gpu-provisioner/templates/deployment.yaml index 804e95eb4..9c6ab6ab2 100644 --- a/charts/kaito/gpu-provisioner/templates/deployment.yaml +++ b/charts/kaito/gpu-provisioner/templates/deployment.yaml @@ -75,14 +75,6 @@ spec: image: {{ include "gpu-provisioner.controller.image" . }} imagePullPolicy: {{ .Values.imagePullPolicy }} env: - - name: KUBERNETES_MIN_VERSION - value: "1.19.0-0" - - name: KARPENTER_SERVICE - value: {{ include "gpu-provisioner.fullname" . }} - - name: METRICS_PORT - value: "{{ .Values.controller.metrics.port }}" - - name: HEALTH_PROBE_PORT - value: "{{ .Values.controller.healthProbe.port }}" - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml index b9c98c09c..56cdf933d 100644 --- a/charts/kaito/gpu-provisioner/values.yaml +++ b/charts/kaito/gpu-provisioner/values.yaml @@ -43,7 +43,7 @@ podLabels: {} # -- Additional annotations for the pod. podAnnotations: {} podDisruptionBudget: - name: karpenter + name: gpu-provisioner maxUnavailable: 1 # -- SecurityContext for the pod. podSecurityContext: @@ -111,15 +111,15 @@ controller: # -- Additional environment variables for the controller pod. env: - name: ARM_SUBSCRIPTION_ID - value: "null" + value: - name: LOCATION - value: eastus + value: - name: AZURE_CLUSTER_NAME - value: new_demo + value: - name: AZURE_NODE_RESOURCE_GROUP - value: MC_llm-test_new_demo_eastus + value: - name: ARM_RESOURCE_GROUP - value: llm-test + value: - name: LEADER_ELECT # disable leader election for better debugging experience value: "false" envFrom: [] @@ -151,16 +151,10 @@ logLevel: debug logEncoding: console # -- Global Settings to configure gpu-provisioner workloadIdentity: - clientId: 00411c3a-8361-42f3-8917-50b6da46e9fc - tenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47 + clientId: "" + tenantId: "" settings: # -- Azure-specific configuration values azure: # -- Cluster name. - clusterName: new_demo - # -- The global tags to use on all Azure infrastructure resources (VMs, etc.) - # TODO: not propagated yet ... - tags: - # -- Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates - # in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features - featureGates: + clusterName: diff --git a/go.mod b/go.mod index 75b405836..6608dd2e8 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,6 @@ require ( k8s.io/apimachinery v0.27.3 k8s.io/client-go v0.27.3 k8s.io/klog/v2 v2.100.1 - k8s.io/kubernetes v1.27.3 k8s.io/utils v0.0.0-20230209194617-a36077c30491 knative.dev/pkg v0.0.0-20230712131115-7051d301e7f4 sigs.k8s.io/controller-runtime v0.15.2 diff --git a/go.sum b/go.sum index 21f3e322b..c406e1052 100644 --- a/go.sum +++ b/go.sum @@ -666,8 +666,6 @@ k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg= k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5FJ2kxm1WrQFanWchyKuqGg= k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg= -k8s.io/kubernetes v1.27.3 h1:gwufSj7y6X18Q2Gl8v4Ev+AJHdzWkG7A8VNFffS9vu0= -k8s.io/kubernetes v1.27.3/go.mod h1:U8ZXeKBAPxeb4J4/HOaxjw1A9K6WfSH+fY2SS7CR6IM= k8s.io/utils v0.0.0-20230209194617-a36077c30491 h1:r0BAOLElQnnFhE/ApUsg3iHdVYYPBjNSSOMowRZxxsY= k8s.io/utils v0.0.0-20230209194617-a36077c30491/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= knative.dev/pkg v0.0.0-20230712131115-7051d301e7f4 h1:oO/BQJpVCFTSTMHF/S6u+nPtIvbHDTsvbPZvdCZAFjs=