From 6e7e2e79dc34a9cba3455d2362437106aa583b30 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 09:17:19 -0700 Subject: [PATCH 1/6] release: update manifest and helm charts for v0.2.1 Signed-off-by: Heba Elayoty --- Makefile | 2 +- charts/kaito/workspace/Chart.yaml | 2 +- charts/kaito/workspace/README.md | 2 +- charts/kaito/workspace/values.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index d3df4896c..ff48d589e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Image URL to use all building/pushing image targets REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace -VERSION ?= v0.2.0 +VERSION ?= v0.2.1 IMG_TAG ?= $(subst v,,$(VERSION)) ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) diff --git a/charts/kaito/workspace/Chart.yaml b/charts/kaito/workspace/Chart.yaml index d38144ea8..3a434eda9 100644 --- a/charts/kaito/workspace/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -6,7 +6,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.2.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/kaito/workspace/README.md b/charts/kaito/workspace/README.md index dcfe202f5..497ecbfb5 100644 --- a/charts/kaito/workspace/README.md +++ b/charts/kaito/workspace/README.md @@ -5,7 +5,7 @@ ```bash export REGISTRY= export IMG_NAME=workspace -export IMG_TAG=0.2.0 +export IMG_TAG=0.2.1 helm install workspace ./charts/kaito/workspace --set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) ``` diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 77ff062b7..90ae02156 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -5,7 +5,7 @@ replicaCount: 1 image: repository: mcr.microsoft.com/aks/kaito/workspace pullPolicy: IfNotPresent - tag: 0.2.0 + tag: 0.2.1 imagePullSecrets: [] podAnnotations: {} podSecurityContext: From 0f768718553c94bfe9603ff1ac5832970b61b022 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 09:19:11 -0700 Subject: [PATCH 2/6] Fix protobuf to address CVE-2024-24786 Signed-off-by: Heba Elayoty --- go.mod | 4 ++-- go.sum | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 33cee9053..7f4ee8996 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/onsi/gomega v1.27.8 github.com/samber/lo v1.38.1 github.com/stretchr/testify v1.8.4 + gopkg.in/yaml.v2 v2.4.0 gotest.tools v2.2.0+incompatible k8s.io/api v0.27.7 k8s.io/apimachinery v0.27.7 @@ -90,9 +91,8 @@ require ( google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect google.golang.org/grpc v1.56.3 // indirect - google.golang.org/protobuf v1.30.0 // indirect + google.golang.org/protobuf v1.33.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.27.2 // indirect k8s.io/component-base v0.27.7 // indirect diff --git a/go.sum b/go.sum index 7b101bc9b..b5eed9dcc 100644 --- a/go.sum +++ b/go.sum @@ -629,8 +629,8 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= -google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From a7a8d7855769006ff05b2673b3a2de204e85a77a Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 09:21:11 -0700 Subject: [PATCH 3/6] Fix fastapi to address CVE-2024-24762 Signed-off-by: Heba Elayoty --- docker/presets/inference/llama-2/Dockerfile | 2 +- presets/tuning/tfs/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/presets/inference/llama-2/Dockerfile b/docker/presets/inference/llama-2/Dockerfile index 641d158bc..285cb122a 100644 --- a/docker/presets/inference/llama-2/Dockerfile +++ b/docker/presets/inference/llama-2/Dockerfile @@ -20,7 +20,7 @@ WORKDIR /workspace/llama RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\ import datetime\\\n torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"] RUN pip install -e . -RUN pip install torch==2.1.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0 +RUN pip install torch==2.1.0 fastapi==0.109.1 pydantic==1.10.9 gputil==1.4.0 RUN pip install 'uvicorn[standard]' ARG WEIGHTS_PATH diff --git a/presets/tuning/tfs/requirements.txt b/presets/tuning/tfs/requirements.txt index 091e6d21f..9848f3e67 100644 --- a/presets/tuning/tfs/requirements.txt +++ b/presets/tuning/tfs/requirements.txt @@ -3,7 +3,7 @@ peft==0.8.2 transformers==4.38.2 torch==2.2.0 accelerate==0.27.2 -fastapi==0.103.2 +fastapi==0.109.1 pydantic==1.10.9 uvicorn[standard]==0.23.2 bitsandbytes==0.42.0 From 2c50dc99d3329d8821362c4d4c16e89d5e9d5b24 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 09:36:20 -0700 Subject: [PATCH 4/6] release: update manifest and helm charts for v0.2.1 Signed-off-by: Heba Elayoty --- Makefile | 2 +- charts/kaito/workspace/Chart.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index ff48d589e..5f760153b 100644 --- a/Makefile +++ b/Makefile @@ -266,7 +266,7 @@ lint: $(GOLANGCI_LINT) .PHONY: release-manifest release-manifest: @sed -i -e 's/^VERSION ?= .*/VERSION ?= ${VERSION}/' ./Makefile - @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml + @sed -i -e "s/appVersion: .*/appVersion: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/kaito/workspace/values.yaml @sed -i -e 's/IMG_TAG=.*/IMG_TAG=${IMG_TAG}/' ./charts/kaito/workspace/README.md git checkout -b release-${VERSION} diff --git a/charts/kaito/workspace/Chart.yaml b/charts/kaito/workspace/Chart.yaml index 3a434eda9..61f069ac9 100644 --- a/charts/kaito/workspace/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -6,13 +6,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.1 +version: 0.2.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.2.0" +appVersion: "0.2.1" home: https://github.com/Azure/kaito sources: - https://github.com/Azure/kaito From e40b75a31e50dd9ab9d8c234899430fcbea37b26 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 09:38:02 -0700 Subject: [PATCH 5/6] Update gpu-provisioner chart Signed-off-by: Heba Elayoty --- charts/kaito/gpu-provisioner/Chart.yaml | 4 +- charts/kaito/gpu-provisioner/README.md | 107 +++++++++--------- .../templates/clusterrole-core.yaml | 2 +- .../templates/configmap-logging.yaml | 2 +- .../gpu-provisioner/templates/deployment.yaml | 2 + charts/kaito/gpu-provisioner/values.yaml | 6 +- 6 files changed, 61 insertions(+), 62 deletions(-) diff --git a/charts/kaito/gpu-provisioner/Chart.yaml b/charts/kaito/gpu-provisioner/Chart.yaml index 20f58d8cc..83889e614 100644 --- a/charts/kaito/gpu-provisioner/Chart.yaml +++ b/charts/kaito/gpu-provisioner/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: gpu-provisioner description: A Helm chart for gpu-provisioner type: application -version: 0.1.0 -appVersion: 0.1.0 +version: 0.2.0 +appVersion: 0.2.0 sources: - https://github.com/Azure/gpu-provisioner maintainers: diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md index 952d2c976..1f90ed0ac 100644 --- a/charts/kaito/gpu-provisioner/README.md +++ b/charts/kaito/gpu-provisioner/README.md @@ -1,6 +1,6 @@ # Karpenter Azure provider gpu-provisioner -![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.0](https://img.shields.io/badge/AppVersion-0.1.0-informational?style=flat-square) +![Version: 0.2.0](https://img.shields.io/badge/Version-0.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.2.0](https://img.shields.io/badge/AppVersion-0.2.0-informational?style=flat-square) A Helm chart for gpu-provisioner @@ -9,63 +9,58 @@ A Helm chart for gpu-provisioner To install the chart with the release name `gpu-provisioner`: ```bash -helm install gpu-provisioner ./charts/kaito/gpu-provisioner +helm install gpu-provisioner ./charts/gpu-provisioner ``` ## Values -| Key | Type | Default | Description | -|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | -| additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | -| additionalLabels | object | `{}` | Additional labels to add into metadata. | -| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.azure.com/cluster","operator":"Exists"},{"key":"type","operator":"NotIn","values":["virtual-kubelet"]},{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]},{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | -| controller.env | list | `[{"name":"ARM_SUBSCRIPTION_ID","value":null},{"name":"AZURE_CLUSTER_NAME","value":null},{"name":"AZURE_NODE_RESOURCE_GROUP","value":null},{"name":"ARM_RESOURCE_GROUP","value":null}]` | Additional environment variables for the controller pod. | -| controller.envFrom | list | `[]` | | -| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | -| controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | -| controller.image.digest | string | `""` | SHA256 digest of the controller image. | -| controller.image.repository | string | `"ghcr.io/azure/gpu-provisioner"` | Repository path to the controller image. | -| controller.image.tag | string | `"0.1.0"` | Tag of the controller image. | -| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | -| controller.logLevel | string | `"debug"` | Controller log level, defaults to the global log level | -| controller.metrics.port | int | `8000` | The container port to use for metrics. | -| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | -| controller.resources | object | `{"limits":{"cpu":"500m"},"requests":{"cpu":"200m"}}` | Resources for the controller pod. | -| controller.securityContext | object | `{}` | SecurityContext for the controller container. | -| dnsConfig | object | `{}` | Configure DNS Config for the pod | -| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | -| extraVolumes | list | `[]` | Additional volumes for the pod. | -| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | -| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | -| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | -| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | -| logEncoding | string | `"console"` | Global log encoding | -| logLevel | string | `"debug"` | Global log level | -| nameOverride | string | `""` | Overrides the chart's name. | -| namespace | string | `"gpu-provisioner"` | | -| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | -| podAnnotations | object | `{}` | Additional annotations for the pod. | -| podDisruptionBudget.maxUnavailable | int | `1` | | -| podDisruptionBudget.name | string | `"karpenter"` | | -| podLabels | object | `{}` | Additional labels for the pod. | -| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | -| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | -| replicas | int | `1` | Number of replicas. | -| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | -| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | -| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | -| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | -| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | -| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | -| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings.azure | object | `{"clusterName":null}` | Azure-specific configuration values | -| settings.azure.clusterName | string | `nil` | Cluster name. | -| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | -| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | -| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | -| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | -| workloadIdentity | object | `{"clientId":"","tenantId":""}` | Global Settings to configure gpu-provisioner | +| Key | Type | Default | Description | +|------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| +| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | +| additionalLabels | object | `{}` | Additional labels to add into metadata. | +| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}}}` | Affinity rules for scheduling the pod. | +| controller.env | list | `[]` | Additional environment variables for the controller pod. | +| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | +| controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | +| controller.image.repository | string | `ghcr.io/azure/gpu-provisioner` | | +| controller.image.tag | string | `0.2.0` | | +| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | +| controller.logLevel | string | `""` | Controller log level, defaults to the global log level | +| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | +| controller.resources | object | `{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":1,"memory":"1Gi"}}` | Resources for the controller pod. | +| controller.securityContext | object | `{}` | SecurityContext for the controller container. | +| controller.sidecarContainer | object | `{}` | Additional sideCarContainer config - this will also inherit volume mounts from deployment | +| dnsConfig | object | `{}` | Configure DNS Config for the pod | +| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | +| extraVolumes | list | `[]` | Additional volumes for the pod. | +| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | +| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | +| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | +| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | +| logEncoding | string | `"console"` | Gloabl log encoding | +| logLevel | string | `"debug"` | Global log level | +| nameOverride | string | `""` | Overrides the chart's name. | +| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | +| podAnnotations | object | `{}` | Additional annotations for the pod. | +| podDisruptionBudget.maxUnavailable | int | `1` | | +| podDisruptionBudget.name | string | `"karpenter"` | | +| podLabels | object | `{}` | Additional labels for the pod. | +| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | +| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | +| replicas | int | `2` | Number of replicas. | +| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | +| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | +| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | +| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | +| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | +| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | +| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | +| settings | object | `{"azure":{"clusterName":"","tags":null}}` | Global Settings to configure Karpenter | +| settings.azure | object | `{"clusterName":"","tags":null}` | Azure-specific configuration values | +| settings.azure.clusterName | string | `""` | Cluster name. | | +| settings.azure.tags | string | `nil` | The global tags to use on all Azure infrastructure resources (launch templates, instances, SQS queue, etc.) | +| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | +| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | +| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | +| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | topologySpreadConstraints to increase the controller resilience | ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.3](https://github.com/norwoodj/helm-docs/releases/v1.11.3) diff --git a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml index b697ce02a..b2d8c5aa2 100644 --- a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml +++ b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml @@ -56,7 +56,7 @@ rules: verbs: ["create", "patch"] - apiGroups: [""] resources: ["nodes"] - verbs: ["patch", "delete"] + verbs: ["create", "patch", "delete"] - apiGroups: [""] resources: ["pods/eviction"] verbs: ["create"] diff --git a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml index e6dcbd46e..ce293dd76 100644 --- a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml +++ b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: config-logging + name: gpu-provisioner-config-logging namespace: {{ .Values.namespace }} labels: {{- include "gpu-provisioner.labels" . | nindent 4 }} diff --git a/charts/kaito/gpu-provisioner/templates/deployment.yaml b/charts/kaito/gpu-provisioner/templates/deployment.yaml index 9c6ab6ab2..4234e024f 100644 --- a/charts/kaito/gpu-provisioner/templates/deployment.yaml +++ b/charts/kaito/gpu-provisioner/templates/deployment.yaml @@ -75,6 +75,8 @@ spec: image: {{ include "gpu-provisioner.controller.image" . }} imagePullPolicy: {{ .Values.imagePullPolicy }} env: + - name: CONFIG_LOGGING_NAME + value: "gpu-provisioner-config-logging" - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml index 11a0103c0..312a733c5 100644 --- a/charts/kaito/gpu-provisioner/values.yaml +++ b/charts/kaito/gpu-provisioner/values.yaml @@ -101,9 +101,9 @@ extraVolumes: [] controller: image: # -- Repository path to the controller image. - repository: mcr.microsoft.com/aks/kaito/gpu-provisioner + repository: ghcr.io/azure/gpu-provisioner # -- Tag of the controller image. - tag: 0.1.0 + tag: 0.2.0 # -- SHA256 digest of the controller image. digest: "" # -- SecurityContext for the controller container. @@ -122,6 +122,8 @@ controller: value: - name: LEADER_ELECT # disable leader election for better debugging experience value: "false" + - name: E2E_TEST_MODE + value: "false" envFrom: [] # -- Resources for the controller pod. resources: From 7211d84bd4b7116bd386f7535cb049ba878875b4 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Sun, 17 Mar 2024 14:47:03 -0700 Subject: [PATCH 6/6] release: update manifest and helm charts for v0.2.1 Signed-off-by: Heba Elayoty --- Makefile | 2 -- charts/kaito/gpu-provisioner/README.md | 2 +- charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml | 2 +- charts/kaito/gpu-provisioner/values.yaml | 2 +- charts/kaito/workspace/Chart.yaml | 2 +- presets/tuning/tfs/requirements.txt | 2 +- 6 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 5f760153b..c3bfe3385 100644 --- a/Makefile +++ b/Makefile @@ -208,8 +208,6 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu $(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId")) $(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".id")) - yq -i '(.controller.image.repository) = "mcr.microsoft.com/aks/kaito/gpu-provisioner"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.image.tag) = "0.1.0"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md index 1f90ed0ac..d2dc949cd 100644 --- a/charts/kaito/gpu-provisioner/README.md +++ b/charts/kaito/gpu-provisioner/README.md @@ -22,7 +22,7 @@ helm install gpu-provisioner ./charts/gpu-provisioner | controller.env | list | `[]` | Additional environment variables for the controller pod. | | controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | | controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | -| controller.image.repository | string | `ghcr.io/azure/gpu-provisioner` | | +| controller.image.repository | string | `mcr.microsoft.com/aks/kaito/gpu-provisioner` | | | controller.image.tag | string | `0.2.0` | | | controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | | controller.logLevel | string | `""` | Controller log level, defaults to the global log level | diff --git a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml index b2d8c5aa2..b697ce02a 100644 --- a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml +++ b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml @@ -56,7 +56,7 @@ rules: verbs: ["create", "patch"] - apiGroups: [""] resources: ["nodes"] - verbs: ["create", "patch", "delete"] + verbs: ["patch", "delete"] - apiGroups: [""] resources: ["pods/eviction"] verbs: ["create"] diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml index 312a733c5..8c40f9e8d 100644 --- a/charts/kaito/gpu-provisioner/values.yaml +++ b/charts/kaito/gpu-provisioner/values.yaml @@ -101,7 +101,7 @@ extraVolumes: [] controller: image: # -- Repository path to the controller image. - repository: ghcr.io/azure/gpu-provisioner + repository: mcr.microsoft.com/aks/kaito/gpu-provisioner # -- Tag of the controller image. tag: 0.2.0 # -- SHA256 digest of the controller image. diff --git a/charts/kaito/workspace/Chart.yaml b/charts/kaito/workspace/Chart.yaml index 61f069ac9..35c17d199 100644 --- a/charts/kaito/workspace/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -6,7 +6,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.0 +version: 0.2.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/presets/tuning/tfs/requirements.txt b/presets/tuning/tfs/requirements.txt index 9848f3e67..091e6d21f 100644 --- a/presets/tuning/tfs/requirements.txt +++ b/presets/tuning/tfs/requirements.txt @@ -3,7 +3,7 @@ peft==0.8.2 transformers==4.38.2 torch==2.2.0 accelerate==0.27.2 -fastapi==0.109.1 +fastapi==0.103.2 pydantic==1.10.9 uvicorn[standard]==0.23.2 bitsandbytes==0.42.0