diff --git a/Makefile b/Makefile index d3df4896c..c3bfe3385 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Image URL to use all building/pushing image targets REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace -VERSION ?= v0.2.0 +VERSION ?= v0.2.1 IMG_TAG ?= $(subst v,,$(VERSION)) ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) @@ -208,8 +208,6 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu $(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId")) $(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".id")) - yq -i '(.controller.image.repository) = "mcr.microsoft.com/aks/kaito/gpu-provisioner"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.image.tag) = "0.1.0"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml @@ -266,7 +264,7 @@ lint: $(GOLANGCI_LINT) .PHONY: release-manifest release-manifest: @sed -i -e 's/^VERSION ?= .*/VERSION ?= ${VERSION}/' ./Makefile - @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml + @sed -i -e "s/appVersion: .*/appVersion: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/kaito/workspace/values.yaml @sed -i -e 's/IMG_TAG=.*/IMG_TAG=${IMG_TAG}/' ./charts/kaito/workspace/README.md git checkout -b release-${VERSION} diff --git a/charts/kaito/gpu-provisioner/Chart.yaml b/charts/kaito/gpu-provisioner/Chart.yaml index 20f58d8cc..83889e614 100644 --- a/charts/kaito/gpu-provisioner/Chart.yaml +++ b/charts/kaito/gpu-provisioner/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: gpu-provisioner description: A Helm chart for gpu-provisioner type: application -version: 0.1.0 -appVersion: 0.1.0 +version: 0.2.0 +appVersion: 0.2.0 sources: - https://github.com/Azure/gpu-provisioner maintainers: diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md index 952d2c976..d2dc949cd 100644 --- a/charts/kaito/gpu-provisioner/README.md +++ b/charts/kaito/gpu-provisioner/README.md @@ -1,6 +1,6 @@ # Karpenter Azure provider gpu-provisioner -![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.0](https://img.shields.io/badge/AppVersion-0.1.0-informational?style=flat-square) +![Version: 0.2.0](https://img.shields.io/badge/Version-0.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.2.0](https://img.shields.io/badge/AppVersion-0.2.0-informational?style=flat-square) A Helm chart for gpu-provisioner @@ -9,63 +9,58 @@ A Helm chart for gpu-provisioner To install the chart with the release name `gpu-provisioner`: ```bash -helm install gpu-provisioner ./charts/kaito/gpu-provisioner +helm install gpu-provisioner ./charts/gpu-provisioner ``` ## Values -| Key | Type | Default | Description | -|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | -| additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | -| additionalLabels | object | `{}` | Additional labels to add into metadata. | -| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.azure.com/cluster","operator":"Exists"},{"key":"type","operator":"NotIn","values":["virtual-kubelet"]},{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]},{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | -| controller.env | list | `[{"name":"ARM_SUBSCRIPTION_ID","value":null},{"name":"AZURE_CLUSTER_NAME","value":null},{"name":"AZURE_NODE_RESOURCE_GROUP","value":null},{"name":"ARM_RESOURCE_GROUP","value":null}]` | Additional environment variables for the controller pod. | -| controller.envFrom | list | `[]` | | -| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | -| controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | -| controller.image.digest | string | `""` | SHA256 digest of the controller image. | -| controller.image.repository | string | `"ghcr.io/azure/gpu-provisioner"` | Repository path to the controller image. | -| controller.image.tag | string | `"0.1.0"` | Tag of the controller image. | -| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | -| controller.logLevel | string | `"debug"` | Controller log level, defaults to the global log level | -| controller.metrics.port | int | `8000` | The container port to use for metrics. | -| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | -| controller.resources | object | `{"limits":{"cpu":"500m"},"requests":{"cpu":"200m"}}` | Resources for the controller pod. | -| controller.securityContext | object | `{}` | SecurityContext for the controller container. | -| dnsConfig | object | `{}` | Configure DNS Config for the pod | -| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | -| extraVolumes | list | `[]` | Additional volumes for the pod. | -| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | -| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | -| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | -| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | -| logEncoding | string | `"console"` | Global log encoding | -| logLevel | string | `"debug"` | Global log level | -| nameOverride | string | `""` | Overrides the chart's name. | -| namespace | string | `"gpu-provisioner"` | | -| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | -| podAnnotations | object | `{}` | Additional annotations for the pod. | -| podDisruptionBudget.maxUnavailable | int | `1` | | -| podDisruptionBudget.name | string | `"karpenter"` | | -| podLabels | object | `{}` | Additional labels for the pod. | -| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | -| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | -| replicas | int | `1` | Number of replicas. | -| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | -| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | -| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | -| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | -| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | -| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | -| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings.azure | object | `{"clusterName":null}` | Azure-specific configuration values | -| settings.azure.clusterName | string | `nil` | Cluster name. | -| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | -| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | -| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | -| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | -| workloadIdentity | object | `{"clientId":"","tenantId":""}` | Global Settings to configure gpu-provisioner | +| Key | Type | Default | Description | +|------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| +| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | +| additionalLabels | object | `{}` | Additional labels to add into metadata. | +| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}}}` | Affinity rules for scheduling the pod. | +| controller.env | list | `[]` | Additional environment variables for the controller pod. | +| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | +| controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | +| controller.image.repository | string | `mcr.microsoft.com/aks/kaito/gpu-provisioner` | | +| controller.image.tag | string | `0.2.0` | | +| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | +| controller.logLevel | string | `""` | Controller log level, defaults to the global log level | +| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | +| controller.resources | object | `{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":1,"memory":"1Gi"}}` | Resources for the controller pod. | +| controller.securityContext | object | `{}` | SecurityContext for the controller container. | +| controller.sidecarContainer | object | `{}` | Additional sideCarContainer config - this will also inherit volume mounts from deployment | +| dnsConfig | object | `{}` | Configure DNS Config for the pod | +| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | +| extraVolumes | list | `[]` | Additional volumes for the pod. | +| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | +| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | +| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | +| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | +| logEncoding | string | `"console"` | Gloabl log encoding | +| logLevel | string | `"debug"` | Global log level | +| nameOverride | string | `""` | Overrides the chart's name. | +| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | +| podAnnotations | object | `{}` | Additional annotations for the pod. | +| podDisruptionBudget.maxUnavailable | int | `1` | | +| podDisruptionBudget.name | string | `"karpenter"` | | +| podLabels | object | `{}` | Additional labels for the pod. | +| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | +| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | +| replicas | int | `2` | Number of replicas. | +| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | +| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | +| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | +| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | +| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | +| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | +| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | +| settings | object | `{"azure":{"clusterName":"","tags":null}}` | Global Settings to configure Karpenter | +| settings.azure | object | `{"clusterName":"","tags":null}` | Azure-specific configuration values | +| settings.azure.clusterName | string | `""` | Cluster name. | | +| settings.azure.tags | string | `nil` | The global tags to use on all Azure infrastructure resources (launch templates, instances, SQS queue, etc.) | +| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | +| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | +| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | +| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | topologySpreadConstraints to increase the controller resilience | ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.3](https://github.com/norwoodj/helm-docs/releases/v1.11.3) diff --git a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml index e6dcbd46e..ce293dd76 100644 --- a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml +++ b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: config-logging + name: gpu-provisioner-config-logging namespace: {{ .Values.namespace }} labels: {{- include "gpu-provisioner.labels" . | nindent 4 }} diff --git a/charts/kaito/gpu-provisioner/templates/deployment.yaml b/charts/kaito/gpu-provisioner/templates/deployment.yaml index 9c6ab6ab2..4234e024f 100644 --- a/charts/kaito/gpu-provisioner/templates/deployment.yaml +++ b/charts/kaito/gpu-provisioner/templates/deployment.yaml @@ -75,6 +75,8 @@ spec: image: {{ include "gpu-provisioner.controller.image" . }} imagePullPolicy: {{ .Values.imagePullPolicy }} env: + - name: CONFIG_LOGGING_NAME + value: "gpu-provisioner-config-logging" - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml index 11a0103c0..8c40f9e8d 100644 --- a/charts/kaito/gpu-provisioner/values.yaml +++ b/charts/kaito/gpu-provisioner/values.yaml @@ -103,7 +103,7 @@ controller: # -- Repository path to the controller image. repository: mcr.microsoft.com/aks/kaito/gpu-provisioner # -- Tag of the controller image. - tag: 0.1.0 + tag: 0.2.0 # -- SHA256 digest of the controller image. digest: "" # -- SecurityContext for the controller container. @@ -122,6 +122,8 @@ controller: value: - name: LEADER_ELECT # disable leader election for better debugging experience value: "false" + - name: E2E_TEST_MODE + value: "false" envFrom: [] # -- Resources for the controller pod. resources: diff --git a/charts/kaito/workspace/Chart.yaml b/charts/kaito/workspace/Chart.yaml index d38144ea8..35c17d199 100644 --- a/charts/kaito/workspace/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -6,13 +6,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.2.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.2.0" +appVersion: "0.2.1" home: https://github.com/Azure/kaito sources: - https://github.com/Azure/kaito diff --git a/charts/kaito/workspace/README.md b/charts/kaito/workspace/README.md index dcfe202f5..497ecbfb5 100644 --- a/charts/kaito/workspace/README.md +++ b/charts/kaito/workspace/README.md @@ -5,7 +5,7 @@ ```bash export REGISTRY= export IMG_NAME=workspace -export IMG_TAG=0.2.0 +export IMG_TAG=0.2.1 helm install workspace ./charts/kaito/workspace --set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) ``` diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 77ff062b7..90ae02156 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -5,7 +5,7 @@ replicaCount: 1 image: repository: mcr.microsoft.com/aks/kaito/workspace pullPolicy: IfNotPresent - tag: 0.2.0 + tag: 0.2.1 imagePullSecrets: [] podAnnotations: {} podSecurityContext: diff --git a/docker/presets/inference/llama-2/Dockerfile b/docker/presets/inference/llama-2/Dockerfile index 641d158bc..285cb122a 100644 --- a/docker/presets/inference/llama-2/Dockerfile +++ b/docker/presets/inference/llama-2/Dockerfile @@ -20,7 +20,7 @@ WORKDIR /workspace/llama RUN ["/bin/bash", "-c", "sed -i $'/torch.distributed.init_process_group(\"nccl\")/c\\ import datetime\\\n torch.distributed.init_process_group(\"nccl\", timeout=datetime.timedelta(days=365*100))' /workspace/llama/llama/generation.py"] RUN pip install -e . -RUN pip install torch==2.1.0 fastapi==0.103.2 pydantic==1.10.9 gputil==1.4.0 +RUN pip install torch==2.1.0 fastapi==0.109.1 pydantic==1.10.9 gputil==1.4.0 RUN pip install 'uvicorn[standard]' ARG WEIGHTS_PATH diff --git a/go.mod b/go.mod index 33cee9053..7f4ee8996 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/onsi/gomega v1.27.8 github.com/samber/lo v1.38.1 github.com/stretchr/testify v1.8.4 + gopkg.in/yaml.v2 v2.4.0 gotest.tools v2.2.0+incompatible k8s.io/api v0.27.7 k8s.io/apimachinery v0.27.7 @@ -90,9 +91,8 @@ require ( google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect google.golang.org/grpc v1.56.3 // indirect - google.golang.org/protobuf v1.30.0 // indirect + google.golang.org/protobuf v1.33.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.27.2 // indirect k8s.io/component-base v0.27.7 // indirect diff --git a/go.sum b/go.sum index 7b101bc9b..b5eed9dcc 100644 --- a/go.sum +++ b/go.sum @@ -629,8 +629,8 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= -google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=