diff --git a/.github/workflows/kaito-workspace-e2e.yaml b/.github/workflows/kaito-e2e.yaml similarity index 64% rename from .github/workflows/kaito-workspace-e2e.yaml rename to .github/workflows/kaito-e2e.yaml index bfd0d450a..0acf93e62 100644 --- a/.github/workflows/kaito-workspace-e2e.yaml +++ b/.github/workflows/kaito-e2e.yaml @@ -46,7 +46,7 @@ jobs: fi echo "VERSION=${rand}" >> $GITHUB_ENV - echo "CLUSTER_NAME=gpuprov${rand}" >> $GITHUB_ENV + echo "CLUSTER_NAME=kaito${rand}" >> $GITHUB_ENV - uses: azure/login@v1.4.6 with: @@ -74,6 +74,11 @@ jobs: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + - name: Create Azure Identity + shell: bash + run: | + az identity create --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} + - name: build KAITO image shell: bash run: | @@ -91,6 +96,34 @@ jobs: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + - name: Install gpu-provisioner helm chart + shell: bash + run: | + make gpu-provisioner-helm + kubectl wait --for=condition=available deploy "kaito-gpu-provisioner" -n gpu-provisioner --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + + - uses: azure/login@v1.4.6 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + + - name: Create Role Assignment + shell: bash + run: | + IDENTITY_PRINCIPAL_ID="$(az identity show --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)" + az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.AZURE_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor" + + - name: Create Azure Federated Identity + shell: bash + run: | + AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)" + az identity federated-credential create --name gpu-fed-credential --identity-name gpuIdentity --resource-group "${{ env.CLUSTER_NAME }}" \ + --issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange + - name: Install KAITO Workspace helm chart shell: bash run: | diff --git a/Makefile b/Makefile index a6604d828..86714ea96 100644 --- a/Makefile +++ b/Makefile @@ -15,10 +15,10 @@ GOLANGCI_LINT_BIN := golangci-lint GOLANGCI_LINT := $(abspath $(TOOLS_BIN_DIR)/$(GOLANGCI_LINT_BIN)-$(GOLANGCI_LINT_VER)) -AZURE_SUBSCRIPTION_ID ?= ff05f55d-22b5-44a7-b704-f9a8efd493ed +AZURE_SUBSCRIPTION_ID ?= $(AZURE_SUBSCRIPTION_ID) AZURE_LOCATION ?= eastus -AZURE_RESOURCE_GROUP ?= kaito-test -AZURE_CLUSTER_NAME ?= kaito-test +AZURE_RESOURCE_GROUP ?= demo +AZURE_CLUSTER_NAME ?= kaito-demo AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION) # Scripts @@ -38,7 +38,6 @@ endif $(GOLANGCI_LINT): GOBIN=$(TOOLS_BIN_DIR) $(GO_INSTALL) github.com/golangci/golangci-lint/cmd/golangci-lint $(GOLANGCI_LINT_BIN) $(GOLANGCI_LINT_VER) - # CONTAINER_TOOL defines the container tool to be used for building images. # Be aware that the target commands are only tested with Docker which is # scaffolded by default. However, you might want to replace it to use other @@ -50,26 +49,6 @@ CONTAINER_TOOL ?= docker SHELL = /usr/bin/env bash -o pipefail .SHELLFLAGS = -ec -.PHONY: all -all: build - -##@ General - -# The help target prints out all targets with their descriptions organized -# beneath their categories. The categories are represented by '##@' and the -# target descriptions by '##'. The awk commands is responsible for reading the -# entire set of makefiles included in this invocation, looking for lines of the -# file as xyz: ## something, and then pretty-format the target and help. Then, -# if there's a line with ##@ something, that gets pretty-printed as a category. -# More info on the usage of ANSI control characters for terminal formatting: -# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters -# More info on the awk command: -# http://linuxcommand.org/lc3_adv_awk.php - -.PHONY: help -help: ## Display this help. - @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) - ##@ Development .PHONY: manifests @@ -126,20 +105,20 @@ create-aks-cluster: ## Create test AKS cluster (with msi, oidc and workload iden az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --attach-acr $(AZURE_ACR_NAME) \ --node-count 1 --generate-ssh-keys --enable-managed-identity --enable-workload-identity --enable-oidc-issuer -o none - az aks nodepool add --cluster-name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --name gpunode \ - --node-count 1 --node-vm-size standard_nc96ads_a100_v4 --node-taints sku=gpu:NoSchedule \ - --aks-custom-headers UseGPUDedicatedVHD=true --node-osdisk-size 300 + # az aks nodepool add --cluster-name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --name gpunode \ + # --node-count 1 --node-vm-size standard_nc96ads_a100_v4 --node-taints sku=gpu:NoSchedule \ + # --aks-custom-headers UseGPUDedicatedVHD=true --node-osdisk-size 300 az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) .PHONY: az-patch-install-helm -az-patch-install-helm: ## Update Azure client env vars and settings in helm values.yml +az-patch-install-helm: ## Update Azure client env vars and settings in helm values.yml az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/values.yaml - yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/values.yaml + yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/workspace/values.yaml + yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/workspace/values.yaml - helm install kaito-workspace ./charts/kaito + helm install kaito-workspace ./charts/kaito/workspace ##@ Build @@ -180,22 +159,40 @@ ifndef ignore-not-found ignore-not-found = false endif -.PHONY: install -install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. - $(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f - +##@ gpu-provider +.PHONE: gpu-provisioner-identity-perm +gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner + az identity create --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) -.PHONY: uninstall -uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. - $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + IDENTITY_PRINCIPAL_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId') + IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'clientId') -.PHONY: deploy -deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. - cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} - $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - + az role assignment create --assignee $(IDENTITY_PRINCIPAL_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor" -.PHONY: undeploy -undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. - $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl") + + az identity federated-credential create --name gpu-federatecredential --identity-name gpuIdentity --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer "$(AKS_OIDC_ISSUER)" \ + --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) + +.PHONY: gpu-provisioner-helm +gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) + $(eval IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --query 'clientId' -o tsv)) + $(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId")) + $(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".subscriptionId")) + + yq -i '(.controller.image.repository) = "mcr.microsoft.com/aks/kaito/gpu-provisioner"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.image.tag) = "0.0.1"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.env[] | select(.name=="AZURE_NODE_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP_MC)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.controller.env[] | select(.name=="AZURE_CLUSTER_NAME")) .value = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.settings.azure.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.workloadIdentity.clientId) = "$(IDENTITY_CLIENT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml + yq -i '(.workloadIdentity.tenantId) = "$(AZURE_TENANT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml + + helm install kaito-gpu-provisioner ./charts/kaito/gpu-provisioner ##@ Build Dependencies @@ -206,28 +203,18 @@ $(LOCALBIN): ## Tool Binaries KUBECTL ?= kubectl -KUSTOMIZE ?= $(LOCALBIN)/kustomize CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen ENVTEST ?= $(LOCALBIN)/setup-envtest ## Tool Versions -KUSTOMIZE_VERSION ?= v5.0.1 CONTROLLER_TOOLS_VERSION ?= v0.12.0 -.PHONY: kustomize -kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. If wrong version is installed, it will be removed before downloading. -$(KUSTOMIZE): $(LOCALBIN) - @if test -x $(LOCALBIN)/kustomize && ! $(LOCALBIN)/kustomize version | grep -q $(KUSTOMIZE_VERSION); then \ - echo "$(LOCALBIN)/kustomize version is not expected $(KUSTOMIZE_VERSION). Removing it before installing."; \ - rm -rf $(LOCALBIN)/kustomize; \ - fi - test -s $(LOCALBIN)/kustomize || GOBIN=$(LOCALBIN) GO111MODULE=on go install sigs.k8s.io/kustomize/kustomize/v5@$(KUSTOMIZE_VERSION) - .PHONY: controller-gen controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. If wrong version is installed, it will be overwritten. $(CONTROLLER_GEN): $(LOCALBIN) test -s $(LOCALBIN)/controller-gen && $(LOCALBIN)/controller-gen --version | grep -q $(CONTROLLER_TOOLS_VERSION) || \ GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION) + cp config/crd/bases/kaito.sh_workspaces.yaml charts/kaito/workspace/crds/ .PHONY: envtest envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. @@ -252,9 +239,11 @@ lint: $(GOLANGCI_LINT) .PHONY: release-manifest release-manifest: @sed -i -e 's/^VERSION ?= .*/VERSION ?= ${VERSION}/' ./Makefile - @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/kaito/Chart.yaml - @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/kaito/values.yaml - @sed -i -e 's/IMG_TAG=.*/IMG_TAG=${IMG_TAG}/' ./charts/README.md + @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml + @sed -i -e "s/appVersion: .*/appVersion: ${IMG_TAG}/" ./charts/kaito/workspace/Chart.yaml + @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/kaito/workspace/values.yaml + @sed -i -e 's/IMG_TAG=.*/IMG_TAG=${IMG_TAG}/' ./charts/kaito/workspace/README.md + @sed -i -e 's/image.tag=.*/image.tag=${IMG_TAG}/' ./charts/kaito/workspace/README.md git checkout -b release-${VERSION} - git add ./Makefile ./charts/kaito/Chart.yaml ./charts/kaito/values.yaml ./charts/README.md + git add ./Makefile ./charts/kaito/workspace/Chart.yaml ./charts/kaito/workspace/values.yaml ./charts/workspace/README.md git commit -s -m "release: update manifest and helm charts for ${VERSION}" diff --git a/charts/README.md b/charts/README.md deleted file mode 100644 index 39122191b..000000000 --- a/charts/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# KAITO Helm Chart - -## Install - -```bash -export REGISTRY= -export IMG_NAME=kaito -export IMG_TAG=0.0.1 -helm install kaito-workspace ./charts/kaito --set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) -``` - -## Configuration - -The following table lists the configurable parameters of the KAITO chart and their default values. - -| Parameter | Description | Default | -|--------------------------------------------|-------------|-------------------------- | -| `replicaCount` | | `1` | -| `image.repository` | | `ghcr.io/Azure/kaito/kaito` | -| `image.pullPolicy` | | `"IfNotPresent"` | -| `image.tag` | | `latest` | -| `imagePullSecrets` | | `[]` | -| `podAnnotations` | | `{}` | -| `podSecurityContext.runAsNonRoot` | | `true` | -| `securityContext.allowPrivilegeEscalation` | | `false` | -| `securityContext.capabilities.drop` | | `["ALL"]` | -| `resources.limits.cpu` | | `"500m"` | -| `resources.limits.memory` | | `"128Mi"` | -| `resources.requests.cpu` | | `"10m"` | -| `resources.requests.memory` | | `"64Mi"` | -| `nodeSelector` | | `{}` | -| `tolerations` | | `[]` | -| `affinity` | | `{}` | diff --git a/charts/kaito/.helmignore b/charts/kaito/gpu-provisioner/.helmignore similarity index 100% rename from charts/kaito/.helmignore rename to charts/kaito/gpu-provisioner/.helmignore diff --git a/charts/kaito/gpu-provisioner/Chart.yaml b/charts/kaito/gpu-provisioner/Chart.yaml new file mode 100644 index 000000000..9c361639b --- /dev/null +++ b/charts/kaito/gpu-provisioner/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: gpu-provisioner +description: A Helm chart for gpu-provisioner +type: application +version: 0.0.1 +appVersion: 0.1.0 +sources: +- https://github.com/Azure/gpu-provisioner +maintainers: + - name: Fei-Guo + email: vrgf2003@gmail.com + - name: helayoty + email: hebaelayoty@gmail.com diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md new file mode 100644 index 000000000..5fcf189b9 --- /dev/null +++ b/charts/kaito/gpu-provisioner/README.md @@ -0,0 +1,70 @@ +# Karpenter Azure provider gpu-provisioner + +![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.0](https://img.shields.io/badge/AppVersion-0.1.0-informational?style=flat-square) + +A Helm chart for gpu-provisioner + +## Installing the Chart + +To install the chart with the release name `gpu-provisioner`: + +```bash +helm install gpu-provisioner ./charts/kaito/gpu-provisioner +``` + +## Values + +| Key | Type | Default | Description | +|------------------------------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | +| additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | +| additionalLabels | object | `{}` | Additional labels to add into metadata. | +| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.azure.com/cluster","operator":"Exists"},{"key":"type","operator":"NotIn","values":["virtual-kubelet"]},{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]},{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | +| controller.env | list | `[{"name":"ARM_SUBSCRIPTION_ID","value":null},{"name":"LOCATION","value":null},{"name":"AZURE_CLUSTER_NAME","value":null},{"name":"AZURE_NODE_RESOURCE_GROUP","value":null},{"name":"ARM_RESOURCE_GROUP","value":"l"},{"name":"LEADER_ELECT","value":"false"}]` | Additional environment variables for the controller pod. | +| controller.envFrom | list | `[]` | | +| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | +| controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | +| controller.image.digest | string | `""` | SHA256 digest of the controller image. | +| controller.image.repository | string | `"mcr.microsoft.com/aks/kaito/gpu-provisioner"` | Repository path to the controller image. | +| controller.image.tag | string | `"0.0.1"` | Tag of the controller image. | +| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | +| controller.logLevel | string | `"debug"` | Controller log level, defaults to the global log level | +| controller.metrics.port | int | `8000` | The container port to use for metrics. | +| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | +| controller.resources | object | `{"limits":{"cpu":"500m"},"requests":{"cpu":"200m"}}` | Resources for the controller pod. | +| controller.securityContext | object | `{}` | SecurityContext for the controller container. | +| dnsConfig | object | `{}` | Configure DNS Config for the pod | +| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | +| extraVolumes | list | `[]` | Additional volumes for the pod. | +| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | +| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | +| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | +| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | +| logEncoding | string | `"console"` | Global log encoding | +| logLevel | string | `"debug"` | Global log level | +| nameOverride | string | `""` | Overrides the chart's name. | +| namespace | string | `"gpu-provisioner"` | | +| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | +| podAnnotations | object | `{}` | Additional annotations for the pod. | +| podDisruptionBudget.maxUnavailable | int | `1` | | +| podDisruptionBudget.name | string | `"karpenter"` | | +| podLabels | object | `{}` | Additional labels for the pod. | +| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | +| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | +| replicas | int | `1` | Number of replicas. | +| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | +| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | +| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | +| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | +| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | +| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | +| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | +| settings.azure | object | `{"clusterName":"new_demo","tags":null}` | Azure-specific configuration values | +| settings.azure.clusterName | string | `"new_demo"` | Cluster name. | +| settings.azure.tags | string | `nil` | The global tags to use on all Azure infrastructure resources (VMs, etc.) TODO: not propagated yet ... | +| settings.featureGates | string | `nil` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | +| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | +| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | +| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | +| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | +| workloadIdentity | object | `{"clientId":null,"tenantId":null}` | Global Settings to configure gpu-provisioner | diff --git a/charts/kaito/gpu-provisioner/crds/karpenter.sh_machines.yaml b/charts/kaito/gpu-provisioner/crds/karpenter.sh_machines.yaml new file mode 100644 index 000000000..0304d1fce --- /dev/null +++ b/charts/kaito/gpu-provisioner/crds/karpenter.sh_machines.yaml @@ -0,0 +1,293 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.0 + name: machines.karpenter.sh +spec: + group: karpenter.sh + names: + categories: + - karpenter + kind: Machine + listKind: MachineList + plural: machines + singular: machine + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .metadata.labels.node\.kubernetes\.io/instance-type + name: Type + type: string + - jsonPath: .metadata.labels.karpenter\.k8s\.azure/zone + name: Zone + type: string + - jsonPath: .status.nodeName + name: Node + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .metadata.labels.karpenter\.sh/capacity-type + name: Capacity + priority: 1 + type: string + - jsonPath: .metadata.labels.karpenter\.sh/provisioner-name + name: Provisioner + priority: 1 + type: string + - jsonPath: .spec.machineTemplateRef.name + name: Template + priority: 1 + type: string + name: v1alpha5 + schema: + openAPIV3Schema: + description: Machine is the Schema for the Machines API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: MachineSpec describes the desired state of the Machine + properties: + kubelet: + description: Kubelet are options passed to the kubelet when provisioning nodes + properties: + clusterDNS: + description: clusterDNS is a list of IP addresses for the cluster DNS server. Note that not all providers may use all addresses. + items: + type: string + type: array + containerRuntime: + description: ContainerRuntime is the container runtime to be used with your worker nodes. + type: string + cpuCFSQuota: + description: CPUCFSQuota enables CPU CFS quota enforcement for containers that specify CPU limits. + type: boolean + evictionHard: + additionalProperties: + type: string + description: EvictionHard is the map of signal names to quantities that define hard eviction thresholds + type: object + evictionMaxPodGracePeriod: + description: EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in response to soft eviction thresholds being met. + format: int32 + type: integer + evictionSoft: + additionalProperties: + type: string + description: EvictionSoft is the map of signal names to quantities that define soft eviction thresholds + type: object + evictionSoftGracePeriod: + additionalProperties: + type: string + description: EvictionSoftGracePeriod is the map of signal names to quantities that define grace periods for each eviction signal + type: object + imageGCHighThresholdPercent: + description: ImageGCHighThresholdPercent is the percent of disk usage after which image garbage collection is always run. The percent is calculated by dividing this field value by 100, so this field must be between 0 and 100, inclusive. When specified, the value must be greater than ImageGCLowThresholdPercent. + format: int32 + maximum: 100 + minimum: 0 + type: integer + imageGCLowThresholdPercent: + description: ImageGCLowThresholdPercent is the percent of disk usage before which image garbage collection is never run. Lowest disk usage to garbage collect to. The percent is calculated by dividing this field value by 100, so the field value must be between 0 and 100, inclusive. When specified, the value must be less than imageGCHighThresholdPercent + format: int32 + maximum: 100 + minimum: 0 + type: integer + kubeReserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: KubeReserved contains resources reserved for Kubernetes system components. + type: object + maxPods: + description: MaxPods is an override for the maximum number of pods that can run on a worker node instance. + format: int32 + minimum: 0 + type: integer + podsPerCore: + description: PodsPerCore is an override for the number of pods that can run on a worker node instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if MaxPods is a lower value, that value will be used. + format: int32 + minimum: 0 + type: integer + systemReserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: SystemReserved contains resources reserved for OS system daemons and kernel memory. + type: object + type: object + machineTemplateRef: + description: MachineTemplateRef is a reference to an object that defines provider specific configuration + properties: + apiVersion: + description: API version of the referent + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - name + type: object + requirements: + description: Requirements are layered with Labels and applied to every node. + items: + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + resources: + description: Resources models the resource requirements for the Machine to launch + properties: + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Requests describes the minimum required resources for the Machine to launch + type: object + type: object + startupTaints: + description: StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. + properties: + effect: + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array + taints: + description: Taints will be applied to the machine's node. + items: + description: The node this Taint is attached to has the "effect" on any pod that does not tolerate the Taint. + properties: + effect: + description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array + type: object + status: + description: MachineStatus defines the observed state of Machine + properties: + allocatable: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Allocatable is the estimated allocatable capacity of the machine + type: object + capacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Capacity is the estimated full capacity of the machine + type: object + conditions: + description: Conditions contains signals for health and readiness + items: + description: 'Condition defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties' + properties: + lastTransitionTime: + description: LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). + type: string + message: + description: A human readable message indicating details about the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + severity: + description: Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of condition. + type: string + required: + - status + - type + type: object + type: array + nodeName: + description: NodeName is the name of the corresponding node object + type: string + providerID: + description: ProviderID of the corresponding node object + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/kaito/gpu-provisioner/templates/_helpers.tpl b/charts/kaito/gpu-provisioner/templates/_helpers.tpl new file mode 100644 index 000000000..15b644f1f --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/_helpers.tpl @@ -0,0 +1,180 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "gpu-provisioner.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "gpu-provisioner.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "gpu-provisioner.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "gpu-provisioner.labels" -}} +helm.sh/chart: {{ include "gpu-provisioner.chart" . }} +{{ include "gpu-provisioner.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- with .Values.additionalLabels }} +{{ toYaml . }} +{{- end }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "gpu-provisioner.selectorLabels" -}} +app.kubernetes.io/name: {{ include "gpu-provisioner.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +gpu-provisioner image to use +*/}} +{{- define "gpu-provisioner.controller.image" -}} +{{- if .Values.controller.image.digest }} +{{- printf "%s:%s@%s" .Values.controller.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.controller.image.tag) .Values.controller.image.digest }} +{{- else }} +{{- printf "%s:%s" .Values.controller.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.controller.image.tag) }} +{{- end }} +{{- end }} + + +{{/* Get PodDisruptionBudget API Version */}} +{{- define "gpu-provisioner.pdb.apiVersion" -}} +{{- if and (.Capabilities.APIVersions.Has "policy/v1") (semverCompare ">= 1.21-0" .Capabilities.KubeVersion.Version) -}} +{{- print "policy/v1" -}} +{{- else -}} +{{- print "policy/v1beta1" -}} +{{- end -}} +{{- end -}} + +{{/* +Patch the label selector on an object +This template will add a labelSelector using matchLabels to the object referenced at _target if there is no labelSelector specified. +The matchLabels are created with the selectorLabels template. +This works because Helm treats dictionaries as mutable objects and allows passing them by reference. +*/}} +{{- define "gpu-provisioner.patchLabelSelector" -}} +{{- if not (hasKey ._target "labelSelector") }} +{{- $selectorLabels := (include "gpu-provisioner.selectorLabels" .) | fromYaml }} +{{- $_ := set ._target "labelSelector" (dict "matchLabels" $selectorLabels) }} +{{- end }} +{{- end }} + +{{/* +Patch pod affinity +This template uses the patchLabelSelector template to add a labelSelector to pod affinity objects if there is no labelSelector specified. +This works because Helm treats dictionaries as mutable objects and allows passing them by reference. +*/}} +{{- define "gpu-provisioner.patchPodAffinity" -}} +{{- if (hasKey ._podAffinity "requiredDuringSchedulingIgnoredDuringExecution") }} +{{- range $term := ._podAffinity.requiredDuringSchedulingIgnoredDuringExecution }} +{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $term) $) }} +{{- end }} +{{- end }} +{{- if (hasKey ._podAffinity "preferredDuringSchedulingIgnoredDuringExecution") }} +{{- range $weightedTerm := ._podAffinity.preferredDuringSchedulingIgnoredDuringExecution }} +{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $weightedTerm.podAffinityTerm) $) }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Patch affinity +This template uses patchPodAffinity template to add a labelSelector to podAffinity & podAntiAffinity if one isn't specified. +This works because Helm treats dictionaries as mutable objects and allows passing them by reference. +*/}} +{{- define "gpu-provisioner.patchAffinity" -}} +{{- if (hasKey .Values.affinity "podAffinity") }} +{{- include "gpu-provisioner.patchPodAffinity" (merge (dict "_podAffinity" .Values.affinity.podAffinity) .) }} +{{- end }} +{{- if (hasKey .Values.affinity "podAntiAffinity") }} +{{- include "gpu-provisioner.patchPodAffinity" (merge (dict "_podAffinity" .Values.affinity.podAntiAffinity) .) }} +{{- end }} +{{- end }} + +{{/* +Patch topology spread constraints +This template uses the patchLabelSelector template to add a labelSelector to topologySpreadConstraints if one isn't specified. +This works because Helm treats dictionaries as mutable objects and allows passing them by reference. +*/}} +{{- define "gpu-provisioner.patchTopologySpreadConstraints" -}} +{{- range $constraint := .Values.topologySpreadConstraints }} +{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $constraint) $) }} +{{- end }} +{{- end }} + +{{/* +Flatten Settings Map using "." syntax +*/}} +{{- define "flattenSettings" -}} +{{- $map := first . -}} +{{- $label := last . -}} +{{- range $key := (keys $map | uniq | sortAlpha) }} + {{- $sublabel := $key -}} + {{- $val := (get $map $key) -}} + {{- if $label -}} + {{- $sublabel = list $label $key | join "." -}} + {{- end -}} + {{/* Special-case "tags" since we want this to be a JSON object */}} + {{- if eq $key "tags" -}} + {{- if not (kindIs "invalid" $val) -}} + {{- $sublabel | quote | nindent 2 }}: {{ $val | toJson | quote }} + {{- end -}} + {{- else if kindOf $val | eq "map" -}} + {{- list $val $sublabel | include "flattenSettings" -}} + {{- else -}} + {{- if not (kindIs "invalid" $val) -}} + {{- $sublabel | quote | nindent 2 -}}: {{ $val | quote }} + {{- end -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Flatten the stdout logging outputs from args provided +*/}} +{{- define "gpu-provisioner.controller.outputPathsList" -}} +{{ $paths := list -}} +{{- range .Values.controller.outputPaths -}} + {{- $paths = printf "%s" . | quote | append $paths -}} +{{- end -}} +{{ $paths | join ", " }} +{{- end -}} + +{{/* +Flatten the stderr logging outputs from args provided +*/}} +{{- define "gpu-provisioner.controller.errorOutputPathsList" -}} +{{ $paths := list -}} +{{- range .Values.controller.errorOutputPaths -}} + {{- $paths = printf "%s" . | quote | append $paths -}} +{{- end -}} +{{ $paths | join ", " }} +{{- end -}} \ No newline at end of file diff --git a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml new file mode 100644 index 000000000..b2d8c5aa2 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml @@ -0,0 +1,66 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "gpu-provisioner.fullname" . }}-core + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "gpu-provisioner.fullname" . }}-core +subjects: + - kind: ServiceAccount + name: gpu-provisioner + namespace: {{ .Values.namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "gpu-provisioner.fullname" . }}-core + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +rules: + # Read + - apiGroups: [ "karpenter.sh" ] + resources: [ "provisioners" ] + verbs: [ "get", "list", "watch" ] + - apiGroups: ["karpenter.sh"] + resources: ["machines", "machines/status"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods", "nodes", "persistentvolumes", "persistentvolumeclaims", "replicationcontrollers", "namespaces"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses", "csinodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["apps"] + resources: ["daemonsets", "deployments", "replicasets", "statefulsets"] + verbs: ["list", "watch"] + - apiGroups: [ "policy" ] + resources: [ "poddisruptionbudgets" ] + verbs: [ "get", "list", "watch" ] + # Write + - apiGroups: ["karpenter.sh"] + resources: ["machines", "machines/status"] + verbs: ["create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["create", "patch", "delete"] + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + {{- with .Values.additionalClusterRoleRules -}} + {{ toYaml . | nindent 2 }} + {{- end -}} + diff --git a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml new file mode 100644 index 000000000..e6dcbd46e --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-logging + namespace: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + # https://github.com/uber-go/zap/blob/aa3e73ec0896f8b066ddf668597a02f89628ee50/config.go + zap-logger-config: | + { + "level": "{{ .Values.logLevel }}", + "development": false, + "disableStacktrace": true, + "disableCaller": true, + "sampling": { + "initial": 100, + "thereafter": 100 + }, + "outputPaths": [{{ include "gpu-provisioner.controller.outputPathsList" . }}], + "errorOutputPaths": [{{ include "gpu-provisioner.controller.errorOutputPathsList" . }}], + "encoding": "{{ .Values.logEncoding }}", + "encoderConfig": { + "timeKey": "time", + "levelKey": "level", + "nameKey": "logger", + "callerKey": "caller", + "messageKey": "message", + "stacktraceKey": "stacktrace", + "levelEncoder": "capital", + "timeEncoder": "iso8601" + } + } +{{- with .Values.controller.logLevel }} + loglevel.controller: {{ . | quote }} +{{- end }} diff --git a/charts/kaito/gpu-provisioner/templates/configmap.yaml b/charts/kaito/gpu-provisioner/templates/configmap.yaml new file mode 100644 index 000000000..10474bab0 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/configmap.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gpu-provisioner-global-settings + namespace: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + {{- list .Values.settings "" | include "flattenSettings" | indent 2 }} diff --git a/charts/kaito/gpu-provisioner/templates/deployment.yaml b/charts/kaito/gpu-provisioner/templates/deployment.yaml new file mode 100644 index 000000000..804e95eb4 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/deployment.yaml @@ -0,0 +1,136 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gpu-provisioner.fullname" . }} + namespace: {{ .Values.namespace }} + labels: + azure.workload.identity/use: "true" + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.replicas }} + revisionHistoryLimit: {{ .Values.revisionHistoryLimit }} + {{- with .Values.strategy }} + strategy: + {{- toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + {{- include "gpu-provisioner.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + azure.workload.identity/use: "true" + {{- include "gpu-provisioner.selectorLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + checksum/settings: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: gpu-provisioner + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . | quote }} + {{- end }} + {{- with .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} + {{- with .Values.dnsPolicy }} + dnsPolicy: {{ . }} + {{- end }} + {{- with .Values.dnsConfig }} + dnsConfig: + {{- toYaml . | nindent 8}} + {{- end }} + {{- if .Values.hostNetwork }} + hostNetwork: true + {{- end }} + containers: + - name: controller + {{- with .Values.controller.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: {{ include "gpu-provisioner.controller.image" . }} + imagePullPolicy: {{ .Values.imagePullPolicy }} + env: + - name: KUBERNETES_MIN_VERSION + value: "1.19.0-0" + - name: KARPENTER_SERVICE + value: {{ include "gpu-provisioner.fullname" . }} + - name: METRICS_PORT + value: "{{ .Values.controller.metrics.port }}" + - name: HEALTH_PROBE_PORT + value: "{{ .Values.controller.healthProbe.port }}" + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- with .Values.controller.env }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.controller.envFrom }} + envFrom: + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.controller.healthProbe.port }} + protocol: TCP + livenessProbe: + initialDelaySeconds: 30 + timeoutSeconds: 30 + httpGet: + path: /healthz + port: http + readinessProbe: + initialDelaySeconds: 5 + timeoutSeconds: 30 + httpGet: + path: /readyz + port: http + {{- with .Values.controller.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + # The template below patches the .Values.affinity to add a default label selector where not specified + {{- $_ := include "gpu-provisioner.patchAffinity" $ }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologySpreadConstraints }} + # The template below patches the .Values.topologySpreadConstraints to add a default label selector where not specified + {{- $_ := include "gpu-provisioner.patchTopologySpreadConstraints" $ }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/kaito/gpu-provisioner/templates/role.yaml b/charts/kaito/gpu-provisioner/templates/role.yaml new file mode 100644 index 000000000..e23ac6fe5 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/role.yaml @@ -0,0 +1,61 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "gpu-provisioner.fullname" . }} + namespace: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +rules: + # Read + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "watch"] + - apiGroups: [""] + resources: ["configmaps", "namespaces", "secrets"] + verbs: ["get", "list", "watch"] + # Write + - apiGroups: [""] + resources: ["secrets"] + verbs: ["update"] + resourceNames: ["{{ include "gpu-provisioner.fullname" . }}-cert"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["update", "patch", "delete"] + resourceNames: + - gpu-provisioner-global-settings + - config-logging + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["patch", "update"] + resourceNames: + - "gpu-provisioner-leader-election" + # Cannot specify resourceNames on create + # https://kubernetes.io/docs/reference/access-authn-authz/rbac/#referring-to-resources + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "gpu-provisioner.fullname" . }}-dns + namespace: kube-system + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +rules: + # Read + - apiGroups: [""] + resources: ["services"] + resourceNames: ["kube-dns"] + verbs: ["get"] diff --git a/charts/kaito/gpu-provisioner/templates/rolebinding.yaml b/charts/kaito/gpu-provisioner/templates/rolebinding.yaml new file mode 100644 index 000000000..89606bb73 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/rolebinding.yaml @@ -0,0 +1,39 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "gpu-provisioner.fullname" . }} + namespace: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "gpu-provisioner.fullname" . }} +subjects: + - kind: ServiceAccount + name: gpu-provisioner + namespace: {{ .Values.namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "gpu-provisioner.fullname" . }}-dns + namespace: kube-system + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + {{- with .Values.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "gpu-provisioner.fullname" . }}-dns +subjects: + - kind: ServiceAccount + name: gpu-provisioner + namespace: {{ .Values.namespace }} \ No newline at end of file diff --git a/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml b/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml new file mode 100644 index 000000000..02e2e57a2 --- /dev/null +++ b/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-provisioner + namespace: {{ .Values.namespace }} + labels: + {{- include "gpu-provisioner.labels" . | nindent 4 }} + annotations: + azure.workload.identity/client-id: {{ .Values.workloadIdentity.clientId }} + azure.workload.identity/tenant-id: {{ .Values.workloadIdentity.tenantId }} diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml new file mode 100644 index 000000000..b9c98c09c --- /dev/null +++ b/charts/kaito/gpu-provisioner/values.yaml @@ -0,0 +1,166 @@ +namespace: gpu-provisioner +# -- Overrides the chart's name. +nameOverride: "" +# -- Overrides the chart's computed fullname. +fullnameOverride: "" +# -- Additional labels to add into metadata. +additionalLabels: {} +# app: gpu-provisioner + +# -- Additional annotations to add into metadata. +additionalAnnotations: {} +# -- Image pull policy for Docker images. +imagePullPolicy: IfNotPresent +# -- Image pull secrets for Docker images. +imagePullSecrets: [] +serviceAccount: + # -- Specifies if a ServiceAccount should be created. + create: true + # -- The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template. + name: "" + # -- Additional annotations for the ServiceAccount. + annotations: {} +# -- Specifies additional rules for the core ClusterRole. +additionalClusterRoleRules: [] +serviceMonitor: + # -- Specifies whether a ServiceMonitor should be created. + enabled: false + # -- Additional labels for the ServiceMonitor. + additionalLabels: {} + # -- Endpoint configuration for the ServiceMonitor. + endpointConfig: {} +# -- Number of replicas. +replicas: 1 +# -- The number of old ReplicaSets to retain to allow rollback. +revisionHistoryLimit: 10 +# -- Strategy for updating the pod. +strategy: + rollingUpdate: + maxUnavailable: 1 +# -- Additional labels for the pod. +podLabels: {} +# -- Additional annotations for the pod. +podAnnotations: {} +podDisruptionBudget: + name: karpenter + maxUnavailable: 1 +# -- SecurityContext for the pod. +podSecurityContext: + fsGroup: 1000 +# -- PriorityClass name for the pod. +priorityClassName: system-cluster-critical +# -- Override the default termination grace period for the pod. +terminationGracePeriodSeconds: +# -- Bind the pod to the host network. +# This is required when using a custom CNI. +hostNetwork: false +# -- Configure the DNS Policy for the pod +dnsPolicy: Default +# -- Configure DNS Config for the pod +dnsConfig: {} +# options: +# - name: ndots +# value: "1" +# -- Node selectors to schedule the pod to nodes with labels. +nodeSelector: + kubernetes.io/os: linux +# -- Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.azure.com/cluster + operator: Exists + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/os + operator: In + values: + - linux + - matchExpressions: + - key: karpenter.sh/provisioner-name + operator: DoesNotExist + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - topologyKey: "kubernetes.io/hostname" +# -- Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway +# -- Tolerations to allow the pod to be scheduled to nodes with taints. +tolerations: + - key: CriticalAddonsOnly + operator: Exists +# -- Additional volumes for the pod. +extraVolumes: [] +controller: + image: + # -- Repository path to the controller image. + repository: mcr.microsoft.com/aks/kaito/gpu-provisioner + # -- Tag of the controller image. + tag: 0.0.1 + # -- SHA256 digest of the controller image. + digest: "" + # -- SecurityContext for the controller container. + securityContext: {} + # -- Additional environment variables for the controller pod. + env: + - name: ARM_SUBSCRIPTION_ID + value: "null" + - name: LOCATION + value: eastus + - name: AZURE_CLUSTER_NAME + value: new_demo + - name: AZURE_NODE_RESOURCE_GROUP + value: MC_llm-test_new_demo_eastus + - name: ARM_RESOURCE_GROUP + value: llm-test + - name: LEADER_ELECT # disable leader election for better debugging experience + value: "false" + envFrom: [] + # -- Resources for the controller pod. + resources: + requests: + cpu: 200m + limits: + cpu: 500m + # -- Controller outputPaths - default to stdout only + outputPaths: + - stdout + # -- Controller errorOutputPaths - default to stderr only + errorOutputPaths: + - stderr + # -- Controller log level, defaults to the global log level + logLevel: debug + # -- Controller log encoding, defaults to the global log encoding + logEncoding: "" + metrics: + # -- The container port to use for metrics. + port: 8000 + healthProbe: + # -- The container port to use for http health probe. + port: 8081 +# -- Global log level +logLevel: debug +# -- Global log encoding +logEncoding: console +# -- Global Settings to configure gpu-provisioner +workloadIdentity: + clientId: 00411c3a-8361-42f3-8917-50b6da46e9fc + tenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47 +settings: + # -- Azure-specific configuration values + azure: + # -- Cluster name. + clusterName: new_demo + # -- The global tags to use on all Azure infrastructure resources (VMs, etc.) + # TODO: not propagated yet ... + tags: + # -- Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates + # in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features + featureGates: diff --git a/charts/kaito/workspace/.helmignore b/charts/kaito/workspace/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/charts/kaito/workspace/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/kaito/Chart.yaml b/charts/kaito/workspace/Chart.yaml similarity index 52% rename from charts/kaito/Chart.yaml rename to charts/kaito/workspace/Chart.yaml index dad3b2ceb..15bd382ee 100644 --- a/charts/kaito/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -1,15 +1,6 @@ apiVersion: v2 -name: kaito -description: A Helm chart for Kubernetes - -# A chart can be either an 'application' or a 'library' chart. -# -# Application charts are a collection of templates that can be packaged into versioned archives -# to be deployed. -# -# Library charts provide useful utilities or functions for the chart developer. They're included as -# a dependency of application charts to inject those utilities and functions into the rendering -# pipeline. Library charts do not define any templates and therefore cannot be deployed. +name: workspace +description: A Helm chart to install AI Toolchain Operator Workspace type: application # This is the chart version. This version number should be incremented each time you make changes @@ -21,4 +12,14 @@ version: 0.0.1 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.1.0" +appVersion: "0.0.1" +home: https://github.com/Azure/kaito +sources: + - https://github.com/Azure/kaito +maintainers: + - name: Fei-Guo + email: vrgf2003@gmail.com + - name: helayoty + email: hebaelayoty@gmail.com + - name: ishaansehgal99 + email: ishaanforthewin@gmail.com \ No newline at end of file diff --git a/charts/kaito/workspace/README.md b/charts/kaito/workspace/README.md new file mode 100644 index 000000000..ce1ec9ea2 --- /dev/null +++ b/charts/kaito/workspace/README.md @@ -0,0 +1,33 @@ +# KAITO Workspace Helm Chart + +## Install + +```bash +export REGISTRY= +export IMG_NAME=workspace +export IMG_TAG=0.0.1 +helm install workspace ./charts/kaito/workspace --set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) +``` + +## Values + +| Key | Type | Default | Description | +|------------------------------------------|--------|-----------------------------------|-------------| +| affinity | object | `{}` | | +| image.pullPolicy | string | `"IfNotPresent"` | | +| image.repository | string | `"ghcr.io/azure/kaito/workspace"` | | +| image.tag | string | `"0.0.1"` | | +| imagePullSecrets | list | `[]` | | +| nodeSelector | object | `{}` | | +| podAnnotations | object | `{}` | | +| podSecurityContext.runAsNonRoot | bool | `true` | | +| presetRegistryName | string | `"mcr.microsoft.com/aks/kaito"` | | +| replicaCount | int | `1` | | +| resources.limits.cpu | string | `"500m"` | | +| resources.limits.memory | string | `"128Mi"` | | +| resources.requests.cpu | string | `"10m"` | | +| resources.requests.memory | string | `"64Mi"` | | +| securityContext.allowPrivilegeEscalation | bool | `false` | | +| securityContext.capabilities.drop[0] | string | `"ALL"` | | +| tolerations | list | `[]` | | +| webhook.port | int | `9443` | | diff --git a/charts/kaito/crds/kaito.io_workspaces.yaml b/charts/kaito/workspace/crds/kaito.sh_workspaces.yaml similarity index 100% rename from charts/kaito/crds/kaito.io_workspaces.yaml rename to charts/kaito/workspace/crds/kaito.sh_workspaces.yaml diff --git a/charts/kaito/templates/_helpers.tpl b/charts/kaito/workspace/templates/_helpers.tpl similarity index 100% rename from charts/kaito/templates/_helpers.tpl rename to charts/kaito/workspace/templates/_helpers.tpl diff --git a/charts/kaito/templates/clusterrole.yaml b/charts/kaito/workspace/templates/clusterrole.yaml similarity index 100% rename from charts/kaito/templates/clusterrole.yaml rename to charts/kaito/workspace/templates/clusterrole.yaml diff --git a/charts/kaito/templates/clusterrole_binding.yaml b/charts/kaito/workspace/templates/clusterrole_binding.yaml similarity index 100% rename from charts/kaito/templates/clusterrole_binding.yaml rename to charts/kaito/workspace/templates/clusterrole_binding.yaml diff --git a/charts/kaito/templates/deployment.yaml b/charts/kaito/workspace/templates/deployment.yaml similarity index 100% rename from charts/kaito/templates/deployment.yaml rename to charts/kaito/workspace/templates/deployment.yaml diff --git a/charts/kaito/templates/nvidia-device-plugin-ds.yaml b/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml similarity index 100% rename from charts/kaito/templates/nvidia-device-plugin-ds.yaml rename to charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml diff --git a/charts/kaito/templates/role.yaml b/charts/kaito/workspace/templates/role.yaml similarity index 100% rename from charts/kaito/templates/role.yaml rename to charts/kaito/workspace/templates/role.yaml diff --git a/charts/kaito/templates/role_binding.yaml b/charts/kaito/workspace/templates/role_binding.yaml similarity index 100% rename from charts/kaito/templates/role_binding.yaml rename to charts/kaito/workspace/templates/role_binding.yaml diff --git a/charts/kaito/templates/secret-webhook-cert.yaml b/charts/kaito/workspace/templates/secret-webhook-cert.yaml similarity index 100% rename from charts/kaito/templates/secret-webhook-cert.yaml rename to charts/kaito/workspace/templates/secret-webhook-cert.yaml diff --git a/charts/kaito/templates/service.yaml b/charts/kaito/workspace/templates/service.yaml similarity index 100% rename from charts/kaito/templates/service.yaml rename to charts/kaito/workspace/templates/service.yaml diff --git a/charts/kaito/templates/serviceaccount.yaml b/charts/kaito/workspace/templates/serviceaccount.yaml similarity index 100% rename from charts/kaito/templates/serviceaccount.yaml rename to charts/kaito/workspace/templates/serviceaccount.yaml diff --git a/charts/kaito/templates/webhooks.yaml b/charts/kaito/workspace/templates/webhooks.yaml similarity index 100% rename from charts/kaito/templates/webhooks.yaml rename to charts/kaito/workspace/templates/webhooks.yaml diff --git a/charts/kaito/values.yaml b/charts/kaito/workspace/values.yaml similarity index 86% rename from charts/kaito/values.yaml rename to charts/kaito/workspace/values.yaml index 437011ed1..b5f08d950 100644 --- a/charts/kaito/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -1,32 +1,23 @@ # Default values for kaito. # This is a YAML-formatted file. # Declare variables to be passed into your templates. - replicaCount: 1 - image: repository: ghcr.io/azure/kaito/workspace - pullPolicy: Always + pullPolicy: IfNotPresent tag: 0.0.1 - imagePullSecrets: [] - podAnnotations: {} - podSecurityContext: runAsNonRoot: true - securityContext: allowPrivilegeEscalation: false capabilities: drop: - "ALL" - webhook: port: 9443 - -presetRegistryName: aimodelsregistry.azurecr.io - +presetRegistryName: mcr.microsoft.com/aks/kaito resources: limits: cpu: 500m @@ -34,9 +25,6 @@ resources: requests: cpu: 10m memory: 64Mi - nodeSelector: {} - tolerations: [] - affinity: {} diff --git a/config/crd/bases/kaito.io_workspaces.yaml b/config/crd/bases/kaito.io_workspaces.yaml deleted file mode 100644 index 3049583e0..000000000 --- a/config/crd/bases/kaito.io_workspaces.yaml +++ /dev/null @@ -1,252 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.12.0 - name: workspaces.kaito.sh -spec: - group: kaito.sh - names: - categories: - - workspace - kind: Workspace - listKind: WorkspaceList - plural: workspaces - shortNames: - - wk - - wks - singular: workspace - scope: Namespaced - versions: - - additionalPrinterColumns: - - jsonPath: .resource.instanceType - name: Instance - type: string - - jsonPath: .status.condition[?(@.type=="ResourceReady")].status - name: ResourceReady - type: string - - jsonPath: .status.condition[?(@.type=="InferenceReady")].status - name: InferenceReady - type: string - - jsonPath: .status.condition[?(@.type=="WorkspaceReady")].status - name: WorkspaceReady - type: string - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - name: v1alpha1 - schema: - openAPIV3Schema: - description: Workspace is the Schema for the workspaces API - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - inference: - properties: - preset: - description: Preset describles the model that will be deployed with - preset configurations. - properties: - accessMode: - description: AccessMode specifies whether the containerized model - image is accessible via public registry or private registry. - This field defaults to "public" if not specified. If this field - is "private", user needs to provide the private image information - in PresetOptions. - enum: - - public - - private - type: string - name: - description: Name of the supported models with preset configurations. - type: string - presetOptions: - properties: - image: - description: Image is the name of the containerized model - image. - type: string - imagePullSecrets: - description: ImagePullSecrets is a list of secret names in - the same namespace used for pulling the image. - items: - type: string - type: array - type: object - required: - - name - type: object - template: - description: Template specifies the Pod template used to run the inference - service. Users can specify custom Pod settings if the preset configurations - cannot meet the requirements. Note that if Preset is specified, - Template should not be specified and vice versa. - x-kubernetes-preserve-unknown-fields: true - type: object - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - resource: - description: ResourceSpec desicribes the resource requirement of running - the workload. If the number of nodes in the cluster that meet the InstanceType - and LabelSelector requirements is small than the Count, controller will - provision new nodes before deploying the workload. The final list of - nodes used to run the workload is presented in workspace Status. - properties: - count: - default: 1 - description: Count is the required number of GPU nodes. - type: integer - instanceType: - default: Standard_NC12s_v3 - description: InstanceType specifies the GPU node SKU. This field defaults - to "Standard_NC12s_v3" if not specified. - type: string - labelSelector: - description: LabelSelector specifies the required labels for the GPU - nodes. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement is a selector that - contains values, a key, and an operator that relates the key - and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: operator represents a key's relationship to - a set of values. Valid operators are In, NotIn, Exists - and DoesNotExist. - type: string - values: - description: values is an array of string values. If the - operator is In or NotIn, the values array must be non-empty. - If the operator is Exists or DoesNotExist, the values - array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator - is "In", and the values array contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - preferredNodes: - description: PreferredNodes is an optional node list specified by - the user. If a node in the list does not have the required labels - or the required instanceType, it will be ignored. - items: - type: string - type: array - required: - - labelSelector - type: object - status: - description: WorkspaceStatus defines the observed state of Workspace - properties: - conditions: - description: Conditions report the current conditions of the workspace. - items: - description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - workerNodes: - description: WorkerNodes is the list of nodes chosen to run the workload - based on the workspace resource requirement. - items: - type: string - type: array - type: object - type: object - served: true - storage: true - subresources: - status: {}