diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index e31021b5c..3e5465b94 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -139,10 +139,13 @@ jobs: shell: bash run: | make gpu-provisioner-helm - kubectl wait --for=condition=available deploy "kaito-gpu-provisioner" -n gpu-provisioner --timeout=300s + kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s env: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }} + GPU_PROVISIONER_VERSION: ${{ vars.GPU_PROVISIONER_VERSION }} - uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 with: diff --git a/Makefile b/Makefile index d10bfcb25..650ddc285 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace VERSION ?= v0.2.2 +GPU_PROVISIONER_VERSION ?= 0.2.0 IMG_TAG ?= $(subst v,,$(VERSION)) ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) @@ -30,6 +31,8 @@ AZURE_CLUSTER_NAME ?= kaito-demo AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION) GPU_NAMESPACE ?= gpu-provisioner KAITO_NAMESPACE ?= kaito-workspace +GPU_PROVISIONER_MSI_NAME ?= gpuIdentity + RUN_LLAMA_13B ?= false AI_MODELS_REGISTRY ?= modelregistry.azurecr.io AI_MODELS_REGISTRY_SECRET ?= modelregistry @@ -191,37 +194,25 @@ ifndef ignore-not-found endif ##@ gpu-provider -.PHONY: gpu-provisioner-identity-perm gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner - az identity create --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) - - IDENTITY_PRINCIPAL_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId') - IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'clientId') - - az role assignment create --assignee $(IDENTITY_PRINCIPAL_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor" + az identity create --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl") + IDENTITY_PRINCIPAL_ID=$(shell az identity show --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId');\ + az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor" - az identity federated-credential create --name gpu-federatecredential --identity-name gpuIdentity --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer "$(AKS_OIDC_ISSUER)" \ - --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) + AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ + az identity federated-credential create --name gpu-federatecredential --identity-name $(GPU_PROVISIONER_MSI_NAME) --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ + --subject system:serviceaccount:"$(GPU_NAMESPACE):$(GPU_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) .PHONY: gpu-provisioner-helm gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - $(eval IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --query 'clientId' -o tsv)) - $(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId")) - $(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".id")) - - yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="AZURE_NODE_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP_MC)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="AZURE_CLUSTER_NAME")) .value = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.settings.azure.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.workloadIdentity.clientId) = "$(IDENTITY_CLIENT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.workloadIdentity.tenantId) = "$(AZURE_TENANT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml - - helm install kaito-gpu-provisioner ./charts/kaito/gpu-provisioner --namespace $(GPU_NAMESPACE) --create-namespace + + curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh + chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME) + + helm install $(GPU_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \ + https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz ##@ Build Dependencies diff --git a/README.md b/README.md index 4251a35bf..879da0507 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ Kaito follows the classic Kubernetes Custom Resource Definition(CRD)/controller The above figure presents the Kaito architecture overview. Its major components consist of: - **Workspace controller**: It reconciles the `workspace` custom resource, creates `machine` (explained below) custom resources to trigger node auto provisioning, and creates the inference workload (`deployment` or `statefulset`) based on the model preset configurations. -- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [Kaito helm chart](charts/kaito/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Kubernetes Service(AKS) APIs to add new GPU nodes to the AKS cluster. -Note that the *gpu-provisioner* is an open sourced component maintained in [this](https://github.com/Azure/gpu-provisioner) repository. It can be replaced by other controllers if they support Karpenter-core APIs. +- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Kubernetes Service(AKS) APIs to add new GPU nodes to the AKS cluster. +> Note: The [*gpu-provisioner*](https://github.com/Azure/gpu-provisioner) is an open sourced component. It can be replaced by other controllers if they support [Karpenter-core](https://sigs.k8s.io/karpenter) APIs. ## Installation diff --git a/charts/kaito/gpu-provisioner/.helmignore b/charts/kaito/gpu-provisioner/.helmignore deleted file mode 100644 index 0e8a0eb36..000000000 --- a/charts/kaito/gpu-provisioner/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/charts/kaito/gpu-provisioner/Chart.yaml b/charts/kaito/gpu-provisioner/Chart.yaml deleted file mode 100644 index 83889e614..000000000 --- a/charts/kaito/gpu-provisioner/Chart.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v2 -name: gpu-provisioner -description: A Helm chart for gpu-provisioner -type: application -version: 0.2.0 -appVersion: 0.2.0 -sources: - - https://github.com/Azure/gpu-provisioner -maintainers: - - name: Fei-Guo - email: vrgf2003@gmail.com - - name: helayoty - email: hebaelayoty@gmail.com diff --git a/charts/kaito/gpu-provisioner/README.md b/charts/kaito/gpu-provisioner/README.md deleted file mode 100644 index 81440c4e8..000000000 --- a/charts/kaito/gpu-provisioner/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Karpenter Azure provider gpu-provisioner - -![Version: 0.2.0](https://img.shields.io/badge/Version-0.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.2.0](https://img.shields.io/badge/AppVersion-0.2.0-informational?style=flat-square) - -A Helm chart for gpu-provisioner - -## Installing the Chart - -To install the chart with the release name `gpu-provisioner`: - -```bash -helm install gpu-provisioner ./charts/gpu-provisioner --namespace=gpu-provisioner --create-namespace -``` - -## Values - -| Key | Type | Default | Description | -|------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| -| additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | -| additionalLabels | object | `{}` | Additional labels to add into metadata. | -| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}}}` | Affinity rules for scheduling the pod. | -| controller.env | list | `[]` | Additional environment variables for the controller pod. | -| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - default to stderr only | -| controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | -| controller.image.repository | string | `mcr.microsoft.com/aks/kaito/gpu-provisioner` | | -| controller.image.tag | string | `0.2.0` | | -| controller.logEncoding | string | `""` | Controller log encoding, defaults to the global log encoding | -| controller.logLevel | string | `""` | Controller log level, defaults to the global log level | -| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - default to stdout only | -| controller.resources | object | `{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":1,"memory":"1Gi"}}` | Resources for the controller pod. | -| controller.securityContext | object | `{}` | SecurityContext for the controller container. | -| controller.sidecarContainer | object | `{}` | Additional sideCarContainer config - this will also inherit volume mounts from deployment | -| dnsConfig | object | `{}` | Configure DNS Config for the pod | -| dnsPolicy | string | `"Default"` | Configure the DNS Policy for the pod | -| extraVolumes | list | `[]` | Additional volumes for the pod. | -| fullnameOverride | string | `""` | Overrides the chart's computed fullname. | -| hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | -| imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | -| imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | -| logEncoding | string | `"console"` | Gloabl log encoding | -| logLevel | string | `"debug"` | Global log level | -| nameOverride | string | `""` | Overrides the chart's name. | -| nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | -| podAnnotations | object | `{}` | Additional annotations for the pod. | -| podDisruptionBudget.maxUnavailable | int | `1` | | -| podDisruptionBudget.name | string | `"karpenter"` | | -| podLabels | object | `{}` | Additional labels for the pod. | -| podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | -| priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | -| replicas | int | `2` | Number of replicas. | -| revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | -| serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | -| serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. | -| serviceAccount.name | string | `""` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the fullname template. | -| serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | -| serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | -| serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings | object | `{"azure":{"clusterName":"","tags":null}}` | Global Settings to configure Karpenter | -| settings.azure | object | `{"clusterName":"","tags":null}` | Azure-specific configuration values | -| settings.azure.clusterName | string | `""` | Cluster name. | | -| settings.azure.tags | string | `nil` | The global tags to use on all Azure infrastructure resources (launch templates, instances, SQS queue, etc.) | -| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | -| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | -| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | -| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | topologySpreadConstraints to increase the controller resilience | - diff --git a/charts/kaito/gpu-provisioner/templates/_helpers.tpl b/charts/kaito/gpu-provisioner/templates/_helpers.tpl deleted file mode 100644 index 15b644f1f..000000000 --- a/charts/kaito/gpu-provisioner/templates/_helpers.tpl +++ /dev/null @@ -1,180 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "gpu-provisioner.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "gpu-provisioner.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "gpu-provisioner.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "gpu-provisioner.labels" -}} -helm.sh/chart: {{ include "gpu-provisioner.chart" . }} -{{ include "gpu-provisioner.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- with .Values.additionalLabels }} -{{ toYaml . }} -{{- end }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "gpu-provisioner.selectorLabels" -}} -app.kubernetes.io/name: {{ include "gpu-provisioner.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -gpu-provisioner image to use -*/}} -{{- define "gpu-provisioner.controller.image" -}} -{{- if .Values.controller.image.digest }} -{{- printf "%s:%s@%s" .Values.controller.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.controller.image.tag) .Values.controller.image.digest }} -{{- else }} -{{- printf "%s:%s" .Values.controller.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.controller.image.tag) }} -{{- end }} -{{- end }} - - -{{/* Get PodDisruptionBudget API Version */}} -{{- define "gpu-provisioner.pdb.apiVersion" -}} -{{- if and (.Capabilities.APIVersions.Has "policy/v1") (semverCompare ">= 1.21-0" .Capabilities.KubeVersion.Version) -}} -{{- print "policy/v1" -}} -{{- else -}} -{{- print "policy/v1beta1" -}} -{{- end -}} -{{- end -}} - -{{/* -Patch the label selector on an object -This template will add a labelSelector using matchLabels to the object referenced at _target if there is no labelSelector specified. -The matchLabels are created with the selectorLabels template. -This works because Helm treats dictionaries as mutable objects and allows passing them by reference. -*/}} -{{- define "gpu-provisioner.patchLabelSelector" -}} -{{- if not (hasKey ._target "labelSelector") }} -{{- $selectorLabels := (include "gpu-provisioner.selectorLabels" .) | fromYaml }} -{{- $_ := set ._target "labelSelector" (dict "matchLabels" $selectorLabels) }} -{{- end }} -{{- end }} - -{{/* -Patch pod affinity -This template uses the patchLabelSelector template to add a labelSelector to pod affinity objects if there is no labelSelector specified. -This works because Helm treats dictionaries as mutable objects and allows passing them by reference. -*/}} -{{- define "gpu-provisioner.patchPodAffinity" -}} -{{- if (hasKey ._podAffinity "requiredDuringSchedulingIgnoredDuringExecution") }} -{{- range $term := ._podAffinity.requiredDuringSchedulingIgnoredDuringExecution }} -{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $term) $) }} -{{- end }} -{{- end }} -{{- if (hasKey ._podAffinity "preferredDuringSchedulingIgnoredDuringExecution") }} -{{- range $weightedTerm := ._podAffinity.preferredDuringSchedulingIgnoredDuringExecution }} -{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $weightedTerm.podAffinityTerm) $) }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Patch affinity -This template uses patchPodAffinity template to add a labelSelector to podAffinity & podAntiAffinity if one isn't specified. -This works because Helm treats dictionaries as mutable objects and allows passing them by reference. -*/}} -{{- define "gpu-provisioner.patchAffinity" -}} -{{- if (hasKey .Values.affinity "podAffinity") }} -{{- include "gpu-provisioner.patchPodAffinity" (merge (dict "_podAffinity" .Values.affinity.podAffinity) .) }} -{{- end }} -{{- if (hasKey .Values.affinity "podAntiAffinity") }} -{{- include "gpu-provisioner.patchPodAffinity" (merge (dict "_podAffinity" .Values.affinity.podAntiAffinity) .) }} -{{- end }} -{{- end }} - -{{/* -Patch topology spread constraints -This template uses the patchLabelSelector template to add a labelSelector to topologySpreadConstraints if one isn't specified. -This works because Helm treats dictionaries as mutable objects and allows passing them by reference. -*/}} -{{- define "gpu-provisioner.patchTopologySpreadConstraints" -}} -{{- range $constraint := .Values.topologySpreadConstraints }} -{{- include "gpu-provisioner.patchLabelSelector" (merge (dict "_target" $constraint) $) }} -{{- end }} -{{- end }} - -{{/* -Flatten Settings Map using "." syntax -*/}} -{{- define "flattenSettings" -}} -{{- $map := first . -}} -{{- $label := last . -}} -{{- range $key := (keys $map | uniq | sortAlpha) }} - {{- $sublabel := $key -}} - {{- $val := (get $map $key) -}} - {{- if $label -}} - {{- $sublabel = list $label $key | join "." -}} - {{- end -}} - {{/* Special-case "tags" since we want this to be a JSON object */}} - {{- if eq $key "tags" -}} - {{- if not (kindIs "invalid" $val) -}} - {{- $sublabel | quote | nindent 2 }}: {{ $val | toJson | quote }} - {{- end -}} - {{- else if kindOf $val | eq "map" -}} - {{- list $val $sublabel | include "flattenSettings" -}} - {{- else -}} - {{- if not (kindIs "invalid" $val) -}} - {{- $sublabel | quote | nindent 2 -}}: {{ $val | quote }} - {{- end -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Flatten the stdout logging outputs from args provided -*/}} -{{- define "gpu-provisioner.controller.outputPathsList" -}} -{{ $paths := list -}} -{{- range .Values.controller.outputPaths -}} - {{- $paths = printf "%s" . | quote | append $paths -}} -{{- end -}} -{{ $paths | join ", " }} -{{- end -}} - -{{/* -Flatten the stderr logging outputs from args provided -*/}} -{{- define "gpu-provisioner.controller.errorOutputPathsList" -}} -{{ $paths := list -}} -{{- range .Values.controller.errorOutputPaths -}} - {{- $paths = printf "%s" . | quote | append $paths -}} -{{- end -}} -{{ $paths | join ", " }} -{{- end -}} \ No newline at end of file diff --git a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml b/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml deleted file mode 100644 index b9c2840f3..000000000 --- a/charts/kaito/gpu-provisioner/templates/clusterrole-core.yaml +++ /dev/null @@ -1,66 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "gpu-provisioner.fullname" . }}-core - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "gpu-provisioner.fullname" . }}-core -subjects: - - kind: ServiceAccount - name: gpu-provisioner - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "gpu-provisioner.fullname" . }}-core - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -rules: - # Read - - apiGroups: [ "karpenter.sh" ] - resources: [ "provisioners" ] - verbs: [ "get", "list", "watch" ] - - apiGroups: ["karpenter.sh"] - resources: ["machines", "machines/status"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["pods", "nodes", "persistentvolumes", "persistentvolumeclaims", "replicationcontrollers", "namespaces"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["storageclasses", "csinodes"] - verbs: ["get", "watch", "list"] - - apiGroups: ["apps"] - resources: ["daemonsets", "deployments", "replicasets", "statefulsets"] - verbs: ["list", "watch"] - - apiGroups: [ "policy" ] - resources: [ "poddisruptionbudgets" ] - verbs: [ "get", "list", "watch" ] - # Write - - apiGroups: ["karpenter.sh"] - resources: ["machines", "machines/status"] - verbs: ["create", "delete", "update", "patch"] - - apiGroups: [""] - resources: ["events"] - verbs: ["create", "patch"] - - apiGroups: [""] - resources: ["nodes"] - verbs: ["patch", "delete"] - - apiGroups: [""] - resources: ["pods/eviction"] - verbs: ["create"] - {{- with .Values.additionalClusterRoleRules -}} - {{ toYaml . | nindent 2 }} - {{- end -}} - diff --git a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml b/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml deleted file mode 100644 index 5c27dc8b5..000000000 --- a/charts/kaito/gpu-provisioner/templates/configmap-logging.yaml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: gpu-provisioner-config-logging - namespace: {{ .Release.Namespace }} - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -data: - # https://github.com/uber-go/zap/blob/aa3e73ec0896f8b066ddf668597a02f89628ee50/config.go - zap-logger-config: | - { - "level": "{{ .Values.logLevel }}", - "development": false, - "disableStacktrace": true, - "disableCaller": true, - "sampling": { - "initial": 100, - "thereafter": 100 - }, - "outputPaths": [{{ include "gpu-provisioner.controller.outputPathsList" . }}], - "errorOutputPaths": [{{ include "gpu-provisioner.controller.errorOutputPathsList" . }}], - "encoding": "{{ .Values.logEncoding }}", - "encoderConfig": { - "timeKey": "time", - "levelKey": "level", - "nameKey": "logger", - "callerKey": "caller", - "messageKey": "message", - "stacktraceKey": "stacktrace", - "levelEncoder": "capital", - "timeEncoder": "iso8601" - } - } -{{- with .Values.controller.logLevel }} - loglevel.controller: {{ . | quote }} -{{- end }} diff --git a/charts/kaito/gpu-provisioner/templates/configmap.yaml b/charts/kaito/gpu-provisioner/templates/configmap.yaml deleted file mode 100644 index 3d51e651e..000000000 --- a/charts/kaito/gpu-provisioner/templates/configmap.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: gpu-provisioner-global-settings - namespace: {{ .Release.Namespace }} - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -data: - {{- list .Values.settings "" | include "flattenSettings" | indent 2 }} diff --git a/charts/kaito/gpu-provisioner/templates/deployment.yaml b/charts/kaito/gpu-provisioner/templates/deployment.yaml deleted file mode 100644 index fe746958a..000000000 --- a/charts/kaito/gpu-provisioner/templates/deployment.yaml +++ /dev/null @@ -1,123 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "gpu-provisioner.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - azure.workload.identity/use: "true" - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - replicas: {{ .Values.replicas }} - revisionHistoryLimit: {{ .Values.revisionHistoryLimit }} - {{- with .Values.strategy }} - strategy: - {{- toYaml . | nindent 4 }} - {{- end }} - selector: - matchLabels: - {{- include "gpu-provisioner.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - azure.workload.identity/use: "true" - {{- include "gpu-provisioner.selectorLabels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - annotations: - {{- with .Values.podAnnotations }} - {{- toYaml . | nindent 8 }} - {{- end }} - checksum/settings: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: gpu-provisioner - {{- with .Values.podSecurityContext }} - securityContext: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.priorityClassName }} - priorityClassName: {{ . | quote }} - {{- end }} - {{- with .Values.terminationGracePeriodSeconds }} - terminationGracePeriodSeconds: {{ . }} - {{- end }} - {{- with .Values.dnsPolicy }} - dnsPolicy: {{ . }} - {{- end }} - {{- with .Values.dnsConfig }} - dnsConfig: - {{- toYaml . | nindent 8}} - {{- end }} - {{- if .Values.hostNetwork }} - hostNetwork: true - {{- end }} - containers: - - name: controller - {{- with .Values.controller.securityContext }} - securityContext: - {{- toYaml . | nindent 12 }} - {{- end }} - image: {{ include "gpu-provisioner.controller.image" . }} - imagePullPolicy: {{ .Values.imagePullPolicy }} - env: - - name: CONFIG_LOGGING_NAME - value: "gpu-provisioner-config-logging" - - name: SYSTEM_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - {{- with .Values.controller.env }} - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.controller.envFrom }} - envFrom: - {{- toYaml . | nindent 12 }} - {{- end }} - ports: - - name: http - containerPort: {{ .Values.controller.healthProbe.port }} - protocol: TCP - livenessProbe: - initialDelaySeconds: 30 - timeoutSeconds: 30 - httpGet: - path: /healthz - port: http - readinessProbe: - initialDelaySeconds: 5 - timeoutSeconds: 30 - httpGet: - path: /readyz - port: http - {{- with .Values.controller.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - # The template below patches the .Values.affinity to add a default label selector where not specified - {{- $_ := include "gpu-provisioner.patchAffinity" $ }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.topologySpreadConstraints }} - # The template below patches the .Values.topologySpreadConstraints to add a default label selector where not specified - {{- $_ := include "gpu-provisioner.patchTopologySpreadConstraints" $ }} - topologySpreadConstraints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/charts/kaito/gpu-provisioner/templates/role.yaml b/charts/kaito/gpu-provisioner/templates/role.yaml deleted file mode 100644 index 3ebd01fce..000000000 --- a/charts/kaito/gpu-provisioner/templates/role.yaml +++ /dev/null @@ -1,61 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "gpu-provisioner.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -rules: - # Read - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["get", "watch"] - - apiGroups: [""] - resources: ["configmaps", "namespaces", "secrets"] - verbs: ["get", "list", "watch"] - # Write - - apiGroups: [""] - resources: ["secrets"] - verbs: ["update"] - resourceNames: ["{{ include "gpu-provisioner.fullname" . }}-cert"] - - apiGroups: [""] - resources: ["configmaps"] - verbs: ["update", "patch", "delete"] - resourceNames: - - gpu-provisioner-global-settings - - config-logging - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["patch", "update"] - resourceNames: - - "gpu-provisioner-leader-election" - # Cannot specify resourceNames on create - # https://kubernetes.io/docs/reference/access-authn-authz/rbac/#referring-to-resources - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create"] - - apiGroups: [""] - resources: ["configmaps"] - verbs: ["create"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "gpu-provisioner.fullname" . }}-dns - namespace: kube-system - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -rules: - # Read - - apiGroups: [""] - resources: ["services"] - resourceNames: ["kube-dns"] - verbs: ["get"] diff --git a/charts/kaito/gpu-provisioner/templates/rolebinding.yaml b/charts/kaito/gpu-provisioner/templates/rolebinding.yaml deleted file mode 100644 index 9c3140135..000000000 --- a/charts/kaito/gpu-provisioner/templates/rolebinding.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "gpu-provisioner.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "gpu-provisioner.fullname" . }} -subjects: - - kind: ServiceAccount - name: gpu-provisioner - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "gpu-provisioner.fullname" . }}-dns - namespace: kube-system - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "gpu-provisioner.fullname" . }}-dns -subjects: - - kind: ServiceAccount - name: gpu-provisioner - namespace: {{ .Release.Namespace }} \ No newline at end of file diff --git a/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml b/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml deleted file mode 100644 index 0b6c5b11a..000000000 --- a/charts/kaito/gpu-provisioner/templates/serviceaccount.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: gpu-provisioner - namespace: {{ .Release.Namespace }} - labels: - {{- include "gpu-provisioner.labels" . | nindent 4 }} - annotations: - azure.workload.identity/client-id: {{ .Values.workloadIdentity.clientId }} - azure.workload.identity/tenant-id: {{ .Values.workloadIdentity.tenantId }} diff --git a/charts/kaito/gpu-provisioner/values.yaml b/charts/kaito/gpu-provisioner/values.yaml deleted file mode 100644 index 74ec2853f..000000000 --- a/charts/kaito/gpu-provisioner/values.yaml +++ /dev/null @@ -1,161 +0,0 @@ -# -- Overrides the chart's name. -nameOverride: "" -# -- Overrides the chart's computed fullname. -fullnameOverride: "" -# -- Additional labels to add into metadata. -additionalLabels: {} -# app: gpu-provisioner - -# -- Additional annotations to add into metadata. -additionalAnnotations: {} -# -- Image pull policy for Docker images. -imagePullPolicy: IfNotPresent -# -- Image pull secrets for Docker images. -imagePullSecrets: [] -serviceAccount: - # -- Specifies if a ServiceAccount should be created. - create: true - # -- The name of the ServiceAccount to use. - # If not set and create is true, a name is generated using the fullname template. - name: "" - # -- Additional annotations for the ServiceAccount. - annotations: {} -# -- Specifies additional rules for the core ClusterRole. -additionalClusterRoleRules: [] -serviceMonitor: - # -- Specifies whether a ServiceMonitor should be created. - enabled: false - # -- Additional labels for the ServiceMonitor. - additionalLabels: {} - # -- Endpoint configuration for the ServiceMonitor. - endpointConfig: {} -# -- Number of replicas. -replicas: 1 -# -- The number of old ReplicaSets to retain to allow rollback. -revisionHistoryLimit: 10 -# -- Strategy for updating the pod. -strategy: - rollingUpdate: - maxUnavailable: 1 -# -- Additional labels for the pod. -podLabels: {} -# -- Additional annotations for the pod. -podAnnotations: {} -podDisruptionBudget: - name: gpu-provisioner - maxUnavailable: 1 -# -- SecurityContext for the pod. -podSecurityContext: - fsGroup: 1000 -# -- PriorityClass name for the pod. -priorityClassName: system-cluster-critical -# -- Override the default termination grace period for the pod. -terminationGracePeriodSeconds: -# -- Bind the pod to the host network. -# This is required when using a custom CNI. -hostNetwork: false -# -- Configure the DNS Policy for the pod -dnsPolicy: Default -# -- Configure DNS Config for the pod -dnsConfig: {} -# options: -# - name: ndots -# value: "1" -# -- Node selectors to schedule the pod to nodes with labels. -nodeSelector: - kubernetes.io/os: linux -# -- Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. -affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.azure.com/cluster - operator: Exists - - key: type - operator: NotIn - values: - - virtual-kubelet - - key: kubernetes.io/os - operator: In - values: - - linux - - matchExpressions: - - key: karpenter.sh/provisioner-name - operator: DoesNotExist - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - topologyKey: "kubernetes.io/hostname" -# -- Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. -topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: ScheduleAnyway -# -- Tolerations to allow the pod to be scheduled to nodes with taints. -tolerations: - - key: CriticalAddonsOnly - operator: Exists -# -- Additional volumes for the pod. -extraVolumes: [] -controller: - image: - # -- Repository path to the controller image. - repository: mcr.microsoft.com/aks/kaito/gpu-provisioner - # -- Tag of the controller image. - tag: 0.2.0 - # -- SHA256 digest of the controller image. - digest: "" - # -- SecurityContext for the controller container. - securityContext: {} - # -- Additional environment variables for the controller pod. - env: - - name: ARM_SUBSCRIPTION_ID - value: - - name: LOCATION - value: - - name: AZURE_CLUSTER_NAME - value: - - name: AZURE_NODE_RESOURCE_GROUP - value: - - name: ARM_RESOURCE_GROUP - value: - - name: LEADER_ELECT # disable leader election for better debugging experience - value: "false" - - name: E2E_TEST_MODE - value: "false" - envFrom: [] - # -- Resources for the controller pod. - resources: - requests: - cpu: 200m - limits: - cpu: 500m - # -- Controller outputPaths - default to stdout only - outputPaths: - - stdout - # -- Controller errorOutputPaths - default to stderr only - errorOutputPaths: - - stderr - # -- Controller log level, defaults to the global log level - logLevel: debug - # -- Controller log encoding, defaults to the global log encoding - logEncoding: "" - metrics: - # -- The container port to use for metrics. - port: 8000 - healthProbe: - # -- The container port to use for http health probe. - port: 8081 -# -- Global log level -logLevel: debug -# -- Global log encoding -logEncoding: console -# -- Global Settings to configure gpu-provisioner -workloadIdentity: - clientId: "" - tenantId: "" -settings: - # -- Azure-specific configuration values - azure: - # -- Cluster name. - clusterName: diff --git a/charts/kaito/gpu-provisioner/crds/karpenter.sh_machines.yaml b/charts/kaito/workspace/crds/karpenter.sh_machines.yaml similarity index 100% rename from charts/kaito/gpu-provisioner/crds/karpenter.sh_machines.yaml rename to charts/kaito/workspace/crds/karpenter.sh_machines.yaml diff --git a/charts/kaito/workspace/crds/karpenter.sh_nodeclaims.yaml b/charts/kaito/workspace/crds/karpenter.sh_nodeclaims.yaml new file mode 100644 index 000000000..37abdeca8 --- /dev/null +++ b/charts/kaito/workspace/crds/karpenter.sh_nodeclaims.yaml @@ -0,0 +1,435 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodeclaims.karpenter.sh +spec: + group: karpenter.sh + names: + categories: + - karpenter + kind: NodeClaim + listKind: NodeClaimList + plural: nodeclaims + singular: nodeclaim + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .metadata.labels.node\.kubernetes\.io/instance-type + name: Type + type: string + - jsonPath: .metadata.labels.topology\.kubernetes\.io/zone + name: Zone + type: string + - jsonPath: .status.nodeName + name: Node + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .metadata.labels.karpenter\.sh/capacity-type + name: Capacity + priority: 1 + type: string + - jsonPath: .metadata.labels.karpenter\.sh/nodepool + name: NodePool + priority: 1 + type: string + - jsonPath: .spec.nodeClassRef.name + name: NodeClass + priority: 1 + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: NodeClaim is the Schema for the NodeClaims API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeClaimSpec describes the desired state of the NodeClaim + properties: + kubelet: + description: |- + Kubelet defines args to be used when configuring kubelet on provisioned nodes. + They are a subset of the upstream types, recognizing not all options may be supported. + Wherever possible, the types and names should reflect the upstream kubelet types. + properties: + clusterDNS: + description: |- + clusterDNS is a list of IP addresses for the cluster DNS server. + Note that not all providers may use all addresses. + items: + type: string + type: array + cpuCFSQuota: + description: CPUCFSQuota enables CPU CFS quota enforcement for containers that specify CPU limits. + type: boolean + evictionHard: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionHard is the map of signal names to quantities that define hard eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionHard are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionMaxPodGracePeriod: + description: |- + EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in + response to soft eviction thresholds being met. + format: int32 + type: integer + evictionSoft: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionSoft is the map of signal names to quantities that define soft eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoft are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionSoftGracePeriod: + additionalProperties: + type: string + description: EvictionSoftGracePeriod is the map of signal names to quantities that define grace periods for each eviction signal + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoftGracePeriod are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + imageGCHighThresholdPercent: + description: |- + ImageGCHighThresholdPercent is the percent of disk usage after which image + garbage collection is always run. The percent is calculated by dividing this + field value by 100, so this field must be between 0 and 100, inclusive. + When specified, the value must be greater than ImageGCLowThresholdPercent. + format: int32 + maximum: 100 + minimum: 0 + type: integer + imageGCLowThresholdPercent: + description: |- + ImageGCLowThresholdPercent is the percent of disk usage before which image + garbage collection is never run. Lowest disk usage to garbage collect to. + The percent is calculated by dividing this field value by 100, + so the field value must be between 0 and 100, inclusive. + When specified, the value must be less than imageGCHighThresholdPercent + format: int32 + maximum: 100 + minimum: 0 + type: integer + kubeReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: KubeReserved contains resources reserved for Kubernetes system components. + type: object + x-kubernetes-validations: + - message: valid keys for kubeReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: kubeReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + maxPods: + description: |- + MaxPods is an override for the maximum number of pods that can run on + a worker node instance. + format: int32 + minimum: 0 + type: integer + podsPerCore: + description: |- + PodsPerCore is an override for the number of pods that can run on a worker node + instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if + MaxPods is a lower value, that value will be used. + format: int32 + minimum: 0 + type: integer + systemReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: SystemReserved contains resources reserved for OS system daemons and kernel memory. + type: object + x-kubernetes-validations: + - message: valid keys for systemReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: systemReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + type: object + x-kubernetes-validations: + - message: imageGCHighThresholdPercent must be greater than imageGCLowThresholdPercent + rule: 'has(self.imageGCHighThresholdPercent) && has(self.imageGCLowThresholdPercent) ? self.imageGCHighThresholdPercent > self.imageGCLowThresholdPercent : true' + - message: evictionSoft OwnerKey does not have a matching evictionSoftGracePeriod + rule: has(self.evictionSoft) ? self.evictionSoft.all(e, (e in self.evictionSoftGracePeriod)):true + - message: evictionSoftGracePeriod OwnerKey does not have a matching evictionSoft + rule: has(self.evictionSoftGracePeriod) ? self.evictionSoftGracePeriod.all(e, (e in self.evictionSoft)):true + nodeClassRef: + description: NodeClassRef is a reference to an object that defines provider specific configuration + properties: + apiVersion: + description: API version of the referent + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - name + type: object + requirements: + description: Requirements are layered with GetLabels and applied to every node. + items: + description: |- + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. + properties: + key: + description: The label key that the selector applies to. + type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-cpu-manufacturer","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + required: + - key + - operator + type: object + maxItems: 30 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' + - message: requirements with 'minValues' must have at least that many values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' + resources: + description: Resources models the resource requirements for the NodeClaim to launch + properties: + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Requests describes the minimum required resources for the NodeClaim to launch + type: object + type: object + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + required: + - nodeClassRef + - requirements + type: object + status: + description: NodeClaimStatus defines the observed state of NodeClaim + properties: + allocatable: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Allocatable is the estimated allocatable capacity of the node + type: object + capacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Capacity is the estimated full capacity of the node + type: object + conditions: + description: Conditions contains signals for health and readiness + items: + description: |- + Condition defines a readiness condition for a Knative resource. + See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties + properties: + lastTransitionTime: + description: |- + LastTransitionTime is the last time the condition transitioned from one status to another. + We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic + differences (all other things held constant). + type: string + message: + description: A human readable message indicating details about the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + severity: + description: |- + Severity with which to treat failures of this type of condition. + When this is not specified, it defaults to Error. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of condition. + type: string + required: + - status + - type + type: object + type: array + imageID: + description: ImageID is an identifier for the image that runs on the node + type: string + nodeName: + description: NodeName is the name of the corresponding node object + type: string + providerID: + description: ProviderID of the corresponding node object + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/kaito/workspace/crds/karpenter.sh_nodepools.yaml b/charts/kaito/workspace/crds/karpenter.sh_nodepools.yaml new file mode 100644 index 000000000..6595d6dd4 --- /dev/null +++ b/charts/kaito/workspace/crds/karpenter.sh_nodepools.yaml @@ -0,0 +1,527 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodepools.karpenter.sh +spec: + group: karpenter.sh + names: + categories: + - karpenter + kind: NodePool + listKind: NodePoolList + plural: nodepools + singular: nodepool + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.template.spec.nodeClassRef.name + name: NodeClass + type: string + - jsonPath: .spec.weight + name: Weight + priority: 1 + type: string + name: v1beta1 + schema: + openAPIV3Schema: + description: NodePool is the Schema for the NodePools API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + NodePoolSpec is the top level nodepool specification. Nodepools + launch nodes in response to pods that are unschedulable. A single nodepool + is capable of managing a diverse set of nodes. Node properties are determined + from a combination of nodepool and pod scheduling constraints. + properties: + disruption: + default: + consolidationPolicy: WhenUnderutilized + expireAfter: 720h + description: Disruption contains the parameters that relate to Karpenter's disruption logic + properties: + budgets: + default: + - nodes: 10% + description: |- + Budgets is a list of Budgets. + If there are multiple active budgets, Karpenter uses + the most restrictive value. If left undefined, + this will default to one budget with a value to 10%. + items: + description: |- + Budget defines when Karpenter will restrict the + number of Node Claims that can be terminating simultaneously. + properties: + duration: + description: |- + Duration determines how long a Budget is active since each Schedule hit. + Only minutes and hours are accepted, as cron does not work in seconds. + If omitted, the budget is always active. + This is required if Schedule is set. + This regex has an optional 0s at the end since the duration.String() always adds + a 0s at the end. + pattern: ^((([0-9]+(h|m))|([0-9]+h[0-9]+m))(0s)?)$ + type: string + nodes: + default: 10% + description: |- + Nodes dictates the maximum number of NodeClaims owned by this NodePool + that can be terminating at once. This is calculated by counting nodes that + have a deletion timestamp set, or are actively being deleted by Karpenter. + This field is required when specifying a budget. + This cannot be of type intstr.IntOrString since kubebuilder doesn't support pattern + checking for int nodes for IntOrString nodes. + Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388 + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + type: string + schedule: + description: |- + Schedule specifies when a budget begins being active, following + the upstream cronjob syntax. If omitted, the budget is always active. + Timezones are not supported. + This field is required if Duration is set. + pattern: ^(@(annually|yearly|monthly|weekly|daily|midnight|hourly))|((.+)\s(.+)\s(.+)\s(.+)\s(.+))$ + type: string + required: + - nodes + type: object + maxItems: 50 + type: array + x-kubernetes-validations: + - message: '''schedule'' must be set with ''duration''' + rule: self.all(x, has(x.schedule) == has(x.duration)) + consolidateAfter: + description: |- + ConsolidateAfter is the duration the controller will wait + before attempting to terminate nodes that are underutilized. + Refer to ConsolidationPolicy for how underutilization is considered. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + consolidationPolicy: + default: WhenUnderutilized + description: |- + ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation + algorithm. This policy defaults to "WhenUnderutilized" if not specified + enum: + - WhenEmpty + - WhenUnderutilized + type: string + expireAfter: + default: 720h + description: |- + ExpireAfter is the duration the controller will wait + before terminating a node, measured from when the node is created. This + is useful to implement features like eventually consistent node upgrade, + memory leak protection, and disruption testing. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + type: object + x-kubernetes-validations: + - message: consolidateAfter cannot be combined with consolidationPolicy=WhenUnderutilized + rule: 'has(self.consolidateAfter) ? self.consolidationPolicy != ''WhenUnderutilized'' || self.consolidateAfter == ''Never'' : true' + - message: consolidateAfter must be specified with consolidationPolicy=WhenEmpty + rule: 'self.consolidationPolicy == ''WhenEmpty'' ? has(self.consolidateAfter) : true' + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Limits define a set of bounds for provisioning capacity. + type: object + template: + description: |- + Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. + NodeClaims launched from this NodePool will often be further constrained than the template specifies. + properties: + metadata: + properties: + annotations: + additionalProperties: + type: string + description: |- + Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations + type: object + labels: + additionalProperties: + type: string + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + description: |- + Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels + type: object + maxProperties: 100 + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self.all(x, x in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/region", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || x.find("^([^/]+)").endsWith("node.kubernetes.io") || x.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !x.find("^([^/]+)").endsWith("kubernetes.io")) + - message: label domain "k8s.io" is restricted + rule: self.all(x, x.find("^([^/]+)").endsWith("kops.k8s.io") || !x.find("^([^/]+)").endsWith("k8s.io")) + - message: label domain "karpenter.sh" is restricted + rule: self.all(x, x in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !x.find("^([^/]+)").endsWith("karpenter.sh")) + - message: label "karpenter.sh/nodepool" is restricted + rule: self.all(x, x != "karpenter.sh/nodepool") + - message: label "kubernetes.io/hostname" is restricted + rule: self.all(x, x != "kubernetes.io/hostname") + - message: label domain "karpenter.azure.com" is restricted + rule: self.all(x, x in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !x.find("^([^/]+)").endsWith("karpenter.azure.com")) + type: object + spec: + description: NodeClaimSpec describes the desired state of the NodeClaim + properties: + kubelet: + description: |- + Kubelet defines args to be used when configuring kubelet on provisioned nodes. + They are a subset of the upstream types, recognizing not all options may be supported. + Wherever possible, the types and names should reflect the upstream kubelet types. + properties: + clusterDNS: + description: |- + clusterDNS is a list of IP addresses for the cluster DNS server. + Note that not all providers may use all addresses. + items: + type: string + type: array + cpuCFSQuota: + description: CPUCFSQuota enables CPU CFS quota enforcement for containers that specify CPU limits. + type: boolean + evictionHard: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionHard is the map of signal names to quantities that define hard eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionHard are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionMaxPodGracePeriod: + description: |- + EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in + response to soft eviction thresholds being met. + format: int32 + type: integer + evictionSoft: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionSoft is the map of signal names to quantities that define soft eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoft are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionSoftGracePeriod: + additionalProperties: + type: string + description: EvictionSoftGracePeriod is the map of signal names to quantities that define grace periods for each eviction signal + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoftGracePeriod are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + imageGCHighThresholdPercent: + description: |- + ImageGCHighThresholdPercent is the percent of disk usage after which image + garbage collection is always run. The percent is calculated by dividing this + field value by 100, so this field must be between 0 and 100, inclusive. + When specified, the value must be greater than ImageGCLowThresholdPercent. + format: int32 + maximum: 100 + minimum: 0 + type: integer + imageGCLowThresholdPercent: + description: |- + ImageGCLowThresholdPercent is the percent of disk usage before which image + garbage collection is never run. Lowest disk usage to garbage collect to. + The percent is calculated by dividing this field value by 100, + so the field value must be between 0 and 100, inclusive. + When specified, the value must be less than imageGCHighThresholdPercent + format: int32 + maximum: 100 + minimum: 0 + type: integer + kubeReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: KubeReserved contains resources reserved for Kubernetes system components. + type: object + x-kubernetes-validations: + - message: valid keys for kubeReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: kubeReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + maxPods: + description: |- + MaxPods is an override for the maximum number of pods that can run on + a worker node instance. + format: int32 + minimum: 0 + type: integer + podsPerCore: + description: |- + PodsPerCore is an override for the number of pods that can run on a worker node + instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if + MaxPods is a lower value, that value will be used. + format: int32 + minimum: 0 + type: integer + systemReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: SystemReserved contains resources reserved for OS system daemons and kernel memory. + type: object + x-kubernetes-validations: + - message: valid keys for systemReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: systemReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + type: object + x-kubernetes-validations: + - message: imageGCHighThresholdPercent must be greater than imageGCLowThresholdPercent + rule: 'has(self.imageGCHighThresholdPercent) && has(self.imageGCLowThresholdPercent) ? self.imageGCHighThresholdPercent > self.imageGCLowThresholdPercent : true' + - message: evictionSoft OwnerKey does not have a matching evictionSoftGracePeriod + rule: has(self.evictionSoft) ? self.evictionSoft.all(e, (e in self.evictionSoftGracePeriod)):true + - message: evictionSoftGracePeriod OwnerKey does not have a matching evictionSoft + rule: has(self.evictionSoftGracePeriod) ? self.evictionSoftGracePeriod.all(e, (e in self.evictionSoft)):true + nodeClassRef: + description: NodeClassRef is a reference to an object that defines provider specific configuration + properties: + apiVersion: + description: API version of the referent + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - name + type: object + requirements: + description: Requirements are layered with GetLabels and applied to every node. + items: + description: |- + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. + properties: + key: + description: The label key that the selector applies to. + type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "karpenter.sh/nodepool" is restricted + rule: self != "karpenter.sh/nodepool" + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.azure.com" is restricted + rule: self in [ "karpenter.azure.com/sku-name", "karpenter.azure.com/sku-family", "karpenter.azure.com/sku-version", "karpenter.azure.com/sku-cpu", "karpenter.azure.com/sku-memory", "karpenter.azure.com/sku-accelerator", "karpenter.azure.com/sku-networking-accelerated", "karpenter.azure.com/sku-storage-premium-capable", "karpenter.azure.com/sku-storage-ephemeralos-maxsize", "karpenter.azure.com/sku-encryptionathost-capable", "karpenter.azure.com/sku-gpu-name", "karpenter.azure.com/sku-gpu-manufacturer", "karpenter.azure.com/sku-gpu-count" ] || !self.find("^([^/]+)").endsWith("karpenter.azure.com") + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + required: + - key + - operator + type: object + maxItems: 30 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' + - message: requirements with 'minValues' must have at least that many values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' + resources: + description: Resources models the resource requirements for the NodeClaim to launch + properties: + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Requests describes the minimum required resources for the NodeClaim to launch + type: object + type: object + maxProperties: 0 + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + required: + - nodeClassRef + - requirements + type: object + required: + - spec + type: object + weight: + description: |- + Weight is the priority given to the nodepool during scheduling. A higher + numerical weight indicates that this nodepool will be ordered + ahead of other nodepools with lower weights. A nodepool with no weight + will be treated as if it is a nodepool with a weight of 0. + format: int32 + maximum: 100 + minimum: 1 + type: integer + required: + - template + type: object + status: + description: NodePoolStatus defines the observed state of NodePool + properties: + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is the list of resources that have been provisioned. + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 75ead6938..acf518e90 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -32,7 +32,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { //check gpu-provisioner deployment is up and running gpuProvisionerDeployment := &v1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: "kaito-gpu-provisioner", + Name: "gpu-provisioner", Namespace: gpuNamespace, }, }