diff --git a/.github/workflows/ragengine-e2e-workflow.yml b/.github/workflows/ragengine-e2e-workflow.yml new file mode 100644 index 000000000..a55a472c9 --- /dev/null +++ b/.github/workflows/ragengine-e2e-workflow.yml @@ -0,0 +1,284 @@ +name: ragengine-e2e-workflow + +on: + workflow_call: + inputs: + git_sha: + type: string + required: true + node_provisioner: + type: string + required: false + default: gpuprovisioner + tag: + type: string + isRelease: + type: boolean + default: false + registry: + type: string + region: + type: string + description: "the azure location to run the e2e test in" + default: "eastus" + k8s_version: + type: string + default: "1.30.0" + +jobs: + e2e-tests: + runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ] + name: e2e-tests-${{ inputs.node_provisioner }} + permissions: + contents: read + id-token: write # This is required for requesting the JWT + environment: e2e-test + env: + GO_VERSION: "1.22" + KARPENTER_NAMESPACE: "karpenter" + GPU_PROVISIONER_NAMESPACE: "gpu-provisioner" + + steps: + - name: Harden Runner + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ inputs.git_sha }} + + - name: Set e2e Resource and Cluster Name + run: | + rand=$(git rev-parse --short ${{ inputs.git_sha }}) + + if [ "$rand" = "" ]; then + rand=$RANDOM + fi + + echo "VERSION=${rand}" >> $GITHUB_ENV + echo "CLUSTER_NAME=${{ inputs.node_provisioner }}${rand}" >> $GITHUB_ENV + echo "REGISTRY=${{ inputs.node_provisioner }}${rand}.azurecr.io" >> $GITHUB_ENV + echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV + + - name: Set Registry + if: ${{ inputs.isRelease }} + run: | + echo "REGISTRY=${{ inputs.registry }}" >> $GITHUB_ENV + echo "VERSION=$(echo ${{ inputs.tag }} | tr -d v)" >> $GITHUB_ENV + + - name: Remove existing Go modules directory + run: sudo rm -rf ~/go/pkg/mod + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5.2.0 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install Azure CLI latest + run: | + if ! which az > /dev/null; then + echo "Azure CLI not found. Installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + else + echo "Azure CLI already installed." + fi + + - name: Azure CLI Login + run: | + az login --identity + + - uses: azure/setup-helm@v4 + id: install + + - name: Create Resource Group + shell: bash + run: | + make create-rg + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + + - name: Create ACR + shell: bash + run: | + make create-acr + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + + - name: Create Azure Identity + uses: azure/CLI@v2.1.0 + with: + inlineScript: | + az identity create --name ${{ inputs.node_provisioner }}Identity --resource-group ${{ env.CLUSTER_NAME }} + + - name: Generate APIs + run: | + make generate + + - name: build KAITO image + if: ${{ !inputs.isRelease }} + shell: bash + run: | + make docker-build-workspace + env: + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + + - name: build kaito RAG Engine image + if: ${{ !inputs.isRelease }} + shell: bash + run: | + make docker-build-ragengine + env: + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + + + + - name: create cluster + shell: bash + run: | + if [ "${{ inputs.node_provisioner }}" == "gpuprovisioner" ]; then + make create-aks-cluster + else + make create-aks-cluster-for-karpenter + fi + env: + AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + AZURE_LOCATION: ${{ inputs.region }} + AKS_K8S_VERSION: ${{ inputs.k8s_version }} + + - name: Create Identities and Permissions for ${{ inputs.node_provisioner }} + shell: bash + run: | + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make generate-identities + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + TEST_SUITE: ${{ inputs.node_provisioner }} + + - name: Install gpu-provisioner helm chart + if: ${{ inputs.node_provisioner == 'gpuprovisioner' }} + shell: bash + run: | + AZURE_TENANT_ID=$E2E_TENANT_ID \ + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make gpu-provisioner-helm + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + + - name: Install karpenter Azure provider helm chart + if: ${{ inputs.node_provisioner == 'azkarpenter' }} + shell: bash + run: | + AZURE_TENANT_ID=$E2E_TENANT_ID \ + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make azure-karpenter-helm + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }} + KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }} + + - name: Install KAITO Workspace helm chart + shell: bash + run: | + make az-patch-install-helm + kubectl wait --for=condition=available deploy "kaito-workspace" -n kaito-workspace --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + TEST_SUITE: ${{ inputs.node_provisioner }} + + - name: Install KAITO RAG Engine helm chart + shell: bash + run: | + make az-patch-install-ragengine-helm + kubectl wait --for=condition=available deploy "kaito-ragengine" -n kaito-ragengine --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + TEST_SUITE: ${{ inputs.node_provisioner }} + + # Retrieve E2E ACR credentials and create Kubernetes secret + - name: Set up E2E ACR Credentials and Secret + shell: bash + run: | + # Retrieve the ACR username and password + ACR_USERNAME=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "username" -o tsv) + ACR_PASSWORD=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "passwords[0].value" -o tsv) + + # Ensure credentials were retrieved successfully + if [ -z "$ACR_USERNAME" ] || [ -z "$ACR_PASSWORD" ]; then + echo "Failed to retrieve ACR credentials" + exit 1 + fi + + # Create the Kubernetes secret with the retrieved credentials + kubectl create secret docker-registry ${{ env.CLUSTER_NAME }}-acr-secret \ + --docker-server=${{ env.CLUSTER_NAME }}.azurecr.io \ + --docker-username=${ACR_USERNAME} \ + --docker-password=${ACR_PASSWORD} + + # Add Private-Hosted ACR secret for private models like llama + - name: Add Private-Hosted ACR Secret Credentials + run: | + # Ensure E2E_AMRT_SECRET_NAME is sanitized to remove any accidental quotes + E2E_AMRT_SECRET_NAME=$(echo "$E2E_AMRT_SECRET_NAME" | sed 's/[\"'\'']//g') + + if kubectl get secret "$E2E_AMRT_SECRET_NAME" >/dev/null 2>&1; then + echo "Secret $E2E_AMRT_SECRET_NAME already exists. Skipping creation." + else + kubectl create secret docker-registry "$E2E_AMRT_SECRET_NAME" \ + --docker-server="$E2E_ACR_AMRT_USERNAME.azurecr.io" \ + --docker-username="$E2E_ACR_AMRT_USERNAME" \ + --docker-password="$E2E_ACR_AMRT_PASSWORD" + echo "Secret $E2E_AMRT_SECRET_NAME created successfully." + fi + + - name: Log ${{ inputs.node_provisioner }} + run: | + if [ "${{ inputs.node_provisioner }}" == "gpuprovisioner" ]; then + kubectl logs -n "${{ env.GPU_PROVISIONER_NAMESPACE }}" -l app.kubernetes.io/name=gpu-provisioner -c controller + else + kubectl logs -n "${{ env.KARPENTER_NAMESPACE }}" -l app.kubernetes.io/name=karpenter -c controller + fi + + - name: Log kaito-workspace + run: | + kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {} + + - name: Log kaito-ragengine + run: | + kubectl get pods -n kaito-ragengine -o name | grep "^pod/kaito-ragengine" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-ragengine {} + + - name: Run e2e test + run: | + AI_MODELS_REGISTRY=$E2E_ACR_AMRT_USERNAME.azurecr.io \ + AI_MODELS_REGISTRY_SECRET=$E2E_AMRT_SECRET_NAME \ + make kaito-ragengine-e2e-test + env: + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }} + REGISTRY: ${{ env.REGISTRY }} + TEST_SUITE: ${{ inputs.node_provisioner }} + E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io + E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret + + - name: Cleanup e2e resources + if: ${{ always() }} + uses: azure/CLI@v2.1.0 + with: + inlineScript: | + set +e + az group delete --name "${{ env.CLUSTER_NAME }}" --yes --no-wait || true diff --git a/.github/workflows/ragengine-e2e.yml b/.github/workflows/ragengine-e2e.yml new file mode 100644 index 000000000..df0a2e847 --- /dev/null +++ b/.github/workflows/ragengine-e2e.yml @@ -0,0 +1,31 @@ +name: ragengine-e2e-test + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: + pull_request: + paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] + +env: + GO_VERSION: "1.22" + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + run-e2e: + strategy: + fail-fast: false + matrix: + node-provisioner: [gpuprovisioner] # WIP: azkarpenter] + permissions: + contents: read + id-token: write + statuses: write + uses: ./.github/workflows/ragengine-e2e-workflow.yml + with: + git_sha: ${{ github.event.pull_request.head.sha }} + node_provisioner: ${{ matrix.node-provisioner }} diff --git a/Makefile b/Makefile index d78cfc18f..ed9d76359 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ # Image URL to use all building/pushing image targets REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace +RAGENGINE_IMG_NAME ?= ragengine VERSION ?= v0.4.1 GPU_PROVISIONER_VERSION ?= 0.2.1 IMG_TAG ?= $(subst v,,$(VERSION)) @@ -32,6 +33,7 @@ AZURE_CLUSTER_NAME ?= kaito-demo AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION) GPU_PROVISIONER_NAMESPACE ?= gpu-provisioner KAITO_NAMESPACE ?= kaito-workspace +KAITO_RAGENGINE_NAMESPACE ?= kaito-ragengine GPU_PROVISIONER_MSI_NAME ?= gpuprovisionerIdentity ## Azure Karpenter parameters @@ -132,6 +134,9 @@ GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_N $(E2E_TEST): (cd test/e2e && go test -c . -o $(E2E_TEST)) +$(RAGENGINE_E2E_TEST): + (cd test/rage2e && go test -c . -o $(RAGENGINE_E2E_TEST)) + .PHONY: kaito-workspace-e2e-test kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO) AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ @@ -140,6 +145,14 @@ kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO) SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \ $(GINKGO) -v -trace $(GINKGO_ARGS) $(E2E_TEST) +.PHONY: kaito-ragengine-e2e-test +kaito-ragengine-e2e-test: $(RAGENGINE_E2E_TEST) $(GINKGO) + AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ + AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_PROVISIONER_NAMESPACE=$(GPU_PROVISIONER_NAMESPACE) \ + KARPENTER_NAMESPACE=$(KARPENTER_NAMESPACE) KAITO_RAGENGINE_NAMESPACE=$(KAITO_RAGENGINE_NAMESPACE) TEST_SUITE=$(TEST_SUITE) \ + SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \ + $(GINKGO) -v -trace $(GINKGO_ARGS) $(RAGENGINE_E2E_TEST) + ## -------------------------------------- ## Azure resources ## -------------------------------------- @@ -234,11 +247,11 @@ docker-build-workspace: docker-buildx .PHONY: docker-build-ragengine docker-build-ragengine: docker-buildx docker buildx build \ - --file ./docker/ragengine/Dockerfile \ - --output=$(OUTPUT_TYPE) \ - --platform="linux/$(ARCH)" \ - --pull \ - --tag $(REGISTRY)/$(RAGENGINE_IMG_NAME):$(RAGENGINE_IMG_TAG) . + --file ./docker/ragengine/Dockerfile \ + --output=$(OUTPUT_TYPE) \ + --platform="linux/$(ARCH)" \ + --pull \ + --tag $(REGISTRY)/$(RAGENGINE_IMAGE_NAME):$(IMG_TAG) . .PHONY: docker-build-rag-service docker-build-ragservice: docker buildx @@ -318,6 +331,19 @@ az-patch-install-helm: ## Update Azure client env vars and settings in helm valu helm install kaito-workspace ./charts/kaito/workspace --namespace $(KAITO_NAMESPACE) --create-namespace +.PHONY: az-patch-install-ragengine-helm +az-patch-install-ragengine-helm: ## Update Azure client env vars and settings in helm values.yml + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) + + yq -i '(.image.repository) = "$(REGISTRY)/ragengine"' ./charts/kaito/ragengine/values.yaml + yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/ragengine/values.yaml + if [ $(TEST_SUITE) = "azkarpenter" ]; then \ + yq -i '(.featureGates.Karpenter) = "true"' ./charts/kaito/ragengine/values.yaml; \ + fi + yq -i '(.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/ragengine/values.yaml + + helm install kaito-ragengine ./charts/kaito/ragengine --namespace $(KAITO_RAGENGINE_NAMESPACE) --create-namespace + .PHONY: aws-patch-install-helm ##install kaito on AWS cluster aws-patch-install-helm: yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/workspace/values.yaml diff --git a/charts/kaito/ragengine/values.yaml b/charts/kaito/ragengine/values.yaml index 0baf44396..db0e23cbf 100644 --- a/charts/kaito/ragengine/values.yaml +++ b/charts/kaito/ragengine/values.yaml @@ -1,7 +1,6 @@ # Default values for kaito. # This is a YAML-formatted file. # Declare variables to be passed into your templates. -fullnameOverride: ragengine replicaCount: 1 image: repository: mcr.microsoft.com/aks/kaito/ragengine diff --git a/test/rage2e/e2e_test.go b/test/rage2e/e2e_test.go new file mode 100644 index 000000000..a5e84fc7e --- /dev/null +++ b/test/rage2e/e2e_test.go @@ -0,0 +1,121 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package e2e + +import ( + "context" + "fmt" + "math/rand" + "os" + "testing" + + "github.com/kaito-project/kaito/test/rage2e/utils" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var ( + ctx = context.Background() + namespaceName = fmt.Sprint(utils.E2eNamespace, rand.Intn(100)) + nodeProvisionerName = os.Getenv("TEST_SUITE") +) + +var _ = BeforeSuite(func() { + utils.GetClusterClient(utils.TestingCluster) + + namespaceName = fmt.Sprintf("%s-%d", namespaceName, GinkgoParallelProcess()) + GinkgoWriter.Printf("Namespace %q for e2e tests\n", namespaceName) + + kaitoNamespace := os.Getenv("KAITO_RAGENGINE_NAMESPACE") + + if nodeProvisionerName == "azkarpenter" { + karpenterNamespace := os.Getenv("KARPENTER_NAMESPACE") + //check karpenter deployment is up and running + karpenterDeployment := &v1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "karpenter", + Namespace: karpenterNamespace, + }, + } + + Eventually(func() error { + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: karpenterDeployment.Namespace, + Name: karpenterDeployment.Name, + }, karpenterDeployment, &client.GetOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to wait for karpenter deployment") + } + + if nodeProvisionerName == "gpuprovisioner" { + gpuNamespace := os.Getenv("GPU_PROVISIONER_NAMESPACE") + //check gpu-provisioner deployment is up and running + gpuProvisionerDeployment := &v1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-provisioner", + Namespace: gpuNamespace, + }, + } + + Eventually(func() error { + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: gpuProvisionerDeployment.Namespace, + Name: gpuProvisionerDeployment.Name, + }, gpuProvisionerDeployment, &client.GetOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to wait for gpu-provisioner deployment") + } + + //check kaito-workspace deployment is up and running + kaitoWorkspaceDeployment := &v1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kaito-workspace", + Namespace: kaitoNamespace, + }, + } + + Eventually(func() error { + return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: kaitoWorkspaceDeployment.Namespace, + Name: kaitoWorkspaceDeployment.Name, + }, kaitoWorkspaceDeployment, &client.GetOptions{}) + }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to wait for kaito-workspace deployment") + + // create testing namespace + err := utils.TestingCluster.KubeClient.Create(context.TODO(), &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + }, + }) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = AfterSuite(func() { + // delete testing namespace + Eventually(func() error { + return utils.TestingCluster.KubeClient.Delete(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + }, + }, &client.DeleteOptions{}) + }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to delete namespace for e2e") + +}) + +func RunE2ETests(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "AI Toolchain Operator E2E Test Suite") +} + +func TestE2E(t *testing.T) { + RunE2ETests(t) +} + +func TestMain(m *testing.M) { + os.Exit(m.Run()) +} diff --git a/test/rage2e/preset_vllm_test.go b/test/rage2e/preset_vllm_test.go new file mode 100644 index 000000000..262bed795 --- /dev/null +++ b/test/rage2e/preset_vllm_test.go @@ -0,0 +1,324 @@ +package e2e + +import ( + "fmt" + "math/rand" + "os" + "strconv" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/test/rage2e/utils" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + PresetPhi3Mini128kModel = "phi-3-mini-128k-instruct" +) + +func loadTestEnvVars() { + var err error + runLlama13B, err = strconv.ParseBool(os.Getenv("RUN_LLAMA_13B")) + if err != nil { + fmt.Print("Error: RUN_LLAMA_13B ENV Variable not set") + runLlama13B = false + } + + // Required for Llama models + aiModelsRegistry = utils.GetEnv("AI_MODELS_REGISTRY") + aiModelsRegistrySecret = utils.GetEnv("AI_MODELS_REGISTRY_SECRET") + // Currently required for uploading fine-tuning results + e2eACRSecret = utils.GetEnv("E2E_ACR_REGISTRY_SECRET") + supportedModelsYamlPath = utils.GetEnv("SUPPORTED_MODELS_YAML_PATH") + azureClusterName = utils.GetEnv("AZURE_CLUSTER_NAME") +} + +func loadModelVersions() { + // Load stable model versions + configs, err := utils.GetModelConfigInfo(supportedModelsYamlPath) + if err != nil { + fmt.Printf("Failed to load model configs: %v\n", err) + os.Exit(1) + } + + modelInfo, err = utils.ExtractModelVersion(configs) + if err != nil { + fmt.Printf("Failed to extract stable model versions: %v\n", err) + os.Exit(1) + } +} + +var runLlama13B bool +var aiModelsRegistry string +var aiModelsRegistrySecret string +var e2eACRSecret string +var supportedModelsYamlPath string +var modelInfo map[string]string +var azureClusterName string + +var _ = Describe("RAGEngine preset", func() { + BeforeEach(func() { + loadTestEnvVars() + loadModelVersions() + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + utils.PrintPodLogsOnFailure(namespaceName, "") // The Preset Pod + utils.PrintPodLogsOnFailure("kaito-workspace", "") // The Kaito Workspace Pod + utils.PrintPodLogsOnFailure("gpu-provisioner", "") // The gpu-provisioner Pod + Fail("Fail threshold reached") + } + }) + + It("should create a Phi-3-mini-128k-instruct workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createPhi3WorkspaceWithPresetPublicModeAndVLLM(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + validateInferenceConfig(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) +}) + +func createPhi3WorkspaceWithPresetPublicModeAndVLLM(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Phi-3-mini-128k-instruct preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-phi3-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC6s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-phi-3-mini-128k-instruct-vllm"}, + }, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) { + By("Creating workspace", func() { + Eventually(func() error { + return utils.TestingCluster.KubeClient.Create(ctx, workspaceObj, &client.CreateOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to create workspace %s", workspaceObj.Name) + + By("Validating workspace creation", func() { + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, workspaceObj, &client.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + }) +} + +// validateWorkspaceReadiness validates workspace readiness +func validateWorkspaceReadiness(workspaceObj *kaitov1alpha1.Workspace) { + By("Checking the workspace status is ready", func() { + Eventually(func() bool { + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, workspaceObj, &client.GetOptions{}) + + if err != nil { + return false + } + + _, conditionFound := lo.Find(workspaceObj.Status.Conditions, func(condition metav1.Condition) bool { + return condition.Type == string(kaitov1alpha1.WorkspaceConditionTypeSucceeded) && + condition.Status == metav1.ConditionTrue + }) + return conditionFound + }, 10*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for workspace to be ready") + }) +} + +func cleanupResources(workspaceObj *kaitov1alpha1.Workspace) { + By("Cleaning up resources", func() { + if !CurrentSpecReport().Failed() { + // delete workspace + err := deleteWorkspace(workspaceObj) + Expect(err).NotTo(HaveOccurred(), "Failed to delete workspace") + } else { + GinkgoWriter.Printf("test failed, keep %s \n", workspaceObj.Name) + } + }) +} + +func validateCreateNode(workspaceObj *kaitov1alpha1.Workspace, numOfNode int) { + if nodeProvisionerName == "azkarpenter" { + utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode) + } else { + utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode) + } +} + +// validateResourceStatus validates resource status +func validateResourceStatus(workspaceObj *kaitov1alpha1.Workspace) { + By("Checking the resource status", func() { + Eventually(func() bool { + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, workspaceObj, &client.GetOptions{}) + + if err != nil { + return false + } + + _, conditionFound := lo.Find(workspaceObj.Status.Conditions, func(condition metav1.Condition) bool { + return condition.Type == string(kaitov1alpha1.ConditionTypeResourceStatus) && + condition.Status == metav1.ConditionTrue + }) + return conditionFound + }, 10*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for resource status to be ready") + }) +} + +func validateAssociatedService(workspaceObj *kaitov1alpha1.Workspace) { + serviceName := workspaceObj.Name + serviceNamespace := workspaceObj.Namespace + + By(fmt.Sprintf("Checking for service %s in namespace %s", serviceName, serviceNamespace), func() { + service := &v1.Service{} + + Eventually(func() bool { + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: serviceNamespace, + Name: serviceName, + }, service) + + if err != nil { + if errors.IsNotFound(err) { + GinkgoWriter.Printf("Service %s not found in namespace %s\n", serviceName, serviceNamespace) + } else { + GinkgoWriter.Printf("Error fetching service %s in namespace %s: %v\n", serviceName, serviceNamespace, err) + } + return false + } + + GinkgoWriter.Printf("Found service: %s in namespace %s\n", serviceName, serviceNamespace) + return true + }, 10*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for service to be created") + }) +} + +// validateInferenceConfig validates that the inference config exists and contains data +func validateInferenceConfig(workspaceObj *kaitov1alpha1.Workspace) { + By("Checking the inference config exists", func() { + Eventually(func() bool { + configMap := &v1.ConfigMap{} + configName := kaitov1alpha1.DefaultInferenceConfigTemplate + if workspaceObj.Inference.Config != "" { + configName = workspaceObj.Inference.Config + } + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: configName, + }, configMap) + + if err != nil { + GinkgoWriter.Printf("Error fetching config: %v\n", err) + return false + } + + return len(configMap.Data) > 0 + }, 10*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for inference config to be ready") + }) +} + +// validateInferenceResource validates inference deployment +func validateInferenceResource(workspaceObj *kaitov1alpha1.Workspace, expectedReplicas int32, isStatefulSet bool) { + By("Checking the inference resource", func() { + Eventually(func() bool { + var err error + var readyReplicas int32 + + if isStatefulSet { + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: workspaceObj.Name, + Namespace: workspaceObj.Namespace, + }, + } + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, sts) + readyReplicas = sts.Status.ReadyReplicas + + } else { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: workspaceObj.Name, + Namespace: workspaceObj.Namespace, + }, + } + err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, dep) + readyReplicas = dep.Status.ReadyReplicas + } + + if err != nil { + GinkgoWriter.Printf("Error fetching resource: %v\n", err) + return false + } + + if readyReplicas == expectedReplicas { + return true + } + + return false + }, 20*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for inference resource to be ready") + }) +} + +func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error { + By("Deleting workspace", func() { + Eventually(func() error { + // Check if the workspace exists + err := utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, workspaceObj) + + if errors.IsNotFound(err) { + GinkgoWriter.Printf("Workspace %s does not exist, no need to delete\n", workspaceObj.Name) + return nil + } + if err != nil { + return fmt.Errorf("error checking if workspace %s exists: %v", workspaceObj.Name, err) + } + + err = utils.TestingCluster.KubeClient.Delete(ctx, workspaceObj, &client.DeleteOptions{}) + if err != nil { + return fmt.Errorf("failed to delete workspace %s: %v", workspaceObj.Name, err) + } + return nil + }, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to delete workspace") + }) + + return nil +} diff --git a/test/rage2e/utils/cluster.go b/test/rage2e/utils/cluster.go new file mode 100644 index 000000000..555472e36 --- /dev/null +++ b/test/rage2e/utils/cluster.go @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + azurev1alpha2 "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + awsv1beta1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" + kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/dynamic" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/kubernetes/test/e2e/framework" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/karpenter/pkg/apis/v1beta1" +) + +const ( + E2eNamespace = "kaito-e2e" +) + +var ( + scheme = runtime.NewScheme() + TestingCluster = NewCluster(scheme) +) + +// Cluster object defines the required clients of the test cluster. +type Cluster struct { + Scheme *runtime.Scheme + KubeClient client.Client + DynamicClient dynamic.Interface +} + +func NewCluster(scheme *runtime.Scheme) *Cluster { + return &Cluster{ + Scheme: scheme, + } +} + +// GetClusterClient returns a Cluster client for the cluster. +func GetClusterClient(cluster *Cluster) { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(kaitov1alpha1.AddToScheme(scheme)) + utilruntime.Must(v1alpha5.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(v1beta1.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(azurev1alpha2.SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(awsv1beta1.SchemeBuilder.AddToScheme(scheme)) + + restConfig := config.GetConfigOrDie() + + k8sClient, err := client.New(restConfig, client.Options{Scheme: cluster.Scheme}) + framework.ExpectNoError(err, "failed to create k8s client for e2e") + + gomega.Expect(err).Should(gomega.Succeed(), "Failed to set up Kube Client") + TestingCluster.KubeClient = k8sClient + + cluster.DynamicClient, err = dynamic.NewForConfig(restConfig) + gomega.Expect(err).Should(gomega.Succeed(), "Failed to set up Dynamic Client") + +} diff --git a/test/rage2e/utils/machine.go b/test/rage2e/utils/machine.go new file mode 100644 index 000000000..4d62fbe9a --- /dev/null +++ b/test/rage2e/utils/machine.go @@ -0,0 +1,61 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "context" + "fmt" + "time" + + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/samber/lo" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ValidateMachineCreation Logic to validate machine creation +func ValidateMachineCreation(ctx context.Context, workspaceObj *v1alpha1.Workspace, expectedCount int) { + ginkgo.By("Checking machine created by the workspace CR", func() { + gomega.Eventually(func() bool { + machineList, err := getAllValidMachines(ctx, workspaceObj) + if err != nil { + fmt.Printf("Failed to get all valid machines: %v", err) + return false + } + + if len(machineList.Items) != expectedCount { + return false + } + + for _, machine := range machineList.Items { + _, conditionFound := lo.Find(machine.GetConditions(), func(condition apis.Condition) bool { + return condition.Type == apis.ConditionReady && condition.Status == v1.ConditionTrue + }) + if !conditionFound { + return false + } + } + return true + }, 20*time.Minute, PollInterval).Should(gomega.BeTrue(), "Failed to wait for machine to be ready") + }) +} + +func getAllValidMachines(ctx context.Context, workspaceObj *v1alpha1.Workspace) (*v1alpha5.MachineList, error) { + machineList := &v1alpha5.MachineList{} + ls := labels.Set{ + v1alpha1.LabelWorkspaceName: workspaceObj.Name, + v1alpha1.LabelWorkspaceNamespace: workspaceObj.Namespace, + } + + err := TestingCluster.KubeClient.List(ctx, machineList, &client.MatchingLabelsSelector{Selector: ls.AsSelector()}) + if err != nil { + return nil, err + } + return machineList, nil +} diff --git a/test/rage2e/utils/nodeclaim.go b/test/rage2e/utils/nodeclaim.go new file mode 100644 index 000000000..560f5fc27 --- /dev/null +++ b/test/rage2e/utils/nodeclaim.go @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "context" + "fmt" + "time" + + "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/karpenter/pkg/apis/v1beta1" +) + +// ValidateNodeClaimCreation Logic to validate the nodeClaim creation. +func ValidateNodeClaimCreation(ctx context.Context, workspaceObj *v1alpha1.Workspace, expectedCount int) { + ginkgo.By("Checking nodeClaim created by the workspace CR", func() { + gomega.Eventually(func() bool { + nodeClaimList, err := GetAllValidNodeClaims(ctx, workspaceObj) + if err != nil { + fmt.Printf("Failed to get all valid nodeClaim: %v", err) + return false + } + + if len(nodeClaimList.Items) != expectedCount { + return false + } + + for _, nodeClaim := range nodeClaimList.Items { + _, conditionFound := lo.Find(nodeClaim.GetConditions(), func(condition apis.Condition) bool { + return condition.Type == apis.ConditionReady && condition.Status == v1.ConditionTrue + }) + if !conditionFound { + return false + } + } + return true + }, 20*time.Minute, PollInterval).Should(gomega.BeTrue(), "Failed to wait for nodeClaim to be ready") + }) +} + +// GetAllValidNodeClaims get all valid nodeClaims. +func GetAllValidNodeClaims(ctx context.Context, workspaceObj *v1alpha1.Workspace) (*v1beta1.NodeClaimList, error) { + nodeClaimList := &v1beta1.NodeClaimList{} + ls := labels.Set{ + v1alpha1.LabelWorkspaceName: workspaceObj.Name, + v1alpha1.LabelWorkspaceNamespace: workspaceObj.Namespace, + } + + err := TestingCluster.KubeClient.List(ctx, nodeClaimList, &client.MatchingLabelsSelector{Selector: ls.AsSelector()}) + if err != nil { + return nil, err + } + return nodeClaimList, nil +} diff --git a/test/rage2e/utils/utils.go b/test/rage2e/utils/utils.go new file mode 100644 index 000000000..9dbbf7198 --- /dev/null +++ b/test/rage2e/utils/utils.go @@ -0,0 +1,480 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "bytes" + "context" + "fmt" + "io" + "io/ioutil" + "log" + "math/rand" + "os" + "path/filepath" + "reflect" + "sort" + "strings" + "time" + + pkgscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/remotecommand" + + "github.com/kaito-project/kaito/api/v1alpha1" + kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/model" + "github.com/samber/lo" + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +const ( + InferenceModeCustomTemplate kaitov1alpha1.ModelImageAccessMode = "customTemplate" + ExampleDatasetURL = "https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/resolve/main/data/train-00000-of-00001-54e3756291ca09c6.parquet?download=true" +) + +var ( + // PollInterval defines the interval time for a poll operation. + PollInterval = 2 * time.Second + // PollTimeout defines the time after which the poll operation times out. + PollTimeout = 120 * time.Second +) + +func GetEnv(envVar string) string { + env := os.Getenv(envVar) + if env == "" { + fmt.Printf("%s is not set or is empty\n", envVar) + return "" + } + return env +} + +// GenerateRandomString generates a random number between 0 and 1000 and returns it as a string. +func GenerateRandomString() string { + rand.Seed(time.Now().UnixNano()) // Seed the random number generator + randomNumber := rand.Intn(1001) // Generate a random number between 0 and 1000 + return fmt.Sprintf("%d", randomNumber) +} + +func GetModelConfigInfo(configFilePath string) (map[string]interface{}, error) { + var data map[string]interface{} + + yamlData, err := ioutil.ReadFile(configFilePath) + if err != nil { + return nil, fmt.Errorf("error reading YAML file: %w", err) + } + + err = yaml.Unmarshal(yamlData, &data) + if err != nil { + return nil, fmt.Errorf("error unmarshalling YAML: %w", err) + } + + return data, nil +} + +func GetPodNameForJob(coreClient *kubernetes.Clientset, namespace, jobName string) (string, error) { + podList, err := coreClient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{ + LabelSelector: fmt.Sprintf("job-name=%s", jobName), + }) + if err != nil { + return "", err + } + + if len(podList.Items) == 0 { + return "", fmt.Errorf("no pods found for job %s", jobName) + } + + return podList.Items[0].Name, nil +} + +func GetPodNameForDeployment(coreClient *kubernetes.Clientset, namespace, deploymentName string) (string, error) { + podList, err := coreClient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{ + LabelSelector: fmt.Sprintf("kaito.sh/workspace=%s", deploymentName), + }) + if err != nil { + return "", err + } + + if len(podList.Items) == 0 { + return "", fmt.Errorf("no pods found for job %s", deploymentName) + } + + return podList.Items[0].Name, nil +} + +func GetK8sConfig() (*rest.Config, error) { + var config *rest.Config + var err error + + if os.Getenv("KUBERNETES_SERVICE_HOST") != "" && os.Getenv("KUBERNETES_SERVICE_PORT") != "" { + config, err = rest.InClusterConfig() + if err != nil { + log.Fatalf("Failed to get in-cluster config: %v", err) + } + } else { + // Use kubeconfig file for local development + kubeconfig := filepath.Join(os.Getenv("HOME"), ".kube", "config") + config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + log.Fatalf("Failed to load kubeconfig: %v", err) + } + } + + return config, err +} + +func GetK8sClientset() (*kubernetes.Clientset, error) { + config, err := GetK8sConfig() + if err != nil { + log.Fatalf("Failed to get k8s config: %v", err) + } + coreClient, err := kubernetes.NewForConfig(config) + if err != nil { + log.Fatalf("Failed to create core client: %v", err) + } + return coreClient, err +} + +func GetPodLogs(coreClient *kubernetes.Clientset, namespace, podName, containerName string) (string, error) { + options := &v1.PodLogOptions{} + if containerName != "" { + options.Container = containerName + } + + req := coreClient.CoreV1().Pods(namespace).GetLogs(podName, options) + logs, err := req.Stream(context.Background()) + if err != nil { + return "", err + } + defer logs.Close() + + buf := new(strings.Builder) + _, err = io.Copy(buf, logs) + if err != nil { + return "", err + } + + return buf.String(), nil +} + +func ExecSync(ctx context.Context, config *rest.Config, coreClient *kubernetes.Clientset, namespace, podName string, options v1.PodExecOptions) (string, error) { + req := coreClient.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(podName). + Namespace(namespace). + SubResource("exec") + req.VersionedParams(&options, pkgscheme.ParameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) + if err != nil { + return "", fmt.Errorf("failed to initialize SPDY executor: %w", err) + } + + var stdout, stderr bytes.Buffer + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + }) + if err != nil { + return "", fmt.Errorf("failed to execute command: %w, stderr: %q", err, stderr.String()) + } + + if stderr.Len() > 0 { + return "", fmt.Errorf("command error: %s", stderr.String()) + } + + return stdout.String(), nil +} + +func PrintPodLogsOnFailure(namespace, labelSelector string) { + coreClient, err := GetK8sClientset() + if err != nil { + log.Printf("Failed to create core client: %v", err) + } + pods, err := coreClient.CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + log.Printf("Failed to list pods: %v", err) + return + } + + for _, pod := range pods.Items { + for _, container := range pod.Spec.Containers { + logs, err := GetPodLogs(coreClient, namespace, pod.Name, container.Name) + if err != nil { + log.Printf("Failed to get logs from pod %s, container %s: %v", pod.Name, container.Name, err) + } else { + fmt.Printf("Logs from pod %s, container %s:\n%s\n", pod.Name, container.Name, string(logs)) + } + } + } +} + +func CopySecret(original *corev1.Secret, targetNamespace string) *corev1.Secret { + newSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: original.Name, + Namespace: targetNamespace, + }, + Data: original.Data, + Type: original.Type, + } + return newSecret +} + +func ExtractModelVersion(configs map[string]interface{}) (map[string]string, error) { + modelsInfo := make(map[string]string) + models, ok := configs["models"].([]interface{}) + if !ok { + return nil, fmt.Errorf("'models' key not found or is not a slice") + } + + for _, modelItem := range models { + model, ok := modelItem.(map[interface{}]interface{}) + if !ok { + return nil, fmt.Errorf("model item is not a map") + } + + modelName, ok := model["name"].(string) + if !ok { + return nil, fmt.Errorf("model name is not a string or not found") + } + + modelTag, ok := model["tag"].(string) // Using 'tag' as the version + if !ok { + return nil, fmt.Errorf("model version for %s is not a string or not found", modelName) + } + + modelsInfo[modelName] = modelTag + } + + return modelsInfo, nil +} + +func GenerateInferenceWorkspaceManifest(name, namespace, imageName string, resourceCount int, instanceType string, + labelSelector *metav1.LabelSelector, preferredNodes []string, presetName kaitov1alpha1.ModelName, + accessMode kaitov1alpha1.ModelImageAccessMode, imagePullSecret []string, + podTemplate *corev1.PodTemplateSpec, adapters []kaitov1alpha1.AdapterSpec) *kaitov1alpha1.Workspace { + + workspace := &kaitov1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), + }, + }, + Resource: kaitov1alpha1.ResourceSpec{ + Count: lo.ToPtr(resourceCount), + InstanceType: instanceType, + LabelSelector: labelSelector, + PreferredNodes: preferredNodes, + }, + } + + var workspaceInference kaitov1alpha1.InferenceSpec + if accessMode == kaitov1alpha1.ModelImageAccessModePublic || + accessMode == kaitov1alpha1.ModelImageAccessModePrivate { + workspaceInference.Preset = &kaitov1alpha1.PresetSpec{ + PresetMeta: kaitov1alpha1.PresetMeta{ + Name: presetName, + AccessMode: accessMode, + }, + PresetOptions: kaitov1alpha1.PresetOptions{ + Image: imageName, + ImagePullSecrets: imagePullSecret, + }, + } + } + + if adapters != nil { + workspaceInference.Adapters = adapters + } + + if accessMode == InferenceModeCustomTemplate { + workspaceInference.Template = podTemplate + } + + workspace.Inference = &workspaceInference + + return workspace +} + +func GenerateInferenceWorkspaceManifestWithVLLM(name, namespace, imageName string, resourceCount int, instanceType string, + labelSelector *metav1.LabelSelector, preferredNodes []string, presetName kaitov1alpha1.ModelName, + accessMode kaitov1alpha1.ModelImageAccessMode, imagePullSecret []string, + podTemplate *corev1.PodTemplateSpec, adapters []kaitov1alpha1.AdapterSpec) *kaitov1alpha1.Workspace { + workspace := GenerateInferenceWorkspaceManifest(name, namespace, imageName, resourceCount, instanceType, + labelSelector, preferredNodes, presetName, accessMode, imagePullSecret, podTemplate, adapters) + + if workspace.Annotations == nil { + workspace.Annotations = make(map[string]string) + } + workspace.Annotations[kaitov1alpha1.AnnotationWorkspaceRuntime] = string(model.RuntimeNameVLLM) + return workspace +} + +func GenerateTuningWorkspaceManifest(name, namespace, imageName string, resourceCount int, instanceType string, + labelSelector *metav1.LabelSelector, preferredNodes []string, input *kaitov1alpha1.DataSource, + output *kaitov1alpha1.DataDestination, preset *kaitov1alpha1.PresetSpec, method kaitov1alpha1.TuningMethod) *kaitov1alpha1.Workspace { + + workspace := &kaitov1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Resource: kaitov1alpha1.ResourceSpec{ + Count: lo.ToPtr(resourceCount), + InstanceType: instanceType, + LabelSelector: labelSelector, + PreferredNodes: preferredNodes, + }, + Tuning: &kaitov1alpha1.TuningSpec{ + Method: method, + Input: input, + Output: output, + Preset: preset, + }, + } + + return workspace +} + +func GenerateE2ETuningWorkspaceManifest(name, namespace, imageName, datasetImageName, outputRegistry string, + resourceCount int, instanceType string, labelSelector *metav1.LabelSelector, + preferredNodes []string, presetName kaitov1alpha1.ModelName, accessMode kaitov1alpha1.ModelImageAccessMode, + imagePullSecret []string, customConfigMapName string) *kaitov1alpha1.Workspace { + workspace := &kaitov1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Resource: kaitov1alpha1.ResourceSpec{ + Count: lo.ToPtr(resourceCount), + InstanceType: instanceType, + LabelSelector: labelSelector, + PreferredNodes: preferredNodes, + }, + } + + var workspaceTuning kaitov1alpha1.TuningSpec + if accessMode == kaitov1alpha1.ModelImageAccessModePublic || + accessMode == kaitov1alpha1.ModelImageAccessModePrivate { + workspaceTuning.Preset = &kaitov1alpha1.PresetSpec{ + PresetMeta: kaitov1alpha1.PresetMeta{ + Name: presetName, + AccessMode: accessMode, + }, + PresetOptions: kaitov1alpha1.PresetOptions{ + Image: imageName, + ImagePullSecrets: imagePullSecret, + }, + } + } + + workspace.Tuning = &workspaceTuning + workspace.Tuning.Method = kaitov1alpha1.TuningMethodQLora + workspace.Tuning.Input = &kaitov1alpha1.DataSource{ + Image: datasetImageName, + } + workspace.Tuning.Output = &kaitov1alpha1.DataDestination{ + Image: outputRegistry, + ImagePushSecret: imagePullSecret[0], + } + + if customConfigMapName != "" { + workspace.Tuning.Config = customConfigMapName + } + + return workspace +} + +// GenerateE2ETuningConfigMapManifest generates a ConfigMap manifest for E2E tuning. +func GenerateE2ETuningConfigMapManifest(namespace string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-qlora-params-template", + Namespace: namespace, // Same as workspace namespace + }, + Data: map[string]string{ + "training_config.yaml": `training_config: + ModelConfig: + torch_dtype: "bfloat16" + local_files_only: true + device_map: "auto" + + QuantizationConfig: + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + + LoraConfig: + r: 8 + lora_alpha: 8 + lora_dropout: 0.0 + target_modules: ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] + + TrainingArguments: + output_dir: "/mnt/results" + ddp_find_unused_parameters: false + save_strategy: "epoch" + per_device_train_batch_size: 1 + max_steps: 2 # Adding this line to limit training to 2 steps + + DataCollator: + mlm: true + + DatasetConfig: + shuffle_dataset: true + train_test_split: 1`, + }, + } +} + +func GeneratePodTemplate(name, namespace, image string, labels map[string]string) *corev1.PodTemplateSpec { + return &corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: labels, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: name, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sleep", "10000"}, + }, + }, + }, + } +} + +func CompareSecrets(refs []corev1.LocalObjectReference, secrets []string) bool { + if len(refs) != len(secrets) { + return false + } + + refSecrets := make([]string, len(refs)) + for i, ref := range refs { + refSecrets[i] = ref.Name + } + + sort.Strings(refSecrets) + sort.Strings(secrets) + + return reflect.DeepEqual(refSecrets, secrets) +}