From ecf78c10771eb4afd594e2eb6916d8608a05a6a0 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 15:39:51 -0800 Subject: [PATCH 1/7] nit: allow any runner --- .github/workflows/e2e-preset-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 8f2e0463e..8736d51e4 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -54,7 +54,7 @@ jobs: e2e-preset-tests: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models - runs-on: [self-hosted, 'username:runner-2','username:runner-3'] + runs-on: [self-hosted] strategy: fail-fast: false matrix: From b56d8ee8fa406e355292626709cf25ab74348863 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 15:41:20 -0800 Subject: [PATCH 2/7] nit: allow any runner --- .github/workflows/preset-image-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 33b882011..7047e7877 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -63,7 +63,7 @@ jobs: build-models: needs: determine-models if: needs.determine-models.outputs.is_matrix_empty == 'false' - runs-on: [self-hosted, 'username:runner-2', 'username:runner-3'] + runs-on: [self-hosted] strategy: fail-fast: false matrix: From e288feabb315f6c933062226c4b1b470a323c956 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 17:08:01 -0800 Subject: [PATCH 3/7] feat: add new runners --- .github/workflows/e2e-preset-test.yml | 2 +- .github/workflows/preset-image-build.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 8736d51e4..7083562e8 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -54,7 +54,7 @@ jobs: e2e-preset-tests: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models - runs-on: [self-hosted] + runs-on: [self-hosted, 'hostname:aks-agentpool-38555668-vmss000007'] strategy: fail-fast: false matrix: diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 7047e7877..ee08b7b6f 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -63,7 +63,7 @@ jobs: build-models: needs: determine-models if: needs.determine-models.outputs.is_matrix_empty == 'false' - runs-on: [self-hosted] + runs-on: [self-hosted, 'hostname:model-server'] strategy: fail-fast: false matrix: From 58cda26830f66976dca627d1f77ee22e86f5ec0f Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 18:17:06 -0800 Subject: [PATCH 4/7] feat: update login --- .github/workflows/e2e-preset-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 7083562e8..752ed9a3a 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -82,7 +82,7 @@ jobs: - name: 'Az CLI login' uses: azure/login@v1.6.0 with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} + client-id: ${{ secrets.AZURE_KDM_PRESET_SELF_RUNNER_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} allow-no-subscriptions: true From dc846b611d9c97da079a9b03e2304d98ebc6e5f5 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 18:33:16 -0800 Subject: [PATCH 5/7] nit: add env --- .github/workflows/e2e-preset-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 752ed9a3a..ea879f000 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -55,6 +55,7 @@ jobs: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models runs-on: [self-hosted, 'hostname:aks-agentpool-38555668-vmss000007'] + environment: e2e-test strategy: fail-fast: false matrix: From 324edfd122f9e09c46a4b698ccac6ee686647487 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 18:42:31 -0800 Subject: [PATCH 6/7] fix: attempt running e2e from git runner --- .github/workflows/e2e-preset-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index ea879f000..2446c72eb 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -54,7 +54,7 @@ jobs: e2e-preset-tests: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models - runs-on: [self-hosted, 'hostname:aks-agentpool-38555668-vmss000007'] + runs-on: ubuntu-latest # [self-hosted, 'hostname:aks-agentpool-38555668-vmss000007'] environment: e2e-test strategy: fail-fast: false @@ -83,7 +83,7 @@ jobs: - name: 'Az CLI login' uses: azure/login@v1.6.0 with: - client-id: ${{ secrets.AZURE_KDM_PRESET_SELF_RUNNER_CLIENT_ID }} + client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} allow-no-subscriptions: true From fa86bcc362242a6c69cc2b45ee35bfe4786dc5a0 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 29 Jan 2024 18:48:44 -0800 Subject: [PATCH 7/7] fix: matrix access bug --- .github/workflows/e2e-preset-test.yml | 60 ++++++++++++++------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 2446c72eb..28a728e5a 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -50,7 +50,11 @@ jobs: ') echo "matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT - + + - name: Print Combined Matrix + run: | + echo "Combined Matrix:" + echo '${{ steps.images.outputs.matrix }}' e2e-preset-tests: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models @@ -94,8 +98,8 @@ jobs: id: check_test_image run: | ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} - IMAGE_NAME=${{ matrix.name }} - TAG=${{ matrix.tag }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv) @@ -110,8 +114,8 @@ jobs: id: check_prod_image run: | ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} - IMAGE_NAME=${{ matrix.name }} - TAG=${{ matrix.tag }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv) @@ -131,7 +135,7 @@ jobs: if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' id: get_nodepool_name run: | - NAME_SUFFIX=${{ matrix.name }} + NAME_SUFFIX=${{ matrix.model.name }} NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then @@ -156,9 +160,9 @@ jobs: --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ --cluster-name GitRunner \ --resource-group llm-test \ - --node-count ${{ matrix.node-count }} \ - --node-vm-size ${{ matrix.node-vm-size }} \ - --node-osdisk-size ${{ matrix.node-osdisk-size }} \ + --node-count ${{ matrix.model.node-count }} \ + --node-vm-size ${{ matrix.model.node-vm-size }} \ + --node-osdisk-size ${{ matrix.model.node-osdisk-size }} \ --labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ --node-taints sku=gpu:NoSchedule \ --aks-custom-headers UseGPUDedicatedVHD=true @@ -179,14 +183,14 @@ jobs: - name: Create Service if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' - run: kubectl apply -f presets/test/manifests/${{ matrix.name }}/${{ matrix.name }}-service.yaml + run: kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml - name: Retrieve External Service IP if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' id: get_ip run: | while [[ -z $SERVICE_IP ]]; do - SERVICE_IP=$(kubectl get svc ${{ matrix.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') + SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') sleep 5 done echo "Service IP is $SERVICE_IP" @@ -195,15 +199,15 @@ jobs: - name: Replace IP and Deploy Statefulset to K8s if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' run: | - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.name }}/${{ matrix.name }}-statefulset.yaml - sed -i "s/TAG_HERE/${{ matrix.tag }}/g" presets/test/manifests/${{ matrix.name }}/${{ matrix.name }}-statefulset.yaml - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.name }}/${{ matrix.name }}-statefulset.yaml - kubectl apply -f presets/test/manifests/${{ matrix.name }}/${{ matrix.name }}-statefulset.yaml + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml + sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml + kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml - name: Wait for Statefulset to be ready if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' run: | - kubectl rollout status statefulset/${{ matrix.name }} + kubectl rollout status statefulset/${{ matrix.model.name }} - name: Test home endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' @@ -218,8 +222,8 @@ jobs: - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' run: | - if [[ "${{ matrix.name }}" == *"llama"* && "${{ matrix.name }}" == *"-chat"* ]]; then - echo "Testing inference for ${{ matrix.name }}" + if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then + echo "Testing inference for ${{ matrix.model.name }}" curl -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -239,8 +243,8 @@ jobs: } }' \ http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat - elif [[ "${{ matrix.name }}" == *"llama"* ]]; then - echo "Testing inference for ${{ matrix.name }}" + elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then + echo "Testing inference for ${{ matrix.model.name }}" curl -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -255,8 +259,8 @@ jobs: } }' \ http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate - elif [[ "${{ matrix.name }}" == *"falcon"* ]]; then - echo "Testing inference for ${{ matrix.name }}" + elif [[ "${{ matrix.model.name }}" == *"falcon"* ]]; then + echo "Testing inference for ${{ matrix.model.name }}" curl -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ @@ -274,8 +278,8 @@ jobs: TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} PROD_ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} - IMAGE_NAME=${{ matrix.name }} - TAG=${{ matrix.tag }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} # Formulate the source image reference SOURCE_IMAGE="$TEST_ACR_NAME.azurecr.io/$IMAGE_NAME:$TAG" @@ -287,13 +291,13 @@ jobs: if: always() run: | # Check and Delete K8s Service if it exists - if kubectl get svc ${{ matrix.name }} > /dev/null 2>&1; then - kubectl delete svc ${{ matrix.name }} + if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl delete svc ${{ matrix.model.name }} fi # Check and Delete K8s StatefulSet if it exists - if kubectl get statefulset ${{ matrix.name }} > /dev/null 2>&1; then - kubectl delete statefulset ${{ matrix.name }} + if kubectl get statefulset ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl delete statefulset ${{ matrix.model.name }} fi # Check and Delete AKS Nodepool if it exists