add some preset test for vllm

Signed-off-by: jerryzhuang <[email protected]>
kaito-project · Nov 13, 2024 · 5898b4d · 5898b4d
1 parent f3ef4c8
commit 5898b4d
Show file tree

Hide file tree

Showing 12 changed files with 303 additions and 8 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -70,15 +70,15 @@
         "name": "phi-3-mini-4k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
-        "node-osdisk-size": 50,
+        "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
       },
       {
         "name": "phi-3-mini-128k-instruct",
         "node-count": 1,
         "node-vm-size": "Standard_NC6s_v3",
-        "node-osdisk-size": 50,
+        "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
       },

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -15,12 +15,17 @@ on:
                 type: boolean
                 default: false
                 description: "Test all Phi models for E2E"
+            test-on-vllm:
+                type: boolean
+                default: false
+                description: "Test on VLLM runtime"
 
 env:
     GO_VERSION: "1.22"
     BRANCH_NAME: ${{ github.head_ref || github.ref_name}} 
     FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
     FORCE_RUN_ALL_PHI:  ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
+    RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}
 
 permissions:
     id-token: write
@@ -232,7 +237,7 @@ jobs:
             sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
             sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
             sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}_${{ env.RUNTIME }}.yaml
 
       - name: Wait for Resource to be ready
         run: |
@@ -243,14 +248,10 @@ jobs:
         run: |
             POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
-          
-      - name: Test home endpoint
-        run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
 
       - name: Test healthz endpoint
         run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
+            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/health
     
       - name: Test inference endpoint
         run: |
@@ -291,6 +292,25 @@ jobs:
                     }
                 }' \
                 http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
+            elif [[ "${{ env.RUNTIME }}" == *"vllm"*]]; then
+                echo "Testing inference for ${{ matrix.model.name }}"
+                curl -X POST \
+                -H "accept: application/json" \
+                -H "Content-Type: application/json" \
+                -d '{
+                    "model": "test",
+                    "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant."
+                    },
+                    {
+                        "role": "user",
+                        "content": "Hello!"
+                    }
+                    ]
+                    }' \
+                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/v1/chat/completions
             else
                 echo "Testing inference for ${{ matrix.model.name }}"
                 curl -X POST \

diff --git a/...s/test/manifests/falcon-7b/falcon-7b.yaml → ...est/manifests/falcon-7b/falcon-7b_hf.yaml b/...s/test/manifests/falcon-7b/falcon-7b.yaml → ...est/manifests/falcon-7b/falcon-7b_hf.yaml
diff --git a/presets/test/manifests/falcon-7b/falcon-7b_vllm.yaml b/presets/test/manifests/falcon-7b/falcon-7b_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2  # Requesting 2 GPUs
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon7b
diff --git a/...-instruct/phi-3-medium-128k-instruct.yaml → ...struct/phi-3-medium-128k-instruct_hf.yaml b/...-instruct/phi-3-medium-128k-instruct.yaml → ...struct/phi-3-medium-128k-instruct_hf.yaml
diff --git a/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml b/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium12
diff --git a/...4k-instruct/phi-3-medium-4k-instruct.yaml → ...instruct/phi-3-medium-4k-instruct_hf.yaml b/...4k-instruct/phi-3-medium-4k-instruct.yaml → ...instruct/phi-3-medium-4k-instruct_hf.yaml
diff --git a/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml b/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium4k
diff --git a/...8k-instruct/phi-3-mini-128k-instruct.yaml → ...instruct/phi-3-mini-128k-instruct_hf.yaml b/...8k-instruct/phi-3-mini-128k-instruct.yaml → ...instruct/phi-3-mini-128k-instruct_hf.yaml
diff --git a/presets/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml b/presets/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini128k
diff --git a/...i-4k-instruct/phi-3-mini-4k-instruct.yaml → ...k-instruct/phi-3-mini-4k-instruct_hf.yaml b/...i-4k-instruct/phi-3-mini-4k-instruct.yaml → ...k-instruct/phi-3-mini-4k-instruct_hf.yaml
diff --git a/presets/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml b/presets/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini4kin