Skip to content

Commit

Permalink
add some preset test for vllm
Browse files Browse the repository at this point in the history
Signed-off-by: jerryzhuang <[email protected]>
  • Loading branch information
zhuangqh committed Nov 13, 2024
1 parent f3ef4c8 commit 5898b4d
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@
"name": "phi-3-mini-4k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50,
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "phi-3-mini-128k-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50,
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
Expand Down
32 changes: 26 additions & 6 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ on:
type: boolean
default: false
description: "Test all Phi models for E2E"
test-on-vllm:
type: boolean
default: false
description: "Test on VLLM runtime"

env:
GO_VERSION: "1.22"
BRANCH_NAME: ${{ github.head_ref || github.ref_name}}
FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
FORCE_RUN_ALL_PHI: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}

permissions:
id-token: write
Expand Down Expand Up @@ -232,7 +237,7 @@ jobs:
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}_${{ env.RUNTIME }}.yaml
- name: Wait for Resource to be ready
run: |
Expand All @@ -243,14 +248,10 @@ jobs:
run: |
POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
- name: Test home endpoint
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
- name: Test healthz endpoint
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/health
- name: Test inference endpoint
run: |
Expand Down Expand Up @@ -291,6 +292,25 @@ jobs:
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
elif [[ "${{ env.RUNTIME }}" == *"vllm"*]]; then
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"model": "test",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
]
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/v1/chat/completions
else
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
Expand Down
55 changes: 55 additions & 0 deletions presets/test/manifests/falcon-7b/falcon-7b_vllm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: falcon-7b
spec:
replicas: 1
selector:
matchLabels:
app: falcon
template:
metadata:
labels:
app: falcon
spec:
containers:
- name: falcon-container
image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: falcon7b
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: phi-3-medium-128k-instruct
spec:
replicas: 1
selector:
matchLabels:
app: phi-3-medium-128k-instruct
template:
metadata:
labels:
app: phi-3-medium-128k-instruct
spec:
containers:
- name: phi-3-medium-128k-instruct-container
image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1 # Requesting 1 GPU
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: phi3medium12
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: phi-3-medium-4k-instruct
spec:
replicas: 1
selector:
matchLabels:
app: phi-3-medium-4k-instruct
template:
metadata:
labels:
app: phi-3-medium-4k-instruct
spec:
containers:
- name: phi-3-medium-4k-instruct-container
image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1 # Requesting 1 GPU
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: phi3medium4k
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: phi-3-mini-128k-instruct
spec:
replicas: 1
selector:
matchLabels:
app: phi-3-mini-128k-instruct
template:
metadata:
labels:
app: phi-3-mini-128k-instruct
spec:
containers:
- name: phi-3-mini-128k-instruct-container
image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1 # Requesting 1 GPU
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: phi3mini128k
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: phi-3-mini-4k-instruct
spec:
replicas: 1
selector:
matchLabels:
app: phi-3-mini-4k-instruct
template:
metadata:
labels:
app: phi-3-mini-4k-instruct
spec:
containers:
- name: phi-3-mini-4k-instruct-container
image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1 # Requesting 1 GPU
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: phi3mini4kin

0 comments on commit 5898b4d

Please sign in to comment.