feat: implement inference server by using vllm (#624)

**Reason for Change**:  **Requirements** - [x] added unit tests and e2e tests (if applicable). **Issue Fixed**:  **Notes for Reviewers**: --------- Signed-off-by: zhuangqh <[email protected]> Signed-off-by: jerryzhuang <[email protected]>
kaito-project · Oct 24, 2024 · 1d09da0 · 1d09da0
1 parent 8906190
commit 1d09da0
Show file tree

Hide file tree

Showing 12 changed files with 2,369 additions and 16 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -50,7 +50,7 @@ jobs:
 
       - name: Run inference api e2e tests
         run: |
-          make inference-api-e2e
+          DEVICE=cpu make inference-api-e2e
 
       - name: Upload Codecov report
         uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673 # v4.5.0

diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 bin/*
 Dockerfile.cross
+__pycache__/
 
 # Test binary, build with `go test -c`
 *.test

diff --git a/Makefile b/Makefile
@@ -97,24 +97,25 @@ unit-test: ## Run unit tests.
 	-race -coverprofile=coverage.txt -covermode=atomic
 	go tool cover -func=coverage.txt
 
+.PHONY: virtualenv
+virtualenv:
+	pip install virtualenv
+
 .PHONY: rag-service-test
-rag-service-test:
-	pip install -r ragengine/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO ragengine/tests
+rag-service-test: virtualenv
+	./hack/run-pytest-in-venv.sh ragengine/tests ragengine/requirements.txt
 
 .PHONY: tuning-metrics-server-test
-tuning-metrics-server-test:
-	pip install -r presets/inference/text-generation/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/tuning/text-generation/metrics
+tuning-metrics-server-test: virtualenv
+	./hack/run-pytest-in-venv.sh presets/tuning/text-generation/metrics presets/tuning/text-generation/requirements.txt
 
 ## --------------------------------------
 ## E2E tests
 ## --------------------------------------
 
-.PHONY: inference-api-e2e
-inference-api-e2e:
-	pip install -r presets/inference/text-generation/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/inference/text-generation/tests
+inference-api-e2e: virtualenv
+	./hack/run-pytest-in-venv.sh presets/inference/vllm presets/inference/vllm/requirements.txt
+	./hack/run-pytest-in-venv.sh presets/inference/text-generation presets/inference/text-generation/requirements.txt
 
 # Ginkgo configurations
 GINKGO_FOCUS ?=

diff --git a/hack/run-pytest-in-venv.sh b/hack/run-pytest-in-venv.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <test_dir> <requirements.txt>"
+    exit 1
+fi
+
+PROJECT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
+
+TEST_DIR="$PROJECT_DIR/$1"
+REQUIREMENTS="$PROJECT_DIR/$2"
+VENV_DIR=$(mktemp -d)
+
+cleanup() {
+    rm -rf "$VENV_DIR"
+}
+trap cleanup EXIT
+
+cd $VENV_DIR
+printf "Creating virtual environment in %s\n" "$VENV_DIR"
+python3 -m virtualenv venv
+source "$VENV_DIR/venv/bin/activate"
+if [ "$?" -ne 0 ]; then
+    printf "Failed to activate virtual environment\n"
+    exit 1
+fi
+
+printf "Installing requirements from %s\n" "$REQUIREMENTS"
+pip install -r "$REQUIREMENTS" > "$VENV_DIR/pip.log"
+if [ "$?" -ne 0 ]; then
+    cat "$VENV_DIR/pip.log"
+    exit 1
+fi
+
+printf "Running tests in %s\n" "$TEST_DIR"
+pytest -o log_cli=true -o log_cli_level=INFO "$TEST_DIR"
diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py
@@ -192,7 +192,7 @@ def get_metrics():
             return {"error": str(e)}
 
 def setup_worker_routes():
-    @app_worker.get("/healthz")
+    @app_worker.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")

diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json
@@ -24,7 +24,7 @@
                 }
             }
         },
-        "/healthz": {
+        "/health": {
             "get": {
                 "summary": "Health Check Endpoint",
                 "operationId": "health_check_healthz_get",

diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
@@ -181,7 +181,7 @@ def home():
 class HealthStatus(BaseModel):
     status: str = Field(..., example="Healthy")
 @app.get(
-    "/healthz",
+    "/health",
     response_model=HealthStatus,
     summary="Health Check Endpoint",
     responses={
@@ -461,7 +461,7 @@ def get_metrics():
         if torch.cuda.is_available():
             gpus = GPUtil.getGPUs()
             gpu_info = [GPUInfo(
-                id=gpu.id,
+                id=str(gpu.id),
                 name=gpu.name,
                 load=f"{gpu.load * 100:.2f}%",
                 temperature=f"{gpu.temperature} C",

diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
@@ -108,7 +108,7 @@ def test_read_main(configured_app):
 
 def test_health_check(configured_app):
     client = TestClient(configured_app)
-    response = client.get("/healthz")
+    response = client.get("/health")
     assert response.status_code == 200
     assert response.json() == {"status": "Healthy"}
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     *.dylib
     bin/*
     Dockerfile.cross
+    __pycache__/
     # Test binary, build with `go test -c`
     *.test
@@ Expand Down @@