diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5015d742f..4e0044db1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -50,7 +50,7 @@ jobs:
 
       - name: Run inference api e2e tests
         run: |
-          make inference-api-e2e
+          DEVICE=cpu make inference-api-e2e
 
       - name: Upload Codecov report
         uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673 # v4.5.0
diff --git a/.gitignore b/.gitignore
index 055cbc388..501d73b21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 bin/*
 Dockerfile.cross
+__pycache__/
 
 # Test binary, build with `go test -c`
 *.test
diff --git a/Makefile b/Makefile
index 699e557b5..08ca8fcb0 100644
--- a/Makefile
+++ b/Makefile
@@ -97,24 +97,25 @@ unit-test: ## Run unit tests.
 	-race -coverprofile=coverage.txt -covermode=atomic
 	go tool cover -func=coverage.txt
 
+.PHONY: virtualenv
+virtualenv:
+	pip install virtualenv
+
 .PHONY: rag-service-test
-rag-service-test:
-	pip install -r ragengine/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO ragengine/tests
+rag-service-test: virtualenv
+	./hack/run-pytest-in-venv.sh ragengine/tests ragengine/requirements.txt
 
 .PHONY: tuning-metrics-server-test
-tuning-metrics-server-test:
-	pip install -r presets/inference/text-generation/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/tuning/text-generation/metrics
+tuning-metrics-server-test: virtualenv
+	./hack/run-pytest-in-venv.sh presets/tuning/text-generation/metrics presets/tuning/text-generation/requirements.txt
 
 ## --------------------------------------
 ## E2E tests
 ## --------------------------------------
 
-.PHONY: inference-api-e2e
-inference-api-e2e:
-	pip install -r presets/inference/text-generation/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/inference/text-generation/tests
+inference-api-e2e: virtualenv
+	./hack/run-pytest-in-venv.sh presets/inference/vllm presets/inference/vllm/requirements.txt
+	./hack/run-pytest-in-venv.sh presets/inference/text-generation presets/inference/text-generation/requirements.txt
 
 # Ginkgo configurations
 GINKGO_FOCUS ?=
diff --git a/hack/run-pytest-in-venv.sh b/hack/run-pytest-in-venv.sh
new file mode 100755
index 000000000..3e73b5cfd
--- /dev/null
+++ b/hack/run-pytest-in-venv.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <test_dir> <requirements.txt>"
+    exit 1
+fi
+
+PROJECT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
+
+TEST_DIR="$PROJECT_DIR/$1"
+REQUIREMENTS="$PROJECT_DIR/$2"
+VENV_DIR=$(mktemp -d)
+
+cleanup() {
+    rm -rf "$VENV_DIR"
+}
+trap cleanup EXIT
+
+cd $VENV_DIR
+printf "Creating virtual environment in %s\n" "$VENV_DIR"
+python3 -m virtualenv venv
+source "$VENV_DIR/venv/bin/activate"
+if [ "$?" -ne 0 ]; then
+    printf "Failed to activate virtual environment\n"
+    exit 1
+fi
+
+printf "Installing requirements from %s\n" "$REQUIREMENTS"
+pip install -r "$REQUIREMENTS" > "$VENV_DIR/pip.log"
+if [ "$?" -ne 0 ]; then
+    cat "$VENV_DIR/pip.log"
+    exit 1
+fi
+
+printf "Running tests in %s\n" "$TEST_DIR"
+pytest -o log_cli=true -o log_cli_level=INFO "$TEST_DIR"
diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py
index 11776bf3d..b5eb11d3a 100644
--- a/presets/inference/llama2-chat/inference_api.py
+++ b/presets/inference/llama2-chat/inference_api.py
@@ -192,7 +192,7 @@ def get_metrics():
             return {"error": str(e)}
 
 def setup_worker_routes():
-    @app_worker.get("/healthz")
+    @app_worker.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")
diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json
index 480fa97e4..13d64722a 100644
--- a/presets/inference/text-generation/api_spec.json
+++ b/presets/inference/text-generation/api_spec.json
@@ -24,7 +24,7 @@
                 }
             }
         },
-        "/healthz": {
+        "/health": {
             "get": {
                 "summary": "Health Check Endpoint",
                 "operationId": "health_check_healthz_get",
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index b9381e220..e9aca92e4 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -181,7 +181,7 @@ def home():
 class HealthStatus(BaseModel):
     status: str = Field(..., example="Healthy")
 @app.get(
-    "/healthz",
+    "/health",
     response_model=HealthStatus,
     summary="Health Check Endpoint",
     responses={
@@ -461,7 +461,7 @@ def get_metrics():
         if torch.cuda.is_available():
             gpus = GPUtil.getGPUs()
             gpu_info = [GPUInfo(
-                id=gpu.id,
+                id=str(gpu.id),
                 name=gpu.name,
                 load=f"{gpu.load * 100:.2f}%",
                 temperature=f"{gpu.temperature} C",
diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
index 667f5eab7..baedbb832 100644
--- a/presets/inference/text-generation/tests/test_inference_api.py
+++ b/presets/inference/text-generation/tests/test_inference_api.py
@@ -108,7 +108,7 @@ def test_read_main(configured_app):
 
 def test_health_check(configured_app):
     client = TestClient(configured_app)
-    response = client.get("/healthz")
+    response = client.get("/health")
     assert response.status_code == 200
     assert response.json() == {"status": "Healthy"}
 
diff --git a/presets/inference/vllm/api_spec.json b/presets/inference/vllm/api_spec.json
new file mode 100644
index 000000000..a3ffa492b
--- /dev/null
+++ b/presets/inference/vllm/api_spec.json
@@ -0,0 +1,2130 @@
+{
+    "openapi": "3.1.0",
+    "info": {
+        "title": "FastAPI",
+        "version": "0.1.0"
+    },
+    "paths": {
+        "/health": {
+            "get": {
+                "summary": "Health",
+                "description": "Health check.",
+                "operationId": "health_health_get",
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/tokenize": {
+            "post": {
+                "summary": "Tokenize",
+                "operationId": "tokenize_tokenize_post",
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "anyOf": [
+                                    {
+                                        "$ref": "#/components/schemas/TokenizeCompletionRequest"
+                                    },
+                                    {
+                                        "$ref": "#/components/schemas/TokenizeChatRequest"
+                                    }
+                                ],
+                                "title": "Request"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/detokenize": {
+            "post": {
+                "summary": "Detokenize",
+                "operationId": "detokenize_detokenize_post",
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DetokenizeRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/models": {
+            "get": {
+                "summary": "Show Available Models",
+                "operationId": "show_available_models_v1_models_get",
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/version": {
+            "get": {
+                "summary": "Show Version",
+                "operationId": "show_version_version_get",
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/chat/completions": {
+            "post": {
+                "summary": "Create Chat Completion",
+                "operationId": "create_chat_completion_v1_chat_completions_post",
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/ChatCompletionRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/completions": {
+            "post": {
+                "summary": "Create Completion",
+                "operationId": "create_completion_v1_completions_post",
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CompletionRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/embeddings": {
+            "post": {
+                "summary": "Create Embedding",
+                "operationId": "create_embedding_v1_embeddings_post",
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EmbeddingRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "components": {
+        "schemas": {
+            "AudioURL": {
+                "properties": {
+                    "url": {
+                        "type": "string",
+                        "title": "Url"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "url"
+                ],
+                "title": "AudioURL"
+            },
+            "BaseModel": {
+                "properties": {},
+                "type": "object",
+                "title": "BaseModel"
+            },
+            "ChatCompletionAssistantMessageParam": {
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "enum": [
+                            "assistant"
+                        ],
+                        "const": "assistant",
+                        "title": "Role"
+                    },
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "anyOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartTextParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam"
+                                        }
+                                    ]
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "function_call": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/FunctionCall"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "refusal": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Refusal"
+                    },
+                    "tool_calls": {
+                        "items": {
+                            "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam"
+                        },
+                        "type": "array",
+                        "title": "Tool Calls"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "role"
+                ],
+                "title": "ChatCompletionAssistantMessageParam"
+            },
+            "ChatCompletionContentPartAudioParam": {
+                "properties": {
+                    "audio_url": {
+                        "$ref": "#/components/schemas/AudioURL"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "audio_url"
+                        ],
+                        "const": "audio_url",
+                        "title": "Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "audio_url",
+                    "type"
+                ],
+                "title": "ChatCompletionContentPartAudioParam"
+            },
+            "ChatCompletionContentPartImageParam": {
+                "properties": {
+                    "image_url": {
+                        "$ref": "#/components/schemas/ImageURL"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "image_url"
+                        ],
+                        "const": "image_url",
+                        "title": "Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "image_url",
+                    "type"
+                ],
+                "title": "ChatCompletionContentPartImageParam"
+            },
+            "ChatCompletionContentPartRefusalParam": {
+                "properties": {
+                    "refusal": {
+                        "type": "string",
+                        "title": "Refusal"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "refusal"
+                        ],
+                        "const": "refusal",
+                        "title": "Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "refusal",
+                    "type"
+                ],
+                "title": "ChatCompletionContentPartRefusalParam"
+            },
+            "ChatCompletionContentPartTextParam": {
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "title": "Text"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "text"
+                        ],
+                        "const": "text",
+                        "title": "Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "ChatCompletionContentPartTextParam"
+            },
+            "ChatCompletionFunctionMessageParam": {
+                "properties": {
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "role": {
+                        "type": "string",
+                        "enum": [
+                            "function"
+                        ],
+                        "const": "function",
+                        "title": "Role"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "content",
+                    "name",
+                    "role"
+                ],
+                "title": "ChatCompletionFunctionMessageParam"
+            },
+            "ChatCompletionMessageToolCallParam": {
+                "properties": {
+                    "id": {
+                        "type": "string",
+                        "title": "Id"
+                    },
+                    "function": {
+                        "$ref": "#/components/schemas/Function"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "function"
+                        ],
+                        "const": "function",
+                        "title": "Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "id",
+                    "function",
+                    "type"
+                ],
+                "title": "ChatCompletionMessageToolCallParam"
+            },
+            "ChatCompletionNamedFunction": {
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "name"
+                ],
+                "title": "ChatCompletionNamedFunction"
+            },
+            "ChatCompletionNamedToolChoiceParam": {
+                "properties": {
+                    "function": {
+                        "$ref": "#/components/schemas/ChatCompletionNamedFunction"
+                    },
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "function"
+                        ],
+                        "const": "function",
+                        "title": "Type",
+                        "default": "function"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "function"
+                ],
+                "title": "ChatCompletionNamedToolChoiceParam"
+            },
+            "ChatCompletionRequest": {
+                "properties": {
+                    "messages": {
+                        "items": {
+                            "anyOf": [
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionSystemMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionUserMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionToolMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CustomChatCompletionMessageParam"
+                                }
+                            ]
+                        },
+                        "type": "array",
+                        "title": "Messages"
+                    },
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "frequency_penalty": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Frequency Penalty",
+                        "default": 0.0
+                    },
+                    "logit_bias": {
+                        "anyOf": [
+                            {
+                                "additionalProperties": {
+                                    "type": "number"
+                                },
+                                "type": "object"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Logit Bias"
+                    },
+                    "logprobs": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Logprobs",
+                        "default": false
+                    },
+                    "top_logprobs": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Top Logprobs",
+                        "default": 0
+                    },
+                    "max_tokens": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Max Tokens"
+                    },
+                    "n": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "N",
+                        "default": 1
+                    },
+                    "presence_penalty": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Presence Penalty",
+                        "default": 0.0
+                    },
+                    "response_format": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/ResponseFormat"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "seed": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "maximum": 9.223372036854776e+18,
+                                "minimum": -9.223372036854776e+18
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Seed"
+                    },
+                    "stop": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stop"
+                    },
+                    "stream": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stream",
+                        "default": false
+                    },
+                    "stream_options": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/StreamOptions"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "temperature": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Temperature",
+                        "default": 0.7
+                    },
+                    "top_p": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Top P",
+                        "default": 1.0
+                    },
+                    "tools": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "$ref": "#/components/schemas/ChatCompletionToolsParam"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Tools"
+                    },
+                    "tool_choice": {
+                        "anyOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "none"
+                                ],
+                                "const": "none"
+                            },
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "auto"
+                                ],
+                                "const": "auto"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ChatCompletionNamedToolChoiceParam"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Tool Choice",
+                        "default": "none"
+                    },
+                    "parallel_tool_calls": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Parallel Tool Calls",
+                        "default": false
+                    },
+                    "user": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "User"
+                    },
+                    "best_of": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Best Of"
+                    },
+                    "use_beam_search": {
+                        "type": "boolean",
+                        "title": "Use Beam Search",
+                        "default": false
+                    },
+                    "top_k": {
+                        "type": "integer",
+                        "title": "Top K",
+                        "default": -1
+                    },
+                    "min_p": {
+                        "type": "number",
+                        "title": "Min P",
+                        "default": 0.0
+                    },
+                    "repetition_penalty": {
+                        "type": "number",
+                        "title": "Repetition Penalty",
+                        "default": 1.0
+                    },
+                    "length_penalty": {
+                        "type": "number",
+                        "title": "Length Penalty",
+                        "default": 1.0
+                    },
+                    "early_stopping": {
+                        "type": "boolean",
+                        "title": "Early Stopping",
+                        "default": false
+                    },
+                    "stop_token_ids": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "integer"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stop Token Ids"
+                    },
+                    "include_stop_str_in_output": {
+                        "type": "boolean",
+                        "title": "Include Stop Str In Output",
+                        "default": false
+                    },
+                    "ignore_eos": {
+                        "type": "boolean",
+                        "title": "Ignore Eos",
+                        "default": false
+                    },
+                    "min_tokens": {
+                        "type": "integer",
+                        "title": "Min Tokens",
+                        "default": 0
+                    },
+                    "skip_special_tokens": {
+                        "type": "boolean",
+                        "title": "Skip Special Tokens",
+                        "default": true
+                    },
+                    "spaces_between_special_tokens": {
+                        "type": "boolean",
+                        "title": "Spaces Between Special Tokens",
+                        "default": true
+                    },
+                    "truncate_prompt_tokens": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "minimum": 1.0
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Truncate Prompt Tokens"
+                    },
+                    "prompt_logprobs": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Prompt Logprobs"
+                    },
+                    "echo": {
+                        "type": "boolean",
+                        "title": "Echo",
+                        "description": "If true, the new message will be prepended with the last message if they belong to the same role.",
+                        "default": false
+                    },
+                    "add_generation_prompt": {
+                        "type": "boolean",
+                        "title": "Add Generation Prompt",
+                        "description": "If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
+                        "default": true
+                    },
+                    "add_special_tokens": {
+                        "type": "boolean",
+                        "title": "Add Special Tokens",
+                        "description": "If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
+                        "default": false
+                    },
+                    "documents": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "additionalProperties": {
+                                        "type": "string"
+                                    },
+                                    "type": "object"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Documents",
+                        "description": "A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing \"title\" and \"text\" keys."
+                    },
+                    "chat_template": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Chat Template",
+                        "description": "A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one."
+                    },
+                    "chat_template_kwargs": {
+                        "anyOf": [
+                            {
+                                "type": "object"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Chat Template Kwargs",
+                        "description": "Additional kwargs to pass to the template renderer. Will be accessible by the chat template."
+                    },
+                    "guided_json": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "object"
+                            },
+                            {
+                                "$ref": "#/components/schemas/BaseModel"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Json",
+                        "description": "If specified, the output will follow the JSON schema."
+                    },
+                    "guided_regex": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Regex",
+                        "description": "If specified, the output will follow the regex pattern."
+                    },
+                    "guided_choice": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Choice",
+                        "description": "If specified, the output will be exactly one of the choices."
+                    },
+                    "guided_grammar": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Grammar",
+                        "description": "If specified, the output will follow the context free grammar."
+                    },
+                    "guided_decoding_backend": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Decoding Backend",
+                        "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be either 'outlines' / 'lm-format-enforcer'"
+                    },
+                    "guided_whitespace_pattern": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Whitespace Pattern",
+                        "description": "If specified, will override the default whitespace pattern for guided json decoding."
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "messages",
+                    "model"
+                ],
+                "title": "ChatCompletionRequest"
+            },
+            "ChatCompletionSystemMessageParam": {
+                "properties": {
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "$ref": "#/components/schemas/ChatCompletionContentPartTextParam"
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "role": {
+                        "type": "string",
+                        "enum": [
+                            "system"
+                        ],
+                        "const": "system",
+                        "title": "Role"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "content",
+                    "role"
+                ],
+                "title": "ChatCompletionSystemMessageParam"
+            },
+            "ChatCompletionToolMessageParam": {
+                "properties": {
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "$ref": "#/components/schemas/ChatCompletionContentPartTextParam"
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "role": {
+                        "type": "string",
+                        "enum": [
+                            "tool"
+                        ],
+                        "const": "tool",
+                        "title": "Role"
+                    },
+                    "tool_call_id": {
+                        "type": "string",
+                        "title": "Tool Call Id"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "content",
+                    "role",
+                    "tool_call_id"
+                ],
+                "title": "ChatCompletionToolMessageParam"
+            },
+            "ChatCompletionToolsParam": {
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "function"
+                        ],
+                        "const": "function",
+                        "title": "Type",
+                        "default": "function"
+                    },
+                    "function": {
+                        "$ref": "#/components/schemas/FunctionDefinition"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "function"
+                ],
+                "title": "ChatCompletionToolsParam"
+            },
+            "ChatCompletionUserMessageParam": {
+                "properties": {
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "anyOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartTextParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartImageParam"
+                                        }
+                                    ]
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "role": {
+                        "type": "string",
+                        "enum": [
+                            "user"
+                        ],
+                        "const": "user",
+                        "title": "Role"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "content",
+                    "role"
+                ],
+                "title": "ChatCompletionUserMessageParam"
+            },
+            "CompletionRequest": {
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "prompt": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "integer"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "items": {
+                                    "items": {
+                                        "type": "integer"
+                                    },
+                                    "type": "array"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Prompt"
+                    },
+                    "best_of": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Best Of"
+                    },
+                    "echo": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Echo",
+                        "default": false
+                    },
+                    "frequency_penalty": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Frequency Penalty",
+                        "default": 0.0
+                    },
+                    "logit_bias": {
+                        "anyOf": [
+                            {
+                                "additionalProperties": {
+                                    "type": "number"
+                                },
+                                "type": "object"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Logit Bias"
+                    },
+                    "logprobs": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Logprobs"
+                    },
+                    "max_tokens": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Max Tokens",
+                        "default": 16
+                    },
+                    "n": {
+                        "type": "integer",
+                        "title": "N",
+                        "default": 1
+                    },
+                    "presence_penalty": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Presence Penalty",
+                        "default": 0.0
+                    },
+                    "seed": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "maximum": 9.223372036854776e+18,
+                                "minimum": -9.223372036854776e+18
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Seed"
+                    },
+                    "stop": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stop"
+                    },
+                    "stream": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stream",
+                        "default": false
+                    },
+                    "stream_options": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/StreamOptions"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "suffix": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Suffix"
+                    },
+                    "temperature": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Temperature",
+                        "default": 1.0
+                    },
+                    "top_p": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Top P",
+                        "default": 1.0
+                    },
+                    "user": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "User"
+                    },
+                    "use_beam_search": {
+                        "type": "boolean",
+                        "title": "Use Beam Search",
+                        "default": false
+                    },
+                    "top_k": {
+                        "type": "integer",
+                        "title": "Top K",
+                        "default": -1
+                    },
+                    "min_p": {
+                        "type": "number",
+                        "title": "Min P",
+                        "default": 0.0
+                    },
+                    "repetition_penalty": {
+                        "type": "number",
+                        "title": "Repetition Penalty",
+                        "default": 1.0
+                    },
+                    "length_penalty": {
+                        "type": "number",
+                        "title": "Length Penalty",
+                        "default": 1.0
+                    },
+                    "early_stopping": {
+                        "type": "boolean",
+                        "title": "Early Stopping",
+                        "default": false
+                    },
+                    "stop_token_ids": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "integer"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Stop Token Ids"
+                    },
+                    "include_stop_str_in_output": {
+                        "type": "boolean",
+                        "title": "Include Stop Str In Output",
+                        "default": false
+                    },
+                    "ignore_eos": {
+                        "type": "boolean",
+                        "title": "Ignore Eos",
+                        "default": false
+                    },
+                    "min_tokens": {
+                        "type": "integer",
+                        "title": "Min Tokens",
+                        "default": 0
+                    },
+                    "skip_special_tokens": {
+                        "type": "boolean",
+                        "title": "Skip Special Tokens",
+                        "default": true
+                    },
+                    "spaces_between_special_tokens": {
+                        "type": "boolean",
+                        "title": "Spaces Between Special Tokens",
+                        "default": true
+                    },
+                    "truncate_prompt_tokens": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "minimum": 1.0
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Truncate Prompt Tokens"
+                    },
+                    "allowed_token_ids": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "integer"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Allowed Token Ids"
+                    },
+                    "prompt_logprobs": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Prompt Logprobs"
+                    },
+                    "add_special_tokens": {
+                        "type": "boolean",
+                        "title": "Add Special Tokens",
+                        "description": "If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
+                        "default": true
+                    },
+                    "response_format": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/ResponseFormat"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "description": "Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'} or {'type': 'text' } is supported."
+                    },
+                    "guided_json": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "object"
+                            },
+                            {
+                                "$ref": "#/components/schemas/BaseModel"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Json",
+                        "description": "If specified, the output will follow the JSON schema."
+                    },
+                    "guided_regex": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Regex",
+                        "description": "If specified, the output will follow the regex pattern."
+                    },
+                    "guided_choice": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Choice",
+                        "description": "If specified, the output will be exactly one of the choices."
+                    },
+                    "guided_grammar": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Grammar",
+                        "description": "If specified, the output will follow the context free grammar."
+                    },
+                    "guided_decoding_backend": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Decoding Backend",
+                        "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be one of 'outlines' / 'lm-format-enforcer'"
+                    },
+                    "guided_whitespace_pattern": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Guided Whitespace Pattern",
+                        "description": "If specified, will override the default whitespace pattern for guided json decoding."
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "model",
+                    "prompt"
+                ],
+                "title": "CompletionRequest"
+            },
+            "CustomChatCompletionContentPartParam": {
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "title": "Type"
+                    }
+                },
+                "additionalProperties": true,
+                "type": "object",
+                "required": [
+                    "type"
+                ],
+                "title": "CustomChatCompletionContentPartParam"
+            },
+            "CustomChatCompletionMessageParam": {
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "title": "Role"
+                    },
+                    "content": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "anyOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartTextParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartImageParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartAudioParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/CustomChatCompletionContentPartParam"
+                                        }
+                                    ]
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Content"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "tool_call_id": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Tool Call Id"
+                    },
+                    "tool_calls": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Tool Calls"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "role"
+                ],
+                "title": "CustomChatCompletionMessageParam",
+                "description": "Enables custom roles in the Chat Completion API."
+            },
+            "DetokenizeRequest": {
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "tokens": {
+                        "items": {
+                            "type": "integer"
+                        },
+                        "type": "array",
+                        "title": "Tokens"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "model",
+                    "tokens"
+                ],
+                "title": "DetokenizeRequest"
+            },
+            "EmbeddingRequest": {
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "input": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "integer"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "items": {
+                                    "items": {
+                                        "type": "integer"
+                                    },
+                                    "type": "array"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            }
+                        ],
+                        "title": "Input"
+                    },
+                    "encoding_format": {
+                        "type": "string",
+                        "enum": [
+                            "float",
+                            "base64"
+                        ],
+                        "title": "Encoding Format",
+                        "default": "float"
+                    },
+                    "dimensions": {
+                        "anyOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Dimensions"
+                    },
+                    "user": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "User"
+                    },
+                    "additional_data": {
+                        "anyOf": [
+                            {},
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Additional Data"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "model",
+                    "input"
+                ],
+                "title": "EmbeddingRequest"
+            },
+            "Function": {
+                "properties": {
+                    "arguments": {
+                        "type": "string",
+                        "title": "Arguments"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "arguments",
+                    "name"
+                ],
+                "title": "Function"
+            },
+            "FunctionCall": {
+                "properties": {
+                    "arguments": {
+                        "type": "string",
+                        "title": "Arguments"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "arguments",
+                    "name"
+                ],
+                "title": "FunctionCall"
+            },
+            "FunctionDefinition": {
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "description": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Description"
+                    },
+                    "parameters": {
+                        "anyOf": [
+                            {
+                                "type": "object"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Parameters"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "name"
+                ],
+                "title": "FunctionDefinition"
+            },
+            "HTTPValidationError": {
+                "properties": {
+                    "detail": {
+                        "items": {
+                            "$ref": "#/components/schemas/ValidationError"
+                        },
+                        "type": "array",
+                        "title": "Detail"
+                    }
+                },
+                "type": "object",
+                "title": "HTTPValidationError"
+            },
+            "ImageURL": {
+                "properties": {
+                    "url": {
+                        "type": "string",
+                        "title": "Url"
+                    },
+                    "detail": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "low",
+                            "high"
+                        ],
+                        "title": "Detail"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "url"
+                ],
+                "title": "ImageURL"
+            },
+            "JsonSchemaResponseFormat": {
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "description": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Description"
+                    },
+                    "schema": {
+                        "anyOf": [
+                            {
+                                "type": "object"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Schema"
+                    },
+                    "strict": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Strict"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "name"
+                ],
+                "title": "JsonSchemaResponseFormat"
+            },
+            "ResponseFormat": {
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "text",
+                            "json_object",
+                            "json_schema"
+                        ],
+                        "title": "Type"
+                    },
+                    "json_schema": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "type"
+                ],
+                "title": "ResponseFormat"
+            },
+            "StreamOptions": {
+                "properties": {
+                    "include_usage": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Include Usage",
+                        "default": true
+                    },
+                    "continuous_usage_stats": {
+                        "anyOf": [
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Continuous Usage Stats",
+                        "default": true
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "title": "StreamOptions"
+            },
+            "TokenizeChatRequest": {
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "messages": {
+                        "items": {
+                            "anyOf": [
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionSystemMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionUserMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionToolMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CustomChatCompletionMessageParam"
+                                }
+                            ]
+                        },
+                        "type": "array",
+                        "title": "Messages"
+                    },
+                    "add_generation_prompt": {
+                        "type": "boolean",
+                        "title": "Add Generation Prompt",
+                        "default": true
+                    },
+                    "add_special_tokens": {
+                        "type": "boolean",
+                        "title": "Add Special Tokens",
+                        "default": false
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "model",
+                    "messages"
+                ],
+                "title": "TokenizeChatRequest"
+            },
+            "TokenizeCompletionRequest": {
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "title": "Model"
+                    },
+                    "prompt": {
+                        "type": "string",
+                        "title": "Prompt"
+                    },
+                    "add_special_tokens": {
+                        "type": "boolean",
+                        "title": "Add Special Tokens",
+                        "default": true
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "model",
+                    "prompt"
+                ],
+                "title": "TokenizeCompletionRequest"
+            },
+            "ValidationError": {
+                "properties": {
+                    "loc": {
+                        "items": {
+                            "anyOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                }
+                            ]
+                        },
+                        "type": "array",
+                        "title": "Location"
+                    },
+                    "msg": {
+                        "type": "string",
+                        "title": "Message"
+                    },
+                    "type": {
+                        "type": "string",
+                        "title": "Error Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "loc",
+                    "msg",
+                    "type"
+                ],
+                "title": "ValidationError"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/presets/inference/vllm/inference_api.py b/presets/inference/vllm/inference_api.py
new file mode 100644
index 000000000..ab2613e9e
--- /dev/null
+++ b/presets/inference/vllm/inference_api.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import logging
+import os
+
+import uvloop
+from vllm.utils import FlexibleArgumentParser
+import vllm.entrypoints.openai.api_server as api_server
+
+# Initialize logger
+logger = logging.getLogger(__name__)
+debug_mode = os.environ.get('DEBUG_MODE', 'false').lower() == 'true'
+logging.basicConfig(level=logging.DEBUG if debug_mode else logging.INFO)
+
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    local_rank = int(os.environ.get("LOCAL_RANK",
+                                    0))  # Default to 0 if not set
+    port = 5000 + local_rank  # Adjust port based on local rank
+
+    server_default_args = {
+        "disable-frontend-multiprocessing": False,
+        "port": port
+    }
+    parser.set_defaults(**server_default_args)
+
+    # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args
+    engine_default_args = {
+        "model": "/workspace/tfs/weights",
+        "cpu-offload-gb": 0,
+        "gpu-memory-utilization": 0.9,
+        "swap-space": 4,
+        "disable-log-stats": False,
+    }
+    parser.set_defaults(**engine_default_args)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description='vLLM serving server')
+    parser = api_server.make_arg_parser(parser)
+    parser = make_arg_parser(parser)
+    args = parser.parse_args()
+
+    # Run the serving server
+    logger.info(f"Starting server on port {args.port}")
+    # See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more
+    # details about serving server.
+    # endpoints:
+    # - /health
+    # - /tokenize
+    # - /detokenize
+    # - /v1/models
+    # - /version
+    # - /v1/chat/completions
+    # - /v1/completions
+    # - /v1/embeddings
+    uvloop.run(api_server.run_server(args))
diff --git a/presets/inference/vllm/requirements.txt b/presets/inference/vllm/requirements.txt
new file mode 100644
index 000000000..4481a9966
--- /dev/null
+++ b/presets/inference/vllm/requirements.txt
@@ -0,0 +1,11 @@
+# Dependencies for vllm
+
+# Core Dependencies
+vllm==0.6.3
+torch==2.4.0
+uvloop
+numpy
+
+# For UTs
+pytest
+requests
\ No newline at end of file
diff --git a/presets/inference/vllm/tests/test_inference_api.py b/presets/inference/vllm/tests/test_inference_api.py
new file mode 100644
index 000000000..30ae9cc7f
--- /dev/null
+++ b/presets/inference/vllm/tests/test_inference_api.py
@@ -0,0 +1,114 @@
+import sys
+import os
+import subprocess
+import time
+import socket
+from pathlib import Path
+
+import pytest
+import requests
+
+# Get the parent directory of the current file
+parent_dir = str(Path(__file__).resolve().parent.parent)
+# Add the parent directory to sys.path
+sys.path.append(parent_dir)
+
+TEST_MODEL = "facebook/opt-125m"
+CHAT_TEMPLATE = ("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}"
+    "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"
+    "{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}")
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_server(request):
+    if os.getenv("DEVICE") == "cpu":
+        pytest.skip("Skipping test on cpu device")
+    print("\n>>> Doing setup")
+    port = find_available_port()
+    global TEST_PORT
+    TEST_PORT = port
+
+    args = [
+        "python3",
+        os.path.join(parent_dir, "inference_api.py"),
+        "--model", TEST_MODEL,
+        "--chat-template", CHAT_TEMPLATE,
+        "--port", str(TEST_PORT)
+    ]
+    print(f">>> Starting server on port {TEST_PORT}")
+    process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def fin():
+        process.terminate()
+        process.wait()
+        stderr = process.stderr.read().decode()
+        print(f">>> Server stderr: {stderr}")
+        stdout = process.stdout.read().decode()
+        print(f">>> Server stdout: {stdout}")
+        print ("\n>>> Doing teardown")
+
+    if not is_port_open("localhost", TEST_PORT):
+        fin()
+        pytest.fail("failed to launch vllm server")
+
+    request.addfinalizer(fin)
+
+def is_port_open(host, port, timeout=60):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.settimeout(1)  # Set a short timeout for each connection attempt
+            result = sock.connect_ex((host, port))
+            print(">>> waiting for server to start")
+            if result == 0:
+                print(f">>> server started in {int(time.time() - start_time)} seconds")
+                return True
+            time.sleep(1)  # Wait a bit before retrying
+    return False
+
+def find_available_port(start_port=5000, end_port=8000):
+    for port in range(start_port, end_port + 1):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            if s.connect_ex(('localhost', port)) != 0:
+                return port
+    raise RuntimeError('No available ports found')
+
+def test_completions_api(setup_server):
+    request_data = {
+        "model": TEST_MODEL,
+        "prompt": "Say this is a test",
+        "max_tokens": 7,
+        "temperature": 0.5,
+        "n": 2
+    }
+
+    response = requests.post(f"http://127.0.0.1:{TEST_PORT}/v1/completions", json=request_data)
+    data = response.json()
+    assert "choices" in data, "The response should contain a 'choices' key"
+    assert len(data["choices"]) == 2, "The response should contain two completion"
+
+    for choice in data["choices"]:
+        assert "text" in choice, "Each choice should contain a 'text' key"
+        assert len(choice["text"]) > 0, "The completion text should not be empty"
+
+def test_chat_completions_api(setup_server):
+    request_data = {
+        "model": TEST_MODEL,
+        "messages": [
+            {"role": "user", "content": "Hello!"},
+            {"role": "assistant", "content": "Hi there! How can I help you today?"}
+        ],
+        "max_tokens": 7,
+        "temperature": 0.5,
+        "n": 2
+    }
+
+    response = requests.post(f"http://127.0.0.1:{TEST_PORT}/v1/chat/completions", json=request_data)
+    data = response.json()
+
+    assert "choices" in data, "The response should contain a 'choices' key"
+    assert len(data["choices"]) == 2, "The response should contain two completion"
+
+    for choice in data["choices"]:
+        assert "message" in choice, "Each choice should contain a 'message' key"
+        assert "content" in choice["message"], "Each message should contain a 'content' key"
+        assert len(choice["message"]["content"]) > 0, "The completion text should not be empty"
\ No newline at end of file