diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5015d742f..4e0044db1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -50,7 +50,7 @@ jobs: - name: Run inference api e2e tests run: | - make inference-api-e2e + DEVICE=cpu make inference-api-e2e - name: Upload Codecov report uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673 # v4.5.0 diff --git a/.gitignore b/.gitignore index 055cbc388..501d73b21 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.dylib bin/* Dockerfile.cross +__pycache__/ # Test binary, build with `go test -c` *.test diff --git a/Makefile b/Makefile index 699e557b5..08ca8fcb0 100644 --- a/Makefile +++ b/Makefile @@ -97,24 +97,25 @@ unit-test: ## Run unit tests. -race -coverprofile=coverage.txt -covermode=atomic go tool cover -func=coverage.txt +.PHONY: virtualenv +virtualenv: + pip install virtualenv + .PHONY: rag-service-test -rag-service-test: - pip install -r ragengine/requirements.txt - pytest -o log_cli=true -o log_cli_level=INFO ragengine/tests +rag-service-test: virtualenv + ./hack/run-pytest-in-venv.sh ragengine/tests ragengine/requirements.txt .PHONY: tuning-metrics-server-test -tuning-metrics-server-test: - pip install -r presets/inference/text-generation/requirements.txt - pytest -o log_cli=true -o log_cli_level=INFO presets/tuning/text-generation/metrics +tuning-metrics-server-test: virtualenv + ./hack/run-pytest-in-venv.sh presets/tuning/text-generation/metrics presets/tuning/text-generation/requirements.txt ## -------------------------------------- ## E2E tests ## -------------------------------------- -.PHONY: inference-api-e2e -inference-api-e2e: - pip install -r presets/inference/text-generation/requirements.txt - pytest -o log_cli=true -o log_cli_level=INFO presets/inference/text-generation/tests +inference-api-e2e: virtualenv + ./hack/run-pytest-in-venv.sh presets/inference/vllm presets/inference/vllm/requirements.txt + ./hack/run-pytest-in-venv.sh presets/inference/text-generation presets/inference/text-generation/requirements.txt # Ginkgo configurations GINKGO_FOCUS ?= diff --git a/hack/run-pytest-in-venv.sh b/hack/run-pytest-in-venv.sh new file mode 100755 index 000000000..3e73b5cfd --- /dev/null +++ b/hack/run-pytest-in-venv.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +PROJECT_DIR=$(dirname "$(dirname "$(realpath "$0")")") + +TEST_DIR="$PROJECT_DIR/$1" +REQUIREMENTS="$PROJECT_DIR/$2" +VENV_DIR=$(mktemp -d) + +cleanup() { + rm -rf "$VENV_DIR" +} +trap cleanup EXIT + +cd $VENV_DIR +printf "Creating virtual environment in %s\n" "$VENV_DIR" +python3 -m virtualenv venv +source "$VENV_DIR/venv/bin/activate" +if [ "$?" -ne 0 ]; then + printf "Failed to activate virtual environment\n" + exit 1 +fi + +printf "Installing requirements from %s\n" "$REQUIREMENTS" +pip install -r "$REQUIREMENTS" > "$VENV_DIR/pip.log" +if [ "$?" -ne 0 ]; then + cat "$VENV_DIR/pip.log" + exit 1 +fi + +printf "Running tests in %s\n" "$TEST_DIR" +pytest -o log_cli=true -o log_cli_level=INFO "$TEST_DIR" diff --git a/presets/inference/llama2-chat/inference_api.py b/presets/inference/llama2-chat/inference_api.py index 11776bf3d..b5eb11d3a 100644 --- a/presets/inference/llama2-chat/inference_api.py +++ b/presets/inference/llama2-chat/inference_api.py @@ -192,7 +192,7 @@ def get_metrics(): return {"error": str(e)} def setup_worker_routes(): - @app_worker.get("/healthz") + @app_worker.get("/health") def health_check(): if not torch.cuda.is_available(): raise HTTPException(status_code=500, detail="No GPU available") diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json index 480fa97e4..13d64722a 100644 --- a/presets/inference/text-generation/api_spec.json +++ b/presets/inference/text-generation/api_spec.json @@ -24,7 +24,7 @@ } } }, - "/healthz": { + "/health": { "get": { "summary": "Health Check Endpoint", "operationId": "health_check_healthz_get", diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index b9381e220..e9aca92e4 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -181,7 +181,7 @@ def home(): class HealthStatus(BaseModel): status: str = Field(..., example="Healthy") @app.get( - "/healthz", + "/health", response_model=HealthStatus, summary="Health Check Endpoint", responses={ @@ -461,7 +461,7 @@ def get_metrics(): if torch.cuda.is_available(): gpus = GPUtil.getGPUs() gpu_info = [GPUInfo( - id=gpu.id, + id=str(gpu.id), name=gpu.name, load=f"{gpu.load * 100:.2f}%", temperature=f"{gpu.temperature} C", diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py index 667f5eab7..baedbb832 100644 --- a/presets/inference/text-generation/tests/test_inference_api.py +++ b/presets/inference/text-generation/tests/test_inference_api.py @@ -108,7 +108,7 @@ def test_read_main(configured_app): def test_health_check(configured_app): client = TestClient(configured_app) - response = client.get("/healthz") + response = client.get("/health") assert response.status_code == 200 assert response.json() == {"status": "Healthy"} diff --git a/presets/inference/vllm/api_spec.json b/presets/inference/vllm/api_spec.json new file mode 100644 index 000000000..a3ffa492b --- /dev/null +++ b/presets/inference/vllm/api_spec.json @@ -0,0 +1,2130 @@ +{ + "openapi": "3.1.0", + "info": { + "title": "FastAPI", + "version": "0.1.0" + }, + "paths": { + "/health": { + "get": { + "summary": "Health", + "description": "Health check.", + "operationId": "health_health_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/tokenize": { + "post": { + "summary": "Tokenize", + "operationId": "tokenize_tokenize_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/TokenizeCompletionRequest" + }, + { + "$ref": "#/components/schemas/TokenizeChatRequest" + } + ], + "title": "Request" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/detokenize": { + "post": { + "summary": "Detokenize", + "operationId": "detokenize_detokenize_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DetokenizeRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/v1/models": { + "get": { + "summary": "Show Available Models", + "operationId": "show_available_models_v1_models_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/version": { + "get": { + "summary": "Show Version", + "operationId": "show_version_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/v1/chat/completions": { + "post": { + "summary": "Create Chat Completion", + "operationId": "create_chat_completion_v1_chat_completions_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatCompletionRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/v1/completions": { + "post": { + "summary": "Create Completion", + "operationId": "create_completion_v1_completions_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CompletionRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/v1/embeddings": { + "post": { + "summary": "Create Embedding", + "operationId": "create_embedding_v1_embeddings_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EmbeddingRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AudioURL": { + "properties": { + "url": { + "type": "string", + "title": "Url" + } + }, + "type": "object", + "required": [ + "url" + ], + "title": "AudioURL" + }, + "BaseModel": { + "properties": {}, + "type": "object", + "title": "BaseModel" + }, + "ChatCompletionAssistantMessageParam": { + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ], + "const": "assistant", + "title": "Role" + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Content" + }, + "function_call": { + "anyOf": [ + { + "$ref": "#/components/schemas/FunctionCall" + }, + { + "type": "null" + } + ] + }, + "name": { + "type": "string", + "title": "Name" + }, + "refusal": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Refusal" + }, + "tool_calls": { + "items": { + "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam" + }, + "type": "array", + "title": "Tool Calls" + } + }, + "type": "object", + "required": [ + "role" + ], + "title": "ChatCompletionAssistantMessageParam" + }, + "ChatCompletionContentPartAudioParam": { + "properties": { + "audio_url": { + "$ref": "#/components/schemas/AudioURL" + }, + "type": { + "type": "string", + "enum": [ + "audio_url" + ], + "const": "audio_url", + "title": "Type" + } + }, + "type": "object", + "required": [ + "audio_url", + "type" + ], + "title": "ChatCompletionContentPartAudioParam" + }, + "ChatCompletionContentPartImageParam": { + "properties": { + "image_url": { + "$ref": "#/components/schemas/ImageURL" + }, + "type": { + "type": "string", + "enum": [ + "image_url" + ], + "const": "image_url", + "title": "Type" + } + }, + "type": "object", + "required": [ + "image_url", + "type" + ], + "title": "ChatCompletionContentPartImageParam" + }, + "ChatCompletionContentPartRefusalParam": { + "properties": { + "refusal": { + "type": "string", + "title": "Refusal" + }, + "type": { + "type": "string", + "enum": [ + "refusal" + ], + "const": "refusal", + "title": "Type" + } + }, + "type": "object", + "required": [ + "refusal", + "type" + ], + "title": "ChatCompletionContentPartRefusalParam" + }, + "ChatCompletionContentPartTextParam": { + "properties": { + "text": { + "type": "string", + "title": "Text" + }, + "type": { + "type": "string", + "enum": [ + "text" + ], + "const": "text", + "title": "Type" + } + }, + "type": "object", + "required": [ + "text", + "type" + ], + "title": "ChatCompletionContentPartTextParam" + }, + "ChatCompletionFunctionMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Content" + }, + "name": { + "type": "string", + "title": "Name" + }, + "role": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Role" + } + }, + "type": "object", + "required": [ + "content", + "name", + "role" + ], + "title": "ChatCompletionFunctionMessageParam" + }, + "ChatCompletionMessageToolCallParam": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "function": { + "$ref": "#/components/schemas/Function" + }, + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type" + } + }, + "type": "object", + "required": [ + "id", + "function", + "type" + ], + "title": "ChatCompletionMessageToolCallParam" + }, + "ChatCompletionNamedFunction": { + "properties": { + "name": { + "type": "string", + "title": "Name" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name" + ], + "title": "ChatCompletionNamedFunction" + }, + "ChatCompletionNamedToolChoiceParam": { + "properties": { + "function": { + "$ref": "#/components/schemas/ChatCompletionNamedFunction" + }, + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type", + "default": "function" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "function" + ], + "title": "ChatCompletionNamedToolChoiceParam" + }, + "ChatCompletionRequest": { + "properties": { + "messages": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionSystemMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionUserMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionToolMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionMessageParam" + } + ] + }, + "type": "array", + "title": "Messages" + }, + "model": { + "type": "string", + "title": "Model" + }, + "frequency_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Frequency Penalty", + "default": 0.0 + }, + "logit_bias": { + "anyOf": [ + { + "additionalProperties": { + "type": "number" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Logit Bias" + }, + "logprobs": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Logprobs", + "default": false + }, + "top_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Top Logprobs", + "default": 0 + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Tokens" + }, + "n": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "N", + "default": 1 + }, + "presence_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Presence Penalty", + "default": 0.0 + }, + "response_format": { + "anyOf": [ + { + "$ref": "#/components/schemas/ResponseFormat" + }, + { + "type": "null" + } + ] + }, + "seed": { + "anyOf": [ + { + "type": "integer", + "maximum": 9.223372036854776e+18, + "minimum": -9.223372036854776e+18 + }, + { + "type": "null" + } + ], + "title": "Seed" + }, + "stop": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop" + }, + "stream": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Stream", + "default": false + }, + "stream_options": { + "anyOf": [ + { + "$ref": "#/components/schemas/StreamOptions" + }, + { + "type": "null" + } + ] + }, + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature", + "default": 0.7 + }, + "top_p": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Top P", + "default": 1.0 + }, + "tools": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ChatCompletionToolsParam" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tools" + }, + "tool_choice": { + "anyOf": [ + { + "type": "string", + "enum": [ + "none" + ], + "const": "none" + }, + { + "type": "string", + "enum": [ + "auto" + ], + "const": "auto" + }, + { + "$ref": "#/components/schemas/ChatCompletionNamedToolChoiceParam" + }, + { + "type": "null" + } + ], + "title": "Tool Choice", + "default": "none" + }, + "parallel_tool_calls": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Parallel Tool Calls", + "default": false + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "best_of": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Best Of" + }, + "use_beam_search": { + "type": "boolean", + "title": "Use Beam Search", + "default": false + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": -1 + }, + "min_p": { + "type": "number", + "title": "Min P", + "default": 0.0 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1.0 + }, + "length_penalty": { + "type": "number", + "title": "Length Penalty", + "default": 1.0 + }, + "early_stopping": { + "type": "boolean", + "title": "Early Stopping", + "default": false + }, + "stop_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop Token Ids" + }, + "include_stop_str_in_output": { + "type": "boolean", + "title": "Include Stop Str In Output", + "default": false + }, + "ignore_eos": { + "type": "boolean", + "title": "Ignore Eos", + "default": false + }, + "min_tokens": { + "type": "integer", + "title": "Min Tokens", + "default": 0 + }, + "skip_special_tokens": { + "type": "boolean", + "title": "Skip Special Tokens", + "default": true + }, + "spaces_between_special_tokens": { + "type": "boolean", + "title": "Spaces Between Special Tokens", + "default": true + }, + "truncate_prompt_tokens": { + "anyOf": [ + { + "type": "integer", + "minimum": 1.0 + }, + { + "type": "null" + } + ], + "title": "Truncate Prompt Tokens" + }, + "prompt_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Prompt Logprobs" + }, + "echo": { + "type": "boolean", + "title": "Echo", + "description": "If true, the new message will be prepended with the last message if they belong to the same role.", + "default": false + }, + "add_generation_prompt": { + "type": "boolean", + "title": "Add Generation Prompt", + "description": "If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.", + "default": true + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "description": "If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).", + "default": false + }, + "documents": { + "anyOf": [ + { + "items": { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Documents", + "description": "A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing \"title\" and \"text\" keys." + }, + "chat_template": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Chat Template", + "description": "A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one." + }, + "chat_template_kwargs": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Chat Template Kwargs", + "description": "Additional kwargs to pass to the template renderer. Will be accessible by the chat template." + }, + "guided_json": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "$ref": "#/components/schemas/BaseModel" + }, + { + "type": "null" + } + ], + "title": "Guided Json", + "description": "If specified, the output will follow the JSON schema." + }, + "guided_regex": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Regex", + "description": "If specified, the output will follow the regex pattern." + }, + "guided_choice": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Guided Choice", + "description": "If specified, the output will be exactly one of the choices." + }, + "guided_grammar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Grammar", + "description": "If specified, the output will follow the context free grammar." + }, + "guided_decoding_backend": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Decoding Backend", + "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be either 'outlines' / 'lm-format-enforcer'" + }, + "guided_whitespace_pattern": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Whitespace Pattern", + "description": "If specified, will override the default whitespace pattern for guided json decoding." + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "messages", + "model" + ], + "title": "ChatCompletionRequest" + }, + "ChatCompletionSystemMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "system" + ], + "const": "system", + "title": "Role" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "ChatCompletionSystemMessageParam" + }, + "ChatCompletionToolMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "tool" + ], + "const": "tool", + "title": "Role" + }, + "tool_call_id": { + "type": "string", + "title": "Tool Call Id" + } + }, + "type": "object", + "required": [ + "content", + "role", + "tool_call_id" + ], + "title": "ChatCompletionToolMessageParam" + }, + "ChatCompletionToolsParam": { + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type", + "default": "function" + }, + "function": { + "$ref": "#/components/schemas/FunctionDefinition" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "function" + ], + "title": "ChatCompletionToolsParam" + }, + "ChatCompletionUserMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartImageParam" + } + ] + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "user" + ], + "const": "user", + "title": "Role" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "ChatCompletionUserMessageParam" + }, + "CompletionRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "prompt": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "type": "array" + }, + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], + "title": "Prompt" + }, + "best_of": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Best Of" + }, + "echo": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Echo", + "default": false + }, + "frequency_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Frequency Penalty", + "default": 0.0 + }, + "logit_bias": { + "anyOf": [ + { + "additionalProperties": { + "type": "number" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Logit Bias" + }, + "logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Logprobs" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Tokens", + "default": 16 + }, + "n": { + "type": "integer", + "title": "N", + "default": 1 + }, + "presence_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Presence Penalty", + "default": 0.0 + }, + "seed": { + "anyOf": [ + { + "type": "integer", + "maximum": 9.223372036854776e+18, + "minimum": -9.223372036854776e+18 + }, + { + "type": "null" + } + ], + "title": "Seed" + }, + "stop": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop" + }, + "stream": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Stream", + "default": false + }, + "stream_options": { + "anyOf": [ + { + "$ref": "#/components/schemas/StreamOptions" + }, + { + "type": "null" + } + ] + }, + "suffix": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Suffix" + }, + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature", + "default": 1.0 + }, + "top_p": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Top P", + "default": 1.0 + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "use_beam_search": { + "type": "boolean", + "title": "Use Beam Search", + "default": false + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": -1 + }, + "min_p": { + "type": "number", + "title": "Min P", + "default": 0.0 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1.0 + }, + "length_penalty": { + "type": "number", + "title": "Length Penalty", + "default": 1.0 + }, + "early_stopping": { + "type": "boolean", + "title": "Early Stopping", + "default": false + }, + "stop_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop Token Ids" + }, + "include_stop_str_in_output": { + "type": "boolean", + "title": "Include Stop Str In Output", + "default": false + }, + "ignore_eos": { + "type": "boolean", + "title": "Ignore Eos", + "default": false + }, + "min_tokens": { + "type": "integer", + "title": "Min Tokens", + "default": 0 + }, + "skip_special_tokens": { + "type": "boolean", + "title": "Skip Special Tokens", + "default": true + }, + "spaces_between_special_tokens": { + "type": "boolean", + "title": "Spaces Between Special Tokens", + "default": true + }, + "truncate_prompt_tokens": { + "anyOf": [ + { + "type": "integer", + "minimum": 1.0 + }, + { + "type": "null" + } + ], + "title": "Truncate Prompt Tokens" + }, + "allowed_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Allowed Token Ids" + }, + "prompt_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Prompt Logprobs" + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "description": "If true (the default), special tokens (e.g. BOS) will be added to the prompt.", + "default": true + }, + "response_format": { + "anyOf": [ + { + "$ref": "#/components/schemas/ResponseFormat" + }, + { + "type": "null" + } + ], + "description": "Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'} or {'type': 'text' } is supported." + }, + "guided_json": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "$ref": "#/components/schemas/BaseModel" + }, + { + "type": "null" + } + ], + "title": "Guided Json", + "description": "If specified, the output will follow the JSON schema." + }, + "guided_regex": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Regex", + "description": "If specified, the output will follow the regex pattern." + }, + "guided_choice": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Guided Choice", + "description": "If specified, the output will be exactly one of the choices." + }, + "guided_grammar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Grammar", + "description": "If specified, the output will follow the context free grammar." + }, + "guided_decoding_backend": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Decoding Backend", + "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be one of 'outlines' / 'lm-format-enforcer'" + }, + "guided_whitespace_pattern": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Whitespace Pattern", + "description": "If specified, will override the default whitespace pattern for guided json decoding." + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "prompt" + ], + "title": "CompletionRequest" + }, + "CustomChatCompletionContentPartParam": { + "properties": { + "type": { + "type": "string", + "title": "Type" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "type" + ], + "title": "CustomChatCompletionContentPartParam" + }, + "CustomChatCompletionMessageParam": { + "properties": { + "role": { + "type": "string", + "title": "Role" + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartImageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartAudioParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionContentPartParam" + } + ] + }, + "type": "array" + } + ], + "title": "Content" + }, + "name": { + "type": "string", + "title": "Name" + }, + "tool_call_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Tool Call Id" + }, + "tool_calls": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool Calls" + } + }, + "type": "object", + "required": [ + "role" + ], + "title": "CustomChatCompletionMessageParam", + "description": "Enables custom roles in the Chat Completion API." + }, + "DetokenizeRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "tokens": { + "items": { + "type": "integer" + }, + "type": "array", + "title": "Tokens" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "tokens" + ], + "title": "DetokenizeRequest" + }, + "EmbeddingRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "input": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "type": "array" + }, + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], + "title": "Input" + }, + "encoding_format": { + "type": "string", + "enum": [ + "float", + "base64" + ], + "title": "Encoding Format", + "default": "float" + }, + "dimensions": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Dimensions" + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "additional_data": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "title": "Additional Data" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "input" + ], + "title": "EmbeddingRequest" + }, + "Function": { + "properties": { + "arguments": { + "type": "string", + "title": "Arguments" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "arguments", + "name" + ], + "title": "Function" + }, + "FunctionCall": { + "properties": { + "arguments": { + "type": "string", + "title": "Arguments" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "arguments", + "name" + ], + "title": "FunctionCall" + }, + "FunctionDefinition": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "parameters": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Parameters" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name" + ], + "title": "FunctionDefinition" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "ImageURL": { + "properties": { + "url": { + "type": "string", + "title": "Url" + }, + "detail": { + "type": "string", + "enum": [ + "auto", + "low", + "high" + ], + "title": "Detail" + } + }, + "type": "object", + "required": [ + "url" + ], + "title": "ImageURL" + }, + "JsonSchemaResponseFormat": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "schema": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Schema" + }, + "strict": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Strict" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name" + ], + "title": "JsonSchemaResponseFormat" + }, + "ResponseFormat": { + "properties": { + "type": { + "type": "string", + "enum": [ + "text", + "json_object", + "json_schema" + ], + "title": "Type" + }, + "json_schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/JsonSchemaResponseFormat" + }, + { + "type": "null" + } + ] + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "type" + ], + "title": "ResponseFormat" + }, + "StreamOptions": { + "properties": { + "include_usage": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Include Usage", + "default": true + }, + "continuous_usage_stats": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Continuous Usage Stats", + "default": true + } + }, + "additionalProperties": false, + "type": "object", + "title": "StreamOptions" + }, + "TokenizeChatRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "messages": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionSystemMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionUserMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionToolMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionMessageParam" + } + ] + }, + "type": "array", + "title": "Messages" + }, + "add_generation_prompt": { + "type": "boolean", + "title": "Add Generation Prompt", + "default": true + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "default": false + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "messages" + ], + "title": "TokenizeChatRequest" + }, + "TokenizeCompletionRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "prompt": { + "type": "string", + "title": "Prompt" + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "default": true + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "prompt" + ], + "title": "TokenizeCompletionRequest" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" + } + } + } +} \ No newline at end of file diff --git a/presets/inference/vllm/inference_api.py b/presets/inference/vllm/inference_api.py new file mode 100644 index 000000000..ab2613e9e --- /dev/null +++ b/presets/inference/vllm/inference_api.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import logging +import os + +import uvloop +from vllm.utils import FlexibleArgumentParser +import vllm.entrypoints.openai.api_server as api_server + +# Initialize logger +logger = logging.getLogger(__name__) +debug_mode = os.environ.get('DEBUG_MODE', 'false').lower() == 'true' +logging.basicConfig(level=logging.DEBUG if debug_mode else logging.INFO) + +def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + local_rank = int(os.environ.get("LOCAL_RANK", + 0)) # Default to 0 if not set + port = 5000 + local_rank # Adjust port based on local rank + + server_default_args = { + "disable-frontend-multiprocessing": False, + "port": port + } + parser.set_defaults(**server_default_args) + + # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args + engine_default_args = { + "model": "/workspace/tfs/weights", + "cpu-offload-gb": 0, + "gpu-memory-utilization": 0.9, + "swap-space": 4, + "disable-log-stats": False, + } + parser.set_defaults(**engine_default_args) + + return parser + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description='vLLM serving server') + parser = api_server.make_arg_parser(parser) + parser = make_arg_parser(parser) + args = parser.parse_args() + + # Run the serving server + logger.info(f"Starting server on port {args.port}") + # See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more + # details about serving server. + # endpoints: + # - /health + # - /tokenize + # - /detokenize + # - /v1/models + # - /version + # - /v1/chat/completions + # - /v1/completions + # - /v1/embeddings + uvloop.run(api_server.run_server(args)) diff --git a/presets/inference/vllm/requirements.txt b/presets/inference/vllm/requirements.txt new file mode 100644 index 000000000..4481a9966 --- /dev/null +++ b/presets/inference/vllm/requirements.txt @@ -0,0 +1,11 @@ +# Dependencies for vllm + +# Core Dependencies +vllm==0.6.3 +torch==2.4.0 +uvloop +numpy + +# For UTs +pytest +requests \ No newline at end of file diff --git a/presets/inference/vllm/tests/test_inference_api.py b/presets/inference/vllm/tests/test_inference_api.py new file mode 100644 index 000000000..30ae9cc7f --- /dev/null +++ b/presets/inference/vllm/tests/test_inference_api.py @@ -0,0 +1,114 @@ +import sys +import os +import subprocess +import time +import socket +from pathlib import Path + +import pytest +import requests + +# Get the parent directory of the current file +parent_dir = str(Path(__file__).resolve().parent.parent) +# Add the parent directory to sys.path +sys.path.append(parent_dir) + +TEST_MODEL = "facebook/opt-125m" +CHAT_TEMPLATE = ("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}" + "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}" + "{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}") + +@pytest.fixture(scope="module", autouse=True) +def setup_server(request): + if os.getenv("DEVICE") == "cpu": + pytest.skip("Skipping test on cpu device") + print("\n>>> Doing setup") + port = find_available_port() + global TEST_PORT + TEST_PORT = port + + args = [ + "python3", + os.path.join(parent_dir, "inference_api.py"), + "--model", TEST_MODEL, + "--chat-template", CHAT_TEMPLATE, + "--port", str(TEST_PORT) + ] + print(f">>> Starting server on port {TEST_PORT}") + process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def fin(): + process.terminate() + process.wait() + stderr = process.stderr.read().decode() + print(f">>> Server stderr: {stderr}") + stdout = process.stdout.read().decode() + print(f">>> Server stdout: {stdout}") + print ("\n>>> Doing teardown") + + if not is_port_open("localhost", TEST_PORT): + fin() + pytest.fail("failed to launch vllm server") + + request.addfinalizer(fin) + +def is_port_open(host, port, timeout=60): + start_time = time.time() + while time.time() - start_time < timeout: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) # Set a short timeout for each connection attempt + result = sock.connect_ex((host, port)) + print(">>> waiting for server to start") + if result == 0: + print(f">>> server started in {int(time.time() - start_time)} seconds") + return True + time.sleep(1) # Wait a bit before retrying + return False + +def find_available_port(start_port=5000, end_port=8000): + for port in range(start_port, end_port + 1): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', port)) != 0: + return port + raise RuntimeError('No available ports found') + +def test_completions_api(setup_server): + request_data = { + "model": TEST_MODEL, + "prompt": "Say this is a test", + "max_tokens": 7, + "temperature": 0.5, + "n": 2 + } + + response = requests.post(f"http://127.0.0.1:{TEST_PORT}/v1/completions", json=request_data) + data = response.json() + assert "choices" in data, "The response should contain a 'choices' key" + assert len(data["choices"]) == 2, "The response should contain two completion" + + for choice in data["choices"]: + assert "text" in choice, "Each choice should contain a 'text' key" + assert len(choice["text"]) > 0, "The completion text should not be empty" + +def test_chat_completions_api(setup_server): + request_data = { + "model": TEST_MODEL, + "messages": [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there! How can I help you today?"} + ], + "max_tokens": 7, + "temperature": 0.5, + "n": 2 + } + + response = requests.post(f"http://127.0.0.1:{TEST_PORT}/v1/chat/completions", json=request_data) + data = response.json() + + assert "choices" in data, "The response should contain a 'choices' key" + assert len(data["choices"]) == 2, "The response should contain two completion" + + for choice in data["choices"]: + assert "message" in choice, "Each choice should contain a 'message' key" + assert "content" in choice["message"], "Each message should contain a 'content' key" + assert len(choice["message"]["content"]) > 0, "The completion text should not be empty" \ No newline at end of file