From 02a2a7134794b1b274b0519dca1daa21b9b52a6e Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Mon, 28 Oct 2024 09:18:36 +0000 Subject: [PATCH] Update 2024-10-28 09:18:36 --- README.html | 7 +- _sources/README.md | 1 + _sources/embedding_model.ipynb | 259 ++++++-- _sources/openai_api.ipynb | 1006 +++++++++++++++++++++++++------- _sources/send_request.ipynb | 273 ++++++++- _static/css/custom_log.css | 29 + backend.html | 7 +- benchmark_and_profiling.html | 7 +- choices_methods.html | 7 +- contributor_guide.html | 7 +- custom_chat_template.html | 7 +- embedding_model.html | 111 +++- embedding_model.ipynb | 259 ++++++-- frontend.html | 7 +- genindex.html | 7 +- hyperparameter_tuning.html | 7 +- index.html | 7 +- install.html | 7 +- model_support.html | 7 +- openai_api.html | 561 ++++++++++++------ openai_api.ipynb | 1006 +++++++++++++++++++++++++------- release_process.html | 7 +- sampling_params.html | 7 +- search.html | 7 +- searchindex.js | 2 +- send_request.html | 96 ++- send_request.ipynb | 273 ++++++++- setup_github_runner.html | 7 +- troubleshooting.html | 7 +- 29 files changed, 3179 insertions(+), 816 deletions(-) create mode 100644 _static/css/custom_log.css diff --git a/README.html b/README.html index c136ab6..b6dc768 100644 --- a/README.html +++ b/README.html @@ -33,7 +33,8 @@ - + + @@ -54,7 +55,7 @@ - + @@ -523,7 +524,7 @@

Deploy#

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/_sources/README.md b/_sources/README.md index 052acbc..5dba730 100644 --- a/_sources/README.md +++ b/_sources/README.md @@ -20,6 +20,7 @@ make clean ### Serve (preview) Run an HTTP server and visit http://localhost:8000 in your browser. + ``` python3 -m http.server --d _build/html ``` diff --git a/_sources/embedding_model.ipynb b/_sources/embedding_model.ipynb index 0370084..d26743c 100644 --- a/_sources/embedding_model.ipynb +++ b/_sources/embedding_model.ipynb @@ -21,7 +21,7 @@ "The following code is equivalent to running this in the shell:\n", "```bash\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", - " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", + " --port 30010 --host 0.0.0.0 --is-embedding\n", "```\n", "\n", "Remember to add `--is-embedding` to the command." @@ -32,10 +32,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:22:53.085503Z", - "iopub.status.busy": "2024-10-27T23:22:53.085120Z", - "iopub.status.idle": "2024-10-27T23:23:32.527591Z", - "shell.execute_reply": "2024-10-27T23:23:32.526838Z" + "iopub.execute_input": "2024-10-28T09:15:14.536811Z", + "iopub.status.busy": "2024-10-28T09:15:14.536653Z", + "iopub.status.idle": "2024-10-28T09:15:54.999497Z", + "shell.execute_reply": "2024-10-28T09:15:54.998849Z" } }, "outputs": [ @@ -43,23 +43,144 @@ "name": "stdout", "output_type": "stream", "text": [ - "Embedding server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:15:25] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=237179517, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:40 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:41 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:41 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:15:41 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00Embedding server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", - "embedding_process = execute_shell_command(\n", + "embedding_process = lauch_sglang_server(\n", " \"\"\"\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", - " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", + " --port 30010 --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30010\")\n", "\n", - "print(\"Embedding server is ready. Proceeding with the next steps.\")" + "highlight_text(\"Embedding server is ready. Proceeding with the next steps.\")" ] }, { @@ -74,10 +195,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:32.562075Z", - "iopub.status.busy": "2024-10-27T23:23:32.561818Z", - "iopub.status.idle": "2024-10-27T23:23:33.771076Z", - "shell.execute_reply": "2024-10-27T23:23:33.770326Z" + "iopub.execute_input": "2024-10-28T09:15:55.001608Z", + "iopub.status.busy": "2024-10-28T09:15:55.001359Z", + "iopub.status.idle": "2024-10-28T09:15:56.216067Z", + "shell.execute_reply": "2024-10-28T09:15:56.215410Z" } }, "outputs": [ @@ -85,8 +206,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]\n" + "[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:55] INFO: 127.0.0.1:59280 - \"GET /get_model_info HTTP/1.1\" 200 OK\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0\n", + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59274 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -103,7 +251,7 @@ " \"embedding\"\n", "]\n", "\n", - "print(f\"Text embedding (first 10): {text_embedding[:10]}\")" + "highlight_text(f\"Text embedding (first 10): {text_embedding[:10]}\")" ] }, { @@ -118,10 +266,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:33.773259Z", - "iopub.status.busy": "2024-10-27T23:23:33.772776Z", - "iopub.status.idle": "2024-10-27T23:23:34.250269Z", - "shell.execute_reply": "2024-10-27T23:23:34.249623Z" + "iopub.execute_input": "2024-10-28T09:15:56.218030Z", + "iopub.status.busy": "2024-10-28T09:15:56.217835Z", + "iopub.status.idle": "2024-10-28T09:15:56.696733Z", + "shell.execute_reply": "2024-10-28T09:15:56.696187Z" } }, "outputs": [ @@ -129,8 +277,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n" + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59290 - \"POST /encode HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:15:56] The server is fired up and ready to roll!\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59300 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -145,7 +314,7 @@ ")\n", "\n", "embedding = response.data[0].embedding[:10]\n", - "print(f\"Text embedding (first 10): {embedding}\")" + "highlight_text(f\"Text embedding (first 10): {embedding}\")" ] }, { @@ -162,10 +331,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:34.252332Z", - "iopub.status.busy": "2024-10-27T23:23:34.251830Z", - "iopub.status.idle": "2024-10-27T23:23:40.028848Z", - "shell.execute_reply": "2024-10-27T23:23:40.028041Z" + "iopub.execute_input": "2024-10-28T09:15:56.698501Z", + "iopub.status.busy": "2024-10-28T09:15:56.698324Z", + "iopub.status.idle": "2024-10-28T09:16:02.484649Z", + "shell.execute_reply": "2024-10-28T09:16:02.483955Z" } }, "outputs": [ @@ -173,8 +342,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n" + "[2024-10-28 09:16:02 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:02] INFO: 127.0.0.1:59034 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -196,7 +378,7 @@ " 0\n", "][\"embedding\"]\n", "\n", - "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" + "highlight_text(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" ] }, { @@ -204,13 +386,24 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:40.031161Z", - "iopub.status.busy": "2024-10-27T23:23:40.030680Z", - "iopub.status.idle": "2024-10-27T23:23:42.843192Z", - "shell.execute_reply": "2024-10-27T23:23:42.842506Z" + "iopub.execute_input": "2024-10-28T09:16:02.486791Z", + "iopub.status.busy": "2024-10-28T09:16:02.486434Z", + "iopub.status.idle": "2024-10-28T09:16:05.293548Z", + "shell.execute_reply": "2024-10-28T09:16:05.292820Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:02] INFO: Shutting down\n", + "[2024-10-28 09:16:02] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:16:02] INFO: Application shutdown complete.\n", + "[2024-10-28 09:16:02] INFO: Finished server process [509328]\n" + ] + } + ], "source": [ "terminate_process(embedding_process)" ] diff --git a/_sources/openai_api.ipynb b/_sources/openai_api.ipynb index 3f07a6b..bcd5c32 100644 --- a/_sources/openai_api.ipynb +++ b/_sources/openai_api.ipynb @@ -6,7 +6,9 @@ "source": [ "# OpenAI Compatible API\n", "\n", - "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n", + "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n", + "\n", + "This tutorial aims at these popular APIs:\n", "\n", "- `chat/completions`\n", "- `completions`\n", @@ -30,10 +32,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:45.484181Z", - "iopub.status.busy": "2024-10-27T23:23:45.484018Z", - "iopub.status.idle": "2024-10-27T23:24:23.959941Z", - "shell.execute_reply": "2024-10-27T23:24:23.959208Z" + "iopub.execute_input": "2024-10-28T09:16:07.904473Z", + "iopub.status.busy": "2024-10-28T09:16:07.904311Z", + "iopub.status.idle": "2024-10-28T09:16:46.330698Z", + "shell.execute_reply": "2024-10-28T09:16:46.330038Z" } }, "outputs": [ @@ -41,22 +43,124 @@ "name": "stdout", "output_type": "stream", "text": [ - "Server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:16:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", - "server_process = execute_shell_command(\n", - " \"\"\"\n", - "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", - "\"\"\"\n", + "server_process = lauch_sglang_server(\n", + " command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")\n", - "print(\"Server is ready. Proceeding with the next steps.\")" + "\n", + "highlight_text(\"Server is ready. Proceeding with the next steps.\")" ] }, { @@ -64,10 +168,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:23.995371Z", - "iopub.status.busy": "2024-10-27T23:24:23.995106Z", - "iopub.status.idle": "2024-10-27T23:24:24.788840Z", - "shell.execute_reply": "2024-10-27T23:24:24.788201Z" + "iopub.execute_input": "2024-10-28T09:16:46.332812Z", + "iopub.status.busy": "2024-10-28T09:16:46.332554Z", + "iopub.status.idle": "2024-10-28T09:16:47.129366Z", + "shell.execute_reply": "2024-10-28T09:16:47.128802Z" } }, "outputs": [ @@ -75,8 +179,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "ChatCompletion(id='77e45b23e9b34ef0a65afd9598521768', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:16:46] INFO: 127.0.0.1:36690 - \"GET /get_model_info HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:46] INFO: 127.0.0.1:36696 - \"POST /generate HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:46] The server is fired up and ready to roll!\n", + "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0\n", + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -98,7 +226,8 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", - "print(response)" + "\n", + "highlight_text(f\"Response: {response}\")" ] }, { @@ -107,25 +236,7 @@ "source": [ "### Parameters\n", "\n", - "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n", - "\n", - "- `messages`: List of messages in the conversation, each containing `role` and `content`\n", - "- `model`: The model identifier to use for completion\n", - "- `max_tokens`: Maximum number of tokens to generate in the response\n", - "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", - "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", - "- `n`: Number of chat completion choices to generate\n", - "- `stream`: If true, partial message deltas will be sent as they become available\n", - "- `stop`: Sequences where the API will stop generating further tokens\n", - "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", - "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", - "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", - "- `logprobs`: Include log probabilities of tokens in the response\n", - "- `top_logprobs`: Number of most likely tokens to return probabilities for\n", - "- `seed`: Random seed for deterministic results\n", - "- `response_format`: Specify the format of the response (e.g., JSON)\n", - "- `stream_options`: Additional options for streaming responses\n", - "- `user`: A unique identifier representing your end-user\n", + "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n", "\n", "Here is an example of a detailed chat completion request:" ] @@ -135,10 +246,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:24.790616Z", - "iopub.status.busy": "2024-10-27T23:24:24.790426Z", - "iopub.status.idle": "2024-10-27T23:24:24.902228Z", - "shell.execute_reply": "2024-10-27T23:24:24.901651Z" + "iopub.execute_input": "2024-10-28T09:16:47.131245Z", + "iopub.status.busy": "2024-10-28T09:16:47.131061Z", + "iopub.status.idle": "2024-10-28T09:16:47.242225Z", + "shell.execute_reply": "2024-10-28T09:16:47.241691Z" } }, "outputs": [ @@ -146,8 +257,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "Ancient Rome's major achievements include:" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -173,11 +303,9 @@ " frequency_penalty=0.2, # Mild penalty for more natural language\n", " n=1, # Single response is usually more stable\n", " seed=42, # Keep for reproducibility\n", - " stream=True, # Keep streaming for real-time output\n", ")\n", "\n", - "for chunk in response:\n", - " print(chunk.choices[0].delta.content or \"\", end=\"\")" + "highlight_text(f\"Response: {response}\")" ] }, { @@ -188,7 +316,7 @@ "\n", "### Usage\n", "\n", - "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details." + "Completions API is similar to Chat Completions API, but without the `messages` parameter." ] }, { @@ -196,10 +324,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:24.903908Z", - "iopub.status.busy": "2024-10-27T23:24:24.903730Z", - "iopub.status.idle": "2024-10-27T23:24:25.361829Z", - "shell.execute_reply": "2024-10-27T23:24:25.361272Z" + "iopub.execute_input": "2024-10-28T09:16:47.243956Z", + "iopub.status.busy": "2024-10-28T09:16:47.243779Z", + "iopub.status.idle": "2024-10-28T09:16:47.703807Z", + "shell.execute_reply": "2024-10-28T09:16:47.703265Z" } }, "outputs": [ @@ -207,8 +335,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Completion(id='50da1b57333242cca0b8c6d8706f94b2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730071465, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0\n", + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -220,7 +375,8 @@ " n=1,\n", " stop=None,\n", ")\n", - "print(response)" + "\n", + "highlight_text(f\"Response: {response}\")" ] }, { @@ -229,26 +385,7 @@ "source": [ "### Parameters\n", "\n", - "The completions API accepts the following parameters:\n", - "\n", - "- `model`: The model identifier to use for completion\n", - "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n", - "- `best_of`: Number of completions to generate server-side and return the best one\n", - "- `echo`: If true, the prompt will be included in the response\n", - "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", - "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", - "- `logprobs`: Include log probabilities of tokens in the response\n", - "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n", - "- `n`: Number of completion choices to generate\n", - "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", - "- `seed`: Random seed for deterministic results\n", - "- `stop`: Sequences where the API will stop generating further tokens\n", - "- `stream`: If true, partial completion deltas will be sent as they become available\n", - "- `stream_options`: Additional options for streaming responses\n", - "- `suffix`: Text to append to the completion\n", - "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", - "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", - "- `user`: A unique identifier representing your end-user\n", + "The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n", "\n", "Here is an example of a detailed completions request:" ] @@ -258,10 +395,10 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:25.363510Z", - "iopub.status.busy": "2024-10-27T23:24:25.363334Z", - "iopub.status.idle": "2024-10-27T23:24:26.087507Z", - "shell.execute_reply": "2024-10-27T23:24:26.086953Z" + "iopub.execute_input": "2024-10-28T09:16:47.705617Z", + "iopub.status.busy": "2024-10-28T09:16:47.705438Z", + "iopub.status.idle": "2024-10-28T09:16:48.612422Z", + "shell.execute_reply": "2024-10-28T09:16:48.611889Z" } }, "outputs": [ @@ -269,51 +406,42 @@ "name": "stdout", "output_type": "stream", "text": [ - " Be sure to include a new planet, a strange creature, and a discovery that changes everything.\n", - "As Captain Zara Black" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wood pil" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "oted her ship, the Celestial Quest, through the vast expanse of space, she couldn't help but feel a sense" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " of excitement" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0\n", + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " their criteria" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ". And finally, after months of searching, they had found it." - ] + "data": { + "text/html": [ + "Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\\n\\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -328,11 +456,9 @@ " frequency_penalty=0.3, # Reduce repetitive phrases\n", " n=1, # Generate one completion\n", " seed=123, # For reproducible results\n", - " stream=True, # Stream the response\n", ")\n", "\n", - "for chunk in response:\n", - " print(chunk.choices[0].text or \"\", end=\"\")" + "highlight_text(f\"Response: {response}\")" ] }, { @@ -357,10 +483,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:26.089195Z", - "iopub.status.busy": "2024-10-27T23:24:26.089017Z", - "iopub.status.idle": "2024-10-27T23:24:26.169406Z", - "shell.execute_reply": "2024-10-27T23:24:26.168852Z" + "iopub.execute_input": "2024-10-28T09:16:48.614261Z", + "iopub.status.busy": "2024-10-28T09:16:48.614081Z", + "iopub.status.idle": "2024-10-28T09:16:48.695988Z", + "shell.execute_reply": "2024-10-28T09:16:48.695467Z" } }, "outputs": [ @@ -368,8 +494,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Batch job created with ID: batch_a8bb0663-1cc5-487b-b170-d8f2a76dbf60\n" + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36708 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36708 - \"POST /v1/batches HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:48 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" ] + }, + { + "data": { + "text/html": [ + "Batch job created with ID: batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -419,7 +559,7 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Batch job created with ID: {batch_response.id}\")" + "highlight_text(f\"Batch job created with ID: {batch_response.id}\")" ] }, { @@ -427,28 +567,96 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:26.171258Z", - "iopub.status.busy": "2024-10-27T23:24:26.170832Z", - "iopub.status.idle": "2024-10-27T23:24:29.186895Z", - "shell.execute_reply": "2024-10-27T23:24:29.186293Z" + "iopub.execute_input": "2024-10-28T09:16:48.697904Z", + "iopub.status.busy": "2024-10-28T09:16:48.697486Z", + "iopub.status.idle": "2024-10-28T09:16:51.719102Z", + "shell.execute_reply": "2024-10-28T09:16:51.718503Z" } }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Batch job status: validating...trying again in 3 seconds...\n", + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1\" 200 OK\n", "Batch job completed successfully!\n", "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", - "\n", - "Request request-1:\n", - "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n", - "\n", - "Request request-2:\n", - "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\\n\\n1. **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n", - "\n", - "Cleaning up files...\n" + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Request request-1:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request request-2:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Cleaning up files..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1\" 200 OK\n" ] } ], @@ -471,16 +679,16 @@ " ]\n", "\n", " for result in results:\n", - " print(f\"\\nRequest {result['custom_id']}:\")\n", - " print(f\"Response: {result['response']}\")\n", + " highlight_text(f\"Request {result['custom_id']}:\")\n", + " highlight_text(f\"Response: {result['response']}\")\n", "\n", - " print(\"\\nCleaning up files...\")\n", + " highlight_text(\"Cleaning up files...\")\n", " # Only delete the result file ID since file_response is just content\n", " client.files.delete(result_file_id)\n", "else:\n", - " print(f\"Batch job failed with status: {batch_response.status}\")\n", + " highlight_text(f\"Batch job failed with status: {batch_response.status}\")\n", " if hasattr(batch_response, \"errors\"):\n", - " print(f\"Errors: {batch_response.errors}\")" + " highlight_text(f\"Errors: {batch_response.errors}\")" ] }, { @@ -500,10 +708,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:29.188845Z", - "iopub.status.busy": "2024-10-27T23:24:29.188552Z", - "iopub.status.idle": "2024-10-27T23:24:54.305285Z", - "shell.execute_reply": "2024-10-27T23:24:54.304629Z" + "iopub.execute_input": "2024-10-28T09:16:51.720917Z", + "iopub.status.busy": "2024-10-28T09:16:51.720728Z", + "iopub.status.idle": "2024-10-28T09:17:16.852156Z", + "shell.execute_reply": "2024-10-28T09:17:16.851486Z" } }, "outputs": [ @@ -511,89 +719,280 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Initial status: validating\n" + "[2024-10-28 09:16:51] INFO: 127.0.0.1:53788 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:51] INFO: 127.0.0.1:53788 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 1/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 2/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 3/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 4/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 5/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:01] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:04] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:07] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:10] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:13] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -641,25 +1040,21 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Created batch job with ID: {batch_job.id}\")\n", - "print(f\"Initial status: {batch_job.status}\")\n", + "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n", + "highlight_text(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "max_checks = 5\n", "for i in range(max_checks):\n", " batch_details = client.batches.retrieve(batch_id=batch_job.id)\n", - " print(f\"Batch job details (check {i+1}/{max_checks}):\")\n", - " print(f\"ID: {batch_details.id}\")\n", - " print(f\"Status: {batch_details.status}\")\n", - " print(f\"Created at: {batch_details.created_at}\")\n", - " print(f\"Input file ID: {batch_details.input_file_id}\")\n", - " print(f\"Output file ID: {batch_details.output_file_id}\")\n", - "\n", - " print(\"Request counts:\")\n", - " print(f\"Total: {batch_details.request_counts.total}\")\n", - " print(f\"Completed: {batch_details.request_counts.completed}\")\n", - " print(f\"Failed: {batch_details.request_counts.failed}\")\n", + "\n", + " highlight_text(\n", + " f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n", + " )\n", + " highlight_text(\n", + " f\"Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}\"\n", + " )\n", "\n", " time.sleep(3)" ] @@ -676,10 +1071,10 @@ "execution_count": 9, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:54.307459Z", - "iopub.status.busy": "2024-10-27T23:24:54.307266Z", - "iopub.status.idle": "2024-10-27T23:25:07.414717Z", - "shell.execute_reply": "2024-10-27T23:25:07.413989Z" + "iopub.execute_input": "2024-10-28T09:17:16.854434Z", + "iopub.status.busy": "2024-10-28T09:17:16.854239Z", + "iopub.status.idle": "2024-10-28T09:17:29.967949Z", + "shell.execute_reply": "2024-10-28T09:17:29.967373Z" } }, "outputs": [ @@ -687,25 +1082,187 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a\n", - "Initial status: validating\n" + "[2024-10-28 09:17:16] INFO: 127.0.0.1:48056 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:17:16] INFO: 127.0.0.1:48056 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Cancellation initiated. Status: cancelling\n" + "[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Current status: cancelled\n", - "Batch job successfully cancelled\n", - "Successfully cleaned up input file\n" + "[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:26] INFO: 127.0.0.1:54868 - \"POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Cancellation initiated. Status: cancelling" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: 127.0.0.1:54868 - \"GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Current status: cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Batch job successfully cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: 127.0.0.1:54868 - \"DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Successfully cleaned up input file" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -753,37 +1310,37 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Created batch job with ID: {batch_job.id}\")\n", - "print(f\"Initial status: {batch_job.status}\")\n", + "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n", + "highlight_text(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "try:\n", " cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n", - " print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", + " highlight_text(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", " assert cancelled_job.status == \"cancelling\"\n", "\n", " # Monitor the cancellation process\n", " while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", " cancelled_job = client.batches.retrieve(batch_job.id)\n", - " print(f\"Current status: {cancelled_job.status}\")\n", + " highlight_text(f\"Current status: {cancelled_job.status}\")\n", "\n", " # Verify final status\n", " assert cancelled_job.status == \"cancelled\"\n", - " print(\"Batch job successfully cancelled\")\n", + " highlight_text(\"Batch job successfully cancelled\")\n", "\n", "except Exception as e:\n", - " print(f\"Error during cancellation: {e}\")\n", + " highlight_text(f\"Error during cancellation: {e}\")\n", " raise e\n", "\n", "finally:\n", " try:\n", " del_response = client.files.delete(uploaded_file.id)\n", " if del_response.deleted:\n", - " print(\"Successfully cleaned up input file\")\n", + " highlight_text(\"Successfully cleaned up input file\")\n", " except Exception as e:\n", - " print(f\"Error cleaning up: {e}\")\n", + " highlight_text(f\"Error cleaning up: {e}\")\n", " raise e" ] }, @@ -792,13 +1349,24 @@ "execution_count": 10, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:07.416667Z", - "iopub.status.busy": "2024-10-27T23:25:07.416471Z", - "iopub.status.idle": "2024-10-27T23:25:10.222119Z", - "shell.execute_reply": "2024-10-27T23:25:10.221434Z" + "iopub.execute_input": "2024-10-28T09:17:29.969798Z", + "iopub.status.busy": "2024-10-28T09:17:29.969613Z", + "iopub.status.idle": "2024-10-28T09:17:32.811800Z", + "shell.execute_reply": "2024-10-28T09:17:32.811092Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: Shutting down\n", + "[2024-10-28 09:17:30] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:17:30] INFO: Application shutdown complete.\n", + "[2024-10-28 09:17:30] INFO: Finished server process [510260]\n" + ] + } + ], "source": [ "terminate_process(server_process)" ] diff --git a/_sources/send_request.ipynb b/_sources/send_request.ipynb index ea93b12..ea640a6 100644 --- a/_sources/send_request.ipynb +++ b/_sources/send_request.ipynb @@ -19,7 +19,7 @@ "\n", "```bash\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", + "--port 30000 --host 0.0.0.0\n", "```\n", "\n", "in your command line and wait for the server to be ready." @@ -30,10 +30,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:12.782403Z", - "iopub.status.busy": "2024-10-27T23:25:12.781995Z", - "iopub.status.idle": "2024-10-27T23:25:50.292760Z", - "shell.execute_reply": "2024-10-27T23:25:50.291723Z" + "iopub.execute_input": "2024-10-28T09:17:35.325923Z", + "iopub.status.busy": "2024-10-28T09:17:35.325748Z", + "iopub.status.idle": "2024-10-28T09:18:13.770765Z", + "shell.execute_reply": "2024-10-28T09:18:13.770130Z" } }, "outputs": [ @@ -41,23 +41,127 @@ "name": "stdout", "output_type": "stream", "text": [ - "Server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:17:45] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:01 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", "\n", - "server_process = execute_shell_command(\n", + "server_process = lauch_sglang_server(\n", " \"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", + "--port 30000 --host 0.0.0.0\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")\n", - "print(\"Server is ready. Proceeding with the next steps.\")" + "highlight_text(\"Server is ready. Proceeding with the next steps.\")" ] }, { @@ -74,10 +178,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:50.328286Z", - "iopub.status.busy": "2024-10-27T23:25:50.327797Z", - "iopub.status.idle": "2024-10-27T23:25:53.479602Z", - "shell.execute_reply": "2024-10-27T23:25:53.478670Z" + "iopub.execute_input": "2024-10-28T09:18:13.772846Z", + "iopub.status.busy": "2024-10-28T09:18:13.772593Z", + "iopub.status.idle": "2024-10-28T09:18:16.416442Z", + "shell.execute_reply": "2024-10-28T09:18:16.415708Z" } }, "outputs": [ @@ -85,7 +189,87 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"id\":\"6ae7fabfd4c54054a8017e2aa7c6bc5a\",\"object\":\"chat.completion\",\"created\":1730071553,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\\n\\nLarge Language Models are typically characterized by their ability to:\\n\\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\\n\\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\\n\\nSome popular examples of LLMs include:\\n\\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\\n\\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":450,\"completion_tokens\":403,\"prompt_tokens_details\":null}}" + "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:18:13] INFO: 127.0.0.1:35536 - \"GET /get_model_info HTTP/1.1\" 200 OK\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:13] INFO: 127.0.0.1:35540 - \"POST /generate HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:18:13] The server is fired up and ready to roll!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:16] INFO: 127.0.0.1:35530 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", + "{\"id\":\"ad61027db61649d0bd69f6aa901f1d8c\",\"object\":\"chat.completion\",\"created\":1730107096,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\\n\\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\\n\\nSome common applications of LLMs include:\\n\\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\\n\\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\\n\\nIf you have any specific questions or topics you'd like to discuss, feel free to ask!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":378,\"completion_tokens\":331,\"prompt_tokens_details\":null}}" ] } ], @@ -110,10 +294,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:53.481936Z", - "iopub.status.busy": "2024-10-27T23:25:53.481707Z", - "iopub.status.idle": "2024-10-27T23:25:54.273214Z", - "shell.execute_reply": "2024-10-27T23:25:54.272434Z" + "iopub.execute_input": "2024-10-28T09:18:16.418642Z", + "iopub.status.busy": "2024-10-28T09:18:16.418313Z", + "iopub.status.idle": "2024-10-28T09:18:17.213494Z", + "shell.execute_reply": "2024-10-28T09:18:17.212929Z" } }, "outputs": [ @@ -121,8 +305,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "ChatCompletion(id='da93c64364af475cbdd2cb19155fd68d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0\n", + "[2024-10-28 09:18:17] INFO: 127.0.0.1:35554 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "ChatCompletion(id='29542e83d53f44eea0c01d1f517c4b40', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107097, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -144,7 +348,7 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", - "print(response)" + "highlight_text(response)" ] }, { @@ -152,13 +356,30 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:54.275385Z", - "iopub.status.busy": "2024-10-27T23:25:54.274807Z", - "iopub.status.idle": "2024-10-27T23:25:57.082401Z", - "shell.execute_reply": "2024-10-27T23:25:57.080829Z" + "iopub.execute_input": "2024-10-28T09:18:17.215264Z", + "iopub.status.busy": "2024-10-28T09:18:17.215073Z", + "iopub.status.idle": "2024-10-28T09:18:20.076158Z", + "shell.execute_reply": "2024-10-28T09:18:20.075276Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17] INFO: Shutting down\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:18:17] INFO: Application shutdown complete.\n", + "[2024-10-28 09:18:17] INFO: Finished server process [511197]\n" + ] + } + ], "source": [ "terminate_process(server_process)" ] diff --git a/_static/css/custom_log.css b/_static/css/custom_log.css new file mode 100644 index 0000000..86ee951 --- /dev/null +++ b/_static/css/custom_log.css @@ -0,0 +1,29 @@ +.output_area { + color: #615656; +} + +table.autosummary td { + width: 50% + } + + img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.output_area.stderr { + color: #d3d3d3 !important; /* 浅灰色 */ +} + +.output_area.stdout { + color: #d3d3d3 !important; +} + +div.output_area.stderr { + color: #d3d3d3 !important; /* 浅灰色 */ +} + +div.output_area.stdout { + color: #d3d3d3 !important; +} \ No newline at end of file diff --git a/backend.html b/backend.html index c91aa06..ba0d99d 100644 --- a/backend.html +++ b/backend.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -733,7 +734,7 @@

Benchmark Performance

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/benchmark_and_profiling.html b/benchmark_and_profiling.html index f48a351..6ce6710 100644 --- a/benchmark_and_profiling.html +++ b/benchmark_and_profiling.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -555,7 +556,7 @@

Other tips

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/choices_methods.html b/choices_methods.html index 2aa46a8..1acd2ca 100644 --- a/choices_methods.html +++ b/choices_methods.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -570,7 +571,7 @@

Unconditional Likelihood Normalized

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/contributor_guide.html b/contributor_guide.html index a635db9..65be0cf 100644 --- a/contributor_guide.html +++ b/contributor_guide.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -506,7 +507,7 @@

Add Unit Tests

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/custom_chat_template.html b/custom_chat_template.html index f53a65d..f551c17 100644 --- a/custom_chat_template.html +++ b/custom_chat_template.html @@ -33,7 +33,8 @@ - + + @@ -54,7 +55,7 @@ - + @@ -469,7 +470,7 @@

Custom Chat Template in SGLang Runtime

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/embedding_model.html b/embedding_model.html index 156f5b6..171b2a8 100644 --- a/embedding_model.html +++ b/embedding_model.html @@ -33,8 +33,9 @@ + - + @@ -58,7 +59,7 @@ - + @@ -411,7 +412,11 @@

Contents

-
+

Embedding Model#

SGLang supports embedding models in the same way as completion models. Here are some example models:

Use Curl#

@@ -473,18 +506,27 @@

Use Curl "embedding" ] -print(f"Text embedding (first 10): {text_embedding[:10]}") +highlight_text(f"Text embedding (first 10): {text_embedding[:10]}") -
+
-Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]
+[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:15:55] INFO:     127.0.0.1:59280 - "GET /get_model_info HTTP/1.1" 200 OK
+[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59274 - "POST /v1/embeddings HTTP/1.1" 200 OK
 
+
+
+
+
+Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]
+

Using OpenAI Compatible API#

@@ -503,18 +545,27 @@

Using OpenAI Compatible API) embedding = response.data[0].embedding[:10] -print(f"Text embedding (first 10): {embedding}") +highlight_text(f"Text embedding (first 10): {embedding}") -
+
-Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59290 - "POST /encode HTTP/1.1" 200 OK
+[2024-10-28 09:15:56] The server is fired up and ready to roll!
+[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59300 - "POST /v1/embeddings HTTP/1.1" 200 OK
 
+
+
+
+
+Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]
+

Using Input IDs#

@@ -541,19 +592,26 @@

Using Input IDs0 ]["embedding"] -print(f"Input IDs embedding (first 10): {input_ids_embedding[:10]}") +highlight_text(f"Input IDs embedding (first 10): {input_ids_embedding[:10]}") -

@@ -638,7 +707,7 @@

Using Input IDs

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/embedding_model.ipynb b/embedding_model.ipynb index 0370084..d26743c 100644 --- a/embedding_model.ipynb +++ b/embedding_model.ipynb @@ -21,7 +21,7 @@ "The following code is equivalent to running this in the shell:\n", "```bash\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", - " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", + " --port 30010 --host 0.0.0.0 --is-embedding\n", "```\n", "\n", "Remember to add `--is-embedding` to the command." @@ -32,10 +32,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:22:53.085503Z", - "iopub.status.busy": "2024-10-27T23:22:53.085120Z", - "iopub.status.idle": "2024-10-27T23:23:32.527591Z", - "shell.execute_reply": "2024-10-27T23:23:32.526838Z" + "iopub.execute_input": "2024-10-28T09:15:14.536811Z", + "iopub.status.busy": "2024-10-28T09:15:14.536653Z", + "iopub.status.idle": "2024-10-28T09:15:54.999497Z", + "shell.execute_reply": "2024-10-28T09:15:54.998849Z" } }, "outputs": [ @@ -43,23 +43,144 @@ "name": "stdout", "output_type": "stream", "text": [ - "Embedding server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:15:25] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=237179517, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:40 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:41 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:41 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:15:41 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00Embedding server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", - "embedding_process = execute_shell_command(\n", + "embedding_process = lauch_sglang_server(\n", " \"\"\"\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", - " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", + " --port 30010 --host 0.0.0.0 --is-embedding\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30010\")\n", "\n", - "print(\"Embedding server is ready. Proceeding with the next steps.\")" + "highlight_text(\"Embedding server is ready. Proceeding with the next steps.\")" ] }, { @@ -74,10 +195,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:32.562075Z", - "iopub.status.busy": "2024-10-27T23:23:32.561818Z", - "iopub.status.idle": "2024-10-27T23:23:33.771076Z", - "shell.execute_reply": "2024-10-27T23:23:33.770326Z" + "iopub.execute_input": "2024-10-28T09:15:55.001608Z", + "iopub.status.busy": "2024-10-28T09:15:55.001359Z", + "iopub.status.idle": "2024-10-28T09:15:56.216067Z", + "shell.execute_reply": "2024-10-28T09:15:56.215410Z" } }, "outputs": [ @@ -85,8 +206,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]\n" + "[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:55] INFO: 127.0.0.1:59280 - \"GET /get_model_info HTTP/1.1\" 200 OK\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0\n", + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59274 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -103,7 +251,7 @@ " \"embedding\"\n", "]\n", "\n", - "print(f\"Text embedding (first 10): {text_embedding[:10]}\")" + "highlight_text(f\"Text embedding (first 10): {text_embedding[:10]}\")" ] }, { @@ -118,10 +266,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:33.773259Z", - "iopub.status.busy": "2024-10-27T23:23:33.772776Z", - "iopub.status.idle": "2024-10-27T23:23:34.250269Z", - "shell.execute_reply": "2024-10-27T23:23:34.249623Z" + "iopub.execute_input": "2024-10-28T09:15:56.218030Z", + "iopub.status.busy": "2024-10-28T09:15:56.217835Z", + "iopub.status.idle": "2024-10-28T09:15:56.696733Z", + "shell.execute_reply": "2024-10-28T09:15:56.696187Z" } }, "outputs": [ @@ -129,8 +277,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n" + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59290 - \"POST /encode HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:15:56] The server is fired up and ready to roll!\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:15:56] INFO: 127.0.0.1:59300 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -145,7 +314,7 @@ ")\n", "\n", "embedding = response.data[0].embedding[:10]\n", - "print(f\"Text embedding (first 10): {embedding}\")" + "highlight_text(f\"Text embedding (first 10): {embedding}\")" ] }, { @@ -162,10 +331,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:34.252332Z", - "iopub.status.busy": "2024-10-27T23:23:34.251830Z", - "iopub.status.idle": "2024-10-27T23:23:40.028848Z", - "shell.execute_reply": "2024-10-27T23:23:40.028041Z" + "iopub.execute_input": "2024-10-28T09:15:56.698501Z", + "iopub.status.busy": "2024-10-28T09:15:56.698324Z", + "iopub.status.idle": "2024-10-28T09:16:02.484649Z", + "shell.execute_reply": "2024-10-28T09:16:02.483955Z" } }, "outputs": [ @@ -173,8 +342,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n" + "[2024-10-28 09:16:02 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:02] INFO: 127.0.0.1:59034 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -196,7 +378,7 @@ " 0\n", "][\"embedding\"]\n", "\n", - "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" + "highlight_text(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" ] }, { @@ -204,13 +386,24 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:40.031161Z", - "iopub.status.busy": "2024-10-27T23:23:40.030680Z", - "iopub.status.idle": "2024-10-27T23:23:42.843192Z", - "shell.execute_reply": "2024-10-27T23:23:42.842506Z" + "iopub.execute_input": "2024-10-28T09:16:02.486791Z", + "iopub.status.busy": "2024-10-28T09:16:02.486434Z", + "iopub.status.idle": "2024-10-28T09:16:05.293548Z", + "shell.execute_reply": "2024-10-28T09:16:05.292820Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:02] INFO: Shutting down\n", + "[2024-10-28 09:16:02] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:16:02] INFO: Application shutdown complete.\n", + "[2024-10-28 09:16:02] INFO: Finished server process [509328]\n" + ] + } + ], "source": [ "terminate_process(embedding_process)" ] diff --git a/frontend.html b/frontend.html index 46dae62..f219b07 100644 --- a/frontend.html +++ b/frontend.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -761,7 +762,7 @@

Tips and Implementation Details

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/genindex.html b/genindex.html index ed6d6d0..71c62b4 100644 --- a/genindex.html +++ b/genindex.html @@ -32,7 +32,8 @@ - + + @@ -61,7 +62,7 @@ - + @@ -374,7 +375,7 @@

Index

diff --git a/hyperparameter_tuning.html b/hyperparameter_tuning.html index 78aa95a..b3aee03 100644 --- a/hyperparameter_tuning.html +++ b/hyperparameter_tuning.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -551,7 +552,7 @@

Tune --schedule diff --git a/index.html b/index.html index 2085e38..a7246cb 100644 --- a/index.html +++ b/index.html @@ -33,7 +33,8 @@ - + + @@ -57,7 +58,7 @@ - + @@ -500,7 +501,7 @@

SGLang Documentation

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/install.html b/install.html index 831ca90..e3124f2 100644 --- a/install.html +++ b/install.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -604,7 +605,7 @@

Common Notes

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/model_support.html b/model_support.html index 7e21319..1959fc1 100644 --- a/model_support.html +++ b/model_support.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -537,7 +538,7 @@

Port a model from vLLM to SGLang

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/openai_api.html b/openai_api.html index c39d006..918f43d 100644 --- a/openai_api.html +++ b/openai_api.html @@ -33,8 +33,9 @@ + - + @@ -59,7 +60,7 @@ - + @@ -419,9 +420,14 @@

Contents

-
+

OpenAI Compatible API#

-

SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.

+

SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at OpenAI API Reference.

+

This tutorial aims at these popular APIs:

  • chat/completions

  • completions

  • @@ -437,28 +443,52 @@

    Usage#<
    [1]:
     
    -
    from sglang.utils import execute_shell_command, wait_for_server, terminate_process
    +
    from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text
     
    -server_process = execute_shell_command(
    -    """
    -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
    ---port 30000 --host 0.0.0.0 --log-level warning
    -"""
    +server_process = lauch_sglang_server(
    +    command="python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0"
     )
     
     wait_for_server("http://localhost:30000")
    -print("Server is ready. Proceeding with the next steps.")
    +
    +highlight_text("Server is ready. Proceeding with the next steps.")
     
    -
    +
    -Server is ready. Proceeding with the next steps.
    +[2024-10-28 09:16:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
    +[2024-10-28 09:16:34 TP0] Init torch distributed begin.
    +[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB
    +[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable
    +INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format ['*.safetensors']
    +Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
    +Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.21it/s]
    +Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.12it/s]
    +Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.12it/s]
    +Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]
    +Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.35it/s]
    +
    +[2024-10-28 09:16:38 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
    +[2024-10-28 09:16:38 TP0] Memory pool end. avail mem=8.37 GB
    +[2024-10-28 09:16:38 TP0] Capture cuda graph begin. This can take up to several minutes.
    +[2024-10-28 09:16:45 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
    +[2024-10-28 09:16:45] INFO:     Started server process [510260]
    +[2024-10-28 09:16:45] INFO:     Waiting for application startup.
    +[2024-10-28 09:16:45] INFO:     Application startup complete.
    +[2024-10-28 09:16:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
    +[2024-10-28 09:16:46] INFO:     127.0.0.1:36680 - "GET /v1/models HTTP/1.1" 200 OK
     
    +
    +
    +
    +
    +Server is ready. Proceeding with the next steps.
    +
    [2]:
     
    @@ -481,41 +511,35 @@

    Usage#< temperature=0, max_tokens=64, ) -print(response) + +highlight_text(f"Response: {response}")

    -
    +
    -ChatCompletion(id='77e45b23e9b34ef0a65afd9598521768', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))
    +[2024-10-28 09:16:46] INFO:     127.0.0.1:36690 - "GET /get_model_info HTTP/1.1" 200 OK
    +[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
    +[2024-10-28 09:16:46] INFO:     127.0.0.1:36696 - "POST /generate HTTP/1.1" 200 OK
    +[2024-10-28 09:16:46] The server is fired up and ready to roll!
    +[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
    +[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0
    +[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - "POST /v1/chat/completions HTTP/1.1" 200 OK
     
    +
    +
    +
    +
    +Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))
    +

Parameters#

-

The chat completions API accepts the following parameters (refer to OpenAI Chat Completions API for more details):

-
    -
  • messages: List of messages in the conversation, each containing role and content

  • -
  • model: The model identifier to use for completion

  • -
  • max_tokens: Maximum number of tokens to generate in the response

  • -
  • temperature: Controls randomness (0-2). Lower values make output more focused and deterministic

  • -
  • top_p: Alternative to temperature. Controls diversity via nucleus sampling

  • -
  • n: Number of chat completion choices to generate

  • -
  • stream: If true, partial message deltas will be sent as they become available

  • -
  • stop: Sequences where the API will stop generating further tokens

  • -
  • presence_penalty: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)

  • -
  • frequency_penalty: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)

  • -
  • logit_bias: Modify the likelihood of specified tokens appearing in the completion

  • -
  • logprobs: Include log probabilities of tokens in the response

  • -
  • top_logprobs: Number of most likely tokens to return probabilities for

  • -
  • seed: Random seed for deterministic results

  • -
  • response_format: Specify the format of the response (e.g., JSON)

  • -
  • stream_options: Additional options for streaming responses

  • -
  • user: A unique identifier representing your end-user

  • -
+

The chat completions API accepts OpenAI Chat Completions API’s parameters. Refer to OpenAI Chat Completions API for more details.

Here is an example of a detailed chat completion request:

-
+
-Ancient Rome's major achievements include:
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - "POST /v1/chat/completions HTTP/1.1" 200 OK
 
+
+
+
+
+Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Ancient Rome's major achievements include:", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\n\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))
+

Completions#

Usage#

-

Completions API is similar to Chat Completions API, but without the messages parameter. Refer to OpenAI Completions API for more details.

+

Completions API is similar to Chat Completions API, but without the messages parameter.

[4]:
 
@@ -578,42 +607,32 @@

Usage#n=1, stop=None, ) -print(response) + +highlight_text(f"Response: {response}")

-
+
-Completion(id='50da1b57333242cca0b8c6d8706f94b2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730071465, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0
+[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0
+[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - "POST /v1/completions HTTP/1.1" 200 OK
 
+
+
+
+
+Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))
+

Parameters#

-

The completions API accepts the following parameters:

-
    -
  • model: The model identifier to use for completion

  • -
  • prompt: Input text to generate completions for. Can be a string, array of strings, or token arrays

  • -
  • best_of: Number of completions to generate server-side and return the best one

  • -
  • echo: If true, the prompt will be included in the response

  • -
  • frequency_penalty: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)

  • -
  • logit_bias: Modify the likelihood of specified tokens appearing in the completion

  • -
  • logprobs: Include log probabilities of tokens in the response

  • -
  • max_tokens: Maximum number of tokens to generate in the response (default: 16)

  • -
  • n: Number of completion choices to generate

  • -
  • presence_penalty: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)

  • -
  • seed: Random seed for deterministic results

  • -
  • stop: Sequences where the API will stop generating further tokens

  • -
  • stream: If true, partial completion deltas will be sent as they become available

  • -
  • stream_options: Additional options for streaming responses

  • -
  • suffix: Text to append to the completion

  • -
  • temperature: Controls randomness (0-2). Lower values make output more focused and deterministic

  • -
  • top_p: Alternative to temperature. Controls diversity via nucleus sampling

  • -
  • user: A unique identifier representing your end-user

  • -
+

The completions API accepts OpenAI Completions API’s parameters. Refer to OpenAI Completions API for more details.

Here is an example of a detailed completions request:

-
+
-  Be sure to include a new planet, a strange creature, and a discovery that changes everything.
-As Captain Zara Blackwood piloted her ship, the Celestial Quest, through the vast expanse of space, she couldn't help but feel a sense of excitement and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit their criteria. And finally, after months of searching, they had found it.
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0
+[2024-10-28 09:16:48] INFO:     127.0.0.1:36706 - "POST /v1/completions HTTP/1.1" 200 OK
 
+
+
+
+
+Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\n\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))
+
@@ -709,18 +735,26 @@

Batches#< completion_window="24h", ) -print(f"Batch job created with ID: {batch_response.id}") +highlight_text(f"Batch job created with ID: {batch_response.id}") - -
+
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0
 Batch job status: validating...trying again in 3 seconds...
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - "GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1" 200 OK
 Batch job completed successfully!
 Request counts: BatchRequestCounts(completed=2, failed=0, total=2)
-
-Request request-1:
-Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\n\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}
-
-Request request-2:
-Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\n\n1.  **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}
-
-Cleaning up files...
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - "GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1" 200 OK
+
+
+
+
+
+
+Request request-1:
+
+
+
+
+
+Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\n\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}
+
+
+
+
+
+Request request-2:
+
+
+
+
+
+Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}
+
+
+
+
+
+Cleaning up files...
+
+
+
+
+
+
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - "DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1" 200 OK
 

It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.

@@ -828,89 +895,161 @@

Batches#< completion_window="24h", ) -print(f"Created batch job with ID: {batch_job.id}") -print(f"Initial status: {batch_job.status}") +highlight_text(f"Created batch job with ID: {batch_job.id}") +highlight_text(f"Initial status: {batch_job.status}") time.sleep(10) max_checks = 5 for i in range(max_checks): batch_details = client.batches.retrieve(batch_id=batch_job.id) - print(f"Batch job details (check {i+1}/{max_checks}):") - print(f"ID: {batch_details.id}") - print(f"Status: {batch_details.status}") - print(f"Created at: {batch_details.created_at}") - print(f"Input file ID: {batch_details.input_file_id}") - print(f"Output file ID: {batch_details.output_file_id}") - - print("Request counts:") - print(f"Total: {batch_details.request_counts.total}") - print(f"Completed: {batch_details.request_counts.completed}") - print(f"Failed: {batch_details.request_counts.failed}") + + highlight_text( + f"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}" + ) + highlight_text( + f"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>" + ) time.sleep(3)

+
+
+
+
+
+[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - "POST /v1/files HTTP/1.1" 200 OK
+[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - "POST /v1/batches HTTP/1.1" 200 OK
+
+
+
+
+
+
+Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc
+
+
+
+
+
+Initial status: validating
+
+
+
+
+
+
+[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0
+[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0
+[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0
+[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0
+[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0
+[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0
+[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0
+[2024-10-28 09:17:01] INFO:     127.0.0.1:40866 - "GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1" 200 OK
+
+
+
+
+
+
+Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff
+
+
+Request counts: Total: 100 // Completed: 100 // Failed: 0
+
+
+
+
-Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Initial status: validating
-Batch job details (check 1/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 2/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 3/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 4/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 5/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
+[2024-10-28 09:17:04] INFO:     127.0.0.1:40866 - "GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1" 200 OK
 
+
+
+
+
+Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff
+
+
+
+
+
+Request counts: Total: 100 // Completed: 100 // Failed: 0
+
+
+
+
+
+
+[2024-10-28 09:17:07] INFO:     127.0.0.1:40866 - "GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1" 200 OK
+
+
+
+
+
+
+Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff
+
+
+
+
+
+Request counts: Total: 100 // Completed: 100 // Failed: 0
+
+
+
+
+
+
+[2024-10-28 09:17:10] INFO:     127.0.0.1:40866 - "GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1" 200 OK
+
+
+
+
+
+
+Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff
+
+
+
+
+
+Request counts: Total: 100 // Completed: 100 // Failed: 0
+
+
+
+
+
+
+[2024-10-28 09:17:13] INFO:     127.0.0.1:40866 - "GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1" 200 OK
+
+
+
+
+
+
+Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff
+
+
+
+
+
+Request counts: Total: 100 // Completed: 100 // Failed: 0
+

Here is an example to cancel a batch job.

-
+
-Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a
-Initial status: validating
-Cancellation initiated. Status: cancelling
-Current status: cancelled
-Batch job successfully cancelled
-Successfully cleaned up input file
+[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - "POST /v1/files HTTP/1.1" 200 OK
+[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - "POST /v1/batches HTTP/1.1" 200 OK
 
-
+
+
+
+
+Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5
+
+
+
+
+
+Initial status: validating
+
+
+
+
+
+
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1
+[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0
+[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0
+[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0
+[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0
+[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0
+[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0
+[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0
+[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0
+[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0
+[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0
+[2024-10-28 09:17:26] INFO:     127.0.0.1:54868 - "POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1" 200 OK
+
+
+
+
+
+
+Cancellation initiated. Status: cancelling
+
+
+
+
+
+
+[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - "GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1" 200 OK
+
+
+
+
+
+
+Current status: cancelled
+
+
+
+
+
+Batch job successfully cancelled
+
+
+
+
+
+
+[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - "DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1" 200 OK
+
+
+
+
+
+
+Successfully cleaned up input file
+
+
+
+
+
+
+
+[2024-10-28 09:17:29] INFO:     Shutting down
+[2024-10-28 09:17:30] INFO:     Waiting for application shutdown.
+[2024-10-28 09:17:30] INFO:     Application shutdown complete.
+[2024-10-28 09:17:30] INFO:     Finished server process [510260]
+
+

@@ -1109,7 +1328,7 @@

Batches#< diff --git a/openai_api.ipynb b/openai_api.ipynb index 3f07a6b..bcd5c32 100644 --- a/openai_api.ipynb +++ b/openai_api.ipynb @@ -6,7 +6,9 @@ "source": [ "# OpenAI Compatible API\n", "\n", - "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n", + "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n", + "\n", + "This tutorial aims at these popular APIs:\n", "\n", "- `chat/completions`\n", "- `completions`\n", @@ -30,10 +32,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:23:45.484181Z", - "iopub.status.busy": "2024-10-27T23:23:45.484018Z", - "iopub.status.idle": "2024-10-27T23:24:23.959941Z", - "shell.execute_reply": "2024-10-27T23:24:23.959208Z" + "iopub.execute_input": "2024-10-28T09:16:07.904473Z", + "iopub.status.busy": "2024-10-28T09:16:07.904311Z", + "iopub.status.idle": "2024-10-28T09:16:46.330698Z", + "shell.execute_reply": "2024-10-28T09:16:46.330038Z" } }, "outputs": [ @@ -41,22 +43,124 @@ "name": "stdout", "output_type": "stream", "text": [ - "Server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:16:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", - "server_process = execute_shell_command(\n", - " \"\"\"\n", - "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", - "\"\"\"\n", + "server_process = lauch_sglang_server(\n", + " command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")\n", - "print(\"Server is ready. Proceeding with the next steps.\")" + "\n", + "highlight_text(\"Server is ready. Proceeding with the next steps.\")" ] }, { @@ -64,10 +168,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:23.995371Z", - "iopub.status.busy": "2024-10-27T23:24:23.995106Z", - "iopub.status.idle": "2024-10-27T23:24:24.788840Z", - "shell.execute_reply": "2024-10-27T23:24:24.788201Z" + "iopub.execute_input": "2024-10-28T09:16:46.332812Z", + "iopub.status.busy": "2024-10-28T09:16:46.332554Z", + "iopub.status.idle": "2024-10-28T09:16:47.129366Z", + "shell.execute_reply": "2024-10-28T09:16:47.128802Z" } }, "outputs": [ @@ -75,8 +179,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "ChatCompletion(id='77e45b23e9b34ef0a65afd9598521768', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:16:46] INFO: 127.0.0.1:36690 - \"GET /get_model_info HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:46] INFO: 127.0.0.1:36696 - \"POST /generate HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:46] The server is fired up and ready to roll!\n", + "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0\n", + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -98,7 +226,8 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", - "print(response)" + "\n", + "highlight_text(f\"Response: {response}\")" ] }, { @@ -107,25 +236,7 @@ "source": [ "### Parameters\n", "\n", - "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n", - "\n", - "- `messages`: List of messages in the conversation, each containing `role` and `content`\n", - "- `model`: The model identifier to use for completion\n", - "- `max_tokens`: Maximum number of tokens to generate in the response\n", - "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", - "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", - "- `n`: Number of chat completion choices to generate\n", - "- `stream`: If true, partial message deltas will be sent as they become available\n", - "- `stop`: Sequences where the API will stop generating further tokens\n", - "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", - "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", - "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", - "- `logprobs`: Include log probabilities of tokens in the response\n", - "- `top_logprobs`: Number of most likely tokens to return probabilities for\n", - "- `seed`: Random seed for deterministic results\n", - "- `response_format`: Specify the format of the response (e.g., JSON)\n", - "- `stream_options`: Additional options for streaming responses\n", - "- `user`: A unique identifier representing your end-user\n", + "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n", "\n", "Here is an example of a detailed chat completion request:" ] @@ -135,10 +246,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:24.790616Z", - "iopub.status.busy": "2024-10-27T23:24:24.790426Z", - "iopub.status.idle": "2024-10-27T23:24:24.902228Z", - "shell.execute_reply": "2024-10-27T23:24:24.901651Z" + "iopub.execute_input": "2024-10-28T09:16:47.131245Z", + "iopub.status.busy": "2024-10-28T09:16:47.131061Z", + "iopub.status.idle": "2024-10-28T09:16:47.242225Z", + "shell.execute_reply": "2024-10-28T09:16:47.241691Z" } }, "outputs": [ @@ -146,8 +257,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "Ancient Rome's major achievements include:" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -173,11 +303,9 @@ " frequency_penalty=0.2, # Mild penalty for more natural language\n", " n=1, # Single response is usually more stable\n", " seed=42, # Keep for reproducibility\n", - " stream=True, # Keep streaming for real-time output\n", ")\n", "\n", - "for chunk in response:\n", - " print(chunk.choices[0].delta.content or \"\", end=\"\")" + "highlight_text(f\"Response: {response}\")" ] }, { @@ -188,7 +316,7 @@ "\n", "### Usage\n", "\n", - "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details." + "Completions API is similar to Chat Completions API, but without the `messages` parameter." ] }, { @@ -196,10 +324,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:24.903908Z", - "iopub.status.busy": "2024-10-27T23:24:24.903730Z", - "iopub.status.idle": "2024-10-27T23:24:25.361829Z", - "shell.execute_reply": "2024-10-27T23:24:25.361272Z" + "iopub.execute_input": "2024-10-28T09:16:47.243956Z", + "iopub.status.busy": "2024-10-28T09:16:47.243779Z", + "iopub.status.idle": "2024-10-28T09:16:47.703807Z", + "shell.execute_reply": "2024-10-28T09:16:47.703265Z" } }, "outputs": [ @@ -207,8 +335,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Completion(id='50da1b57333242cca0b8c6d8706f94b2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730071465, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0\n", + "[2024-10-28 09:16:47] INFO: 127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -220,7 +375,8 @@ " n=1,\n", " stop=None,\n", ")\n", - "print(response)" + "\n", + "highlight_text(f\"Response: {response}\")" ] }, { @@ -229,26 +385,7 @@ "source": [ "### Parameters\n", "\n", - "The completions API accepts the following parameters:\n", - "\n", - "- `model`: The model identifier to use for completion\n", - "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n", - "- `best_of`: Number of completions to generate server-side and return the best one\n", - "- `echo`: If true, the prompt will be included in the response\n", - "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", - "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", - "- `logprobs`: Include log probabilities of tokens in the response\n", - "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n", - "- `n`: Number of completion choices to generate\n", - "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", - "- `seed`: Random seed for deterministic results\n", - "- `stop`: Sequences where the API will stop generating further tokens\n", - "- `stream`: If true, partial completion deltas will be sent as they become available\n", - "- `stream_options`: Additional options for streaming responses\n", - "- `suffix`: Text to append to the completion\n", - "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", - "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", - "- `user`: A unique identifier representing your end-user\n", + "The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n", "\n", "Here is an example of a detailed completions request:" ] @@ -258,10 +395,10 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:25.363510Z", - "iopub.status.busy": "2024-10-27T23:24:25.363334Z", - "iopub.status.idle": "2024-10-27T23:24:26.087507Z", - "shell.execute_reply": "2024-10-27T23:24:26.086953Z" + "iopub.execute_input": "2024-10-28T09:16:47.705617Z", + "iopub.status.busy": "2024-10-28T09:16:47.705438Z", + "iopub.status.idle": "2024-10-28T09:16:48.612422Z", + "shell.execute_reply": "2024-10-28T09:16:48.611889Z" } }, "outputs": [ @@ -269,51 +406,42 @@ "name": "stdout", "output_type": "stream", "text": [ - " Be sure to include a new planet, a strange creature, and a discovery that changes everything.\n", - "As Captain Zara Black" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wood pil" + "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "oted her ship, the Celestial Quest, through the vast expanse of space, she couldn't help but feel a sense" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " of excitement" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit" + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0\n", + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " their criteria" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ". And finally, after months of searching, they had found it." - ] + "data": { + "text/html": [ + "Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\\n\\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -328,11 +456,9 @@ " frequency_penalty=0.3, # Reduce repetitive phrases\n", " n=1, # Generate one completion\n", " seed=123, # For reproducible results\n", - " stream=True, # Stream the response\n", ")\n", "\n", - "for chunk in response:\n", - " print(chunk.choices[0].text or \"\", end=\"\")" + "highlight_text(f\"Response: {response}\")" ] }, { @@ -357,10 +483,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:26.089195Z", - "iopub.status.busy": "2024-10-27T23:24:26.089017Z", - "iopub.status.idle": "2024-10-27T23:24:26.169406Z", - "shell.execute_reply": "2024-10-27T23:24:26.168852Z" + "iopub.execute_input": "2024-10-28T09:16:48.614261Z", + "iopub.status.busy": "2024-10-28T09:16:48.614081Z", + "iopub.status.idle": "2024-10-28T09:16:48.695988Z", + "shell.execute_reply": "2024-10-28T09:16:48.695467Z" } }, "outputs": [ @@ -368,8 +494,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Batch job created with ID: batch_a8bb0663-1cc5-487b-b170-d8f2a76dbf60\n" + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36708 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:48] INFO: 127.0.0.1:36708 - \"POST /v1/batches HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:48 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" ] + }, + { + "data": { + "text/html": [ + "Batch job created with ID: batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -419,7 +559,7 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Batch job created with ID: {batch_response.id}\")" + "highlight_text(f\"Batch job created with ID: {batch_response.id}\")" ] }, { @@ -427,28 +567,96 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:26.171258Z", - "iopub.status.busy": "2024-10-27T23:24:26.170832Z", - "iopub.status.idle": "2024-10-27T23:24:29.186895Z", - "shell.execute_reply": "2024-10-27T23:24:29.186293Z" + "iopub.execute_input": "2024-10-28T09:16:48.697904Z", + "iopub.status.busy": "2024-10-28T09:16:48.697486Z", + "iopub.status.idle": "2024-10-28T09:16:51.719102Z", + "shell.execute_reply": "2024-10-28T09:16:51.718503Z" } }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Batch job status: validating...trying again in 3 seconds...\n", + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1\" 200 OK\n", "Batch job completed successfully!\n", "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", - "\n", - "Request request-1:\n", - "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n", - "\n", - "Request request-2:\n", - "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\\n\\n1. **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n", - "\n", - "Cleaning up files...\n" + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Request request-1:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request request-2:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Cleaning up files..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:51] INFO: 127.0.0.1:36708 - \"DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1\" 200 OK\n" ] } ], @@ -471,16 +679,16 @@ " ]\n", "\n", " for result in results:\n", - " print(f\"\\nRequest {result['custom_id']}:\")\n", - " print(f\"Response: {result['response']}\")\n", + " highlight_text(f\"Request {result['custom_id']}:\")\n", + " highlight_text(f\"Response: {result['response']}\")\n", "\n", - " print(\"\\nCleaning up files...\")\n", + " highlight_text(\"Cleaning up files...\")\n", " # Only delete the result file ID since file_response is just content\n", " client.files.delete(result_file_id)\n", "else:\n", - " print(f\"Batch job failed with status: {batch_response.status}\")\n", + " highlight_text(f\"Batch job failed with status: {batch_response.status}\")\n", " if hasattr(batch_response, \"errors\"):\n", - " print(f\"Errors: {batch_response.errors}\")" + " highlight_text(f\"Errors: {batch_response.errors}\")" ] }, { @@ -500,10 +708,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:29.188845Z", - "iopub.status.busy": "2024-10-27T23:24:29.188552Z", - "iopub.status.idle": "2024-10-27T23:24:54.305285Z", - "shell.execute_reply": "2024-10-27T23:24:54.304629Z" + "iopub.execute_input": "2024-10-28T09:16:51.720917Z", + "iopub.status.busy": "2024-10-28T09:16:51.720728Z", + "iopub.status.idle": "2024-10-28T09:17:16.852156Z", + "shell.execute_reply": "2024-10-28T09:17:16.851486Z" } }, "outputs": [ @@ -511,89 +719,280 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Initial status: validating\n" + "[2024-10-28 09:16:51] INFO: 127.0.0.1:53788 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:16:51] INFO: 127.0.0.1:53788 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 1/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 2/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 3/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 4/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Batch job details (check 5/5):\n", - "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n", - "Status: completed\n", - "Created at: 1730071469\n", - "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n", - "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n", - "Request counts:\n", - "Total: 100\n", - "Completed: 100\n", - "Failed: 0\n" + "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:01] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:04] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:07] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:10] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:13] INFO: 127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -641,25 +1040,21 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Created batch job with ID: {batch_job.id}\")\n", - "print(f\"Initial status: {batch_job.status}\")\n", + "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n", + "highlight_text(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "max_checks = 5\n", "for i in range(max_checks):\n", " batch_details = client.batches.retrieve(batch_id=batch_job.id)\n", - " print(f\"Batch job details (check {i+1}/{max_checks}):\")\n", - " print(f\"ID: {batch_details.id}\")\n", - " print(f\"Status: {batch_details.status}\")\n", - " print(f\"Created at: {batch_details.created_at}\")\n", - " print(f\"Input file ID: {batch_details.input_file_id}\")\n", - " print(f\"Output file ID: {batch_details.output_file_id}\")\n", - "\n", - " print(\"Request counts:\")\n", - " print(f\"Total: {batch_details.request_counts.total}\")\n", - " print(f\"Completed: {batch_details.request_counts.completed}\")\n", - " print(f\"Failed: {batch_details.request_counts.failed}\")\n", + "\n", + " highlight_text(\n", + " f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n", + " )\n", + " highlight_text(\n", + " f\"Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}\"\n", + " )\n", "\n", " time.sleep(3)" ] @@ -676,10 +1071,10 @@ "execution_count": 9, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:24:54.307459Z", - "iopub.status.busy": "2024-10-27T23:24:54.307266Z", - "iopub.status.idle": "2024-10-27T23:25:07.414717Z", - "shell.execute_reply": "2024-10-27T23:25:07.413989Z" + "iopub.execute_input": "2024-10-28T09:17:16.854434Z", + "iopub.status.busy": "2024-10-28T09:17:16.854239Z", + "iopub.status.idle": "2024-10-28T09:17:29.967949Z", + "shell.execute_reply": "2024-10-28T09:17:29.967373Z" } }, "outputs": [ @@ -687,25 +1082,187 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a\n", - "Initial status: validating\n" + "[2024-10-28 09:17:16] INFO: 127.0.0.1:48056 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:17:16] INFO: 127.0.0.1:48056 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Cancellation initiated. Status: cancelling\n" + "[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Current status: cancelled\n", - "Batch job successfully cancelled\n", - "Successfully cleaned up input file\n" + "[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:26] INFO: 127.0.0.1:54868 - \"POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Cancellation initiated. Status: cancelling" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: 127.0.0.1:54868 - \"GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Current status: cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Batch job successfully cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: 127.0.0.1:54868 - \"DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Successfully cleaned up input file" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -753,37 +1310,37 @@ " completion_window=\"24h\",\n", ")\n", "\n", - "print(f\"Created batch job with ID: {batch_job.id}\")\n", - "print(f\"Initial status: {batch_job.status}\")\n", + "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n", + "highlight_text(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "try:\n", " cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n", - " print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", + " highlight_text(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", " assert cancelled_job.status == \"cancelling\"\n", "\n", " # Monitor the cancellation process\n", " while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", " cancelled_job = client.batches.retrieve(batch_job.id)\n", - " print(f\"Current status: {cancelled_job.status}\")\n", + " highlight_text(f\"Current status: {cancelled_job.status}\")\n", "\n", " # Verify final status\n", " assert cancelled_job.status == \"cancelled\"\n", - " print(\"Batch job successfully cancelled\")\n", + " highlight_text(\"Batch job successfully cancelled\")\n", "\n", "except Exception as e:\n", - " print(f\"Error during cancellation: {e}\")\n", + " highlight_text(f\"Error during cancellation: {e}\")\n", " raise e\n", "\n", "finally:\n", " try:\n", " del_response = client.files.delete(uploaded_file.id)\n", " if del_response.deleted:\n", - " print(\"Successfully cleaned up input file\")\n", + " highlight_text(\"Successfully cleaned up input file\")\n", " except Exception as e:\n", - " print(f\"Error cleaning up: {e}\")\n", + " highlight_text(f\"Error cleaning up: {e}\")\n", " raise e" ] }, @@ -792,13 +1349,24 @@ "execution_count": 10, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:07.416667Z", - "iopub.status.busy": "2024-10-27T23:25:07.416471Z", - "iopub.status.idle": "2024-10-27T23:25:10.222119Z", - "shell.execute_reply": "2024-10-27T23:25:10.221434Z" + "iopub.execute_input": "2024-10-28T09:17:29.969798Z", + "iopub.status.busy": "2024-10-28T09:17:29.969613Z", + "iopub.status.idle": "2024-10-28T09:17:32.811800Z", + "shell.execute_reply": "2024-10-28T09:17:32.811092Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:17:29] INFO: Shutting down\n", + "[2024-10-28 09:17:30] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:17:30] INFO: Application shutdown complete.\n", + "[2024-10-28 09:17:30] INFO: Finished server process [510260]\n" + ] + } + ], "source": [ "terminate_process(server_process)" ] diff --git a/release_process.html b/release_process.html index 299c1a3..c610c19 100644 --- a/release_process.html +++ b/release_process.html @@ -33,7 +33,8 @@ - + + @@ -54,7 +55,7 @@ - + @@ -494,7 +495,7 @@

Make a release in GitHub

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/sampling_params.html b/sampling_params.html index b186a12..334bff8 100644 --- a/sampling_params.html +++ b/sampling_params.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -942,7 +943,7 @@

Min New Tokens

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/search.html b/search.html index 69d4c1a..3e0a808 100644 --- a/search.html +++ b/search.html @@ -31,7 +31,8 @@ - + + @@ -63,7 +64,7 @@ - + @@ -385,7 +386,7 @@

Search

diff --git a/searchindex.js b/searchindex.js index 378d3ee..ceca047 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[16, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[14, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[14, "baseline"]], "Batches": [[12, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[14, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[17, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[12, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[12, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[14, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[14, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[14, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[15, "Launch-a-server"]], "Make a release in GitHub": [[13, "make-a-release-in-github"]], "Memory": [[14, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[14, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[14, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[14, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [12, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[12, "Parameters"], [12, "id2"]], "Performance Implications on Penalties": [[14, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[14, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[13, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[15, null]], "References": [[9, null]], "Repetition Penalty": [[14, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[14, null]], "Send a Request": [[15, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[16, null]], "Step 1: Start a docker container.": [[16, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[16, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[16, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [14, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[17, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[17, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[13, "update-the-version-in-code"]], "Upload the PyPI package": [[13, "upload-the-pypi-package"]], "Usage": [[12, "Usage"], [12, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [15, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 12, 14, 15], "0": [1, 6, 7, 8, 10, 12, 14, 15, 16, 17], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "01": [7, 8, 14], "01239013671875": 6, "01438140869140625": 6, "02": 14, "03": 14, "04": [14, 16], "05": 14, "06": 14, "08": 14, "0_rocm6": 16, "0_triton3": 16, "1": [2, 6, 7, 8, 11, 12, 14, 15], "10": [1, 2, 6, 12, 14], "100": [7, 12], "101": 14, "103": 14, "104": 14, "10405": 14, "10666": 14, "107": 14, "10767": 14, "11": 14, "114": 14, "11586": 14, "117": 14, "11732": 14, "12": [10, 14, 16], "123": 12, "127": [1, 6, 12, 15], "128": [1, 14], "128009": [12, 15], "13": [12, 14], "14226": 14, "150": 12, "158c": 12, "16": [1, 7, 12, 14], "16219": 14, "16740": 14, "17": 14, "17125": 14, "17167": 14, "172": 1, "1730071464": 12, "1730071465": 12, "1730071466": 12, "1730071469": 12, "1730071553": 15, "1730071554": 15, "174": 14, "179": 14, "18": 14, "18895": 14, "189": 14, "191": 14, "195": 14, "19884": 14, "1cc5": 12, "1st": 14, "2": [1, 5, 6, 7, 9, 12, 14, 15], "200": 12, "20000": 1, "2048": [2, 8], "205": 14, "20866": 14, "22095": 14, "22363": 14, "22603": 14, "233": [8, 14], "23892": 14, "24": 14, "2499": 12, "24h": 12, "25": 7, "256": [1, 2, 7, 14], "26": 14, "268": 14, "27": 14, "271": 14, "29": 14, "293": 14, "3": [2, 5, 6, 7, 8, 9, 12, 14, 15], "30": 14, "3000": 14, "30000": [1, 5, 7, 10, 12, 14, 15], "30010": 6, "308": 14, "31": 14, "317": 8, "32": [1, 2, 14], "320": 14, "34": 15, "35": 14, "36": 14, "37": 14, "370959": 8, "378633": 14, "38": 14, "386d": 12, "39": [12, 14, 15], "4": [1, 6, 7, 12, 15], "40": 14, "403": 15, "40881": 14, "409": 14, "4096": [1, 2, 8], "41": [12, 14], "41888": 14, "42": 12, "4286": 12, "433": 14, "43967": 14, "44": 14, "440": 14, "447": 14, "44926": 14, "45": 14, "450": 15, "453": 14, "45354": 14, "45445": 14, "455": 14, "4594": 8, "46": [12, 14, 15], "4640": 12, "46530": 14, "47": [14, 15], "47738": 14, "4811": 12, "48302": 14, "4832": 14, "487b": 12, "48960": 14, "49": [12, 15], "49017": 14, "4918": 12, "49263": 14, "5": [1, 6, 7, 12, 14], "50": [8, 12, 14], "500": [8, 12], "50000": 1, "50302": 14, "5079": 14, "50da1b57333242cca0b8c6d8706f94b2": 12, "51": 14, "512": [2, 14], "52": 1, "5206": 14, "5255": 14, "52554": 14, "52825": 14, "52920": 14, "54": [12, 14], "54497": 14, "55": 14, "56": 14, "5656": 14, "5727": 14, "57426": 14, "58": 14, "59": 14, "5b": 11, "6": [1, 12, 16], "60": [2, 14], "6000": 2, "6040f73b": 12, "61": 14, "64": [1, 2, 12, 14, 15], "64g": 16, "65": 14, "66": 14, "67": 14, "68": 14, "69": 14, "6ae7fabfd4c54054a8017e2aa7c6bc5a": 15, "6d26915780e2": 12, "6fd9": 12, "7": [1, 12], "70": [2, 14], "71": 14, "72": 14, "72b": 1, "73": [12, 14], "74": 14, "75": 14, "76": 14, "766008": 14, "774756": 14, "774955": 14, "775118": 14, "775210": 14, "775220": 14, "775651": 14, "77e45b23e9b34ef0a65afd9598521768": 12, "78": 14, "79": 14, "7b": [1, 5, 6, 14], "7fa2af80": 2, "8": [1, 12, 14, 17], "8000": 0, "81": 14, "82": 8, "8235c9a12fd8": 12, "83": 14, "84": 14, "8413": 14, "85": 14, "8522": 12, "86": 14, "88": 14, "89": [12, 14], "8b": [1, 2, 7, 10, 12, 14, 15], "9": [1, 7, 8, 12, 17], "90": 14, "900097bd": 12, "91": 14, "92fe": 12, "93": 14, "94": 14, "95": [1, 12, 14, 15], "96": 14, "97": 14, "98": 14, "9900": 14, "9998": 8, "9a0c": 12, "A": [1, 2, 7, 8, 9, 10, 12], "And": 12, "As": 12, "Be": 12, "By": [5, 14], "For": [1, 2, 3, 11, 12, 14], "If": [1, 5, 8, 10, 12, 14, 17], "In": [1, 7, 17], "It": [1, 3, 5, 7, 8, 9, 10, 12, 14, 15], "NOT": 5, "On": 8, "THE": 12, "The": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 14, 16], "Then": [7, 16], "There": 5, "These": 14, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 13, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abil": 15, "abl": 11, "about": [1, 5, 7, 8, 12], "abov": [2, 3, 10, 14], "acceler": [1, 8, 10], "accept": [12, 14], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2, 15], "accuraci": 15, "achiev": 12, "across": 3, "activ": 9, "ad": 10, "add": [1, 2, 6, 7, 8, 14, 17], "addit": [3, 7, 12, 14], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "after": [12, 15], "again": 12, "against": 3, "ai": [1, 10, 12, 15], "alexa": 15, "algorithm": 14, "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 16], "all_other_model": 11, "allow": [2, 10, 15], "almost": [1, 8, 11], "also": [1, 5, 6, 7, 8, 14], "altern": [3, 7, 12], "alwai": [8, 12, 15], "amd": 16, "amount": 15, "an": [0, 1, 3, 7, 9, 10, 12, 14, 15, 16], "ancient": 12, "ani": [1, 7, 10, 12, 14], "annot": 2, "anoth": [11, 15], "answer": [3, 7, 15], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 14], "api_kei": [1, 6, 12, 15], "appear": [12, 14], "append": 12, "appli": 14, "applic": [1, 6, 9, 15], "approach": 10, "apt": [2, 16], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15], "arch": 16, "architectur": [2, 15], "arg": 3, "argument": [2, 7, 14], "around": 17, "arrai": 12, "articl": 15, "artifici": 15, "assert": 12, "assign": [12, 15], "assist": [1, 3, 5, 7, 12, 14, 15], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attract": [3, 7, 12], "audio": [12, 15], "auror": 7, "australia": [12, 15], "author": [6, 15], "automat": 14, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 10, 12], "averag": 3, "avoid": [10, 12], "awq": 9, "b": 10, "b170": 12, "b23150459375": 12, "b879": 12, "back": 9, "backend": [2, 3, 10, 14, 17], "backend_input_fil": 12, "backend_result_fil": 12, "bad": 3, "baichuan2": 1, "balanc": [7, 12], "base": [3, 12, 14], "base64": 14, "base_url": [1, 6, 12, 15], "bash": [13, 16], "basic": 14, "batch": [1, 2, 8, 9, 14], "batch_08ed9e0c": 12, "batch_3bed32fb": 12, "batch_a8bb0663": 12, "batch_detail": 12, "batch_id": 12, "batch_job": 12, "batch_request": 12, "batch_respons": 12, "batchrequestcount": 12, "bearer": [6, 15], "becaus": [7, 8], "becom": 12, "been": 12, "befor": [2, 14], "begin": 7, "beij": 12, "being": 8, "below": [7, 10, 14, 16], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 14], "benchmark": 9, "berlin": 3, "bespok": 3, "best": 12, "best_of": 12, "better": [1, 8, 10, 11, 12], "between": [1, 14], "bia": 7, "bin": 16, "blackwood": 12, "blob": 14, "block": [7, 15], "blogpost": 3, "blood": 7, "bodi": [7, 12, 14], "bogart": 7, "book": 15, "bool": 14, "born": 7, "both": 8, "bottleneck": 8, "bra": 12, "branch": 10, "bras\u00edlia": [12, 15], "brazil": [12, 15], "break": 14, "browser": 0, "bug": 12, "build": [1, 10, 12, 13], "built": 10, "c": [10, 12], "cach": [1, 2, 8, 9, 10, 16], "calcul": 7, "call": [3, 7, 9], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "canberra": [12, 15], "cancel": 12, "cancelled_job": 12, "cannot": 14, "capit": [1, 3, 7, 12, 14, 15], "captain": 12, "case": [8, 17], "cd": [4, 10, 13], "celesti": 12, "center": 12, "chain": 9, "chang": [11, 12, 16], "charact": 7, "character": 15, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 14, 15], "chat_exampl": 7, "chat_templ": 5, "chatbot": 15, "chatcomplet": [12, 15], "chatcompletionmessag": [12, 15], "chatglm": 1, "chatml": [1, 5, 14], "check": [1, 10, 12], "check_output": 6, "checkpoint": [1, 2], "china": 12, "choic": [7, 9, 12, 15], "choices_method": 3, "chunk": [1, 9, 12, 14], "ci": 4, "civil": 12, "class": 14, "clean": 12, "cli": 2, "client": [1, 2, 6, 12, 15], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 6, 7, 15], "coher": 15, "color": 2, "com": [2, 10, 13, 14, 16], "come": [8, 14], "command": [1, 2, 4, 6, 10, 11, 15, 16], "commit": 4, "common": 17, "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 9, 14], "compil": [1, 8], "complet": [1, 6, 7, 15], "completion_token": [12, 15], "completion_tokens_detail": [12, 15], "completion_window": 12, "completionchoic": 12, "completionusag": [12, 15], "complex": [7, 15], "compos": 15, "comprehend": 15, "comput": [2, 7, 8, 14], "concis": 12, "conda": 10, "condens": 15, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 14], "constrain": [8, 9, 14], "constraint": 7, "contain": [3, 12], "content": [1, 6, 7, 12, 15], "context": 15, "continu": [7, 9], "contribut": 5, "contributor": 9, "control": [9, 12], "convers": [5, 12], "convert": 11, "copi": 10, "core": [7, 9, 15], "correct": [2, 14], "cost": 12, "could": 14, "couldn": 12, "count": 12, "countri": [1, 12, 15], "coverag": 11, "cpu": 8, "creat": [1, 6, 11, 12, 15], "created_at": 12, "creation": 15, "creativ": 12, "creatur": 12, "crew": 12, "criteria": 12, "critic": 2, "cu121": 10, "cuda": [1, 2, 10, 14, 16], "cuda_visible_devic": 16, "curl": [1, 14, 15, 16], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 12, "custom": [1, 15], "custom_id": 12, "d": [0, 1, 2, 6, 7, 10, 12, 15], "d8f2a76dbf60": 12, "da93c64364af475cbdd2cb19155fd68d": 15, "dark": 12, "data": [1, 6, 8, 14, 15], "dataclass": 14, "dataset": [2, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 14, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 12, 14], "decode_unicod": 14, "decor": 7, "decreas": 8, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 12, 14, 17], "defin": [5, 7], "del_respons": 12, "delai": 2, "delet": 12, "delta": 12, "depend": 10, "deploi": 10, "deploy": 10, "describ": [3, 14], "descript": [2, 14], "design": [9, 15], "desir": 14, "detail": [12, 14], "detailed_tip": 7, "determin": 3, "determinist": 12, "detoken": 14, "dev": [1, 16], "devel": 16, "develop": [2, 12], "devic": [1, 10, 16], "devtool": 2, "dict": 14, "diet": 7, "differ": 11, "difficult": 14, "digest": 15, "directli": 1, "directori": 11, "disabl": [1, 2, 14, 17], "discoveri": 12, "dislik": 14, "distrib_releas": 2, "divers": 12, "django": 12, "dn": 7, "do": [2, 8, 12, 14, 16], "doc": [2, 3, 10, 14], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [14, 16], "down": 3, "download": [2, 14], "dp": 1, "dpkg": 2, "drawback": 14, "dri": 16, "dtype": 1, "duck": 3, "due": [3, 8, 17], "dummi": 2, "dump": [6, 12], "durat": [2, 14], "dure": [1, 8, 12, 14, 15], "dynam": [1, 2], "e": [2, 10, 11, 12, 16], "e2": 14, "e5": [1, 6, 9], "eab3380d686a": 12, "each": [1, 12], "earli": 8, "earlier": 3, "easi": [9, 11, 17], "easier": 7, "eater": 7, "echo": [2, 12, 16], "edit": 16, "effici": [1, 9], "either": 14, "element": 12, "eleutherai": 3, "elif": 7, "els": 12, "embed": [1, 9, 12], "embedding_model": 12, "embedding_process": 6, "empti": 1, "enabl": [1, 7, 8, 10, 15], "encod": [6, 14], "encount": 10, "encourag": [12, 14], "end": [7, 11, 12, 14], "endpoint": [1, 10, 12, 14], "engin": 7, "enough": [1, 8], "entir": 15, "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 6, 16], "eo": [8, 14], "equival": [6, 15], "error": [1, 6, 8, 12], "especi": 8, "etc": [2, 9], "eth0": 1, "even": [3, 12, 15], "everi": 14, "everyth": 12, "exampl": [1, 3, 6, 11, 12, 15, 16], "example_imag": 14, "exaon": 1, "except": 12, "excit": 12, "excl": 14, "exec": 2, "execut": [10, 15], "execute_shell_command": [6, 12, 15], "exercis": 7, "exist": 11, "expand": 7, "expans": 12, "experiment": 8, "explor": 12, "export": [0, 1, 7, 16], "express": [7, 14], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 14, "f": [1, 6, 7, 12], "face": [1, 5], "fact": 15, "fail": [3, 12], "failur": 10, "fals": [6, 14], "far": [12, 14], "fast": [9, 12], "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feel": 12, "fetch": 2, "field": 15, "file": [0, 2, 4, 11, 12, 14], "file_respons": 12, "fill": 7, "fillmor": 3, "final": 12, "find": [7, 11, 14], "finish_reason": [12, 15], "first": [1, 2, 6, 7, 8, 14], "fit": 12, "fix": 17, "flashinf": [9, 10], "flask": 12, "flexibl": 9, "float": 14, "flow": 9, "fluenci": 12, "flush": [7, 14], "focus": 12, "folder": [2, 4, 16], "follow": [1, 2, 5, 6, 7, 8, 11, 12, 14, 16], "forev": 16, "fork": [2, 7], "format": [2, 7, 12, 14], "forward": [9, 11], "forward_batch": 11, "found": [7, 12], "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 14, 17], "framework": [9, 12], "franc": [1, 3, 7, 14], "frequenc": 12, "frequency_penalti": [12, 14], "frequent": 8, "from": [4, 5, 6, 7, 12, 15], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8], "function": [3, 7, 11], "function_cal": [12, 15], "further": [10, 12], "futur": [1, 11], "g": [2, 10, 11, 12, 16], "galaxi": 12, "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8], "gener": [0, 1, 9, 12, 14, 15], "generatereqinput": 14, "get": [6, 10, 11, 14], "git": [10, 16], "github": [0, 10, 14], "give": [11, 16], "given": 14, "glimps": 14, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": [7, 15], "gpt": 7, "gptq": 9, "gpu": [1, 8, 10, 14, 16], "graph": [1, 2, 17], "greedy_token_select": 3, "grok": 1, "group": 16, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 14, 15], "h": [1, 6, 15], "h100": [10, 14], "ha": [8, 11], "had": 12, "haisgl": 16, "half": 7, "hand": 8, "handl": [1, 2, 14], "happen": 8, "hardwar": 14, "harri": 7, "hasattr": 12, "have": [0, 1, 3, 8, 12, 14, 15], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 12, 14, 15], "henryx": 16, "her": 12, "here": [1, 6, 7, 12, 15], "hf": 5, "hf_home": 16, "hf_token": [10, 16], "hf_xxx": 16, "high": [3, 8, 12, 14, 15], "higher": [12, 14], "highest": [3, 7], "historian": 12, "hit": 14, "host": [6, 10, 12, 15], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 16], "human": 15, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17], "id": [12, 14, 15], "ident": 11, "identifi": 12, "idiom": 15, "ignor": 14, "ignore_eo": 14, "im_end": [5, 14], "im_start": [5, 14], "imag": [7, 10, 14], "image_data": 14, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11, 12], "import": [1, 2, 6, 7, 8, 12, 14, 15], "includ": [7, 9, 12, 15], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 12, 15], "indic": 8, "industri": [9, 15], "inf": 14, "infer": [1, 14], "inform": [7, 14, 15], "infra": 10, "init": 1, "initi": [3, 12, 15], "input": [1, 2, 7, 9, 12, 14, 15], "input_file_id": 12, "input_file_path": 12, "input_id": [6, 14], "input_ids_embed": 6, "insid": 16, "instal": [0, 2, 4, 9, 13, 15, 16], "installationguid": 2, "instanc": 3, "instead": [1, 17], "instruct": [1, 2, 6, 7, 10, 12, 14, 15], "int": 14, "int4": 9, "int4wo": 1, "integr": 9, "intellig": 15, "inter": 14, "interact": 9, "interconnect": 15, "interfac": [9, 11], "internlm": 1, "interpret": [12, 15], "intfloat": 6, "intuit": 9, "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 12, "issu": [7, 10, 17], "itali": 12, "iter_lin": 14, "itl": 14, "its": [3, 15], "japan": [7, 12, 15], "job": 12, "joke": 12, "json": [1, 2, 5, 6, 12, 14, 15], "json_decod": 7, "json_output": 7, "json_schema": 14, "jsonl": 12, "jump": 9, "just": [5, 12], "k": 14, "k8": 10, "keep": 12, "kei": [2, 7, 12, 15], "kernel": [9, 10, 17], "kfd": 16, "kingdom": 7, "knowledg": 12, "kv": [1, 8], "l": 14, "l4": 10, "l40": 10, "lab": [1, 14], "label": 16, "lang": 14, "languag": [5, 9, 10, 12, 15], "larg": [1, 2, 8, 9, 15], "last": 10, "later": [3, 16], "latest": 10, "launch": [1, 2, 5, 7, 9, 10, 14, 17], "launch_serv": [1, 2, 5, 6, 7, 10, 12, 14, 15], "layer": [11, 15], "layer_id": 11, "learn": [1, 4, 11, 15], "least": 14, "len": [1, 2, 14], "length": [7, 12, 14], "less": 12, "let": 1, "level": [6, 12, 14, 15], "librari": 7, "light": 12, "like": [8, 12, 15], "likelihood": 12, "limit": 3, "line": [12, 15], "lint": 4, "linux": 16, "list": [1, 2, 7, 11, 12, 14, 15, 17], "llama": [2, 5, 7, 9, 10, 11, 12, 14, 15], "llama3": 1, "llava": [1, 9, 14], "llava_llama_3": 1, "llm": [1, 3, 9, 15], "lmm": [1, 14], "lmsysorg": 10, "load": [1, 2, 6, 8, 12, 14], "load_imag": 14, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 12, 14, 15], "locat": 14, "log": [6, 7, 8, 12, 15], "logic": 14, "logit": [7, 11, 14], "logit_bia": 12, "logitsprocessor": 11, "logprob": [3, 12, 14, 15], "logprob_start_len": 14, "london": 3, "long": [1, 12, 15], "longer": [3, 12], "longest": 8, "look": [5, 8], "loop": 7, "low": 14, "lower": [8, 12], "lpm": 8, "lsb": 2, "m": [0, 1, 2, 5, 6, 7, 10, 11, 12, 14, 15], "machin": 10, "magic": 7, "mai": [1, 2, 7, 17], "main": [1, 14], "maintain": 11, "major": [11, 12], "make": [0, 8, 9, 11, 12, 14], "manag": 7, "mani": [3, 8, 11], "manner": 14, "mask": 7, "match": 8, "matched_stop": [12, 15], "math": 7, "max": 14, "max_check": 12, "max_new_token": [1, 8, 14], "max_token": [1, 7, 12, 15], "maximum": [12, 14], "md": 4, "me": 12, "mean": [8, 14], "meanwhil": 5, "measur": 14, "media": 15, "median": 14, "meet": 1, "mem": [1, 14, 17], "memori": [1, 2], "messag": [1, 7, 12, 15], "meta": [1, 2, 5, 7, 10, 12, 14, 15], "method": [9, 12], "mild": 12, "millard": 3, "min_new_token": 14, "min_p": 14, "minicpm": 1, "ministri": 7, "mislead": 3, "miss": 5, "mistral": [1, 6, 9], "mix": 14, "mixtral": 1, "modal": [1, 9], "mode": 12, "model": [2, 3, 5, 8, 9, 10, 12, 14, 15, 16], "model_path": 1, "moder": 12, "modifi": 12, "moe": 1, "monitor": 12, "month": 12, "more": [1, 9, 10, 12, 14, 15], "most": [5, 8, 11, 12], "mount": 16, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 12, 15], "multipli": 14, "must": 14, "my": 1, "my_model": 5, "my_model_templ": 5, "n": [7, 12, 14, 15], "n1": [12, 15], "n2": [12, 15], "n3": [12, 15], "n4": 15, "n5": 15, "name": [1, 2, 3, 5, 7, 14, 16], "natur": [12, 15], "nbecaus": 12, "nccl": 1, "nclean": 12, "ndescrib": 14, "need": [2, 5, 7, 10, 11, 16], "nemo": 1, "nest": 7, "network": 15, "neural": 15, "neuron": 15, "new": [1, 8, 9, 12, 13, 16], "new_token_ratio": 8, "next": [1, 6, 12, 15], "ngener": 1, "nlarg": 15, "nlist": 12, "nlp": [1, 6, 15], "nnode": 1, "node": [1, 2, 15], "non": 7, "none": [6, 12, 14, 15], "normal": 7, "note": [1, 2, 5, 11, 14, 16], "novel": 12, "noveral": 15, "now": 7, "npython": 12, "nrequest": 12, "nsome": 15, "nsy": 2, "nthe": 15, "nuanc": 15, "nucleu": 12, "null": [10, 15], "num": [1, 2, 14], "number": [8, 12, 14], "numer": 15, "nvidia": [2, 14, 16], "nvtx": 2, "nyou": 14, "o": [2, 6, 14, 16], "object": [12, 15], "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 15, "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 15], "one": [3, 7, 12, 14, 15], "onevis": [1, 14], "onli": [2, 3, 7, 10, 11, 12, 14], "onlin": [1, 2], "only_run": 11, "oom": [8, 14], "open": [9, 10, 12], "openai": [3, 5, 9, 10, 14], "openai_api_kei": [7, 16], "oper": 10, "optim": 17, "option": [3, 12, 14], "order": 7, "other": [3, 8, 10, 11, 14], "out": [1, 2, 7, 10, 17], "output": [1, 2, 11, 12, 14], "output_file_id": 12, "ov": [1, 14], "overhead": [8, 14], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 14], "p": [10, 14], "p2p": 1, "p99": 14, "page": [9, 17], "paragraph": 7, "parallel": [1, 8, 9, 14], "paramet": [8, 9], "pari": 3, "part": 11, "partial": 12, "pass": [4, 7, 11], "path": [0, 1, 2, 3, 5, 6, 7, 10, 12, 14, 15], "patronu": 7, "pattern": 15, "peer": 1, "penal": [12, 14], "penalti": 12, "per": 14, "perform": 3, "phoenix": 7, "phrase": 12, "piec": 15, "pilot": 12, "pip": [0, 2, 13, 16], "pip3": 4, "plan": 10, "planet": 12, "platform": 15, "playground": 11, "pleas": [1, 7, 10], "png": 14, "pool": [1, 8], "poorli": 3, "popular": 15, "port": [1, 5, 6, 7, 10, 12, 14, 15], "post": [12, 14, 15], "post2": 10, "post3_vllm0": 16, "potter": 7, "power": 15, "pre": 4, "predict": 3, "prefer": 12, "prefil": [1, 2, 9, 11], "prefix": [8, 9], "prerequisit": 2, "presenc": 12, "presence_penalti": [12, 14], "presid": [1, 3], "prev": 14, "primit": [3, 7], "print": [1, 2, 6, 7, 12, 14, 15], "probabl": [7, 12], "proceed": [6, 12, 15], "process": [12, 14, 15], "profil": 9, "program": [9, 10, 12], "programm": 12, "progress_bar": 7, "project": [0, 5, 10, 13, 14, 16], "prompt": [1, 2, 7, 9, 12, 14], "prompt_token": [12, 15], "prompt_tokens_detail": [12, 15], "proper": 10, "provid": [1, 2, 7, 9, 10, 12, 15], "pub": 2, "pull": 16, "pure": 7, "purpos": 12, "py": [0, 1, 2, 5, 7, 11, 13, 14], "pydant": 7, "pyproject": 13, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15], "python3": [0, 1, 2, 10, 11, 14, 16], "pytorch": [10, 17], "q": 7, "quantiz": [1, 9], "queri": [1, 15], "quest": 12, "question": [7, 15], "question_1": 7, "question_2": 7, "queue": 8, "quick": [2, 9], "quick_start": 7, "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 14], "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "rais": 12, "ran": 14, "random": [2, 12, 14], "rang": [8, 9, 12, 15], "rank": 1, "rate": 14, "ravenclaw": 7, "raw": 14, "rb": 12, "reach": 14, "read": 12, "readi": [6, 12, 15], "readm": 4, "readme_exampl": 7, "real": [1, 2, 12], "reason": 12, "recommend": [2, 10, 12, 14, 15], "recoveri": 10, "reduc": [1, 8, 12], "refer": [1, 11, 12], "reference_hf": 11, "refus": [12, 15], "regex": [7, 14], "regist": 5, "regular": [7, 14], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 15, "releas": [2, 10], "relev": 14, "rememb": 6, "remot": 10, "remov": [0, 11], "repeat": 14, "repetit": 12, "repetition_penalti": 14, "replac": [1, 10, 11], "repo": 2, "report": [1, 17], "repres": 12, "reproduc": 12, "req": [8, 12, 14], "request": [1, 7, 9, 12, 14], "request_count": 12, "request_id": 12, "requir": 0, "resourc": [10, 11], "respond": 15, "respons": [1, 3, 6, 12, 14, 15], "response_format": 12, "restart": 16, "result": [3, 12, 14], "result_cont": 12, "result_file_id": 12, "retoken": 14, "retracted_req": 8, "retriev": 12, "return": [12, 14], "return_logprob": 14, "return_text_in_logprob": 14, "reus": 11, "revolution": 15, "rid": 14, "rm": 16, "rmsnorm": 11, "role": [1, 12, 15], "rome": 12, "root": 10, "run": [0, 2, 4, 6, 7, 11, 14, 15], "run_batch": 7, "runner_allow_runasroot": 16, "running_request": 14, "runtim": [9, 10], "runtimeendpoint": [3, 7], "same": [1, 2, 6, 7, 11, 14], "sampl": [9, 10, 11, 12, 17], "sampling_param": [1, 14], "scalabl": 12, "scale": [10, 14], "scan": 12, "schema": [7, 14], "script": 11, "search": [7, 12], "second": 12, "secret": 10, "section": [14, 15], "see": [1, 7, 8, 10, 14], "seed": 12, "select": [7, 10], "send": [1, 8, 9, 12, 14], "send_request": 12, "sens": 12, "sent": 12, "sentenc": 14, "sep": 5, "sep_styl": 5, "sequenc": 12, "serv": [1, 2, 8, 9, 10, 14], "server": [0, 2, 5, 7, 8, 9, 12, 14], "server_process": [12, 15], "servic": [10, 12, 15], "service_ti": [12, 15], "set": [1, 2, 5, 7, 10, 12, 14, 15, 17], "set_default_backend": 7, "sever": [1, 2], "sgl": [0, 1, 3, 7, 10, 13, 14, 16], "sgl0": 16, "sglang": [2, 4, 6, 12, 13, 15, 16], "sglang_is_in_ci": 16, "sglang_use_modelscop": 1, "sh": 13, "share": [8, 16], "she": 12, "shell": 6, "ship": 12, "shm": 16, "short": [12, 14], "shorter": [3, 15], "should": [5, 11], "show": 7, "side": 12, "sign": 12, "siluandmul": 11, "similar": [11, 12, 14], "simpl": [7, 12, 15], "simpli": 3, "sinc": [12, 14], "singl": [1, 2, 10, 11, 12, 14], "siri": 15, "size": [1, 2, 16], "sk": [7, 16], "skip": 14, "skip_special_token": 14, "sky": 10, "skyserv": 10, "sleep": [12, 16], "slightli": 12, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 17], "smollm": 1, "smooth": 12, "snippet": 2, "so": [1, 2, 12, 14], "social": 15, "some": [2, 6, 7, 11, 14, 16, 17], "sometim": 17, "sourc": [2, 9], "space": [12, 14], "spaces_between_special_token": 14, "special": 14, "specif": [1, 10, 11, 15], "specifi": [1, 3, 5, 7, 12, 14, 15, 16], "split": 12, "srt": [9, 10, 14], "stabl": 12, "stablelm": 1, "stai": 7, "stand": [8, 15], "start": [11, 14], "startswith": 14, "state": [1, 7, 12], "static": [1, 2, 14, 17], "statu": [7, 10, 12], "status_cod": 12, "step": [6, 12, 15], "still": 14, "stop": [7, 8, 12, 14, 15], "stop_str": 5, "stop_token_id": 14, "store": 14, "stori": [12, 15], "str": 14, "strang": 12, "strategi": 1, "stream": [1, 12], "stream_opt": 12, "string": [8, 12, 14], "strip": [12, 14], "strong": 3, "strongli": [12, 15], "structur": [9, 15], "student": 7, "subprocess": 6, "subset": 3, "success": 14, "successfulli": 12, "suffix": 12, "suggest": 8, "summar": 15, "summari": [7, 15], "suppli": [3, 14], "support": [3, 6, 7, 9, 10, 14, 15], "sure": [0, 11, 12, 14], "switch": 10, "sxm5": 14, "system": [1, 2, 5, 7, 12, 14, 15], "system_fingerprint": [12, 15], "t": 12, "t4": 10, "take": [8, 12], "teacher": 7, "tee": 2, "tell": 12, "temperatur": [1, 7, 12, 14, 15], "templat": [1, 7, 14], "temporarili": 5, "tensor": [1, 9], "termin": 10, "terminate_process": [6, 12, 15], "test": [1, 2, 14, 15, 16], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 12, 14, 15], "text_complet": 12, "text_embed": 6, "text_it": 7, "text_qa": 7, "thei": [12, 14], "them": [10, 15, 17], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 17], "thing": 8, "through": [7, 12, 14], "throughput": [1, 14], "time": [1, 2, 6, 12, 14], "tip": 17, "tip_suggest": 7, "tmp": 16, "todai": 1, "togeth": [1, 8], "tok": 14, "token": [1, 5, 6, 7, 8, 9, 10, 12], "token_id": 14, "token_length_norm": 3, "tokenizers_parallel": 6, "tokyo": [12, 15], "toml": 13, "too": 8, "tool": [7, 15], "tool_cal": [12, 15], "tool_us": 7, "top": 14, "top_k": 14, "top_logprob": 12, "top_logprobs_num": 14, "top_p": [1, 12, 14], "topic": [12, 15], "torch": [1, 8], "torch2": 10, "torchao": 1, "total": [1, 12, 14], "total_token": [12, 15], "tp": 1, "tpot": 14, "tr": 2, "trace": 2, "traffic": 14, "train": [2, 15], "transform": [6, 11, 15], "transit": 12, "translat": 15, "trepid": 12, "triton": 10, "troubleshoot": 9, "true": [1, 2, 6, 7, 12, 14, 16], "truncat": [1, 2], "try": [1, 12, 14, 17], "ttft": 14, "tune": [1, 9, 14], "turbo": 7, "turn": 7, "twine": 13, "two": [1, 5, 7, 11, 12], "txt": 0, "type": [1, 6, 15], "typic": 15, "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 16, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 15], "union": 14, "uniqu": 12, "unit": [1, 2, 7, 12], "unittest": 11, "until": 14, "up": [10, 12], "updat": [0, 2, 16], "upgrad": 10, "upload": 12, "upload_pypi": 13, "uploaded_fil": 12, "upon": [1, 6], "url": [12, 14], "us": [2, 3, 4, 5, 8, 12, 14, 16], "us_president_exampl": 3, "usag": [1, 3, 8, 15], "user": [1, 3, 5, 7, 8, 12, 14, 15], "usual": [12, 14], "utf": [12, 14], "util": [6, 8, 12, 14, 15], "v": [10, 16], "v0": 10, "v1": [1, 6, 12, 15], "valid": 12, "valu": [1, 8, 12, 14, 17], "valuabl": 11, "variabl": [1, 16], "variant": 2, "variou": [1, 12, 15], "vast": [12, 15], "veri": [8, 11, 12, 14], "verifi": 12, "version": 10, "vertexai": 7, "via": 12, "video": 16, "view": 1, "virtual": 15, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 14, "w": [7, 12], "wa": 12, "wai": [6, 11], "wait": 15, "wait_for_serv": [6, 12, 15], "wand": 7, "want": [1, 14], "warn": [8, 12, 15], "washington": 12, "we": [1, 12, 14], "web": 12, "week": 12, "weight": [1, 2, 16], "welcom": 5, "well": 11, "were": [12, 14], "what": [3, 7, 12, 15], "when": [5, 7, 8, 12, 14, 17], "where": [3, 12], "whether": 14, "which": [8, 12, 14, 15], "while": [1, 2, 10, 12, 14, 16], "whl": 10, "who": 12, "why": 12, "wide": [9, 12, 15], "within": 7, "without": [2, 10, 12], "wood": 7, "word": [7, 12], "work": [1, 5, 8, 16], "workflow": 7, "workload": 8, "write": [0, 12], "x64": 16, "x86_64": 2, "xvers": 1, "xxx": 16, "y": [2, 16], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "your": [0, 1, 5, 7, 9, 10, 12, 14, 15], "zara": 12, "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 16], "2": [10, 16], "3": [1, 10, 16], "4": 10, "405b": 1, "5": 10, "A": [6, 15], "The": 17, "With": 10, "access": 17, "achiev": 8, "action": 16, "add": [4, 11, 16], "addit": 1, "advanc": 8, "all": 14, "an": 17, "api": [1, 6, 12, 15], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 14, "batch": [7, 12], "benchmark": [1, 2, 14], "build": 0, "chat": [5, 12], "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 13], "common": 10, "compat": [1, 6, 12, 15], "complet": 12, "compos": 10, "config": 16, "configur": 16, "conserv": 8, "constrain": 7, "contain": 16, "contributor": 4, "control": 7, "correct": 11, "cuda": 17, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 16], "document": [0, 9], "dp": 8, "embed": 6, "encount": 17, "engin": 1, "error": 17, "exampl": [7, 14], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 14, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [13, 16], "greedi": 3, "guid": [4, 8], "hang": 17, "host": 16, "how": 11, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 17, "implement": 7, "implic": 14, "input": 6, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 14, "launch": [6, 15], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 13, "max": 8, "mem": 8, "memori": [8, 14, 17], "method": [3, 10], "min": 14, "modal": [7, 14], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 14], "new": [11, 14], "normal": [3, 14], "note": 10, "nsight": 2, "openai": [1, 6, 7, 12, 15], "option": 8, "other": 2, "out": 8, "packag": 13, "parallel": 7, "paramet": [12, 14], "peak": 8, "penalti": 14, "perform": [1, 14], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 14, "preview": 0, "process": 13, "profil": 2, "pypi": 13, "quick": [1, 7, 15], "refer": 9, "releas": 13, "repetit": 14, "request": [8, 15], "role": 7, "run": [1, 8, 10, 16], "runner": 16, "runtim": [1, 5, 14], "sampl": 14, "schedul": 8, "select": 3, "self": 16, "send": 15, "serv": 0, "server": [1, 6, 15, 17], "set": 16, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 14], "sh": 16, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 15, 16], "static": 8, "step": 16, "stream": [7, 14], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 14, "token": [3, 14], "tp": 8, "troubleshoot": 17, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 16, "updat": 13, "upload": 13, "us": [1, 6, 7, 10, 15], "usag": 12, "version": 13, "vllm": 11, "wa": 17, "websit": 0, "without": 1, "your": [4, 8]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[16, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[14, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[14, "baseline"]], "Batches": [[12, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[14, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[17, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[12, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[12, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[14, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[14, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[14, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[15, "Launch-a-server"]], "Make a release in GitHub": [[13, "make-a-release-in-github"]], "Memory": [[14, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[14, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[14, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[14, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [12, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[12, "Parameters"], [12, "id2"]], "Performance Implications on Penalties": [[14, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[14, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[13, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[15, null]], "References": [[9, null]], "Repetition Penalty": [[14, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[14, null]], "Send a Request": [[15, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[16, null]], "Step 1: Start a docker container.": [[16, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[16, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[16, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [14, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[17, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[17, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[13, "update-the-version-in-code"]], "Upload the PyPI package": [[13, "upload-the-pypi-package"]], "Usage": [[12, "Usage"], [12, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [15, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 12, 14, 15], "0": [1, 6, 7, 8, 10, 12, 14, 15, 16, 17], "00": [6, 12, 15], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "01": [6, 7, 8, 12, 14, 15], "01239013671875": 6, "01438140869140625": 6, "02": [6, 12, 14, 15], "03": [6, 12, 14], "04": [12, 14, 16], "04a5": 12, "05": [6, 12, 14, 15], "06": [12, 14], "07": [6, 12], "08": [12, 14], "09": [6, 12, 15], "0_rocm6": 16, "0_triton3": 16, "1": [2, 6, 7, 8, 11, 12, 14, 15], "10": [1, 2, 6, 12, 14, 15], "100": [6, 7, 12, 15], "10025": 12, "100525": 12, "10094": 12, "101": 14, "10192": 12, "1025173": 6, "103": 14, "10399": 12, "104": 14, "10405": 14, "10640": 12, "10666": 14, "107": 14, "10767": 14, "108": 12, "10850": 12, "11": [6, 12, 14], "114": 14, "11586": 14, "117": 14, "11732": 14, "12": [6, 10, 14, 15, 16], "120": 12, "120525": 12, "123": 12, "125": 12, "127": [1, 6, 12, 15], "128": [1, 12, 14], "128009": [12, 15], "129": 12, "12it": 12, "13": [12, 14, 15], "130": 12, "131072": [6, 12, 15], "133": 12, "134": 12, "135": 12, "137": 15, "138": 15, "139": 15, "14": [6, 12, 15], "14025": 12, "140525": 12, "142": 12, "14226": 14, "15": [6, 15], "150": 12, "16": [1, 6, 7, 12, 14, 15], "160": [6, 12, 15], "160525": 12, "16219": 14, "16384": [6, 12, 15], "167": 15, "16740": 14, "16it": 15, "17": [12, 14, 15], "17125": 14, "17167": 14, "172": 1, "1730107007": 12, "1730107008": 12, "1730107009": 12, "1730107011": 12, "1730107096": 15, "1730107097": 15, "174": 14, "175": 12, "17734": 12, "179": 14, "17it": 15, "18": [6, 12, 14, 15], "18025": 12, "180525": 12, "18388": 12, "18895": 14, "189": 14, "19": 12, "19058": 12, "191": 14, "195": 14, "1980": 12, "19807": 12, "19884": 14, "1st": 14, "2": [1, 5, 6, 7, 9, 12, 14, 15], "20": [12, 15], "200": [6, 12, 15], "20000": 1, "200525": 12, "2024": [6, 12, 15], "2048": [2, 8], "2049": [12, 15], "205": 14, "20525": 12, "20585": 12, "207": 15, "20866": 14, "21": [6, 12], "210": 12, "2106": 12, "21433": 12, "21it": 12, "22": [12, 15], "22025": 12, "22095": 14, "22312": 12, "22363": 14, "22603": 14, "23": 12, "2325": 12, "233": [8, 14], "23385": 12, "237179517": 6, "23892": 14, "2394": 12, "24": [12, 14], "243": [6, 12, 15], "24587": 12, "247": 15, "24h": 12, "24it": 15, "25": [6, 7, 12, 15], "256": [1, 2, 6, 7, 12, 14, 15], "26": [12, 14], "26025": 12, "268": 14, "27": [12, 14], "271": 14, "27879a06": 12, "2790": 12, "28": [6, 12, 15], "287": 15, "29": [6, 12, 14, 15], "293": 14, "29542e83d53f44eea0c01d1f517c4b40": 15, "29c7": 12, "3": [2, 5, 6, 7, 8, 9, 12, 14, 15], "30": [12, 14], "3000": 14, "30000": [1, 5, 7, 10, 12, 14, 15], "30010": 6, "30025": 12, "308": 14, "31": 14, "317": 8, "32": [1, 2, 6, 12, 14, 15], "320": 14, "3226": 12, "327": 15, "33": 6, "331": 15, "333": 12, "33df398d": 12, "34": [6, 12, 15], "34025": 12, "347192970": 15, "35": [12, 14], "35516": 15, "35530": 15, "35536": 15, "35540": 15, "35554": 15, "35it": 12, "36": [12, 14], "36680": 12, "36690": 12, "36696": 12, "367": 15, "36706": 12, "36708": 12, "37": [12, 14, 15], "370959": 8, "371": 12, "378": 15, "378633": 14, "38": [12, 14], "38025": 12, "3869": 12, "39": [6, 12, 14, 15], "390b6931283540278af6151e5665b9e6": 12, "395": 12, "4": [1, 6, 7, 12, 15], "40": [6, 12, 14], "4005": 6, "40525": 12, "40866": 12, "40881": 14, "409": 14, "4096": [1, 2, 6, 8, 12, 15], "40db": 12, "40it": 15, "41": [6, 12, 14], "41888": 14, "41ef": 12, "42": 12, "42025": 12, "43": [6, 12], "433": 14, "43967": 14, "43c2": 12, "44": 14, "440": 14, "442913": [12, 15], "4456": 12, "447": 14, "44926": 14, "45": [12, 14, 15], "453": 14, "45354": 14, "45445": 14, "455": 14, "4594": 8, "46": [12, 14, 15], "46025": 12, "46530": 14, "47": [12, 14, 15], "47738": 14, "48": 12, "48056": 12, "48302": 14, "4832": 14, "48960": 14, "49": [12, 15], "49017": 14, "49263": 14, "495a": 12, "4995": 12, "4e7f": 12, "5": [1, 6, 7, 12, 14], "50": [8, 12, 14, 15], "500": [8, 12], "50000": 1, "50025": 12, "50302": 14, "5079": 14, "509328": 6, "51": [12, 14], "510260": 12, "511197": 15, "512": [2, 14], "51it": 12, "52": [1, 6, 12], "5206": 14, "5255": 14, "52554": 14, "52609006": 12, "52825": 14, "52920": 14, "53": 12, "53788": 12, "54": [6, 12, 14], "54497": 14, "54868": 12, "55": [6, 12, 14], "56": [6, 12, 14], "5656": 14, "56c1c364": 12, "57": 6, "5727": 14, "57426": 14, "57it": 15, "58": [14, 15], "59": [6, 12, 14, 15], "59034": 6, "59258": 6, "59274": 6, "59280": 6, "59290": 6, "59300": 6, "5b": 11, "6": [1, 6, 12, 15, 16], "60": [2, 14], "600": [6, 12, 15], "6000": 2, "6025": 12, "60525": 12, "61": [12, 14, 15], "62": 12, "63": [12, 15], "64": [1, 2, 6, 12, 14, 15], "64g": 16, "65": 14, "66": [12, 14], "67": 14, "68": 14, "69": 14, "7": [1, 6, 12], "70": [2, 12, 14], "71": [6, 14], "72": [12, 14], "72b": 1, "73": [12, 14], "74": [6, 14], "75": [12, 14, 15], "76": [12, 14], "766008": 14, "77": 12, "774756": 14, "774955": 14, "775118": 14, "775210": 14, "775220": 14, "775651": 14, "78": [6, 12, 14, 15], "79": [12, 14, 15], "7b": [1, 5, 6, 14], "7fa2af80": 2, "8": [1, 6, 12, 14, 15, 17], "8000": 0, "80525": 12, "81": [6, 14], "8192": [6, 12, 15], "82": [8, 12], "83": 14, "84": [12, 14], "8413": 14, "84ab9ffd558f4c5595addde9e7a9b40c": 12, "85": [14, 15], "86": [6, 14], "87": 15, "88": [6, 12, 14, 15], "8840": 12, "89": [6, 12, 14], "890cb3111446": 12, "8925": 12, "8b": [1, 2, 7, 10, 12, 14, 15], "8dd58c0e0eff4036ab377324851c1726": 12, "8dd8": 12, "8fc3": 12, "9": [1, 7, 8, 12, 17], "90": 14, "9012": 12, "91": [12, 14], "9157": 12, "92": 6, "9225459caab5": 12, "927": 12, "93": [12, 14], "9370": 12, "94": 14, "95": [1, 6, 12, 14, 15], "9570": 12, "96": [14, 15], "97": [6, 12, 14], "9739540beefc": 12, "9754": 12, "97b7": 12, "98": [12, 14], "99": 12, "9900": 14, "9969": 12, "9976380bf402": 12, "9982": 12, "9998": 8, "9b8d": 12, "A": [1, 2, 7, 8, 9, 10], "As": 12, "By": [5, 14], "For": [1, 2, 3, 11, 12, 14], "If": [1, 5, 8, 10, 14, 17], "In": [1, 7, 17], "It": [1, 3, 5, 7, 8, 9, 10, 12, 14, 15], "NOT": 5, "On": 8, "THE": 12, "The": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16], "Then": [7, 16], "There": 5, "These": 14, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 13, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abl": 11, "about": [1, 5, 7, 8, 12, 15], "abov": [2, 3, 10, 12, 14], "acceler": [1, 8, 10], "accept": [12, 14], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "accuraci": 15, "achiev": 12, "across": 3, "activ": 9, "ad": 10, "ad61027db61649d0bd69f6aa901f1d8c": 15, "add": [1, 2, 6, 7, 8, 14, 17], "addit": [3, 7, 14], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "af09": 12, "af5c": 12, "after": 15, "again": 12, "against": 3, "ai": [1, 10, 12, 15], "aim": 12, "air": 12, "algorithm": 14, "alibaba": [1, 6], "alien": 12, "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 16], "all_other_model": 11, "allow": [2, 10, 15], "almost": [1, 8, 11], "also": [1, 5, 6, 7, 8, 14, 15], "altern": [3, 7], "alwai": [8, 12, 15], "amd": 16, "amount": 15, "an": [0, 1, 3, 7, 9, 10, 12, 14, 15, 16], "analysi": 12, "ancient": 12, "ani": [1, 7, 10, 14, 15], "annot": 2, "anoth": [11, 15], "answer": [3, 7, 15], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 14], "api_kei": [1, 6, 12, 15], "appear": 14, "append": 12, "appli": 14, "applic": [1, 6, 9, 12, 15], "approach": 10, "apt": [2, 16], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15], "arch": 16, "architectur": 2, "arg": 3, "argument": [2, 7, 14], "around": 17, "articl": 15, "artifici": [12, 15], "ask": 15, "assert": 12, "assign": [12, 15], "assist": [1, 3, 5, 7, 12, 14, 15], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attention_backend": [6, 12, 15], "attract": [3, 7, 12], "audio": [12, 15], "auror": 7, "australia": [12, 15], "author": [6, 15], "auto": [6, 12, 15], "automat": 14, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 6, 10, 12, 15], "averag": 3, "avoid": [10, 12], "awq": 9, "b": 10, "b590": 12, "baccd9a49bff": 12, "back": 9, "backend": [2, 3, 10, 14, 17], "backend_input_fil": 12, "backend_result_fil": 12, "bad": 3, "baichuan2": 1, "balanc": [7, 12], "base": [3, 14], "base64": 14, "base_url": [1, 6, 12, 15], "bash": [13, 16], "basic": 14, "batch": [1, 2, 6, 8, 9, 14, 15], "batch_4c254e9a": 12, "batch_9c319ff5": 12, "batch_bb7ab5e0": 12, "batch_detail": 12, "batch_id": 12, "batch_job": 12, "batch_request": 12, "batch_respons": 12, "batchrequestcount": 12, "bdb569b5e77147d0b4ebe2a79b451814": 12, "be08": 12, "bearer": [6, 15], "becaus": [7, 8], "befor": [2, 14], "begin": [6, 7, 12, 15], "beij": 12, "being": 8, "below": [7, 10, 14, 16], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 14], "benchmark": 9, "berlin": 3, "bespok": 3, "better": [1, 8, 10, 11, 12], "between": [1, 14], "bfloat16": [12, 15], "bia": [7, 15], "bin": 16, "blank": 15, "blob": 14, "block": [7, 15], "blogpost": 3, "blood": 7, "bodi": [7, 12, 14], "bogart": 7, "book": 15, "bool": 14, "born": 7, "both": 8, "bottleneck": 8, "bra": 12, "branch": 10, "bras\u00edlia": [12, 15], "brazil": [12, 15], "break": 14, "breath": 12, "breez": 12, "bring": 12, "browser": 0, "bug": 12, "build": [1, 10, 13], "built": 10, "c": [6, 10, 12, 15], "cach": [1, 2, 6, 8, 9, 10, 12, 15, 16], "calcul": 7, "call": [3, 7, 9], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "canberra": [12, 15], "cancel": 12, "cancelled_job": 12, "cannot": 14, "capit": [1, 3, 7, 12, 14, 15], "captur": [12, 15], "case": [8, 17], "cb3d35cd73d4": 12, "cd": [4, 10, 13], "ce58": 12, "center": 12, "chain": 9, "chang": [11, 16], "charact": 7, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 14, 15], "chat_exampl": 7, "chat_templ": [5, 6, 12, 15], "chatcomplet": [12, 15], "chatcompletionmessag": [12, 15], "chatglm": 1, "chatml": [1, 5, 14], "check": [1, 10, 12], "check_output": 6, "checkpoint": [1, 2, 6, 12, 15], "china": 12, "choic": [7, 9, 12, 15], "choices_method": 3, "chunk": [1, 9, 14], "chunked_prefill_s": [6, 12, 15], "ci": 4, "civil": 12, "class": 14, "clean": 12, "cli": 2, "client": [1, 2, 6, 12, 15], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 6, 7, 15], "coher": 15, "color": 2, "com": [2, 10, 13, 14, 16], "come": [8, 14], "command": [1, 2, 4, 6, 10, 11, 12, 15, 16], "commit": 4, "common": [15, 17], "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 9, 14], "compil": [1, 8], "complet": [1, 6, 7, 15], "completion_token": [12, 15], "completion_tokens_detail": [12, 15], "completion_window": 12, "completionchoic": 12, "completionusag": [12, 15], "complex": 7, "comprehend": 15, "comput": [2, 7, 8, 12, 14], "concis": [12, 15], "conda": 10, "condens": 15, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 14], "constrain": [8, 9, 14], "constrained_json_whitespace_pattern": [6, 12, 15], "constraint": 7, "contain": 3, "content": [1, 6, 7, 12, 15], "context": 15, "context_len": [6, 12, 15], "context_length": [6, 12, 15], "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 15], "convert": 11, "cool": 12, "copi": 10, "core": [7, 9], "corpu": 15, "correct": [2, 14], "cost": 12, "could": 14, "count": 12, "countri": [1, 12, 15], "coverag": 11, "cpu": 8, "craft": 12, "creat": [1, 6, 11, 12, 15], "created_at": 12, "creation": 15, "creativ": 12, "crisp": 12, "critic": 2, "ctrl": [6, 12, 15], "cu121": 10, "cuda": [1, 2, 6, 10, 12, 14, 15, 16], "cuda_graph_max_b": [6, 12, 15], "cuda_visible_devic": 16, "curl": [1, 14, 15, 16], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 12, "custom": [1, 15], "custom_id": 12, "d": [0, 1, 2, 6, 7, 10, 12, 15], "dark": 12, "data": [1, 6, 8, 12, 14, 15], "dataclass": 14, "dataset": [2, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 14, "dd4a2fc580ea": 12, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 12, 14, 15], "decode_unicod": 14, "decor": 7, "decreas": 8, "deep": 12, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 14, 17], "defin": [5, 7], "del_respons": 12, "delai": 2, "delet": 12, "depend": 10, "deploi": 10, "deploy": [10, 15], "describ": [3, 14], "descript": [2, 14], "design": [9, 15], "desir": 14, "detail": [12, 14], "detailed_tip": 7, "determin": 3, "detoken": 14, "dev": [1, 16], "devel": 16, "develop": [2, 12, 15], "devic": [1, 6, 10, 12, 15, 16], "devtool": 2, "dict": 14, "diet": 7, "differ": 11, "difficult": 14, "digit": 15, "directli": 1, "directori": 11, "disabl": [1, 2, 14, 17], "disable_cuda_graph": [6, 12, 15], "disable_cuda_graph_pad": [6, 12, 15], "disable_custom_all_reduc": [6, 12, 15], "disable_disk_cach": [6, 12, 15], "disable_flashinf": [6, 12, 15], "disable_flashinfer_sampl": [6, 12, 15], "disable_mla": [6, 12, 15], "disable_nan_detect": [6, 12, 15], "disable_pen": [6, 12, 15], "disable_radix_cach": [6, 12, 15], "disable_regex_jump_forward": [6, 12, 15], "discov": 12, "discuss": 15, "dislik": 14, "dist_init_addr": [6, 12, 15], "distrib_releas": 2, "distribut": [6, 12, 15], "divers": 12, "dn": 7, "do": [2, 8, 12, 14, 16], "doc": [2, 3, 10, 14], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [14, 16], "down": [3, 6, 12, 15], "download": [2, 14], "dp": 1, "dp_size": [6, 12, 15], "dpkg": 2, "drawback": 14, "dri": 16, "ds_channel_config_path": [6, 12, 15], "ds_heavy_channel_num": [6, 12, 15], "ds_heavy_channel_typ": [6, 12, 15], "ds_heavy_token_num": [6, 12, 15], "ds_sparse_decode_threshold": [6, 12, 15], "dtype": [1, 6, 12, 15], "duck": 3, "due": [3, 8, 17], "dummi": 2, "dump": [6, 12], "durat": [2, 14], "dure": [1, 8, 12, 14, 15], "dusti": 12, "dynam": [1, 2], "e": [2, 10, 11, 12, 16], "e2": 14, "e5": [1, 6, 9], "each": 1, "earli": 8, "earlier": 3, "easi": [9, 11, 17], "easier": 7, "eater": 7, "echo": [2, 16], "edit": 16, "educ": 15, "effici": [1, 9], "either": 14, "element": 12, "eleutherai": 3, "elif": 7, "els": 12, "embed": [1, 9, 12], "embedding_model": 12, "embedding_process": 6, "empti": 1, "enabl": [1, 7, 8, 10, 15], "enable_cache_report": [6, 12, 15], "enable_double_spars": [6, 12, 15], "enable_mixed_chunk": [6, 12, 15], "enable_overlap_schedul": [6, 12, 15], "enable_p2p_check": [6, 12, 15], "enable_torch_compil": [6, 12, 15], "encod": [6, 14], "encount": 10, "encourag": [12, 14], "end": [6, 7, 11, 12, 14, 15], "endpoint": [1, 10, 12, 14], "engag": 15, "engin": 7, "enough": [1, 8], "entir": 15, "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 6, 16], "eo": [8, 14], "equival": [6, 15], "error": [1, 8, 12], "especi": 8, "etc": [2, 9], "eth0": 1, "even": [3, 12, 15], "everi": 14, "exampl": [1, 3, 6, 11, 12, 15, 16], "example_imag": 14, "exaon": 1, "except": 12, "excl": 14, "exec": 2, "execut": [10, 15], "exercis": 7, "exist": 11, "expand": 7, "experiment": 8, "explor": 12, "export": [0, 1, 7, 16], "express": [7, 14], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 14, "f": [1, 6, 7, 12], "fa4ddf26": 12, "face": [1, 5], "fail": [3, 12], "failur": 10, "fals": [6, 12, 14, 15], "far": 14, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feel": [12, 15], "fetch": 2, "file": [0, 2, 4, 11, 12, 14], "file_respons": 12, "file_storage_pth": [6, 12, 15], "fill": [7, 12, 15], "fillmor": 3, "final": 12, "find": [7, 11, 14], "finish": [6, 12, 15], "finish_reason": [12, 15], "fire": [6, 12, 15], "first": [1, 2, 6, 7, 8, 14], "fix": 17, "flashinf": [6, 9, 10, 12, 15], "flexibl": 9, "float": 14, "float16": 6, "floral": 12, "flow": 9, "fluenci": 12, "flush": [7, 14], "focus": 12, "folder": [2, 4, 16], "follow": [1, 2, 5, 6, 7, 8, 11, 14, 16], "forev": 16, "fork": [2, 7], "format": [2, 6, 7, 12, 14, 15], "forward": [9, 11], "forward_batch": 11, "found": 7, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 14, 17], "framework": 9, "franc": [1, 3, 7, 14], "free": 15, "frequency_penalti": [12, 14], "frequent": 8, "from": [4, 5, 6, 7, 12, 15], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8, 12], "function": [3, 7, 11], "function_cal": [12, 15], "further": 10, "futur": [1, 11], "g": [2, 10, 11, 16], "gaze": 12, "gb": [6, 12, 15], "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8, 12, 15], "gener": [0, 1, 9, 12, 14, 15], "generatereqinput": 14, "get": [6, 10, 11, 12, 14, 15], "get_model_info": [6, 12, 15], "git": [10, 16], "github": [0, 10, 14], "give": [11, 16], "given": 14, "glimps": 14, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": 7, "gpt": 7, "gptq": [6, 9, 12, 15], "gpu": [1, 8, 10, 14, 16], "grammar_backend": [6, 12, 15], "graph": [1, 2, 12, 15, 17], "greedy_token_select": 3, "grok": 1, "group": 16, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 14, 15], "h": [1, 6, 15], "h100": [10, 14], "ha": [8, 11], "haisgl": 16, "half": 7, "hand": 8, "handl": [1, 2, 14], "happen": 8, "hardwar": 14, "harri": 7, "hasattr": 12, "have": [0, 1, 3, 8, 12, 14, 15], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 12, 14, 15], "henryx": 16, "her": 12, "here": [1, 6, 7, 12, 15], "hf": 5, "hf_home": 16, "hf_token": [10, 16], "hf_xxx": 16, "high": [3, 8, 12, 14, 15], "higher": [12, 14], "highest": [3, 7], "highlight": 15, "highlight_text": [6, 12, 15], "historian": 12, "hit": [6, 12, 14, 15], "host": [6, 10, 12, 15], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "howev": 15, "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 16], "human": 15, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17], "id": [12, 14, 15], "idea": 15, "ident": 11, "ignor": 14, "ignore_eo": 14, "im_end": [5, 14], "im_start": [5, 14], "imag": [7, 10, 14], "image_data": 14, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11, 12], "import": [1, 2, 6, 7, 8, 12, 14, 15], "includ": [7, 9, 12, 15], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 12, 15], "indic": 8, "indigo": 12, "industri": [9, 15], "inf": 14, "infer": [1, 14], "info": [6, 12, 15], "inform": [7, 14, 15], "infra": 10, "init": [1, 6, 12, 15], "initi": [3, 12, 15], "input": [1, 2, 7, 9, 12, 14], "input_file_id": 12, "input_file_path": 12, "input_id": [6, 14], "input_ids_embed": 6, "insid": 16, "instal": [0, 2, 4, 6, 9, 12, 13, 15, 16], "installationguid": 2, "instanc": 3, "instead": [1, 17], "instruct": [1, 2, 6, 7, 10, 12, 14, 15], "int": 14, "int4": 9, "int4wo": 1, "integr": 9, "intellig": [12, 15], "inter": 14, "interact": 9, "interfac": [9, 11], "internet": 15, "internlm": 1, "interpret": 12, "intfloat": 6, "intuit": 9, "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 12, "is_embed": [6, 12, 15], "issu": [7, 10, 17], "itali": 12, "iter_lin": 14, "itl": 14, "its": 3, "iv": 12, "japan": [7, 12, 15], "job": 12, "joke": 12, "json": [1, 2, 5, 6, 12, 14, 15], "json_decod": 7, "json_model_override_arg": [6, 12, 15], "json_output": 7, "json_schema": 14, "jsonl": 12, "jump": 9, "just": [5, 12], "k": 14, "k8": 10, "keep": 12, "kei": [2, 7, 12, 15], "kernel": [9, 10, 17], "kfd": 16, "kingdom": 7, "knowledg": 12, "kv": [1, 8], "kv_cache_dtyp": [6, 12, 15], "l": 14, "l4": 10, "l40": 10, "lab": [1, 14], "label": 16, "land": 12, "landscap": 12, "lang": 14, "languag": [5, 9, 10, 12, 15], "larg": [1, 2, 8, 9, 15], "last": 10, "late": 12, "later": [3, 16], "latest": 10, "lauch_sglang_serv": [6, 12, 15], "launch": [1, 2, 5, 7, 9, 10, 14, 17], "launch_serv": [1, 2, 5, 6, 7, 10, 12, 14, 15], "layer": 11, "layer_id": 11, "learn": [1, 4, 11, 15], "least": 14, "len": [1, 2, 14], "length": [7, 12, 14], "less": 12, "let": 1, "level": [12, 14], "librari": 7, "life": 12, "light": 12, "like": [8, 15], "limit": 3, "line": [12, 15], "lint": 4, "linux": 16, "list": [1, 2, 7, 11, 12, 14, 15, 17], "llama": [2, 5, 7, 9, 10, 11, 12, 14, 15], "llama3": 1, "llamaforcausallm": [12, 15], "llava": [1, 9, 14], "llava_llama_3": 1, "llm": [1, 3, 9, 15], "lm_eval": [6, 12, 15], "lmm": [1, 14], "lmsysorg": 10, "load": [1, 2, 6, 8, 12, 14, 15], "load_balance_method": [6, 12, 15], "load_format": [6, 12, 15], "load_imag": 14, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 12, 14, 15], "locat": 14, "log": [7, 8], "log_level": [6, 12, 15], "log_level_http": [6, 12, 15], "log_request": [6, 12, 15], "logic": 14, "logit": [7, 11, 14], "logitsprocessor": 11, "logprob": [3, 12, 14, 15], "logprob_start_len": 14, "london": 3, "long": [1, 12, 15], "longer": [3, 12], "longest": 8, "look": [5, 8], "loop": 7, "lora_path": [6, 12, 15], "low": 14, "lower": [8, 12], "lpm": [6, 8, 12, 15], "lsb": 2, "lt": [6, 12, 15], "lung": 12, "lyra": 12, "m": [0, 1, 2, 5, 6, 7, 10, 11, 12, 14, 15], "machin": 10, "magic": 7, "mai": [1, 2, 6, 7, 12, 15, 17], "main": [1, 14, 15], "maintain": 11, "major": [11, 12], "make": [0, 8, 9, 11, 12, 14], "manag": 7, "mani": [3, 8, 11], "manner": 14, "mask": 7, "massiv": 15, "match": 8, "matched_stop": [12, 15], "math": 7, "max": 14, "max_check": 12, "max_loras_per_batch": [6, 12, 15], "max_new_token": [1, 8, 14], "max_prefill_token": [6, 12, 15], "max_running_request": [6, 12, 15], "max_token": [1, 7, 12, 15], "max_total_num_token": [6, 12, 15], "max_total_token": [6, 12, 15], "maximum": 14, "md": 4, "me": 12, "mean": [8, 14, 15], "meanwhil": 5, "measur": 14, "media": 15, "median": 14, "meet": 1, "mem": [1, 6, 12, 14, 15, 17], "mem_fraction_stat": [6, 12, 15], "memori": [1, 2, 6, 12, 15], "messag": [1, 7, 12, 15], "meta": [1, 2, 5, 7, 10, 12, 14, 15], "method": [9, 12], "mild": 12, "millard": 3, "min_new_token": 14, "min_p": 14, "minicpm": 1, "ministri": 7, "minut": [12, 15], "mislead": 3, "miss": 5, "mission": 12, "mistral": [1, 6, 9], "mix": [12, 14], "mixtral": 1, "modal": [1, 9], "mode": 12, "model": [2, 3, 5, 8, 9, 10, 12, 14, 15, 16], "model_path": [1, 6, 12, 15], "moder": 12, "moe": 1, "monitor": 12, "more": [1, 9, 10, 12, 14], "most": [5, 8, 11], "mount": 16, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 12], "multipli": 14, "must": 14, "my": 1, "my_model": 5, "my_model_templ": 5, "myself": 15, "n": [7, 12, 14, 15], "n1": [12, 15], "n2": [12, 15], "n3": [12, 15], "n4": 15, "name": [1, 2, 3, 5, 7, 14, 16], "natur": 12, "nbecaus": 12, "nccl": 1, "ndescrib": 14, "need": [2, 5, 7, 10, 11, 15, 16], "nemo": 1, "nest": 7, "new": [1, 6, 8, 9, 12, 13, 15, 16], "new_token_ratio": 8, "newli": 12, "next": [1, 6, 12, 15], "ngener": 1, "nif": 15, "nlarg": 15, "nlist": 12, "nlp": [1, 6], "nlyra": 12, "nnode": [1, 6, 12, 15], "node": [1, 2], "node_rank": [6, 12, 15], "non": 7, "none": [6, 12, 14, 15], "normal": 7, "note": [1, 2, 5, 11, 14, 16], "novel": 12, "now": 7, "npython": 12, "nsome": 15, "nsy": 2, "nuanc": 15, "null": [10, 15], "num": [1, 2, 14], "num_continuous_decode_step": [6, 12, 15], "number": [8, 14], "nvidia": [2, 14, 16], "nvtx": 2, "nyou": 14, "o": [2, 6, 14, 16], "object": [12, 15], "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 15, "ok": [6, 12, 15], "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 15], "one": [3, 7, 12, 14, 15], "onevis": [1, 14], "onli": [2, 3, 7, 10, 11, 12, 14], "onlin": [1, 2], "only_run": 11, "onto": 12, "oom": [8, 14], "open": [9, 10, 12], "openai": [3, 5, 9, 10, 14], "openai_api_kei": [7, 16], "oper": 10, "optim": 17, "option": [3, 14], "order": 7, "other": [3, 8, 10, 11, 14, 15], "otherworldli": 12, "out": [1, 2, 7, 10, 12, 17], "outlin": [6, 12, 15], "output": [1, 2, 11, 12, 14], "output_file_id": 12, "ov": [1, 14], "overhead": [8, 14], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 14], "ozon": 12, "p": [10, 14], "p2p": 1, "p99": 14, "page": [9, 17], "paragraph": 7, "parallel": [1, 8, 9, 14], "paramet": [8, 9], "pari": 3, "part": 11, "pass": [4, 7, 11], "path": [0, 1, 2, 3, 5, 6, 7, 10, 12, 14, 15], "patronu": 7, "pattern": 15, "peer": 1, "penal": 14, "penalti": 12, "per": 14, "perform": 3, "phoenix": 7, "phrase": 12, "pip": [0, 2, 13, 16], "pip3": 4, "plan": 10, "planet": 12, "playground": 11, "pleas": [1, 7, 10], "png": 14, "point": 15, "pool": [1, 6, 8, 12, 15], "poorli": 3, "popular": 12, "port": [1, 5, 6, 7, 10, 12, 14, 15], "post": [6, 12, 14, 15], "post2": 10, "post3_vllm0": 16, "potenti": 15, "potter": 7, "pre": 4, "predict": 3, "prefer": 12, "prefil": [1, 2, 6, 9, 11, 12, 15], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": [12, 14], "presid": [1, 3], "press": [6, 12, 15], "prev": 14, "primit": [3, 7], "print": [1, 2, 7, 12, 14], "probabl": 7, "proceed": [6, 12, 15], "process": [6, 12, 14, 15], "profil": 9, "program": [9, 10, 12], "programm": 12, "progress_bar": 7, "project": [0, 5, 10, 13, 14, 16], "prompt": [1, 2, 7, 9, 12, 14], "prompt_token": [12, 15], "prompt_tokens_detail": [12, 15], "proper": 10, "provid": [1, 2, 7, 9, 10, 12, 15], "pub": 2, "pull": 16, "pure": 7, "purpos": 12, "py": [0, 1, 2, 5, 6, 7, 11, 12, 13, 14, 15], "pydant": 7, "pyproject": 13, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15], "python3": [0, 1, 2, 10, 11, 14, 16], "pytorch": [10, 17], "q": 7, "qk": [6, 12, 15], "quantiz": [1, 6, 9, 12, 15], "queri": 1, "question": [7, 15], "question_1": 7, "question_2": 7, "queue": [6, 8, 12, 15], "quick": [2, 9], "quick_start": 7, "quit": [6, 12, 15], "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 14], "qwen2forcausallm": 6, "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "rais": [12, 15], "ran": 14, "random": [2, 14], "random_se": [6, 12, 15], "rang": [8, 9, 12], "rank": 1, "rate": [6, 12, 14, 15], "ravenclaw": 7, "raw": 14, "rb": 12, "reach": 14, "read": 12, "reader": 12, "readi": [6, 12, 15], "readm": 4, "readme_exampl": 7, "real": [1, 2], "reason": 12, "recommend": [2, 10, 12, 14, 15], "recoveri": 10, "reduc": [1, 8, 12], "refer": [1, 11, 12], "reference_hf": 11, "refus": [12, 15], "regex": [7, 14], "regist": 5, "regular": [7, 14], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 15, "releas": [2, 10], "relev": 14, "rememb": 6, "remot": 10, "remov": [0, 11], "repeat": 14, "repetit": 12, "repetition_penalti": 14, "replac": [1, 10, 11], "repo": 2, "report": [1, 17], "reproduc": 12, "req": [6, 8, 12, 14, 15], "request": [1, 7, 9, 12, 14], "request_count": 12, "request_id": 12, "requir": 0, "resourc": [10, 11], "respons": [1, 3, 6, 12, 14, 15], "restart": 16, "result": [3, 12, 14], "result_cont": 12, "result_file_id": 12, "retoken": 14, "retracted_req": 8, "retriev": 12, "return": 14, "return_logprob": 14, "return_text_in_logprob": 14, "reus": 11, "revolution": 15, "rid": 14, "rm": 16, "rmsnorm": 11, "role": [1, 12, 15], "roll": [6, 12, 15], "rome": 12, "root": 10, "round_robin": [6, 12, 15], "run": [0, 2, 4, 6, 7, 11, 12, 14, 15], "run_batch": 7, "runner_allow_runasroot": 16, "running_request": 14, "runtim": [9, 10], "runtimeendpoint": [3, 7], "safetensor": [6, 12, 15], "same": [1, 2, 6, 7, 11, 14], "sampl": [9, 10, 11, 17], "sampling_backend": [6, 12, 15], "sampling_param": [1, 14], "scale": [10, 14], "scent": 12, "schedule_conserv": [6, 12, 15], "schedule_polici": [6, 12, 15], "schema": [7, 14], "scientif": 12, "script": 11, "search": 7, "second": 12, "secret": 10, "section": [14, 15], "see": [1, 7, 8, 10, 14], "seed": 12, "select": [7, 10], "semant": 15, "send": [1, 8, 9, 12, 14], "send_request": 12, "sensori": 12, "sentenc": 14, "sep": 5, "sep_styl": 5, "seq": [6, 12, 15], "sequenc": 12, "serv": [1, 2, 8, 9, 10, 14], "served_model_nam": [6, 12, 15], "server": [0, 2, 5, 7, 8, 9, 12, 14], "server_arg": [6, 12, 15], "server_process": [12, 15], "serverarg": [6, 12, 15], "servic": [10, 12, 15], "service_ti": [12, 15], "set": [1, 2, 5, 7, 10, 12, 14, 15, 17], "set_default_backend": 7, "sever": [1, 2, 12, 15], "sgl": [0, 1, 3, 7, 10, 13, 14, 16], "sgl0": 16, "sglang": [2, 4, 6, 12, 13, 15, 16], "sglang_is_in_ci": 16, "sglang_storag": [6, 12, 15], "sglang_use_modelscop": 1, "sh": 13, "shade": 12, "shard": [6, 12, 15], "share": [8, 16], "she": 12, "shell": 6, "shm": 16, "short": [12, 14], "shorter": 3, "should": [5, 11], "show": 7, "show_time_cost": [6, 12, 15], "shut": [6, 12, 15], "shutdown": [6, 12, 15], "siluandmul": 11, "similar": [11, 12, 14], "simpl": [7, 12], "simpli": 3, "sinc": [12, 14], "singl": [1, 2, 10, 11, 12, 14], "size": [1, 2, 16], "sk": [7, 16], "skip": 14, "skip_special_token": 14, "skip_tokenizer_init": [6, 12, 15], "sky": [10, 12], "skyserv": 10, "sleep": [12, 16], "slightli": 12, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 17], "smollm": 1, "smooth": 12, "snippet": 2, "so": [1, 2, 14], "social": 15, "societi": 15, "some": [2, 6, 7, 11, 14, 16, 17], "someth": 12, "sometim": 17, "sourc": [2, 9, 15], "space": [12, 14], "spaces_between_special_token": 14, "special": 14, "specif": [1, 10, 11, 15], "specifi": [1, 3, 5, 7, 12, 14, 15, 16], "split": 12, "srt": [9, 10, 14], "stabl": 12, "stablelm": 1, "stai": 7, "stand": [8, 15], "start": [6, 11, 12, 14], "startswith": 14, "startup": [6, 12, 15], "state": [1, 7, 12], "static": [1, 2, 14, 17], "statu": [7, 10, 12], "status_cod": 12, "step": [6, 12, 15], "still": 14, "stop": [7, 8, 12, 14, 15], "stop_str": 5, "stop_token_id": 14, "store": 14, "stori": [12, 15], "str": 14, "strategi": 1, "stream": 1, "stream_interv": [6, 12, 15], "string": [8, 14], "strip": [12, 14], "strong": [3, 12], "strongli": [12, 15], "structur": 9, "student": 7, "subprocess": 6, "subset": 3, "success": 14, "successfulli": 12, "suggest": 8, "summar": 15, "summari": [7, 15], "suppli": [3, 14], "support": [3, 6, 7, 9, 10, 14, 15], "sure": [0, 11, 14], "surfac": 12, "sweetli": 12, "switch": 10, "sxm5": 14, "syntax": 15, "system": [1, 2, 5, 7, 12, 14, 15], "system_fingerprint": [12, 15], "t4": 10, "take": [8, 12, 15], "teacher": 7, "tee": 2, "tell": 12, "temperatur": [1, 7, 12, 14, 15], "templat": [1, 7, 14], "temporarili": 5, "tensor": [1, 9], "termin": 10, "terminate_process": [6, 12, 15], "test": [1, 2, 14, 15, 16], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 12, 14, 15], "text_complet": 12, "text_embed": 6, "text_it": 7, "text_qa": 7, "thei": [14, 15], "them": [10, 15, 17], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17], "thing": 8, "through": [7, 14], "throughput": [1, 12, 14, 15], "time": [1, 2, 6, 12, 14], "tip": 17, "tip_suggest": 7, "tmp": 16, "todai": 1, "togeth": [1, 8], "tok": 14, "token": [1, 5, 6, 7, 8, 9, 10, 12, 15], "token_id": 14, "token_length_norm": 3, "tokenizer_mod": [6, 12, 15], "tokenizer_path": [6, 12, 15], "tokenizers_parallel": 6, "tokyo": [12, 15], "toml": 13, "too": 8, "took": 12, "tool": 7, "tool_cal": [12, 15], "tool_us": 7, "top": 14, "top_k": 14, "top_logprobs_num": 14, "top_p": [1, 12, 14], "topic": [12, 15], "torch": [1, 6, 8, 12, 15], "torch2": 10, "torch_compile_max_b": [6, 12, 15], "torchao": 1, "torchao_config": [6, 12, 15], "total": [1, 12, 14], "total_token": [12, 15], "tp": 1, "tp0": [6, 12, 15], "tp_size": [6, 12, 15], "tpot": 14, "tr": 2, "trace": 2, "traffic": 14, "train": [2, 15], "transform": [6, 11], "transit": 12, "translat": 15, "triton": 10, "triton_attention_reduce_in_fp32": [6, 12, 15], "troubleshoot": 9, "true": [1, 2, 6, 7, 14, 16], "truncat": [1, 2], "trust_remote_cod": [6, 12, 15], "try": [1, 12, 14, 17], "ttft": 14, "tune": [1, 9, 14], "turbo": 7, "turn": 7, "tutori": 12, "twine": 13, "two": [1, 5, 7, 11, 12], "txt": 0, "type": [1, 6, 12, 15], "u": [3, 15], "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 16, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 15], "union": 14, "unit": [1, 2, 7, 12], "unittest": 11, "until": 14, "up": [6, 10, 12, 15], "updat": [0, 2, 16], "upgrad": 10, "upload": 12, "upload_pypi": 13, "uploaded_fil": 12, "upon": [1, 6], "url": [12, 14], "us": [2, 3, 4, 5, 8, 12, 14, 16], "us_president_exampl": 3, "usabl": [6, 12, 15], "usag": [1, 3, 6, 8, 15], "user": [1, 3, 5, 7, 8, 12, 14, 15], "usual": [12, 14], "utf": [12, 14], "util": [6, 8, 12, 14, 15], "uvicorn": [6, 12, 15], "v": [10, 16], "v0": 10, "v1": [1, 6, 12, 15], "valid": 12, "valu": [1, 8, 14, 17], "valuabl": 11, "variabl": [1, 16], "variant": 2, "variou": [1, 12, 15], "vast": 15, "veri": [8, 11, 12, 14], "verifi": 12, "version": 10, "vertexai": 7, "video": 16, "view": 1, "virtual": 15, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 14, "w": [7, 12], "wa": 12, "wai": [6, 11], "wait": [6, 12, 15], "wait_for_serv": [6, 12, 15], "wand": 7, "want": [1, 14], "warn": 8, "washington": 12, "watchdog_timeout": [6, 12, 15], "we": [1, 12, 14, 15], "web": 12, "weight": [1, 2, 6, 12, 15, 16], "weight_util": [6, 12, 15], "welcom": 5, "well": 11, "were": [12, 14], "what": [3, 7, 12, 15], "when": [5, 7, 8, 12, 14, 17], "where": 3, "whether": 14, "which": [8, 12, 14, 15], "while": [1, 2, 10, 12, 14, 16], "whl": 10, "who": 12, "why": 12, "wide": [9, 12], "within": 7, "without": [2, 10, 12], "wood": 7, "word": [7, 12], "work": [1, 5, 8, 16], "workflow": 7, "workload": 8, "write": [0, 12], "x64": 16, "x86_64": 2, "xvers": 1, "xxx": 16, "xylophia": 12, "y": [2, 16], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "your": [0, 1, 5, 7, 9, 10, 12, 14, 15], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 16], "2": [10, 16], "3": [1, 10, 16], "4": 10, "405b": 1, "5": 10, "A": [6, 15], "The": 17, "With": 10, "access": 17, "achiev": 8, "action": 16, "add": [4, 11, 16], "addit": 1, "advanc": 8, "all": 14, "an": 17, "api": [1, 6, 12, 15], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 14, "batch": [7, 12], "benchmark": [1, 2, 14], "build": 0, "chat": [5, 12], "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 13], "common": 10, "compat": [1, 6, 12, 15], "complet": 12, "compos": 10, "config": 16, "configur": 16, "conserv": 8, "constrain": 7, "contain": 16, "contributor": 4, "control": 7, "correct": 11, "cuda": 17, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 16], "document": [0, 9], "dp": 8, "embed": 6, "encount": 17, "engin": 1, "error": 17, "exampl": [7, 14], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 14, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [13, 16], "greedi": 3, "guid": [4, 8], "hang": 17, "host": 16, "how": 11, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 17, "implement": 7, "implic": 14, "input": 6, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 14, "launch": [6, 15], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 13, "max": 8, "mem": 8, "memori": [8, 14, 17], "method": [3, 10], "min": 14, "modal": [7, 14], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 14], "new": [11, 14], "normal": [3, 14], "note": 10, "nsight": 2, "openai": [1, 6, 7, 12, 15], "option": 8, "other": 2, "out": 8, "packag": 13, "parallel": 7, "paramet": [12, 14], "peak": 8, "penalti": 14, "perform": [1, 14], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 14, "preview": 0, "process": 13, "profil": 2, "pypi": 13, "quick": [1, 7, 15], "refer": 9, "releas": 13, "repetit": 14, "request": [8, 15], "role": 7, "run": [1, 8, 10, 16], "runner": 16, "runtim": [1, 5, 14], "sampl": 14, "schedul": 8, "select": 3, "self": 16, "send": 15, "serv": 0, "server": [1, 6, 15, 17], "set": 16, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 14], "sh": 16, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 15, 16], "static": 8, "step": 16, "stream": [7, 14], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 14, "token": [3, 14], "tp": 8, "troubleshoot": 17, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 16, "updat": 13, "upload": 13, "us": [1, 6, 7, 10, 15], "usag": 12, "version": 13, "vllm": 11, "wa": 17, "websit": 0, "without": 1, "your": [4, 8]}}) \ No newline at end of file diff --git a/send_request.html b/send_request.html index 10e8c2a..95f2121 100644 --- a/send_request.html +++ b/send_request.html @@ -33,8 +33,9 @@ + - + @@ -59,7 +60,7 @@ - + @@ -411,14 +412,18 @@

Contents

-
+

Quick Start: Launch A Server and Send Requests#

This section provides a quick start guide to using SGLang after installation.

Launch a server#

This code block is equivalent to executing

python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
---port 30000 --host 0.0.0.0 --log-level warning
+--port 30000 --host 0.0.0.0
 

in your command line and wait for the server to be ready.

@@ -426,29 +431,55 @@

Launch a server
[1]:
 
-
from sglang.utils import execute_shell_command, wait_for_server, terminate_process
+
from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text
 
 
-server_process = execute_shell_command(
+server_process = lauch_sglang_server(
     """
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
---port 30000 --host 0.0.0.0 --log-level warning
+--port 30000 --host 0.0.0.0
 """
 )
 
 wait_for_server("http://localhost:30000")
-print("Server is ready. Proceeding with the next steps.")
+highlight_text("Server is ready. Proceeding with the next steps.")
 
-
+
-Server is ready. Proceeding with the next steps.
+[2024-10-28 09:17:45] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-28 09:18:01 TP0] Init torch distributed begin.
+[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format ['*.safetensors']
+Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
+Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.24it/s]
+Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.16it/s]
+Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.17it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.57it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.40it/s]
+
+[2024-10-28 09:18:05 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-28 09:18:05 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-28 09:18:05 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-28 09:18:12 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-28 09:18:12] INFO:     Started server process [511197]
+[2024-10-28 09:18:12] INFO:     Waiting for application startup.
+[2024-10-28 09:18:12] INFO:     Application startup complete.
+[2024-10-28 09:18:12] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35516 - "GET /v1/models HTTP/1.1" 200 OK
 
+
+
+
+
+Server is ready. Proceeding with the next steps.
+

Send a Request#

@@ -469,7 +500,21 @@

Send a Request
-{"id":"6ae7fabfd4c54054a8017e2aa7c6bc5a","object":"chat.completion","created":1730071553,"model":"meta-llama/Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\n\nLarge Language Models are typically characterized by their ability to:\n\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\n\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\n\nSome popular examples of LLMs include:\n\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\n\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation."},"logprobs":null,"finish_reason":"stop","matched_stop":128009}],"usage":{"prompt_tokens":47,"total_tokens":450,"completion_tokens":403,"prompt_tokens_details":null}}
+[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35536 - "GET /get_model_info HTTP/1.1" 200 OK
+[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35540 - "POST /generate HTTP/1.1" 200 OK
+[2024-10-28 09:18:13] The server is fired up and ready to roll!
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0
+[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0
+[2024-10-28 09:18:16] INFO:     127.0.0.1:35530 - "POST /v1/chat/completions HTTP/1.1" 200 OK
+{"id":"ad61027db61649d0bd69f6aa901f1d8c","object":"chat.completion","created":1730107096,"model":"meta-llama/Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\n\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\n\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\n\nSome common applications of LLMs include:\n\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\n\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\n\nIf you have any specific questions or topics you'd like to discuss, feel free to ask!"},"logprobs":null,"finish_reason":"stop","matched_stop":128009}],"usage":{"prompt_tokens":47,"total_tokens":378,"completion_tokens":331,"prompt_tokens_details":null}}
 

@@ -498,19 +543,27 @@

Using OpenAI Compatible APItemperature=0, max_tokens=64, ) -print(response) +highlight_text(response) -
+
-ChatCompletion(id='da93c64364af475cbdd2cb19155fd68d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))
+[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0
+[2024-10-28 09:18:17] INFO:     127.0.0.1:35554 - "POST /v1/chat/completions HTTP/1.1" 200 OK
 
-

@@ -603,7 +667,7 @@

Using OpenAI Compatible API

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/send_request.ipynb b/send_request.ipynb index ea93b12..ea640a6 100644 --- a/send_request.ipynb +++ b/send_request.ipynb @@ -19,7 +19,7 @@ "\n", "```bash\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", + "--port 30000 --host 0.0.0.0\n", "```\n", "\n", "in your command line and wait for the server to be ready." @@ -30,10 +30,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:12.782403Z", - "iopub.status.busy": "2024-10-27T23:25:12.781995Z", - "iopub.status.idle": "2024-10-27T23:25:50.292760Z", - "shell.execute_reply": "2024-10-27T23:25:50.291723Z" + "iopub.execute_input": "2024-10-28T09:17:35.325923Z", + "iopub.status.busy": "2024-10-28T09:17:35.325748Z", + "iopub.status.idle": "2024-10-28T09:18:13.770765Z", + "shell.execute_reply": "2024-10-28T09:18:13.770130Z" } }, "outputs": [ @@ -41,23 +41,127 @@ "name": "stdout", "output_type": "stream", "text": [ - "Server is ready. Proceeding with the next steps.\n" + "[2024-10-28 09:17:45] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:01 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Server is ready. Proceeding with the next steps." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n", "\n", "\n", - "server_process = execute_shell_command(\n", + "server_process = lauch_sglang_server(\n", " \"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", - "--port 30000 --host 0.0.0.0 --log-level warning\n", + "--port 30000 --host 0.0.0.0\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")\n", - "print(\"Server is ready. Proceeding with the next steps.\")" + "highlight_text(\"Server is ready. Proceeding with the next steps.\")" ] }, { @@ -74,10 +178,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:50.328286Z", - "iopub.status.busy": "2024-10-27T23:25:50.327797Z", - "iopub.status.idle": "2024-10-27T23:25:53.479602Z", - "shell.execute_reply": "2024-10-27T23:25:53.478670Z" + "iopub.execute_input": "2024-10-28T09:18:13.772846Z", + "iopub.status.busy": "2024-10-28T09:18:13.772593Z", + "iopub.status.idle": "2024-10-28T09:18:16.416442Z", + "shell.execute_reply": "2024-10-28T09:18:16.415708Z" } }, "outputs": [ @@ -85,7 +189,87 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"id\":\"6ae7fabfd4c54054a8017e2aa7c6bc5a\",\"object\":\"chat.completion\",\"created\":1730071553,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\\n\\nLarge Language Models are typically characterized by their ability to:\\n\\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\\n\\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\\n\\nSome popular examples of LLMs include:\\n\\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\\n\\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":450,\"completion_tokens\":403,\"prompt_tokens_details\":null}}" + "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-10-28 09:18:13] INFO: 127.0.0.1:35536 - \"GET /get_model_info HTTP/1.1\" 200 OK\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:13] INFO: 127.0.0.1:35540 - \"POST /generate HTTP/1.1\" 200 OK\n", + "[2024-10-28 09:18:13] The server is fired up and ready to roll!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:16] INFO: 127.0.0.1:35530 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", + "{\"id\":\"ad61027db61649d0bd69f6aa901f1d8c\",\"object\":\"chat.completion\",\"created\":1730107096,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\\n\\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\\n\\nSome common applications of LLMs include:\\n\\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\\n\\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\\n\\nIf you have any specific questions or topics you'd like to discuss, feel free to ask!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":378,\"completion_tokens\":331,\"prompt_tokens_details\":null}}" ] } ], @@ -110,10 +294,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:53.481936Z", - "iopub.status.busy": "2024-10-27T23:25:53.481707Z", - "iopub.status.idle": "2024-10-27T23:25:54.273214Z", - "shell.execute_reply": "2024-10-27T23:25:54.272434Z" + "iopub.execute_input": "2024-10-28T09:18:16.418642Z", + "iopub.status.busy": "2024-10-28T09:18:16.418313Z", + "iopub.status.idle": "2024-10-28T09:18:17.213494Z", + "shell.execute_reply": "2024-10-28T09:18:17.212929Z" } }, "outputs": [ @@ -121,8 +305,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "ChatCompletion(id='da93c64364af475cbdd2cb19155fd68d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n" + "[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0\n", + "[2024-10-28 09:18:17] INFO: 127.0.0.1:35554 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] + }, + { + "data": { + "text/html": [ + "ChatCompletion(id='29542e83d53f44eea0c01d1f517c4b40', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107097, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -144,7 +348,7 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", - "print(response)" + "highlight_text(response)" ] }, { @@ -152,13 +356,30 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-10-27T23:25:54.275385Z", - "iopub.status.busy": "2024-10-27T23:25:54.274807Z", - "iopub.status.idle": "2024-10-27T23:25:57.082401Z", - "shell.execute_reply": "2024-10-27T23:25:57.080829Z" + "iopub.execute_input": "2024-10-28T09:18:17.215264Z", + "iopub.status.busy": "2024-10-28T09:18:17.215073Z", + "iopub.status.idle": "2024-10-28T09:18:20.076158Z", + "shell.execute_reply": "2024-10-28T09:18:20.075276Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17] INFO: Shutting down\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-10-28 09:18:17] INFO: Waiting for application shutdown.\n", + "[2024-10-28 09:18:17] INFO: Application shutdown complete.\n", + "[2024-10-28 09:18:17] INFO: Finished server process [511197]\n" + ] + } + ], "source": [ "terminate_process(server_process)" ] diff --git a/setup_github_runner.html b/setup_github_runner.html index 31a4cad..4ed9f38 100644 --- a/setup_github_runner.html +++ b/setup_github_runner.html @@ -33,7 +33,8 @@ - + + @@ -54,7 +55,7 @@ - + @@ -530,7 +531,7 @@

Step 3: Run the runner by

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.

diff --git a/troubleshooting.html b/troubleshooting.html index e75cf7f..b9a9cea 100644 --- a/troubleshooting.html +++ b/troubleshooting.html @@ -33,7 +33,8 @@ - + + @@ -56,7 +57,7 @@ - + @@ -509,7 +510,7 @@

The server hangs

- Last updated on Oct 27, 2024. + Last updated on Oct 28, 2024.