From 02a2a7134794b1b274b0519dca1daa21b9b52a6e Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochenyang20@gmail.com>
Date: Mon, 28 Oct 2024 09:18:36 +0000
Subject: [PATCH] Update 2024-10-28 09:18:36

---
 README.html                    |    7 +-
 _sources/README.md             |    1 +
 _sources/embedding_model.ipynb |  259 ++++++--
 _sources/openai_api.ipynb      | 1006 +++++++++++++++++++++++++-------
 _sources/send_request.ipynb    |  273 ++++++++-
 _static/css/custom_log.css     |   29 +
 backend.html                   |    7 +-
 benchmark_and_profiling.html   |    7 +-
 choices_methods.html           |    7 +-
 contributor_guide.html         |    7 +-
 custom_chat_template.html      |    7 +-
 embedding_model.html           |  111 +++-
 embedding_model.ipynb          |  259 ++++++--
 frontend.html                  |    7 +-
 genindex.html                  |    7 +-
 hyperparameter_tuning.html     |    7 +-
 index.html                     |    7 +-
 install.html                   |    7 +-
 model_support.html             |    7 +-
 openai_api.html                |  561 ++++++++++++------
 openai_api.ipynb               | 1006 +++++++++++++++++++++++++-------
 release_process.html           |    7 +-
 sampling_params.html           |    7 +-
 search.html                    |    7 +-
 searchindex.js                 |    2 +-
 send_request.html              |   96 ++-
 send_request.ipynb             |  273 ++++++++-
 setup_github_runner.html       |    7 +-
 troubleshooting.html           |    7 +-
 29 files changed, 3179 insertions(+), 816 deletions(-)
 create mode 100644 _static/css/custom_log.css
diff --git a/README.html b/README.html
index c136ab6..b6dc768 100644
--- a/README.html
+++ b/README.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -54,7 +55,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -523,7 +524,7 @@ <h3>Deploy<a class="headerlink" href="#deploy" title="Link to this heading">#</a
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/_sources/README.md b/_sources/README.md
index 052acbc..5dba730 100644
--- a/_sources/README.md
+++ b/_sources/README.md
@@ -20,6 +20,7 @@ make clean
 
 ### Serve (preview)
 Run an HTTP server and visit http://localhost:8000 in your browser.
+
 ```
 python3 -m http.server --d _build/html
 ```
diff --git a/_sources/embedding_model.ipynb b/_sources/embedding_model.ipynb
index 0370084..d26743c 100644
--- a/_sources/embedding_model.ipynb
+++ b/_sources/embedding_model.ipynb
@@ -21,7 +21,7 @@
     "The following code is equivalent to running this in the shell:\n",
     "```bash\n",
     "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
     "```\n",
     "\n",
     "Remember to add `--is-embedding` to the command."
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:22:53.085503Z",
-     "iopub.status.busy": "2024-10-27T23:22:53.085120Z",
-     "iopub.status.idle": "2024-10-27T23:23:32.527591Z",
-     "shell.execute_reply": "2024-10-27T23:23:32.526838Z"
+     "iopub.execute_input": "2024-10-28T09:15:14.536811Z",
+     "iopub.status.busy": "2024-10-28T09:15:14.536653Z",
+     "iopub.status.idle": "2024-10-28T09:15:54.999497Z",
+     "shell.execute_reply": "2024-10-28T09:15:54.998849Z"
     }
    },
    "outputs": [
@@ -43,23 +43,144 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Embedding server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:15:25] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=237179517, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:40 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:41 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:41 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:15:41 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:11,  1.89s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:09,  1.92s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.95s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.95s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.97s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:11<00:01,  1.81s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12<00:00,  1.52s/it]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12<00:00,  1.74s/it]\n",
+      "\n",
+      "[2024-10-28 09:15:54 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n",
+      "[2024-10-28 09:15:54 TP0] Memory pool end. avail mem=7.43 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:54 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:54] INFO:     Started server process [509328]\n",
+      "[2024-10-28 09:15:54] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:15:54] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:15:54] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-28 09:15:54] INFO:     127.0.0.1:59258 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Embedding server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
-    "embedding_process = execute_shell_command(\n",
+    "embedding_process = lauch_sglang_server(\n",
     "    \"\"\"\n",
     "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
     "\"\"\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30010\")\n",
     "\n",
-    "print(\"Embedding server is ready. Proceeding with the next steps.\")"
+    "highlight_text(\"Embedding server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -74,10 +195,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:32.562075Z",
-     "iopub.status.busy": "2024-10-27T23:23:32.561818Z",
-     "iopub.status.idle": "2024-10-27T23:23:33.771076Z",
-     "shell.execute_reply": "2024-10-27T23:23:33.770326Z"
+     "iopub.execute_input": "2024-10-28T09:15:55.001608Z",
+     "iopub.status.busy": "2024-10-28T09:15:55.001359Z",
+     "iopub.status.idle": "2024-10-28T09:15:56.216067Z",
+     "shell.execute_reply": "2024-10-28T09:15:56.215410Z"
     }
    },
    "outputs": [
@@ -85,8 +206,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]\n"
+      "[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:55] INFO:     127.0.0.1:59280 - \"GET /get_model_info HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59274 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -103,7 +251,7 @@
     "    \"embedding\"\n",
     "]\n",
     "\n",
-    "print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+    "highlight_text(f\"Text embedding (first 10): {text_embedding[:10]}\")"
    ]
   },
   {
@@ -118,10 +266,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:33.773259Z",
-     "iopub.status.busy": "2024-10-27T23:23:33.772776Z",
-     "iopub.status.idle": "2024-10-27T23:23:34.250269Z",
-     "shell.execute_reply": "2024-10-27T23:23:34.249623Z"
+     "iopub.execute_input": "2024-10-28T09:15:56.218030Z",
+     "iopub.status.busy": "2024-10-28T09:15:56.217835Z",
+     "iopub.status.idle": "2024-10-28T09:15:56.696733Z",
+     "shell.execute_reply": "2024-10-28T09:15:56.696187Z"
     }
    },
    "outputs": [
@@ -129,8 +277,29 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n"
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59290 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:15:56] The server is fired up and ready to roll!\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59300 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -145,7 +314,7 @@
     ")\n",
     "\n",
     "embedding = response.data[0].embedding[:10]\n",
-    "print(f\"Text embedding (first 10): {embedding}\")"
+    "highlight_text(f\"Text embedding (first 10): {embedding}\")"
    ]
   },
   {
@@ -162,10 +331,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:34.252332Z",
-     "iopub.status.busy": "2024-10-27T23:23:34.251830Z",
-     "iopub.status.idle": "2024-10-27T23:23:40.028848Z",
-     "shell.execute_reply": "2024-10-27T23:23:40.028041Z"
+     "iopub.execute_input": "2024-10-28T09:15:56.698501Z",
+     "iopub.status.busy": "2024-10-28T09:15:56.698324Z",
+     "iopub.status.idle": "2024-10-28T09:16:02.484649Z",
+     "shell.execute_reply": "2024-10-28T09:16:02.483955Z"
     }
    },
    "outputs": [
@@ -173,8 +342,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n"
+      "[2024-10-28 09:16:02 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:02] INFO:     127.0.0.1:59034 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -196,7 +378,7 @@
     "    0\n",
     "][\"embedding\"]\n",
     "\n",
-    "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+    "highlight_text(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
    ]
   },
   {
@@ -204,13 +386,24 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:40.031161Z",
-     "iopub.status.busy": "2024-10-27T23:23:40.030680Z",
-     "iopub.status.idle": "2024-10-27T23:23:42.843192Z",
-     "shell.execute_reply": "2024-10-27T23:23:42.842506Z"
+     "iopub.execute_input": "2024-10-28T09:16:02.486791Z",
+     "iopub.status.busy": "2024-10-28T09:16:02.486434Z",
+     "iopub.status.idle": "2024-10-28T09:16:05.293548Z",
+     "shell.execute_reply": "2024-10-28T09:16:05.292820Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:02] INFO:     Shutting down\n",
+      "[2024-10-28 09:16:02] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:16:02] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:16:02] INFO:     Finished server process [509328]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(embedding_process)"
    ]
diff --git a/_sources/openai_api.ipynb b/_sources/openai_api.ipynb
index 3f07a6b..bcd5c32 100644
--- a/_sources/openai_api.ipynb
+++ b/_sources/openai_api.ipynb
@@ -6,7 +6,9 @@
    "source": [
     "# OpenAI Compatible API\n",
     "\n",
-    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
+    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial aims at these popular APIs:\n",
     "\n",
     "- `chat/completions`\n",
     "- `completions`\n",
@@ -30,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:45.484181Z",
-     "iopub.status.busy": "2024-10-27T23:23:45.484018Z",
-     "iopub.status.idle": "2024-10-27T23:24:23.959941Z",
-     "shell.execute_reply": "2024-10-27T23:24:23.959208Z"
+     "iopub.execute_input": "2024-10-28T09:16:07.904473Z",
+     "iopub.status.busy": "2024-10-28T09:16:07.904311Z",
+     "iopub.status.idle": "2024-10-28T09:16:46.330698Z",
+     "shell.execute_reply": "2024-10-28T09:16:46.330038Z"
     }
    },
    "outputs": [
@@ -41,22 +43,124 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:16:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.21it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.12it/s]\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.12it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.35it/s]\n",
+      "\n",
+      "[2024-10-28 09:16:38 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-28 09:16:38 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-28 09:16:38 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:45 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:45] INFO:     Started server process [510260]\n",
+      "[2024-10-28 09:16:45] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:16:45] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:16:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36680 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
-    "\"\"\"\n",
+    "server_process = lauch_sglang_server(\n",
+    "    command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "\n",
+    "highlight_text(\"Server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -64,10 +168,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:23.995371Z",
-     "iopub.status.busy": "2024-10-27T23:24:23.995106Z",
-     "iopub.status.idle": "2024-10-27T23:24:24.788840Z",
-     "shell.execute_reply": "2024-10-27T23:24:24.788201Z"
+     "iopub.execute_input": "2024-10-28T09:16:46.332812Z",
+     "iopub.status.busy": "2024-10-28T09:16:46.332554Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.129366Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.128802Z"
     }
    },
    "outputs": [
@@ -75,8 +179,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='77e45b23e9b34ef0a65afd9598521768', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36690 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36696 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:46] The server is fired up and ready to roll!\n",
+      "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0\n",
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -98,7 +226,8 @@
     "    temperature=0,\n",
     "    max_tokens=64,\n",
     ")\n",
-    "print(response)"
+    "\n",
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -107,25 +236,7 @@
    "source": [
     "### Parameters\n",
     "\n",
-    "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
-    "\n",
-    "- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `n`: Number of chat completion choices to generate\n",
-    "- `stream`: If true, partial message deltas will be sent as they become available\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `response_format`: Specify the format of the response (e.g., JSON)\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
     "\n",
     "Here is an example of a detailed chat completion request:"
    ]
@@ -135,10 +246,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:24.790616Z",
-     "iopub.status.busy": "2024-10-27T23:24:24.790426Z",
-     "iopub.status.idle": "2024-10-27T23:24:24.902228Z",
-     "shell.execute_reply": "2024-10-27T23:24:24.901651Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.131245Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.131061Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.242225Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.241691Z"
     }
    },
    "outputs": [
@@ -146,8 +257,27 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ancient Rome's major achievements include:"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -173,11 +303,9 @@
     "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
     "    n=1,  # Single response is usually more stable\n",
     "    seed=42,  # Keep for reproducibility\n",
-    "    stream=True,  # Keep streaming for real-time output\n",
     ")\n",
     "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].delta.content or \"\", end=\"\")"
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -188,7 +316,7 @@
     "\n",
     "### Usage\n",
     "\n",
-    "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter."
    ]
   },
   {
@@ -196,10 +324,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:24.903908Z",
-     "iopub.status.busy": "2024-10-27T23:24:24.903730Z",
-     "iopub.status.idle": "2024-10-27T23:24:25.361829Z",
-     "shell.execute_reply": "2024-10-27T23:24:25.361272Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.243956Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.243779Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.703807Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.703265Z"
     }
    },
    "outputs": [
@@ -207,8 +335,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Completion(id='50da1b57333242cca0b8c6d8706f94b2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730071465, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0\n",
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -220,7 +375,8 @@
     "    n=1,\n",
     "    stop=None,\n",
     ")\n",
-    "print(response)"
+    "\n",
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -229,26 +385,7 @@
    "source": [
     "### Parameters\n",
     "\n",
-    "The completions API accepts the following parameters:\n",
-    "\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
-    "- `best_of`: Number of completions to generate server-side and return the best one\n",
-    "- `echo`: If true, the prompt will be included in the response\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
-    "- `n`: Number of completion choices to generate\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `stream`: If true, partial completion deltas will be sent as they become available\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `suffix`: Text to append to the completion\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
     "\n",
     "Here is an example of a detailed completions request:"
    ]
@@ -258,10 +395,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:25.363510Z",
-     "iopub.status.busy": "2024-10-27T23:24:25.363334Z",
-     "iopub.status.idle": "2024-10-27T23:24:26.087507Z",
-     "shell.execute_reply": "2024-10-27T23:24:26.086953Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.705617Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.705438Z",
+     "iopub.status.idle": "2024-10-28T09:16:48.612422Z",
+     "shell.execute_reply": "2024-10-28T09:16:48.611889Z"
     }
    },
    "outputs": [
@@ -269,51 +406,42 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  Be sure to include a new planet, a strange creature, and a discovery that changes everything.\n",
-      "As Captain Zara Black"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "wood pil"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "oted her ship, the Celestial Quest, through the vast expanse of space, she couldn't help but feel a sense"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " of excitement"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0\n",
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " their criteria"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ". And finally, after months of searching, they had found it."
-     ]
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\\n\\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -328,11 +456,9 @@
     "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
     "    n=1,  # Generate one completion\n",
     "    seed=123,  # For reproducible results\n",
-    "    stream=True,  # Stream the response\n",
     ")\n",
     "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].text or \"\", end=\"\")"
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -357,10 +483,10 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:26.089195Z",
-     "iopub.status.busy": "2024-10-27T23:24:26.089017Z",
-     "iopub.status.idle": "2024-10-27T23:24:26.169406Z",
-     "shell.execute_reply": "2024-10-27T23:24:26.168852Z"
+     "iopub.execute_input": "2024-10-28T09:16:48.614261Z",
+     "iopub.status.busy": "2024-10-28T09:16:48.614081Z",
+     "iopub.status.idle": "2024-10-28T09:16:48.695988Z",
+     "shell.execute_reply": "2024-10-28T09:16:48.695467Z"
     }
    },
    "outputs": [
@@ -368,8 +494,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job created with ID: batch_a8bb0663-1cc5-487b-b170-d8f2a76dbf60\n"
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:48 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -419,7 +559,7 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Batch job created with ID: {batch_response.id}\")"
+    "highlight_text(f\"Batch job created with ID: {batch_response.id}\")"
    ]
   },
   {
@@ -427,28 +567,96 @@
    "execution_count": 7,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:26.171258Z",
-     "iopub.status.busy": "2024-10-27T23:24:26.170832Z",
-     "iopub.status.idle": "2024-10-27T23:24:29.186895Z",
-     "shell.execute_reply": "2024-10-27T23:24:29.186293Z"
+     "iopub.execute_input": "2024-10-28T09:16:48.697904Z",
+     "iopub.status.busy": "2024-10-28T09:16:48.697486Z",
+     "iopub.status.idle": "2024-10-28T09:16:51.719102Z",
+     "shell.execute_reply": "2024-10-28T09:16:51.718503Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Batch job status: validating...trying again in 3 seconds...\n",
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1\" 200 OK\n",
       "Batch job completed successfully!\n",
       "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "\n",
-      "Request request-1:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Request request-2:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\\n\\n1.  **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Cleaning up files...\n"
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-1:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-2:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1\" 200 OK\n"
      ]
     }
    ],
@@ -471,16 +679,16 @@
     "    ]\n",
     "\n",
     "    for result in results:\n",
-    "        print(f\"\\nRequest {result['custom_id']}:\")\n",
-    "        print(f\"Response: {result['response']}\")\n",
+    "        highlight_text(f\"Request {result['custom_id']}:\")\n",
+    "        highlight_text(f\"Response: {result['response']}\")\n",
     "\n",
-    "    print(\"\\nCleaning up files...\")\n",
+    "    highlight_text(\"Cleaning up files...\")\n",
     "    # Only delete the result file ID since file_response is just content\n",
     "    client.files.delete(result_file_id)\n",
     "else:\n",
-    "    print(f\"Batch job failed with status: {batch_response.status}\")\n",
+    "    highlight_text(f\"Batch job failed with status: {batch_response.status}\")\n",
     "    if hasattr(batch_response, \"errors\"):\n",
-    "        print(f\"Errors: {batch_response.errors}\")"
+    "        highlight_text(f\"Errors: {batch_response.errors}\")"
    ]
   },
   {
@@ -500,10 +708,10 @@
    "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:29.188845Z",
-     "iopub.status.busy": "2024-10-27T23:24:29.188552Z",
-     "iopub.status.idle": "2024-10-27T23:24:54.305285Z",
-     "shell.execute_reply": "2024-10-27T23:24:54.304629Z"
+     "iopub.execute_input": "2024-10-28T09:16:51.720917Z",
+     "iopub.status.busy": "2024-10-28T09:16:51.720728Z",
+     "iopub.status.idle": "2024-10-28T09:17:16.852156Z",
+     "shell.execute_reply": "2024-10-28T09:17:16.851486Z"
     }
    },
    "outputs": [
@@ -511,89 +719,280 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Initial status: validating\n"
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 1/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 2/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 3/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 4/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 5/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:01] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:04] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:07] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:10] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:13] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -641,25 +1040,21 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "highlight_text(f\"Initial status: {batch_job.status}\")\n",
     "\n",
     "time.sleep(10)\n",
     "\n",
     "max_checks = 5\n",
     "for i in range(max_checks):\n",
     "    batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
-    "    print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
-    "    print(f\"ID: {batch_details.id}\")\n",
-    "    print(f\"Status: {batch_details.status}\")\n",
-    "    print(f\"Created at: {batch_details.created_at}\")\n",
-    "    print(f\"Input file ID: {batch_details.input_file_id}\")\n",
-    "    print(f\"Output file ID: {batch_details.output_file_id}\")\n",
-    "\n",
-    "    print(\"Request counts:\")\n",
-    "    print(f\"Total: {batch_details.request_counts.total}\")\n",
-    "    print(f\"Completed: {batch_details.request_counts.completed}\")\n",
-    "    print(f\"Failed: {batch_details.request_counts.failed}\")\n",
+    "\n",
+    "    highlight_text(\n",
+    "        f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
+    "    )\n",
+    "    highlight_text(\n",
+    "            f\"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>\"\n",
+    "        )\n",
     "\n",
     "    time.sleep(3)"
    ]
@@ -676,10 +1071,10 @@
    "execution_count": 9,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:54.307459Z",
-     "iopub.status.busy": "2024-10-27T23:24:54.307266Z",
-     "iopub.status.idle": "2024-10-27T23:25:07.414717Z",
-     "shell.execute_reply": "2024-10-27T23:25:07.413989Z"
+     "iopub.execute_input": "2024-10-28T09:17:16.854434Z",
+     "iopub.status.busy": "2024-10-28T09:17:16.854239Z",
+     "iopub.status.idle": "2024-10-28T09:17:29.967949Z",
+     "shell.execute_reply": "2024-10-28T09:17:29.967373Z"
     }
    },
    "outputs": [
@@ -687,25 +1082,187 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a\n",
-      "Initial status: validating\n"
+      "[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cancellation initiated. Status: cancelling\n"
+      "[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Current status: cancelled\n",
-      "Batch job successfully cancelled\n",
-      "Successfully cleaned up input file\n"
+      "[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:26] INFO:     127.0.0.1:54868 - \"POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - \"GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - \"DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -753,37 +1310,37 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "highlight_text(f\"Initial status: {batch_job.status}\")\n",
     "\n",
     "time.sleep(10)\n",
     "\n",
     "try:\n",
     "    cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
-    "    print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
+    "    highlight_text(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
     "    assert cancelled_job.status == \"cancelling\"\n",
     "\n",
     "    # Monitor the cancellation process\n",
     "    while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
     "        time.sleep(3)\n",
     "        cancelled_job = client.batches.retrieve(batch_job.id)\n",
-    "        print(f\"Current status: {cancelled_job.status}\")\n",
+    "        highlight_text(f\"Current status: {cancelled_job.status}\")\n",
     "\n",
     "    # Verify final status\n",
     "    assert cancelled_job.status == \"cancelled\"\n",
-    "    print(\"Batch job successfully cancelled\")\n",
+    "    highlight_text(\"Batch job successfully cancelled\")\n",
     "\n",
     "except Exception as e:\n",
-    "    print(f\"Error during cancellation: {e}\")\n",
+    "    highlight_text(f\"Error during cancellation: {e}\")\n",
     "    raise e\n",
     "\n",
     "finally:\n",
     "    try:\n",
     "        del_response = client.files.delete(uploaded_file.id)\n",
     "        if del_response.deleted:\n",
-    "            print(\"Successfully cleaned up input file\")\n",
+    "            highlight_text(\"Successfully cleaned up input file\")\n",
     "    except Exception as e:\n",
-    "        print(f\"Error cleaning up: {e}\")\n",
+    "        highlight_text(f\"Error cleaning up: {e}\")\n",
     "        raise e"
    ]
   },
@@ -792,13 +1349,24 @@
    "execution_count": 10,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:07.416667Z",
-     "iopub.status.busy": "2024-10-27T23:25:07.416471Z",
-     "iopub.status.idle": "2024-10-27T23:25:10.222119Z",
-     "shell.execute_reply": "2024-10-27T23:25:10.221434Z"
+     "iopub.execute_input": "2024-10-28T09:17:29.969798Z",
+     "iopub.status.busy": "2024-10-28T09:17:29.969613Z",
+     "iopub.status.idle": "2024-10-28T09:17:32.811800Z",
+     "shell.execute_reply": "2024-10-28T09:17:32.811092Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     Shutting down\n",
+      "[2024-10-28 09:17:30] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:17:30] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:17:30] INFO:     Finished server process [510260]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(server_process)"
    ]
diff --git a/_sources/send_request.ipynb b/_sources/send_request.ipynb
index ea93b12..ea640a6 100644
--- a/_sources/send_request.ipynb
+++ b/_sources/send_request.ipynb
@@ -19,7 +19,7 @@
     "\n",
     "```bash\n",
     "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
     "```\n",
     "\n",
     "in your command line and wait for the server to be ready."
@@ -30,10 +30,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:12.782403Z",
-     "iopub.status.busy": "2024-10-27T23:25:12.781995Z",
-     "iopub.status.idle": "2024-10-27T23:25:50.292760Z",
-     "shell.execute_reply": "2024-10-27T23:25:50.291723Z"
+     "iopub.execute_input": "2024-10-28T09:17:35.325923Z",
+     "iopub.status.busy": "2024-10-28T09:17:35.325748Z",
+     "iopub.status.idle": "2024-10-28T09:18:13.770765Z",
+     "shell.execute_reply": "2024-10-28T09:18:13.770130Z"
     }
    },
    "outputs": [
@@ -41,23 +41,127 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:17:45] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:01 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.24it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.17it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.57it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.40it/s]\n",
+      "\n",
+      "[2024-10-28 09:18:05 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-28 09:18:05 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-28 09:18:05 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:12 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:12] INFO:     Started server process [511197]\n",
+      "[2024-10-28 09:18:12] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:18:12] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:18:12] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35516 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
     "\n",
-    "server_process = execute_shell_command(\n",
+    "server_process = lauch_sglang_server(\n",
     "    \"\"\"\n",
     "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
     "\"\"\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "highlight_text(\"Server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -74,10 +178,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:50.328286Z",
-     "iopub.status.busy": "2024-10-27T23:25:50.327797Z",
-     "iopub.status.idle": "2024-10-27T23:25:53.479602Z",
-     "shell.execute_reply": "2024-10-27T23:25:53.478670Z"
+     "iopub.execute_input": "2024-10-28T09:18:13.772846Z",
+     "iopub.status.busy": "2024-10-28T09:18:13.772593Z",
+     "iopub.status.idle": "2024-10-28T09:18:16.416442Z",
+     "shell.execute_reply": "2024-10-28T09:18:16.415708Z"
     }
    },
    "outputs": [
@@ -85,7 +189,87 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\"id\":\"6ae7fabfd4c54054a8017e2aa7c6bc5a\",\"object\":\"chat.completion\",\"created\":1730071553,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\\n\\nLarge Language Models are typically characterized by their ability to:\\n\\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\\n\\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\\n\\nSome popular examples of LLMs include:\\n\\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\\n\\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":450,\"completion_tokens\":403,\"prompt_tokens_details\":null}}"
+      "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35536 - \"GET /get_model_info HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35540 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:18:13] The server is fired up and ready to roll!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:16] INFO:     127.0.0.1:35530 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"ad61027db61649d0bd69f6aa901f1d8c\",\"object\":\"chat.completion\",\"created\":1730107096,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\\n\\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\\n\\nSome common applications of LLMs include:\\n\\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\\n\\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\\n\\nIf you have any specific questions or topics you'd like to discuss, feel free to ask!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":378,\"completion_tokens\":331,\"prompt_tokens_details\":null}}"
      ]
     }
    ],
@@ -110,10 +294,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:53.481936Z",
-     "iopub.status.busy": "2024-10-27T23:25:53.481707Z",
-     "iopub.status.idle": "2024-10-27T23:25:54.273214Z",
-     "shell.execute_reply": "2024-10-27T23:25:54.272434Z"
+     "iopub.execute_input": "2024-10-28T09:18:16.418642Z",
+     "iopub.status.busy": "2024-10-28T09:18:16.418313Z",
+     "iopub.status.idle": "2024-10-28T09:18:17.213494Z",
+     "shell.execute_reply": "2024-10-28T09:18:17.212929Z"
     }
    },
    "outputs": [
@@ -121,8 +305,28 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='da93c64364af475cbdd2cb19155fd68d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0\n",
+      "[2024-10-28 09:18:17] INFO:     127.0.0.1:35554 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>ChatCompletion(id='29542e83d53f44eea0c01d1f517c4b40', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107097, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -144,7 +348,7 @@
     "    temperature=0,\n",
     "    max_tokens=64,\n",
     ")\n",
-    "print(response)"
+    "highlight_text(response)"
    ]
   },
   {
@@ -152,13 +356,30 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:54.275385Z",
-     "iopub.status.busy": "2024-10-27T23:25:54.274807Z",
-     "iopub.status.idle": "2024-10-27T23:25:57.082401Z",
-     "shell.execute_reply": "2024-10-27T23:25:57.080829Z"
+     "iopub.execute_input": "2024-10-28T09:18:17.215264Z",
+     "iopub.status.busy": "2024-10-28T09:18:17.215073Z",
+     "iopub.status.idle": "2024-10-28T09:18:20.076158Z",
+     "shell.execute_reply": "2024-10-28T09:18:20.075276Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17] INFO:     Shutting down\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:18:17] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:18:17] INFO:     Finished server process [511197]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(server_process)"
    ]
diff --git a/_static/css/custom_log.css b/_static/css/custom_log.css
new file mode 100644
index 0000000..86ee951
--- /dev/null
+++ b/_static/css/custom_log.css
@@ -0,0 +1,29 @@
+.output_area {
+    color: #615656;
+}
+
+table.autosummary td {
+    width: 50%
+  }
+  
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+ 
+.output_area.stderr {
+    color: #d3d3d3 !important; /* 浅灰色 */
+}
+
+.output_area.stdout {
+    color: #d3d3d3 !important;
+}
+
+div.output_area.stderr {
+    color: #d3d3d3 !important; /* 浅灰色 */
+}
+
+div.output_area.stdout {
+    color: #d3d3d3 !important;
+}
\ No newline at end of file
diff --git a/backend.html b/backend.html
index c91aa06..ba0d99d 100644
--- a/backend.html
+++ b/backend.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -733,7 +734,7 @@ <h2>Benchmark Performance<a class="headerlink" href="#benchmark-performance" tit
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/benchmark_and_profiling.html b/benchmark_and_profiling.html
index f48a351..6ce6710 100644
--- a/benchmark_and_profiling.html
+++ b/benchmark_and_profiling.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -555,7 +556,7 @@ <h2>Other tips<a class="headerlink" href="#other-tips" title="Link to this headi
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/choices_methods.html b/choices_methods.html
index 2aa46a8..1acd2ca 100644
--- a/choices_methods.html
+++ b/choices_methods.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -570,7 +571,7 @@ <h3>Unconditional Likelihood Normalized<a class="headerlink" href="#unconditiona
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/contributor_guide.html b/contributor_guide.html
index a635db9..65be0cf 100644
--- a/contributor_guide.html
+++ b/contributor_guide.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -506,7 +507,7 @@ <h2>Add Unit Tests<a class="headerlink" href="#add-unit-tests" title="Link to th
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/custom_chat_template.html b/custom_chat_template.html
index f53a65d..f551c17 100644
--- a/custom_chat_template.html
+++ b/custom_chat_template.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -54,7 +55,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -469,7 +470,7 @@ <h1>Custom Chat Template in SGLang Runtime<a class="headerlink" href="#custom-ch
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/embedding_model.html b/embedding_model.html
index 156f5b6..171b2a8 100644
--- a/embedding_model.html
+++ b/embedding_model.html
@@ -33,8 +33,9 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
     <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css?v=2aa19091" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -58,7 +59,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -411,7 +412,11 @@ <h2> Contents </h2>
 <div id="searchbox"></div>
                 <article class="bd-article">
                   
-  <section id="Embedding-Model">
+  <style>
+    .output_area.stderr, .output_area.stdout {
+        color: #d3d3d3 !important; /* 浅灰色 */
+    }
+</style><section id="Embedding-Model">
 <h1>Embedding Model<a class="headerlink" href="#Embedding-Model" title="Link to this heading">#</a></h1>
 <p>SGLang supports embedding models in the same way as completion models. Here are some example models:</p>
 <ul class="simple">
@@ -422,7 +427,7 @@ <h1>Embedding Model<a class="headerlink" href="#Embedding-Model" title="Link to
 <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to this heading">#</a></h2>
 <p>The following code is equivalent to running this in the shell:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>sglang.launch_server<span class="w"> </span>--model-path<span class="w"> </span>Alibaba-NLP/gte-Qwen2-7B-instruct<span class="w"> </span><span class="se">\</span>
-<span class="w">    </span>--port<span class="w"> </span><span class="m">30010</span><span class="w"> </span>--host<span class="w"> </span><span class="m">0</span>.0.0.0<span class="w"> </span>--is-embedding<span class="w"> </span>--log-level<span class="w"> </span>error
+<span class="w">    </span>--port<span class="w"> </span><span class="m">30010</span><span class="w"> </span>--host<span class="w"> </span><span class="m">0</span>.0.0.0<span class="w"> </span>--is-embedding
 </pre></div>
 </div>
 <p>Remember to add <code class="docutils literal notranslate"><span class="pre">--is-embedding</span></code> to the command.</p>
@@ -430,29 +435,57 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
 </pre></div>
 </div>
-<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">execute_shell_command</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">lauch_sglang_server</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span><span class="p">,</span> <span class="n">highlight_text</span>
 
-<span class="n">embedding_process</span> <span class="o">=</span> <span class="n">execute_shell_command</span><span class="p">(</span>
+<span class="n">embedding_process</span> <span class="o">=</span> <span class="n">lauch_sglang_server</span><span class="p">(</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \</span>
-<span class="sd">    --port 30010 --host 0.0.0.0 --is-embedding --log-level error</span>
+<span class="sd">    --port 30010 --host 0.0.0.0 --is-embedding</span>
 <span class="sd">&quot;&quot;&quot;</span>
 <span class="p">)</span>
 
 <span class="n">wait_for_server</span><span class="p">(</span><span class="s2">&quot;http://localhost:30010&quot;</span><span class="p">)</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Embedding server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Embedding server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Embedding server is ready. Proceeding with the next steps.
+[2024-10-28 09:15:25] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, chat_template=None, is_embedding=True, host=&#39;0.0.0.0&#39;, port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=237179517, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-28 09:15:40 TP0] Init torch distributed begin.
+[2024-10-28 09:15:41 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-28 09:15:41 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-28 09:15:41 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00&lt;?, ?it/s]
+Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01&lt;00:11,  1.89s/it]
+Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03&lt;00:09,  1.92s/it]
+Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05&lt;00:07,  1.95s/it]
+Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07&lt;00:05,  1.95s/it]
+Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09&lt;00:03,  1.97s/it]
+Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:11&lt;00:01,  1.81s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12&lt;00:00,  1.52s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12&lt;00:00,  1.74s/it]
+
+[2024-10-28 09:15:54 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB
+[2024-10-28 09:15:54 TP0] Memory pool end. avail mem=7.43 GB
+[2024-10-28 09:15:54 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072
+[2024-10-28 09:15:54] INFO:     Started server process [509328]
+[2024-10-28 09:15:54] INFO:     Waiting for application startup.
+[2024-10-28 09:15:54] INFO:     Application startup complete.
+[2024-10-28 09:15:54] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)
+[2024-10-28 09:15:54] INFO:     127.0.0.1:59258 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Embedding server is ready. Proceeding with the next steps.</strong></div>
+</div>
 </section>
 <section id="Use-Curl">
 <h2>Use Curl<a class="headerlink" href="#Use-Curl" title="Link to this heading">#</a></h2>
@@ -473,18 +506,27 @@ <h2>Use Curl<a class="headerlink" href="#Use-Curl" title="Link to this heading">
     <span class="s2">&quot;embedding&quot;</span>
 <span class="p">]</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Text embedding (first 10): </span><span class="si">{</span><span class="n">text_embedding</span><span class="p">[:</span><span class="mi">10</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Text embedding (first 10): </span><span class="si">{</span><span class="n">text_embedding</span><span class="p">[:</span><span class="mi">10</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]
+[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:15:55] INFO:     127.0.0.1:59280 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59274 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]</strong></div>
+</div>
 </section>
 <section id="Using-OpenAI-Compatible-API">
 <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compatible-API" title="Link to this heading">#</a></h2>
@@ -503,18 +545,27 @@ <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compati
 <span class="p">)</span>
 
 <span class="n">embedding</span> <span class="o">=</span> <span class="n">response</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">embedding</span><span class="p">[:</span><span class="mi">10</span><span class="p">]</span>
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Text embedding (first 10): </span><span class="si">{</span><span class="n">embedding</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Text embedding (first 10): </span><span class="si">{</span><span class="n">embedding</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59290 - &#34;POST /encode HTTP/1.1&#34; 200 OK
+[2024-10-28 09:15:56] The server is fired up and ready to roll!
+[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:15:56] INFO:     127.0.0.1:59300 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong></div>
+</div>
 </section>
 <section id="Using-Input-IDs">
 <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to this heading">#</a></h2>
@@ -541,19 +592,26 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
     <span class="mi">0</span>
 <span class="p">][</span><span class="s2">&quot;embedding&quot;</span><span class="p">]</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Input IDs embedding (first 10): </span><span class="si">{</span><span class="n">input_ids_embedding</span><span class="p">[:</span><span class="mi">10</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Input IDs embedding (first 10): </span><span class="si">{</span><span class="n">input_ids_embedding</span><span class="p">[:</span><span class="mi">10</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]
+[2024-10-28 09:16:02 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:02] INFO:     127.0.0.1:59034 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
-<div class="nbinput nblast docutils container">
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong></div>
+</div>
+<div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
 </pre></div>
 </div>
@@ -561,6 +619,17 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
 </pre></div>
 </div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:16:02] INFO:     Shutting down
+[2024-10-28 09:16:02] INFO:     Waiting for application shutdown.
+[2024-10-28 09:16:02] INFO:     Application shutdown complete.
+[2024-10-28 09:16:02] INFO:     Finished server process [509328]
+</pre></div></div>
+</div>
 </section>
 </section>
 
@@ -638,7 +707,7 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/embedding_model.ipynb b/embedding_model.ipynb
index 0370084..d26743c 100644
--- a/embedding_model.ipynb
+++ b/embedding_model.ipynb
@@ -21,7 +21,7 @@
     "The following code is equivalent to running this in the shell:\n",
     "```bash\n",
     "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
     "```\n",
     "\n",
     "Remember to add `--is-embedding` to the command."
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:22:53.085503Z",
-     "iopub.status.busy": "2024-10-27T23:22:53.085120Z",
-     "iopub.status.idle": "2024-10-27T23:23:32.527591Z",
-     "shell.execute_reply": "2024-10-27T23:23:32.526838Z"
+     "iopub.execute_input": "2024-10-28T09:15:14.536811Z",
+     "iopub.status.busy": "2024-10-28T09:15:14.536653Z",
+     "iopub.status.idle": "2024-10-28T09:15:54.999497Z",
+     "shell.execute_reply": "2024-10-28T09:15:54.998849Z"
     }
    },
    "outputs": [
@@ -43,23 +43,144 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Embedding server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:15:25] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=237179517, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:40 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:41 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:41 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:15:41 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:11,  1.89s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:09,  1.92s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.95s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.95s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.97s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:11<00:01,  1.81s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12<00:00,  1.52s/it]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12<00:00,  1.74s/it]\n",
+      "\n",
+      "[2024-10-28 09:15:54 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n",
+      "[2024-10-28 09:15:54 TP0] Memory pool end. avail mem=7.43 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:54 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:54] INFO:     Started server process [509328]\n",
+      "[2024-10-28 09:15:54] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:15:54] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:15:54] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-28 09:15:54] INFO:     127.0.0.1:59258 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Embedding server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
-    "embedding_process = execute_shell_command(\n",
+    "embedding_process = lauch_sglang_server(\n",
     "    \"\"\"\n",
     "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
     "\"\"\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30010\")\n",
     "\n",
-    "print(\"Embedding server is ready. Proceeding with the next steps.\")"
+    "highlight_text(\"Embedding server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -74,10 +195,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:32.562075Z",
-     "iopub.status.busy": "2024-10-27T23:23:32.561818Z",
-     "iopub.status.idle": "2024-10-27T23:23:33.771076Z",
-     "shell.execute_reply": "2024-10-27T23:23:33.770326Z"
+     "iopub.execute_input": "2024-10-28T09:15:55.001608Z",
+     "iopub.status.busy": "2024-10-28T09:15:55.001359Z",
+     "iopub.status.idle": "2024-10-28T09:15:56.216067Z",
+     "shell.execute_reply": "2024-10-28T09:15:56.215410Z"
     }
    },
    "outputs": [
@@ -85,8 +206,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]\n"
+      "[2024-10-28 09:15:55 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:55] INFO:     127.0.0.1:59280 - \"GET /get_model_info HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59274 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00830841064453125, 0.0006804466247558594, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020999908447265625, 0.006214141845703125, -0.0030345916748046875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -103,7 +251,7 @@
     "    \"embedding\"\n",
     "]\n",
     "\n",
-    "print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+    "highlight_text(f\"Text embedding (first 10): {text_embedding[:10]}\")"
    ]
   },
   {
@@ -118,10 +266,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:33.773259Z",
-     "iopub.status.busy": "2024-10-27T23:23:33.772776Z",
-     "iopub.status.idle": "2024-10-27T23:23:34.250269Z",
-     "shell.execute_reply": "2024-10-27T23:23:34.249623Z"
+     "iopub.execute_input": "2024-10-28T09:15:56.218030Z",
+     "iopub.status.busy": "2024-10-28T09:15:56.217835Z",
+     "iopub.status.idle": "2024-10-28T09:15:56.696733Z",
+     "shell.execute_reply": "2024-10-28T09:15:56.696187Z"
     }
    },
    "outputs": [
@@ -129,8 +277,29 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n"
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59290 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:15:56] The server is fired up and ready to roll!\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:15:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:15:56] INFO:     127.0.0.1:59300 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -145,7 +314,7 @@
     ")\n",
     "\n",
     "embedding = response.data[0].embedding[:10]\n",
-    "print(f\"Text embedding (first 10): {embedding}\")"
+    "highlight_text(f\"Text embedding (first 10): {embedding}\")"
    ]
   },
   {
@@ -162,10 +331,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:34.252332Z",
-     "iopub.status.busy": "2024-10-27T23:23:34.251830Z",
-     "iopub.status.idle": "2024-10-27T23:23:40.028848Z",
-     "shell.execute_reply": "2024-10-27T23:23:40.028041Z"
+     "iopub.execute_input": "2024-10-28T09:15:56.698501Z",
+     "iopub.status.busy": "2024-10-28T09:15:56.698324Z",
+     "iopub.status.idle": "2024-10-28T09:16:02.484649Z",
+     "shell.execute_reply": "2024-10-28T09:16:02.483955Z"
     }
    },
    "outputs": [
@@ -173,8 +342,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]\n"
+      "[2024-10-28 09:16:02 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:02] INFO:     127.0.0.1:59034 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00830078125, 0.0006747245788574219, -0.00807952880859375, -0.000682830810546875, 0.01438140869140625, -0.009002685546875, 0.01239013671875, 0.0020961761474609375, 0.006198883056640625, -0.003025054931640625]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -196,7 +378,7 @@
     "    0\n",
     "][\"embedding\"]\n",
     "\n",
-    "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+    "highlight_text(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
    ]
   },
   {
@@ -204,13 +386,24 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:40.031161Z",
-     "iopub.status.busy": "2024-10-27T23:23:40.030680Z",
-     "iopub.status.idle": "2024-10-27T23:23:42.843192Z",
-     "shell.execute_reply": "2024-10-27T23:23:42.842506Z"
+     "iopub.execute_input": "2024-10-28T09:16:02.486791Z",
+     "iopub.status.busy": "2024-10-28T09:16:02.486434Z",
+     "iopub.status.idle": "2024-10-28T09:16:05.293548Z",
+     "shell.execute_reply": "2024-10-28T09:16:05.292820Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:02] INFO:     Shutting down\n",
+      "[2024-10-28 09:16:02] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:16:02] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:16:02] INFO:     Finished server process [509328]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(embedding_process)"
    ]
diff --git a/frontend.html b/frontend.html
index 46dae62..f219b07 100644
--- a/frontend.html
+++ b/frontend.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -761,7 +762,7 @@ <h3>Tips and Implementation Details<a class="headerlink" href="#tips-and-impleme
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/genindex.html b/genindex.html
index ed6d6d0..71c62b4 100644
--- a/genindex.html
+++ b/genindex.html
@@ -32,7 +32,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -61,7 +62,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -374,7 +375,7 @@ <h1 id="index">Index</h1>
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/hyperparameter_tuning.html b/hyperparameter_tuning.html
index 78aa95a..b3aee03 100644
--- a/hyperparameter_tuning.html
+++ b/hyperparameter_tuning.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -551,7 +552,7 @@ <h3>Tune <code class="docutils literal notranslate"><span class="pre">--schedule
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/index.html b/index.html
index 2085e38..a7246cb 100644
--- a/index.html
+++ b/index.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -57,7 +58,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -500,7 +501,7 @@ <h1>SGLang Documentation<a class="headerlink" href="#sglang-documentation" title
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/install.html b/install.html
index 831ca90..e3124f2 100644
--- a/install.html
+++ b/install.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -604,7 +605,7 @@ <h2>Common Notes<a class="headerlink" href="#common-notes" title="Link to this h
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/model_support.html b/model_support.html
index 7e21319..1959fc1 100644
--- a/model_support.html
+++ b/model_support.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -537,7 +538,7 @@ <h2>Port a model from vLLM to SGLang<a class="headerlink" href="#port-a-model-fr
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/openai_api.html b/openai_api.html
index c39d006..918f43d 100644
--- a/openai_api.html
+++ b/openai_api.html
@@ -33,8 +33,9 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
     <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css?v=2aa19091" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -59,7 +60,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -419,9 +420,14 @@ <h2> Contents </h2>
 <div id="searchbox"></div>
                 <article class="bd-article">
                   
-  <section id="OpenAI-Compatible-API">
+  <style>
+    .output_area.stderr, .output_area.stdout {
+        color: #d3d3d3 !important; /* 浅灰色 */
+    }
+</style><section id="OpenAI-Compatible-API">
 <h1>OpenAI Compatible API<a class="headerlink" href="#OpenAI-Compatible-API" title="Link to this heading">#</a></h1>
-<p>SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.</p>
+<p>SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at <a class="reference external" href="https://platform.openai.com/docs/api-reference">OpenAI API Reference</a>.</p>
+<p>This tutorial aims at these popular APIs:</p>
 <ul class="simple">
 <li><p><code class="docutils literal notranslate"><span class="pre">chat/completions</span></code></p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">completions</span></code></p></li>
@@ -437,28 +443,52 @@ <h3>Usage<a class="headerlink" href="#Usage" title="Link to this heading">#</a><
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
 </pre></div>
 </div>
-<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">execute_shell_command</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">lauch_sglang_server</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span><span class="p">,</span> <span class="n">highlight_text</span>
 
-<span class="n">server_process</span> <span class="o">=</span> <span class="n">execute_shell_command</span><span class="p">(</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \</span>
-<span class="sd">--port 30000 --host 0.0.0.0 --log-level warning</span>
-<span class="sd">&quot;&quot;&quot;</span>
+<span class="n">server_process</span> <span class="o">=</span> <span class="n">lauch_sglang_server</span><span class="p">(</span>
+    <span class="n">command</span><span class="o">=</span><span class="s2">&quot;python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0&quot;</span>
 <span class="p">)</span>
 
 <span class="n">wait_for_server</span><span class="p">(</span><span class="s2">&quot;http://localhost:30000&quot;</span><span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
+
+<span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Server is ready. Proceeding with the next steps.
+[2024-10-28 09:16:18] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-28 09:16:34 TP0] Init torch distributed begin.
+[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00&lt;?, ?it/s]
+Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.21it/s]
+Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.12it/s]
+Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.12it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.51it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.35it/s]
+
+[2024-10-28 09:16:38 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-28 09:16:38 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-28 09:16:38 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-28 09:16:45 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-28 09:16:45] INFO:     Started server process [510260]
+[2024-10-28 09:16:45] INFO:     Waiting for application startup.
+[2024-10-28 09:16:45] INFO:     Application startup complete.
+[2024-10-28 09:16:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-28 09:16:46] INFO:     127.0.0.1:36680 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong></div>
+</div>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
 </pre></div>
@@ -481,41 +511,35 @@ <h3>Usage<a class="headerlink" href="#Usage" title="Link to this heading">#</a><
     <span class="n">temperature</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
     <span class="n">max_tokens</span><span class="o">=</span><span class="mi">64</span><span class="p">,</span>
 <span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
+
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">response</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-ChatCompletion(id=&#39;77e45b23e9b34ef0a65afd9598521768&#39;, choices=[Choice(finish_reason=&#39;stop&#39;, index=0, logprobs=None, message=ChatCompletionMessage(content=&#39;Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília&#39;, refusal=None, role=&#39;assistant&#39;, audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, object=&#39;chat.completion&#39;, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))
+[2024-10-28 09:16:46] INFO:     127.0.0.1:36690 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:46] INFO:     127.0.0.1:36696 - &#34;POST /generate HTTP/1.1&#34; 200 OK
+[2024-10-28 09:16:46] The server is fired up and ready to roll!
+[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0
+[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+</div>
 </section>
 <section id="Parameters">
 <h3>Parameters<a class="headerlink" href="#Parameters" title="Link to this heading">#</a></h3>
-<p>The chat completions API accepts the following parameters (refer to <a class="reference external" href="https://platform.openai.com/docs/api-reference/chat/create">OpenAI Chat Completions API</a> for more details):</p>
-<ul class="simple">
-<li><p><code class="docutils literal notranslate"><span class="pre">messages</span></code>: List of messages in the conversation, each containing <code class="docutils literal notranslate"><span class="pre">role</span></code> and <code class="docutils literal notranslate"><span class="pre">content</span></code></p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">model</span></code>: The model identifier to use for completion</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">max_tokens</span></code>: Maximum number of tokens to generate in the response</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">temperature</span></code>: Controls randomness (0-2). Lower values make output more focused and deterministic</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">top_p</span></code>: Alternative to temperature. Controls diversity via nucleus sampling</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">n</span></code>: Number of chat completion choices to generate</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stream</span></code>: If true, partial message deltas will be sent as they become available</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stop</span></code>: Sequences where the API will stop generating further tokens</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">presence_penalty</span></code>: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">frequency_penalty</span></code>: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">logit_bias</span></code>: Modify the likelihood of specified tokens appearing in the completion</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">logprobs</span></code>: Include log probabilities of tokens in the response</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">top_logprobs</span></code>: Number of most likely tokens to return probabilities for</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">seed</span></code>: Random seed for deterministic results</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">response_format</span></code>: Specify the format of the response (e.g., JSON)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stream_options</span></code>: Additional options for streaming responses</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">user</span></code>: A unique identifier representing your end-user</p></li>
-</ul>
+<p>The chat completions API accepts OpenAI Chat Completions API’s parameters. Refer to <a class="reference external" href="https://platform.openai.com/docs/api-reference/chat/create">OpenAI Chat Completions API</a> for more details.</p>
 <p>Here is an example of a detailed chat completion request:</p>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
@@ -543,29 +567,34 @@ <h3>Parameters<a class="headerlink" href="#Parameters" title="Link to this headi
     <span class="n">frequency_penalty</span><span class="o">=</span><span class="mf">0.2</span><span class="p">,</span>  <span class="c1"># Mild penalty for more natural language</span>
     <span class="n">n</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>  <span class="c1"># Single response is usually more stable</span>
     <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">,</span>  <span class="c1"># Keep for reproducibility</span>
-    <span class="n">stream</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>  <span class="c1"># Keep streaming for real-time output</span>
 <span class="p">)</span>
 
-<span class="k">for</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="n">response</span><span class="p">:</span>
-    <span class="nb">print</span><span class="p">(</span><span class="n">chunk</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">delta</span><span class="o">.</span><span class="n">content</span> <span class="ow">or</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">end</span><span class="o">=</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">response</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Ancient Rome&#39;s major achievements include:
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Ancient Rome's major achievements include:", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\n\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+</div>
 </section>
 </section>
 <section id="Completions">
 <h2>Completions<a class="headerlink" href="#Completions" title="Link to this heading">#</a></h2>
 <section id="id1">
 <h3>Usage<a class="headerlink" href="#id1" title="Link to this heading">#</a></h3>
-<p>Completions API is similar to Chat Completions API, but without the <code class="docutils literal notranslate"><span class="pre">messages</span></code> parameter. Refer to <a class="reference external" href="https://platform.openai.com/docs/api-reference/completions/create">OpenAI Completions API</a> for more details.</p>
+<p>Completions API is similar to Chat Completions API, but without the <code class="docutils literal notranslate"><span class="pre">messages</span></code> parameter.</p>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
 </pre></div>
@@ -578,42 +607,32 @@ <h3>Usage<a class="headerlink" href="#id1" title="Link to this heading">#</a></h
     <span class="n">n</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
     <span class="n">stop</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
 <span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
+
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">response</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Completion(id=&#39;50da1b57333242cca0b8c6d8706f94b2&#39;, choices=[CompletionChoice(finish_reason=&#39;length&#39;, index=0, logprobs=None, text=&#39; 1. 2. 3.\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1.  China - Beijing 2.  Brazil - Bras&#39;, matched_stop=None)], created=1730071465, model=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, object=&#39;text_completion&#39;, system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0
+[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0
+[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+</div>
 </section>
 <section id="id2">
 <h3>Parameters<a class="headerlink" href="#id2" title="Link to this heading">#</a></h3>
-<p>The completions API accepts the following parameters:</p>
-<ul class="simple">
-<li><p><code class="docutils literal notranslate"><span class="pre">model</span></code>: The model identifier to use for completion</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">prompt</span></code>: Input text to generate completions for. Can be a string, array of strings, or token arrays</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">best_of</span></code>: Number of completions to generate server-side and return the best one</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">echo</span></code>: If true, the prompt will be included in the response</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">frequency_penalty</span></code>: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">logit_bias</span></code>: Modify the likelihood of specified tokens appearing in the completion</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">logprobs</span></code>: Include log probabilities of tokens in the response</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">max_tokens</span></code>: Maximum number of tokens to generate in the response (default: 16)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">n</span></code>: Number of completion choices to generate</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">presence_penalty</span></code>: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">seed</span></code>: Random seed for deterministic results</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stop</span></code>: Sequences where the API will stop generating further tokens</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stream</span></code>: If true, partial completion deltas will be sent as they become available</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">stream_options</span></code>: Additional options for streaming responses</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">suffix</span></code>: Text to append to the completion</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">temperature</span></code>: Controls randomness (0-2). Lower values make output more focused and deterministic</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">top_p</span></code>: Alternative to temperature. Controls diversity via nucleus sampling</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">user</span></code>: A unique identifier representing your end-user</p></li>
-</ul>
+<p>The completions API accepts OpenAI Completions API’s parameters. Refer to <a class="reference external" href="https://platform.openai.com/docs/api-reference/completions/create">OpenAI Completions API</a> for more details.</p>
 <p>Here is an example of a detailed completions request:</p>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
@@ -630,23 +649,30 @@ <h3>Parameters<a class="headerlink" href="#id2" title="Link to this heading">#</
     <span class="n">frequency_penalty</span><span class="o">=</span><span class="mf">0.3</span><span class="p">,</span>  <span class="c1"># Reduce repetitive phrases</span>
     <span class="n">n</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>  <span class="c1"># Generate one completion</span>
     <span class="n">seed</span><span class="o">=</span><span class="mi">123</span><span class="p">,</span>  <span class="c1"># For reproducible results</span>
-    <span class="n">stream</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>  <span class="c1"># Stream the response</span>
 <span class="p">)</span>
 
-<span class="k">for</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="n">response</span><span class="p">:</span>
-    <span class="nb">print</span><span class="p">(</span><span class="n">chunk</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span> <span class="ow">or</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">end</span><span class="o">=</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">response</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-  Be sure to include a new planet, a strange creature, and a discovery that changes everything.
-As Captain Zara Blackwood piloted her ship, the Celestial Quest, through the vast expanse of space, she couldn&#39;t help but feel a sense of excitement and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit their criteria. And finally, after months of searching, they had found it.
+[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0
+[2024-10-28 09:16:48] INFO:     127.0.0.1:36706 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\n\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+</div>
 </section>
 </section>
 <section id="Batches">
@@ -709,18 +735,26 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
     <span class="n">completion_window</span><span class="o">=</span><span class="s2">&quot;24h&quot;</span><span class="p">,</span>
 <span class="p">)</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Batch job created with ID: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Batch job created with ID: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Batch job created with ID: batch_a8bb0663-1cc5-487b-b170-d8f2a76dbf60
+[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
+[2024-10-28 09:16:48 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job created with ID: batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402</strong></div>
+</div>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
 </pre></div>
@@ -743,35 +777,68 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
     <span class="p">]</span>
 
     <span class="k">for</span> <span class="n">result</span> <span class="ow">in</span> <span class="n">results</span><span class="p">:</span>
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Request </span><span class="si">{</span><span class="n">result</span><span class="p">[</span><span class="s1">&#39;custom_id&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">:&quot;</span><span class="p">)</span>
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">result</span><span class="p">[</span><span class="s1">&#39;response&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Request </span><span class="si">{</span><span class="n">result</span><span class="p">[</span><span class="s1">&#39;custom_id&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">:&quot;</span><span class="p">)</span>
+        <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Response: </span><span class="si">{</span><span class="n">result</span><span class="p">[</span><span class="s1">&#39;response&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
-    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Cleaning up files...&quot;</span><span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Cleaning up files...&quot;</span><span class="p">)</span>
     <span class="c1"># Only delete the result file ID since file_response is just content</span>
     <span class="n">client</span><span class="o">.</span><span class="n">files</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">result_file_id</span><span class="p">)</span>
 <span class="k">else</span><span class="p">:</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Batch job failed with status: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Batch job failed with status: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
     <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">batch_response</span><span class="p">,</span> <span class="s2">&quot;errors&quot;</span><span class="p">):</span>
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Errors: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">errors</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Errors: </span><span class="si">{</span><span class="n">batch_response</span><span class="o">.</span><span class="n">errors</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
+[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0
 Batch job status: validating...trying again in 3 seconds...
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - &#34;GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1&#34; 200 OK
 Batch job completed successfully!
 Request counts: BatchRequestCounts(completed=2, failed=0, total=2)
-
-Request request-1:
-Response: {&#39;status_code&#39;: 200, &#39;request_id&#39;: &#39;request-1&#39;, &#39;body&#39;: {&#39;id&#39;: &#39;request-1&#39;, &#39;object&#39;: &#39;chat.completion&#39;, &#39;created&#39;: 1730071466, &#39;model&#39;: &#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, &#39;choices&#39;: {&#39;index&#39;: 0, &#39;message&#39;: {&#39;role&#39;: &#39;assistant&#39;, &#39;content&#39;: &#39;Why do programmers prefer dark mode?\n\nBecause light attracts bugs.&#39;}, &#39;logprobs&#39;: None, &#39;finish_reason&#39;: &#39;stop&#39;, &#39;matched_stop&#39;: 128009}, &#39;usage&#39;: {&#39;prompt_tokens&#39;: 41, &#39;completion_tokens&#39;: 13, &#39;total_tokens&#39;: 54}, &#39;system_fingerprint&#39;: None}}
-
-Request request-2:
-Response: {&#39;status_code&#39;: 200, &#39;request_id&#39;: &#39;request-2&#39;, &#39;body&#39;: {&#39;id&#39;: &#39;request-2&#39;, &#39;object&#39;: &#39;chat.completion&#39;, &#39;created&#39;: 1730071466, &#39;model&#39;: &#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, &#39;choices&#39;: {&#39;index&#39;: 0, &#39;message&#39;: {&#39;role&#39;: &#39;assistant&#39;, &#39;content&#39;: &#39;**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\n\n1.  **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and&#39;}, &#39;logprobs&#39;: None, &#39;finish_reason&#39;: &#39;length&#39;, &#39;matched_stop&#39;: None}, &#39;usage&#39;: {&#39;prompt_tokens&#39;: 39, &#39;completion_tokens&#39;: 50, &#39;total_tokens&#39;: 89}, &#39;system_fingerprint&#39;: None}}
-
-Cleaning up files...
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - &#34;GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Request request-1:</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\n\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Request request-2:</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Cleaning up files...</strong></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - &#34;DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <p>It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.</p>
@@ -828,89 +895,161 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
     <span class="n">completion_window</span><span class="o">=</span><span class="s2">&quot;24h&quot;</span><span class="p">,</span>
 <span class="p">)</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Created batch job with ID: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Initial status: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Created batch job with ID: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Initial status: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
 <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
 
 <span class="n">max_checks</span> <span class="o">=</span> <span class="mi">5</span>
 <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">max_checks</span><span class="p">):</span>
     <span class="n">batch_details</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">batches</span><span class="o">.</span><span class="n">retrieve</span><span class="p">(</span><span class="n">batch_id</span><span class="o">=</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Batch job details (check </span><span class="si">{</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">max_checks</span><span class="si">}</span><span class="s2">):&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Status: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Created at: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">created_at</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Input file ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">input_file_id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Output file ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">output_file_id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-
-    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Request counts:&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Total: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Completed: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">completed</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Failed: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">failed</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="n">highlight_text</span><span class="p">(</span>
+        <span class="sa">f</span><span class="s2">&quot;Batch job details (check </span><span class="si">{</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">max_checks</span><span class="si">}</span><span class="s2">) // ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2"> // Status: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2"> // Created at: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">created_at</span><span class="si">}</span><span class="s2"> // Input file ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">input_file_id</span><span class="si">}</span><span class="s2"> // Output file ID: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">output_file_id</span><span class="si">}</span><span class="s2">&quot;</span>
+    <span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;&lt;strong&gt;Request counts: Total: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2"> // Completed: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">completed</span><span class="si">}</span><span class="s2"> // Failed: </span><span class="si">{</span><span class="n">batch_details</span><span class="o">.</span><span class="n">request_counts</span><span class="o">.</span><span class="n">failed</span><span class="si">}</span><span class="s2">&lt;/strong&gt;&quot;</span>
+        <span class="p">)</span>
 
     <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Initial status: validating</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0
+[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0
+[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0
+[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0
+[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0
+[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0
+[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0
+[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0
+[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0
+[2024-10-28 09:17:01] INFO:     127.0.0.1:40866 - &#34;GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong></div>
+</div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Initial status: validating
-Batch job details (check 1/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 2/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 3/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 4/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
-Batch job details (check 5/5):
-ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8
-Status: completed
-Created at: 1730071469
-Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375
-Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2
-Request counts:
-Total: 100
-Completed: 100
-Failed: 0
+[2024-10-28 09:17:04] INFO:     127.0.0.1:40866 - &#34;GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:07] INFO:     127.0.0.1:40866 - &#34;GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:10] INFO:     127.0.0.1:40866 - &#34;GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:13] INFO:     127.0.0.1:40866 - &#34;GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong></div>
+</div>
 <p>Here is an example to cancel a batch job.</p>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
@@ -960,55 +1099,124 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
     <span class="n">completion_window</span><span class="o">=</span><span class="s2">&quot;24h&quot;</span><span class="p">,</span>
 <span class="p">)</span>
 
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Created batch job with ID: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Initial status: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Created batch job with ID: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Initial status: </span><span class="si">{</span><span class="n">batch_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
 <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
 
 <span class="k">try</span><span class="p">:</span>
     <span class="n">cancelled_job</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">batches</span><span class="o">.</span><span class="n">cancel</span><span class="p">(</span><span class="n">batch_id</span><span class="o">=</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cancellation initiated. Status: </span><span class="si">{</span><span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cancellation initiated. Status: </span><span class="si">{</span><span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
     <span class="k">assert</span> <span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span> <span class="o">==</span> <span class="s2">&quot;cancelling&quot;</span>
 
     <span class="c1"># Monitor the cancellation process</span>
     <span class="k">while</span> <span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;failed&quot;</span><span class="p">,</span> <span class="s2">&quot;cancelled&quot;</span><span class="p">]:</span>
         <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
         <span class="n">cancelled_job</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">batches</span><span class="o">.</span><span class="n">retrieve</span><span class="p">(</span><span class="n">batch_job</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Current status: </span><span class="si">{</span><span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Current status: </span><span class="si">{</span><span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
     <span class="c1"># Verify final status</span>
     <span class="k">assert</span> <span class="n">cancelled_job</span><span class="o">.</span><span class="n">status</span> <span class="o">==</span> <span class="s2">&quot;cancelled&quot;</span>
-    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Batch job successfully cancelled&quot;</span><span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Batch job successfully cancelled&quot;</span><span class="p">)</span>
 
 <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
-    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error during cancellation: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error during cancellation: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
     <span class="k">raise</span> <span class="n">e</span>
 
 <span class="k">finally</span><span class="p">:</span>
     <span class="k">try</span><span class="p">:</span>
         <span class="n">del_response</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">files</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">uploaded_file</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
         <span class="k">if</span> <span class="n">del_response</span><span class="o">.</span><span class="n">deleted</span><span class="p">:</span>
-            <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Successfully cleaned up input file&quot;</span><span class="p">)</span>
+            <span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Successfully cleaned up input file&quot;</span><span class="p">)</span>
     <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error cleaning up: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">highlight_text</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error cleaning up: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
         <span class="k">raise</span> <span class="n">e</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a
-Initial status: validating
-Cancellation initiated. Status: cancelling
-Current status: cancelled
-Batch job successfully cancelled
-Successfully cleaned up input file
+[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
-<div class="nbinput nblast docutils container">
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Initial status: validating</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128
+[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1
+[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0
+[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0
+[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0
+[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0
+[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0
+[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0
+[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0
+[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0
+[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0
+[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0
+[2024-10-28 09:17:26] INFO:     127.0.0.1:54868 - &#34;POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - &#34;GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Current status: cancelled</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Batch job successfully cancelled</strong></div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - &#34;DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1&#34; 200 OK
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Successfully cleaned up input file</strong></div>
+</div>
+<div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
 </pre></div>
 </div>
@@ -1016,6 +1224,17 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </pre></div>
 </div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:17:29] INFO:     Shutting down
+[2024-10-28 09:17:30] INFO:     Waiting for application shutdown.
+[2024-10-28 09:17:30] INFO:     Application shutdown complete.
+[2024-10-28 09:17:30] INFO:     Finished server process [510260]
+</pre></div></div>
+</div>
 </section>
 </section>
 
@@ -1109,7 +1328,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/openai_api.ipynb b/openai_api.ipynb
index 3f07a6b..bcd5c32 100644
--- a/openai_api.ipynb
+++ b/openai_api.ipynb
@@ -6,7 +6,9 @@
    "source": [
     "# OpenAI Compatible API\n",
     "\n",
-    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
+    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial aims at these popular APIs:\n",
     "\n",
     "- `chat/completions`\n",
     "- `completions`\n",
@@ -30,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:23:45.484181Z",
-     "iopub.status.busy": "2024-10-27T23:23:45.484018Z",
-     "iopub.status.idle": "2024-10-27T23:24:23.959941Z",
-     "shell.execute_reply": "2024-10-27T23:24:23.959208Z"
+     "iopub.execute_input": "2024-10-28T09:16:07.904473Z",
+     "iopub.status.busy": "2024-10-28T09:16:07.904311Z",
+     "iopub.status.idle": "2024-10-28T09:16:46.330698Z",
+     "shell.execute_reply": "2024-10-28T09:16:46.330038Z"
     }
    },
    "outputs": [
@@ -41,22 +43,124 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:16:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=52609006, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:34 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:16:35 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.21it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.12it/s]\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.12it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.35it/s]\n",
+      "\n",
+      "[2024-10-28 09:16:38 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-28 09:16:38 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-28 09:16:38 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:45 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:45] INFO:     Started server process [510260]\n",
+      "[2024-10-28 09:16:45] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:16:45] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:16:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36680 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
-    "\"\"\"\n",
+    "server_process = lauch_sglang_server(\n",
+    "    command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "\n",
+    "highlight_text(\"Server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -64,10 +168,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:23.995371Z",
-     "iopub.status.busy": "2024-10-27T23:24:23.995106Z",
-     "iopub.status.idle": "2024-10-27T23:24:24.788840Z",
-     "shell.execute_reply": "2024-10-27T23:24:24.788201Z"
+     "iopub.execute_input": "2024-10-28T09:16:46.332812Z",
+     "iopub.status.busy": "2024-10-28T09:16:46.332554Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.129366Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.128802Z"
     }
    },
    "outputs": [
@@ -75,8 +179,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='77e45b23e9b34ef0a65afd9598521768', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071464, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36690 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:46] INFO:     127.0.0.1:36696 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:46] The server is fired up and ready to roll!\n",
+      "[2024-10-28 09:16:46 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 21.55, #queue-req: 0\n",
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bdb569b5e77147d0b4ebe2a79b451814', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -98,7 +226,8 @@
     "    temperature=0,\n",
     "    max_tokens=64,\n",
     ")\n",
-    "print(response)"
+    "\n",
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -107,25 +236,7 @@
    "source": [
     "### Parameters\n",
     "\n",
-    "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
-    "\n",
-    "- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `n`: Number of chat completion choices to generate\n",
-    "- `stream`: If true, partial message deltas will be sent as they become available\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `response_format`: Specify the format of the response (e.g., JSON)\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
     "\n",
     "Here is an example of a detailed chat completion request:"
    ]
@@ -135,10 +246,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:24.790616Z",
-     "iopub.status.busy": "2024-10-27T23:24:24.790426Z",
-     "iopub.status.idle": "2024-10-27T23:24:24.902228Z",
-     "shell.execute_reply": "2024-10-27T23:24:24.901651Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.131245Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.131061Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.242225Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.241691Z"
     }
    },
    "outputs": [
@@ -146,8 +257,27 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ancient Rome's major achievements include:"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='84ab9ffd558f4c5595addde9e7a9b40c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -173,11 +303,9 @@
     "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
     "    n=1,  # Single response is usually more stable\n",
     "    seed=42,  # Keep for reproducibility\n",
-    "    stream=True,  # Keep streaming for real-time output\n",
     ")\n",
     "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].delta.content or \"\", end=\"\")"
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -188,7 +316,7 @@
     "\n",
     "### Usage\n",
     "\n",
-    "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter."
    ]
   },
   {
@@ -196,10 +324,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:24.903908Z",
-     "iopub.status.busy": "2024-10-27T23:24:24.903730Z",
-     "iopub.status.idle": "2024-10-27T23:24:25.361829Z",
-     "shell.execute_reply": "2024-10-27T23:24:25.361272Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.243956Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.243779Z",
+     "iopub.status.idle": "2024-10-28T09:16:47.703807Z",
+     "shell.execute_reply": "2024-10-28T09:16:47.703265Z"
     }
    },
    "outputs": [
@@ -207,8 +335,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Completion(id='50da1b57333242cca0b8c6d8706f94b2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730071465, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 30, token usage: 0.00, gen throughput (token/s): 108.70, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:47 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 142.82, #queue-req: 0\n",
+      "[2024-10-28 09:16:47] INFO:     127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='8dd58c0e0eff4036ab377324851c1726', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730107007, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -220,7 +375,8 @@
     "    n=1,\n",
     "    stop=None,\n",
     ")\n",
-    "print(response)"
+    "\n",
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -229,26 +385,7 @@
    "source": [
     "### Parameters\n",
     "\n",
-    "The completions API accepts the following parameters:\n",
-    "\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
-    "- `best_of`: Number of completions to generate server-side and return the best one\n",
-    "- `echo`: If true, the prompt will be included in the response\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
-    "- `n`: Number of completion choices to generate\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `stream`: If true, partial completion deltas will be sent as they become available\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `suffix`: Text to append to the completion\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `user`: A unique identifier representing your end-user\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
     "\n",
     "Here is an example of a detailed completions request:"
    ]
@@ -258,10 +395,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:25.363510Z",
-     "iopub.status.busy": "2024-10-27T23:24:25.363334Z",
-     "iopub.status.idle": "2024-10-27T23:24:26.087507Z",
-     "shell.execute_reply": "2024-10-27T23:24:26.086953Z"
+     "iopub.execute_input": "2024-10-28T09:16:47.705617Z",
+     "iopub.status.busy": "2024-10-28T09:16:47.705438Z",
+     "iopub.status.idle": "2024-10-28T09:16:48.612422Z",
+     "shell.execute_reply": "2024-10-28T09:16:48.611889Z"
     }
    },
    "outputs": [
@@ -269,51 +406,42 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  Be sure to include a new planet, a strange creature, and a discovery that changes everything.\n",
-      "As Captain Zara Black"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "wood pil"
+      "[2024-10-28 09:16:47 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "oted her ship, the Celestial Quest, through the vast expanse of space, she couldn't help but feel a sense"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, gen throughput (token/s): 125.91, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " of excitement"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, gen throughput (token/s): 134.54, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " and trepidation. Her crew had been searching for weeks, scanning the galaxy for any sign of a new planet that fit"
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 128, token usage: 0.00, gen throughput (token/s): 133.40, #queue-req: 0\n",
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36706 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " their criteria"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ". And finally, after months of searching, they had found it."
-     ]
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='390b6931283540278af6151e5665b9e6', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' As you write, include sensory details to help bring the planet to life for your reader. The space explorer, Lyra, is on a mission to explore the newly discovered planet, Xylophia-IV.\\nLyra stepped out of the landing craft and onto the dusty surface of Xylophia-IV. The sky above was a deep shade of indigo, and the air was crisp with an otherworldly scent – a mix of ozone and something sweetly floral. She took a deep breath, feeling the cool breeze fill her lungs as she gazed out at the alien landscape.', matched_stop='\\n\\n')], created=1730107008, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=10, total_tokens=130, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -328,11 +456,9 @@
     "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
     "    n=1,  # Generate one completion\n",
     "    seed=123,  # For reproducible results\n",
-    "    stream=True,  # Stream the response\n",
     ")\n",
     "\n",
-    "for chunk in response:\n",
-    "    print(chunk.choices[0].text or \"\", end=\"\")"
+    "highlight_text(f\"Response: {response}\")"
    ]
   },
   {
@@ -357,10 +483,10 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:26.089195Z",
-     "iopub.status.busy": "2024-10-27T23:24:26.089017Z",
-     "iopub.status.idle": "2024-10-27T23:24:26.169406Z",
-     "shell.execute_reply": "2024-10-27T23:24:26.168852Z"
+     "iopub.execute_input": "2024-10-28T09:16:48.614261Z",
+     "iopub.status.busy": "2024-10-28T09:16:48.614081Z",
+     "iopub.status.idle": "2024-10-28T09:16:48.695988Z",
+     "shell.execute_reply": "2024-10-28T09:16:48.695467Z"
     }
    },
    "outputs": [
@@ -368,8 +494,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job created with ID: batch_a8bb0663-1cc5-487b-b170-d8f2a76dbf60\n"
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:48] INFO:     127.0.0.1:36708 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:48 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -419,7 +559,7 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Batch job created with ID: {batch_response.id}\")"
+    "highlight_text(f\"Batch job created with ID: {batch_response.id}\")"
    ]
   },
   {
@@ -427,28 +567,96 @@
    "execution_count": 7,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:26.171258Z",
-     "iopub.status.busy": "2024-10-27T23:24:26.170832Z",
-     "iopub.status.idle": "2024-10-27T23:24:29.186895Z",
-     "shell.execute_reply": "2024-10-27T23:24:29.186293Z"
+     "iopub.execute_input": "2024-10-28T09:16:48.697904Z",
+     "iopub.status.busy": "2024-10-28T09:16:48.697486Z",
+     "iopub.status.idle": "2024-10-28T09:16:51.719102Z",
+     "shell.execute_reply": "2024-10-28T09:16:51.718503Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:48 TP0] Decode batch. #running-req: 1, #token: 78, token usage: 0.00, gen throughput (token/s): 135.43, #queue-req: 0\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Batch job status: validating...trying again in 3 seconds...\n",
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"GET /v1/batches/batch_bb7ab5e0-97b7-41ef-8fc3-9976380bf402 HTTP/1.1\" 200 OK\n",
       "Batch job completed successfully!\n",
       "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "\n",
-      "Request request-1:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Request request-2:\n",
-      "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730071466, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as:\\n\\n1.  **Web Development**: Python is used in web development frameworks like Django and Flask to build fast, scalable, and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
-      "\n",
-      "Cleaning up files...\n"
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"GET /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea/content HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-1:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-2:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730107009, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:36708 - \"DELETE /v1/files/backend_result_file-fa4ddf26-be08-43c2-af09-dd4a2fc580ea HTTP/1.1\" 200 OK\n"
      ]
     }
    ],
@@ -471,16 +679,16 @@
     "    ]\n",
     "\n",
     "    for result in results:\n",
-    "        print(f\"\\nRequest {result['custom_id']}:\")\n",
-    "        print(f\"Response: {result['response']}\")\n",
+    "        highlight_text(f\"Request {result['custom_id']}:\")\n",
+    "        highlight_text(f\"Response: {result['response']}\")\n",
     "\n",
-    "    print(\"\\nCleaning up files...\")\n",
+    "    highlight_text(\"Cleaning up files...\")\n",
     "    # Only delete the result file ID since file_response is just content\n",
     "    client.files.delete(result_file_id)\n",
     "else:\n",
-    "    print(f\"Batch job failed with status: {batch_response.status}\")\n",
+    "    highlight_text(f\"Batch job failed with status: {batch_response.status}\")\n",
     "    if hasattr(batch_response, \"errors\"):\n",
-    "        print(f\"Errors: {batch_response.errors}\")"
+    "        highlight_text(f\"Errors: {batch_response.errors}\")"
    ]
   },
   {
@@ -500,10 +708,10 @@
    "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:29.188845Z",
-     "iopub.status.busy": "2024-10-27T23:24:29.188552Z",
-     "iopub.status.idle": "2024-10-27T23:24:54.305285Z",
-     "shell.execute_reply": "2024-10-27T23:24:54.304629Z"
+     "iopub.execute_input": "2024-10-28T09:16:51.720917Z",
+     "iopub.status.busy": "2024-10-28T09:16:51.720728Z",
+     "iopub.status.idle": "2024-10-28T09:17:16.852156Z",
+     "shell.execute_reply": "2024-10-28T09:17:16.851486Z"
     }
    },
    "outputs": [
@@ -511,89 +719,280 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Created batch job with ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Initial status: validating\n"
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:16:51] INFO:     127.0.0.1:53788 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 41.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:16:51 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.04%, token usage: 0.00, #running-req: 7, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 6025, token usage: 0.01, gen throughput (token/s): 927.84, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 10025, token usage: 0.02, gen throughput (token/s): 10850.25, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:52 TP0] Decode batch. #running-req: 100, #token: 14025, token usage: 0.03, gen throughput (token/s): 10640.61, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 1/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 18025, token usage: 0.04, gen throughput (token/s): 10399.84, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 2/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:53 TP0] Decode batch. #running-req: 100, #token: 22025, token usage: 0.05, gen throughput (token/s): 10192.34, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 3/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 26025, token usage: 0.06, gen throughput (token/s): 9969.00, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 4/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 30025, token usage: 0.07, gen throughput (token/s): 9754.98, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Batch job details (check 5/5):\n",
-      "ID: batch_3bed32fb-158c-4918-8522-8235c9a12fd8\n",
-      "Status: completed\n",
-      "Created at: 1730071469\n",
-      "Input file ID: backend_input_file-6040f73b-6fd9-4811-92fe-b23150459375\n",
-      "Output file ID: backend_result_file-900097bd-2499-4640-9a0c-6d26915780e2\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
+      "[2024-10-28 09:16:54 TP0] Decode batch. #running-req: 100, #token: 34025, token usage: 0.08, gen throughput (token/s): 9570.09, #queue-req: 0\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 38025, token usage: 0.09, gen throughput (token/s): 9370.66, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:55 TP0] Decode batch. #running-req: 100, #token: 42025, token usage: 0.09, gen throughput (token/s): 9157.62, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 46025, token usage: 0.10, gen throughput (token/s): 9012.88, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:16:56 TP0] Decode batch. #running-req: 100, #token: 50025, token usage: 0.11, gen throughput (token/s): 8840.89, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:01] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:04] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:07] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:10] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:13] INFO:     127.0.0.1:40866 - \"GET /v1/batches/batch_4c254e9a-af5c-4e7f-9982-9739540beefc HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_4c254e9a-af5c-4e7f-9982-9739540beefc // Status: completed // Created at: 1730107011 // Input file ID: backend_input_file-56c1c364-04a5-495a-8925-cb3d35cd73d4 // Output file ID: backend_result_file-27879a06-ce58-4456-b590-baccd9a49bff</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -641,25 +1040,21 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "highlight_text(f\"Initial status: {batch_job.status}\")\n",
     "\n",
     "time.sleep(10)\n",
     "\n",
     "max_checks = 5\n",
     "for i in range(max_checks):\n",
     "    batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
-    "    print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
-    "    print(f\"ID: {batch_details.id}\")\n",
-    "    print(f\"Status: {batch_details.status}\")\n",
-    "    print(f\"Created at: {batch_details.created_at}\")\n",
-    "    print(f\"Input file ID: {batch_details.input_file_id}\")\n",
-    "    print(f\"Output file ID: {batch_details.output_file_id}\")\n",
-    "\n",
-    "    print(\"Request counts:\")\n",
-    "    print(f\"Total: {batch_details.request_counts.total}\")\n",
-    "    print(f\"Completed: {batch_details.request_counts.completed}\")\n",
-    "    print(f\"Failed: {batch_details.request_counts.failed}\")\n",
+    "\n",
+    "    highlight_text(\n",
+    "        f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
+    "    )\n",
+    "    highlight_text(\n",
+    "            f\"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>\"\n",
+    "        )\n",
     "\n",
     "    time.sleep(3)"
    ]
@@ -676,10 +1071,10 @@
    "execution_count": 9,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:24:54.307459Z",
-     "iopub.status.busy": "2024-10-27T23:24:54.307266Z",
-     "iopub.status.idle": "2024-10-27T23:25:07.414717Z",
-     "shell.execute_reply": "2024-10-27T23:25:07.413989Z"
+     "iopub.execute_input": "2024-10-28T09:17:16.854434Z",
+     "iopub.status.busy": "2024-10-28T09:17:16.854239Z",
+     "iopub.status.idle": "2024-10-28T09:17:29.967949Z",
+     "shell.execute_reply": "2024-10-28T09:17:29.967373Z"
     }
    },
    "outputs": [
@@ -687,25 +1082,187 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Created batch job with ID: batch_08ed9e0c-386d-4286-b879-eab3380d686a\n",
-      "Initial status: validating\n"
+      "[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:17:16] INFO:     127.0.0.1:48056 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_9c319ff5-29c7-40db-9b8d-9225459caab5</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 39, #new-token: 39, #cached-token: 2106, cache hit rate: 59.51%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 333, #new-token: 8192, #cached-token: 10094, cache hit rate: 56.50%, token usage: 0.01, #running-req: 39, #queue-req: 128\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Prefill batch. #new-seq: 129, #new-token: 3869, #cached-token: 3226, cache hit rate: 54.14%, token usage: 0.03, #running-req: 371, #queue-req: 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:17 TP0] Decode batch. #running-req: 500, #token: 20525, token usage: 0.05, gen throughput (token/s): 395.72, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:18 TP0] Decode batch. #running-req: 500, #token: 40525, token usage: 0.09, gen throughput (token/s): 24587.43, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:19 TP0] Decode batch. #running-req: 500, #token: 60525, token usage: 0.14, gen throughput (token/s): 23385.77, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:20 TP0] Decode batch. #running-req: 500, #token: 80525, token usage: 0.18, gen throughput (token/s): 22312.99, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:21 TP0] Decode batch. #running-req: 500, #token: 100525, token usage: 0.23, gen throughput (token/s): 21433.76, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cancellation initiated. Status: cancelling\n"
+      "[2024-10-28 09:17:22 TP0] Decode batch. #running-req: 500, #token: 120525, token usage: 0.27, gen throughput (token/s): 20585.73, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Current status: cancelled\n",
-      "Batch job successfully cancelled\n",
-      "Successfully cleaned up input file\n"
+      "[2024-10-28 09:17:23 TP0] Decode batch. #running-req: 500, #token: 140525, token usage: 0.32, gen throughput (token/s): 19807.72, #queue-req: 0\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:24 TP0] Decode batch. #running-req: 500, #token: 160525, token usage: 0.36, gen throughput (token/s): 19058.59, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:25 TP0] Decode batch. #running-req: 500, #token: 180525, token usage: 0.41, gen throughput (token/s): 18388.08, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:26 TP0] Decode batch. #running-req: 500, #token: 200525, token usage: 0.45, gen throughput (token/s): 17734.98, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:26] INFO:     127.0.0.1:54868 - \"POST /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5/cancel HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - \"GET /v1/batches/batch_9c319ff5-29c7-40db-9b8d-9225459caab5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     127.0.0.1:54868 - \"DELETE /v1/files/backend_input_file-33df398d-2394-4995-8dd8-890cb3111446 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -753,37 +1310,37 @@
     "    completion_window=\"24h\",\n",
     ")\n",
     "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "highlight_text(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "highlight_text(f\"Initial status: {batch_job.status}\")\n",
     "\n",
     "time.sleep(10)\n",
     "\n",
     "try:\n",
     "    cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
-    "    print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
+    "    highlight_text(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
     "    assert cancelled_job.status == \"cancelling\"\n",
     "\n",
     "    # Monitor the cancellation process\n",
     "    while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
     "        time.sleep(3)\n",
     "        cancelled_job = client.batches.retrieve(batch_job.id)\n",
-    "        print(f\"Current status: {cancelled_job.status}\")\n",
+    "        highlight_text(f\"Current status: {cancelled_job.status}\")\n",
     "\n",
     "    # Verify final status\n",
     "    assert cancelled_job.status == \"cancelled\"\n",
-    "    print(\"Batch job successfully cancelled\")\n",
+    "    highlight_text(\"Batch job successfully cancelled\")\n",
     "\n",
     "except Exception as e:\n",
-    "    print(f\"Error during cancellation: {e}\")\n",
+    "    highlight_text(f\"Error during cancellation: {e}\")\n",
     "    raise e\n",
     "\n",
     "finally:\n",
     "    try:\n",
     "        del_response = client.files.delete(uploaded_file.id)\n",
     "        if del_response.deleted:\n",
-    "            print(\"Successfully cleaned up input file\")\n",
+    "            highlight_text(\"Successfully cleaned up input file\")\n",
     "    except Exception as e:\n",
-    "        print(f\"Error cleaning up: {e}\")\n",
+    "        highlight_text(f\"Error cleaning up: {e}\")\n",
     "        raise e"
    ]
   },
@@ -792,13 +1349,24 @@
    "execution_count": 10,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:07.416667Z",
-     "iopub.status.busy": "2024-10-27T23:25:07.416471Z",
-     "iopub.status.idle": "2024-10-27T23:25:10.222119Z",
-     "shell.execute_reply": "2024-10-27T23:25:10.221434Z"
+     "iopub.execute_input": "2024-10-28T09:17:29.969798Z",
+     "iopub.status.busy": "2024-10-28T09:17:29.969613Z",
+     "iopub.status.idle": "2024-10-28T09:17:32.811800Z",
+     "shell.execute_reply": "2024-10-28T09:17:32.811092Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:17:29] INFO:     Shutting down\n",
+      "[2024-10-28 09:17:30] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:17:30] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:17:30] INFO:     Finished server process [510260]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(server_process)"
    ]
diff --git a/release_process.html b/release_process.html
index 299c1a3..c610c19 100644
--- a/release_process.html
+++ b/release_process.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -54,7 +55,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -494,7 +495,7 @@ <h2>Make a release in GitHub<a class="headerlink" href="#make-a-release-in-githu
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/sampling_params.html b/sampling_params.html
index b186a12..334bff8 100644
--- a/sampling_params.html
+++ b/sampling_params.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -942,7 +943,7 @@ <h4>Min New Tokens<a class="headerlink" href="#min-new-tokens" title="Link to th
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/search.html b/search.html
index 69d4c1a..3e0a808 100644
--- a/search.html
+++ b/search.html
@@ -31,7 +31,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -63,7 +64,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -385,7 +386,7 @@ <h1>Search</h1>
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/searchindex.js b/searchindex.js
index 378d3ee..ceca047 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[16, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[14, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[14, "baseline"]], "Batches": [[12, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[14, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[17, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[12, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[12, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[14, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[14, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[14, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[15, "Launch-a-server"]], "Make a release in GitHub": [[13, "make-a-release-in-github"]], "Memory": [[14, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[14, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[14, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[14, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [12, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[12, "Parameters"], [12, "id2"]], "Performance Implications on Penalties": [[14, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[14, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[13, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[15, null]], "References": [[9, null]], "Repetition Penalty": [[14, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[14, null]], "Send a Request": [[15, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[16, null]], "Step 1: Start a docker container.": [[16, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[16, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[16, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [14, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[17, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[17, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[13, "update-the-version-in-code"]], "Upload the PyPI package": [[13, "upload-the-pypi-package"]], "Usage": [[12, "Usage"], [12, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [15, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 12, 14, 15], "0": [1, 6, 7, 8, 10, 12, 14, 15, 16, 17], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "01": [7, 8, 14], "01239013671875": 6, "01438140869140625": 6, "02": 14, "03": 14, "04": [14, 16], "05": 14, "06": 14, "08": 14, "0_rocm6": 16, "0_triton3": 16, "1": [2, 6, 7, 8, 11, 12, 14, 15], "10": [1, 2, 6, 12, 14], "100": [7, 12], "101": 14, "103": 14, "104": 14, "10405": 14, "10666": 14, "107": 14, "10767": 14, "11": 14, "114": 14, "11586": 14, "117": 14, "11732": 14, "12": [10, 14, 16], "123": 12, "127": [1, 6, 12, 15], "128": [1, 14], "128009": [12, 15], "13": [12, 14], "14226": 14, "150": 12, "158c": 12, "16": [1, 7, 12, 14], "16219": 14, "16740": 14, "17": 14, "17125": 14, "17167": 14, "172": 1, "1730071464": 12, "1730071465": 12, "1730071466": 12, "1730071469": 12, "1730071553": 15, "1730071554": 15, "174": 14, "179": 14, "18": 14, "18895": 14, "189": 14, "191": 14, "195": 14, "19884": 14, "1cc5": 12, "1st": 14, "2": [1, 5, 6, 7, 9, 12, 14, 15], "200": 12, "20000": 1, "2048": [2, 8], "205": 14, "20866": 14, "22095": 14, "22363": 14, "22603": 14, "233": [8, 14], "23892": 14, "24": 14, "2499": 12, "24h": 12, "25": 7, "256": [1, 2, 7, 14], "26": 14, "268": 14, "27": 14, "271": 14, "29": 14, "293": 14, "3": [2, 5, 6, 7, 8, 9, 12, 14, 15], "30": 14, "3000": 14, "30000": [1, 5, 7, 10, 12, 14, 15], "30010": 6, "308": 14, "31": 14, "317": 8, "32": [1, 2, 14], "320": 14, "34": 15, "35": 14, "36": 14, "37": 14, "370959": 8, "378633": 14, "38": 14, "386d": 12, "39": [12, 14, 15], "4": [1, 6, 7, 12, 15], "40": 14, "403": 15, "40881": 14, "409": 14, "4096": [1, 2, 8], "41": [12, 14], "41888": 14, "42": 12, "4286": 12, "433": 14, "43967": 14, "44": 14, "440": 14, "447": 14, "44926": 14, "45": 14, "450": 15, "453": 14, "45354": 14, "45445": 14, "455": 14, "4594": 8, "46": [12, 14, 15], "4640": 12, "46530": 14, "47": [14, 15], "47738": 14, "4811": 12, "48302": 14, "4832": 14, "487b": 12, "48960": 14, "49": [12, 15], "49017": 14, "4918": 12, "49263": 14, "5": [1, 6, 7, 12, 14], "50": [8, 12, 14], "500": [8, 12], "50000": 1, "50302": 14, "5079": 14, "50da1b57333242cca0b8c6d8706f94b2": 12, "51": 14, "512": [2, 14], "52": 1, "5206": 14, "5255": 14, "52554": 14, "52825": 14, "52920": 14, "54": [12, 14], "54497": 14, "55": 14, "56": 14, "5656": 14, "5727": 14, "57426": 14, "58": 14, "59": 14, "5b": 11, "6": [1, 12, 16], "60": [2, 14], "6000": 2, "6040f73b": 12, "61": 14, "64": [1, 2, 12, 14, 15], "64g": 16, "65": 14, "66": 14, "67": 14, "68": 14, "69": 14, "6ae7fabfd4c54054a8017e2aa7c6bc5a": 15, "6d26915780e2": 12, "6fd9": 12, "7": [1, 12], "70": [2, 14], "71": 14, "72": 14, "72b": 1, "73": [12, 14], "74": 14, "75": 14, "76": 14, "766008": 14, "774756": 14, "774955": 14, "775118": 14, "775210": 14, "775220": 14, "775651": 14, "77e45b23e9b34ef0a65afd9598521768": 12, "78": 14, "79": 14, "7b": [1, 5, 6, 14], "7fa2af80": 2, "8": [1, 12, 14, 17], "8000": 0, "81": 14, "82": 8, "8235c9a12fd8": 12, "83": 14, "84": 14, "8413": 14, "85": 14, "8522": 12, "86": 14, "88": 14, "89": [12, 14], "8b": [1, 2, 7, 10, 12, 14, 15], "9": [1, 7, 8, 12, 17], "90": 14, "900097bd": 12, "91": 14, "92fe": 12, "93": 14, "94": 14, "95": [1, 12, 14, 15], "96": 14, "97": 14, "98": 14, "9900": 14, "9998": 8, "9a0c": 12, "A": [1, 2, 7, 8, 9, 10, 12], "And": 12, "As": 12, "Be": 12, "By": [5, 14], "For": [1, 2, 3, 11, 12, 14], "If": [1, 5, 8, 10, 12, 14, 17], "In": [1, 7, 17], "It": [1, 3, 5, 7, 8, 9, 10, 12, 14, 15], "NOT": 5, "On": 8, "THE": 12, "The": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 14, 16], "Then": [7, 16], "There": 5, "These": 14, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 13, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abil": 15, "abl": 11, "about": [1, 5, 7, 8, 12], "abov": [2, 3, 10, 14], "acceler": [1, 8, 10], "accept": [12, 14], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2, 15], "accuraci": 15, "achiev": 12, "across": 3, "activ": 9, "ad": 10, "add": [1, 2, 6, 7, 8, 14, 17], "addit": [3, 7, 12, 14], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "after": [12, 15], "again": 12, "against": 3, "ai": [1, 10, 12, 15], "alexa": 15, "algorithm": 14, "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 16], "all_other_model": 11, "allow": [2, 10, 15], "almost": [1, 8, 11], "also": [1, 5, 6, 7, 8, 14], "altern": [3, 7, 12], "alwai": [8, 12, 15], "amd": 16, "amount": 15, "an": [0, 1, 3, 7, 9, 10, 12, 14, 15, 16], "ancient": 12, "ani": [1, 7, 10, 12, 14], "annot": 2, "anoth": [11, 15], "answer": [3, 7, 15], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 14], "api_kei": [1, 6, 12, 15], "appear": [12, 14], "append": 12, "appli": 14, "applic": [1, 6, 9, 15], "approach": 10, "apt": [2, 16], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15], "arch": 16, "architectur": [2, 15], "arg": 3, "argument": [2, 7, 14], "around": 17, "arrai": 12, "articl": 15, "artifici": 15, "assert": 12, "assign": [12, 15], "assist": [1, 3, 5, 7, 12, 14, 15], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attract": [3, 7, 12], "audio": [12, 15], "auror": 7, "australia": [12, 15], "author": [6, 15], "automat": 14, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 10, 12], "averag": 3, "avoid": [10, 12], "awq": 9, "b": 10, "b170": 12, "b23150459375": 12, "b879": 12, "back": 9, "backend": [2, 3, 10, 14, 17], "backend_input_fil": 12, "backend_result_fil": 12, "bad": 3, "baichuan2": 1, "balanc": [7, 12], "base": [3, 12, 14], "base64": 14, "base_url": [1, 6, 12, 15], "bash": [13, 16], "basic": 14, "batch": [1, 2, 8, 9, 14], "batch_08ed9e0c": 12, "batch_3bed32fb": 12, "batch_a8bb0663": 12, "batch_detail": 12, "batch_id": 12, "batch_job": 12, "batch_request": 12, "batch_respons": 12, "batchrequestcount": 12, "bearer": [6, 15], "becaus": [7, 8], "becom": 12, "been": 12, "befor": [2, 14], "begin": 7, "beij": 12, "being": 8, "below": [7, 10, 14, 16], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 14], "benchmark": 9, "berlin": 3, "bespok": 3, "best": 12, "best_of": 12, "better": [1, 8, 10, 11, 12], "between": [1, 14], "bia": 7, "bin": 16, "blackwood": 12, "blob": 14, "block": [7, 15], "blogpost": 3, "blood": 7, "bodi": [7, 12, 14], "bogart": 7, "book": 15, "bool": 14, "born": 7, "both": 8, "bottleneck": 8, "bra": 12, "branch": 10, "bras\u00edlia": [12, 15], "brazil": [12, 15], "break": 14, "browser": 0, "bug": 12, "build": [1, 10, 12, 13], "built": 10, "c": [10, 12], "cach": [1, 2, 8, 9, 10, 16], "calcul": 7, "call": [3, 7, 9], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "canberra": [12, 15], "cancel": 12, "cancelled_job": 12, "cannot": 14, "capit": [1, 3, 7, 12, 14, 15], "captain": 12, "case": [8, 17], "cd": [4, 10, 13], "celesti": 12, "center": 12, "chain": 9, "chang": [11, 12, 16], "charact": 7, "character": 15, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 14, 15], "chat_exampl": 7, "chat_templ": 5, "chatbot": 15, "chatcomplet": [12, 15], "chatcompletionmessag": [12, 15], "chatglm": 1, "chatml": [1, 5, 14], "check": [1, 10, 12], "check_output": 6, "checkpoint": [1, 2], "china": 12, "choic": [7, 9, 12, 15], "choices_method": 3, "chunk": [1, 9, 12, 14], "ci": 4, "civil": 12, "class": 14, "clean": 12, "cli": 2, "client": [1, 2, 6, 12, 15], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 6, 7, 15], "coher": 15, "color": 2, "com": [2, 10, 13, 14, 16], "come": [8, 14], "command": [1, 2, 4, 6, 10, 11, 15, 16], "commit": 4, "common": 17, "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 9, 14], "compil": [1, 8], "complet": [1, 6, 7, 15], "completion_token": [12, 15], "completion_tokens_detail": [12, 15], "completion_window": 12, "completionchoic": 12, "completionusag": [12, 15], "complex": [7, 15], "compos": 15, "comprehend": 15, "comput": [2, 7, 8, 14], "concis": 12, "conda": 10, "condens": 15, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 14], "constrain": [8, 9, 14], "constraint": 7, "contain": [3, 12], "content": [1, 6, 7, 12, 15], "context": 15, "continu": [7, 9], "contribut": 5, "contributor": 9, "control": [9, 12], "convers": [5, 12], "convert": 11, "copi": 10, "core": [7, 9, 15], "correct": [2, 14], "cost": 12, "could": 14, "couldn": 12, "count": 12, "countri": [1, 12, 15], "coverag": 11, "cpu": 8, "creat": [1, 6, 11, 12, 15], "created_at": 12, "creation": 15, "creativ": 12, "creatur": 12, "crew": 12, "criteria": 12, "critic": 2, "cu121": 10, "cuda": [1, 2, 10, 14, 16], "cuda_visible_devic": 16, "curl": [1, 14, 15, 16], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 12, "custom": [1, 15], "custom_id": 12, "d": [0, 1, 2, 6, 7, 10, 12, 15], "d8f2a76dbf60": 12, "da93c64364af475cbdd2cb19155fd68d": 15, "dark": 12, "data": [1, 6, 8, 14, 15], "dataclass": 14, "dataset": [2, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 14, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 12, 14], "decode_unicod": 14, "decor": 7, "decreas": 8, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 12, 14, 17], "defin": [5, 7], "del_respons": 12, "delai": 2, "delet": 12, "delta": 12, "depend": 10, "deploi": 10, "deploy": 10, "describ": [3, 14], "descript": [2, 14], "design": [9, 15], "desir": 14, "detail": [12, 14], "detailed_tip": 7, "determin": 3, "determinist": 12, "detoken": 14, "dev": [1, 16], "devel": 16, "develop": [2, 12], "devic": [1, 10, 16], "devtool": 2, "dict": 14, "diet": 7, "differ": 11, "difficult": 14, "digest": 15, "directli": 1, "directori": 11, "disabl": [1, 2, 14, 17], "discoveri": 12, "dislik": 14, "distrib_releas": 2, "divers": 12, "django": 12, "dn": 7, "do": [2, 8, 12, 14, 16], "doc": [2, 3, 10, 14], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [14, 16], "down": 3, "download": [2, 14], "dp": 1, "dpkg": 2, "drawback": 14, "dri": 16, "dtype": 1, "duck": 3, "due": [3, 8, 17], "dummi": 2, "dump": [6, 12], "durat": [2, 14], "dure": [1, 8, 12, 14, 15], "dynam": [1, 2], "e": [2, 10, 11, 12, 16], "e2": 14, "e5": [1, 6, 9], "eab3380d686a": 12, "each": [1, 12], "earli": 8, "earlier": 3, "easi": [9, 11, 17], "easier": 7, "eater": 7, "echo": [2, 12, 16], "edit": 16, "effici": [1, 9], "either": 14, "element": 12, "eleutherai": 3, "elif": 7, "els": 12, "embed": [1, 9, 12], "embedding_model": 12, "embedding_process": 6, "empti": 1, "enabl": [1, 7, 8, 10, 15], "encod": [6, 14], "encount": 10, "encourag": [12, 14], "end": [7, 11, 12, 14], "endpoint": [1, 10, 12, 14], "engin": 7, "enough": [1, 8], "entir": 15, "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 6, 16], "eo": [8, 14], "equival": [6, 15], "error": [1, 6, 8, 12], "especi": 8, "etc": [2, 9], "eth0": 1, "even": [3, 12, 15], "everi": 14, "everyth": 12, "exampl": [1, 3, 6, 11, 12, 15, 16], "example_imag": 14, "exaon": 1, "except": 12, "excit": 12, "excl": 14, "exec": 2, "execut": [10, 15], "execute_shell_command": [6, 12, 15], "exercis": 7, "exist": 11, "expand": 7, "expans": 12, "experiment": 8, "explor": 12, "export": [0, 1, 7, 16], "express": [7, 14], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 14, "f": [1, 6, 7, 12], "face": [1, 5], "fact": 15, "fail": [3, 12], "failur": 10, "fals": [6, 14], "far": [12, 14], "fast": [9, 12], "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feel": 12, "fetch": 2, "field": 15, "file": [0, 2, 4, 11, 12, 14], "file_respons": 12, "fill": 7, "fillmor": 3, "final": 12, "find": [7, 11, 14], "finish_reason": [12, 15], "first": [1, 2, 6, 7, 8, 14], "fit": 12, "fix": 17, "flashinf": [9, 10], "flask": 12, "flexibl": 9, "float": 14, "flow": 9, "fluenci": 12, "flush": [7, 14], "focus": 12, "folder": [2, 4, 16], "follow": [1, 2, 5, 6, 7, 8, 11, 12, 14, 16], "forev": 16, "fork": [2, 7], "format": [2, 7, 12, 14], "forward": [9, 11], "forward_batch": 11, "found": [7, 12], "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 14, 17], "framework": [9, 12], "franc": [1, 3, 7, 14], "frequenc": 12, "frequency_penalti": [12, 14], "frequent": 8, "from": [4, 5, 6, 7, 12, 15], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8], "function": [3, 7, 11], "function_cal": [12, 15], "further": [10, 12], "futur": [1, 11], "g": [2, 10, 11, 12, 16], "galaxi": 12, "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8], "gener": [0, 1, 9, 12, 14, 15], "generatereqinput": 14, "get": [6, 10, 11, 14], "git": [10, 16], "github": [0, 10, 14], "give": [11, 16], "given": 14, "glimps": 14, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": [7, 15], "gpt": 7, "gptq": 9, "gpu": [1, 8, 10, 14, 16], "graph": [1, 2, 17], "greedy_token_select": 3, "grok": 1, "group": 16, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 14, 15], "h": [1, 6, 15], "h100": [10, 14], "ha": [8, 11], "had": 12, "haisgl": 16, "half": 7, "hand": 8, "handl": [1, 2, 14], "happen": 8, "hardwar": 14, "harri": 7, "hasattr": 12, "have": [0, 1, 3, 8, 12, 14, 15], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 12, 14, 15], "henryx": 16, "her": 12, "here": [1, 6, 7, 12, 15], "hf": 5, "hf_home": 16, "hf_token": [10, 16], "hf_xxx": 16, "high": [3, 8, 12, 14, 15], "higher": [12, 14], "highest": [3, 7], "historian": 12, "hit": 14, "host": [6, 10, 12, 15], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 16], "human": 15, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17], "id": [12, 14, 15], "ident": 11, "identifi": 12, "idiom": 15, "ignor": 14, "ignore_eo": 14, "im_end": [5, 14], "im_start": [5, 14], "imag": [7, 10, 14], "image_data": 14, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11, 12], "import": [1, 2, 6, 7, 8, 12, 14, 15], "includ": [7, 9, 12, 15], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 12, 15], "indic": 8, "industri": [9, 15], "inf": 14, "infer": [1, 14], "inform": [7, 14, 15], "infra": 10, "init": 1, "initi": [3, 12, 15], "input": [1, 2, 7, 9, 12, 14, 15], "input_file_id": 12, "input_file_path": 12, "input_id": [6, 14], "input_ids_embed": 6, "insid": 16, "instal": [0, 2, 4, 9, 13, 15, 16], "installationguid": 2, "instanc": 3, "instead": [1, 17], "instruct": [1, 2, 6, 7, 10, 12, 14, 15], "int": 14, "int4": 9, "int4wo": 1, "integr": 9, "intellig": 15, "inter": 14, "interact": 9, "interconnect": 15, "interfac": [9, 11], "internlm": 1, "interpret": [12, 15], "intfloat": 6, "intuit": 9, "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 12, "issu": [7, 10, 17], "itali": 12, "iter_lin": 14, "itl": 14, "its": [3, 15], "japan": [7, 12, 15], "job": 12, "joke": 12, "json": [1, 2, 5, 6, 12, 14, 15], "json_decod": 7, "json_output": 7, "json_schema": 14, "jsonl": 12, "jump": 9, "just": [5, 12], "k": 14, "k8": 10, "keep": 12, "kei": [2, 7, 12, 15], "kernel": [9, 10, 17], "kfd": 16, "kingdom": 7, "knowledg": 12, "kv": [1, 8], "l": 14, "l4": 10, "l40": 10, "lab": [1, 14], "label": 16, "lang": 14, "languag": [5, 9, 10, 12, 15], "larg": [1, 2, 8, 9, 15], "last": 10, "later": [3, 16], "latest": 10, "launch": [1, 2, 5, 7, 9, 10, 14, 17], "launch_serv": [1, 2, 5, 6, 7, 10, 12, 14, 15], "layer": [11, 15], "layer_id": 11, "learn": [1, 4, 11, 15], "least": 14, "len": [1, 2, 14], "length": [7, 12, 14], "less": 12, "let": 1, "level": [6, 12, 14, 15], "librari": 7, "light": 12, "like": [8, 12, 15], "likelihood": 12, "limit": 3, "line": [12, 15], "lint": 4, "linux": 16, "list": [1, 2, 7, 11, 12, 14, 15, 17], "llama": [2, 5, 7, 9, 10, 11, 12, 14, 15], "llama3": 1, "llava": [1, 9, 14], "llava_llama_3": 1, "llm": [1, 3, 9, 15], "lmm": [1, 14], "lmsysorg": 10, "load": [1, 2, 6, 8, 12, 14], "load_imag": 14, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 12, 14, 15], "locat": 14, "log": [6, 7, 8, 12, 15], "logic": 14, "logit": [7, 11, 14], "logit_bia": 12, "logitsprocessor": 11, "logprob": [3, 12, 14, 15], "logprob_start_len": 14, "london": 3, "long": [1, 12, 15], "longer": [3, 12], "longest": 8, "look": [5, 8], "loop": 7, "low": 14, "lower": [8, 12], "lpm": 8, "lsb": 2, "m": [0, 1, 2, 5, 6, 7, 10, 11, 12, 14, 15], "machin": 10, "magic": 7, "mai": [1, 2, 7, 17], "main": [1, 14], "maintain": 11, "major": [11, 12], "make": [0, 8, 9, 11, 12, 14], "manag": 7, "mani": [3, 8, 11], "manner": 14, "mask": 7, "match": 8, "matched_stop": [12, 15], "math": 7, "max": 14, "max_check": 12, "max_new_token": [1, 8, 14], "max_token": [1, 7, 12, 15], "maximum": [12, 14], "md": 4, "me": 12, "mean": [8, 14], "meanwhil": 5, "measur": 14, "media": 15, "median": 14, "meet": 1, "mem": [1, 14, 17], "memori": [1, 2], "messag": [1, 7, 12, 15], "meta": [1, 2, 5, 7, 10, 12, 14, 15], "method": [9, 12], "mild": 12, "millard": 3, "min_new_token": 14, "min_p": 14, "minicpm": 1, "ministri": 7, "mislead": 3, "miss": 5, "mistral": [1, 6, 9], "mix": 14, "mixtral": 1, "modal": [1, 9], "mode": 12, "model": [2, 3, 5, 8, 9, 10, 12, 14, 15, 16], "model_path": 1, "moder": 12, "modifi": 12, "moe": 1, "monitor": 12, "month": 12, "more": [1, 9, 10, 12, 14, 15], "most": [5, 8, 11, 12], "mount": 16, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 12, 15], "multipli": 14, "must": 14, "my": 1, "my_model": 5, "my_model_templ": 5, "n": [7, 12, 14, 15], "n1": [12, 15], "n2": [12, 15], "n3": [12, 15], "n4": 15, "n5": 15, "name": [1, 2, 3, 5, 7, 14, 16], "natur": [12, 15], "nbecaus": 12, "nccl": 1, "nclean": 12, "ndescrib": 14, "need": [2, 5, 7, 10, 11, 16], "nemo": 1, "nest": 7, "network": 15, "neural": 15, "neuron": 15, "new": [1, 8, 9, 12, 13, 16], "new_token_ratio": 8, "next": [1, 6, 12, 15], "ngener": 1, "nlarg": 15, "nlist": 12, "nlp": [1, 6, 15], "nnode": 1, "node": [1, 2, 15], "non": 7, "none": [6, 12, 14, 15], "normal": 7, "note": [1, 2, 5, 11, 14, 16], "novel": 12, "noveral": 15, "now": 7, "npython": 12, "nrequest": 12, "nsome": 15, "nsy": 2, "nthe": 15, "nuanc": 15, "nucleu": 12, "null": [10, 15], "num": [1, 2, 14], "number": [8, 12, 14], "numer": 15, "nvidia": [2, 14, 16], "nvtx": 2, "nyou": 14, "o": [2, 6, 14, 16], "object": [12, 15], "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 15, "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 15], "one": [3, 7, 12, 14, 15], "onevis": [1, 14], "onli": [2, 3, 7, 10, 11, 12, 14], "onlin": [1, 2], "only_run": 11, "oom": [8, 14], "open": [9, 10, 12], "openai": [3, 5, 9, 10, 14], "openai_api_kei": [7, 16], "oper": 10, "optim": 17, "option": [3, 12, 14], "order": 7, "other": [3, 8, 10, 11, 14], "out": [1, 2, 7, 10, 17], "output": [1, 2, 11, 12, 14], "output_file_id": 12, "ov": [1, 14], "overhead": [8, 14], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 14], "p": [10, 14], "p2p": 1, "p99": 14, "page": [9, 17], "paragraph": 7, "parallel": [1, 8, 9, 14], "paramet": [8, 9], "pari": 3, "part": 11, "partial": 12, "pass": [4, 7, 11], "path": [0, 1, 2, 3, 5, 6, 7, 10, 12, 14, 15], "patronu": 7, "pattern": 15, "peer": 1, "penal": [12, 14], "penalti": 12, "per": 14, "perform": 3, "phoenix": 7, "phrase": 12, "piec": 15, "pilot": 12, "pip": [0, 2, 13, 16], "pip3": 4, "plan": 10, "planet": 12, "platform": 15, "playground": 11, "pleas": [1, 7, 10], "png": 14, "pool": [1, 8], "poorli": 3, "popular": 15, "port": [1, 5, 6, 7, 10, 12, 14, 15], "post": [12, 14, 15], "post2": 10, "post3_vllm0": 16, "potter": 7, "power": 15, "pre": 4, "predict": 3, "prefer": 12, "prefil": [1, 2, 9, 11], "prefix": [8, 9], "prerequisit": 2, "presenc": 12, "presence_penalti": [12, 14], "presid": [1, 3], "prev": 14, "primit": [3, 7], "print": [1, 2, 6, 7, 12, 14, 15], "probabl": [7, 12], "proceed": [6, 12, 15], "process": [12, 14, 15], "profil": 9, "program": [9, 10, 12], "programm": 12, "progress_bar": 7, "project": [0, 5, 10, 13, 14, 16], "prompt": [1, 2, 7, 9, 12, 14], "prompt_token": [12, 15], "prompt_tokens_detail": [12, 15], "proper": 10, "provid": [1, 2, 7, 9, 10, 12, 15], "pub": 2, "pull": 16, "pure": 7, "purpos": 12, "py": [0, 1, 2, 5, 7, 11, 13, 14], "pydant": 7, "pyproject": 13, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15], "python3": [0, 1, 2, 10, 11, 14, 16], "pytorch": [10, 17], "q": 7, "quantiz": [1, 9], "queri": [1, 15], "quest": 12, "question": [7, 15], "question_1": 7, "question_2": 7, "queue": 8, "quick": [2, 9], "quick_start": 7, "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 14], "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "rais": 12, "ran": 14, "random": [2, 12, 14], "rang": [8, 9, 12, 15], "rank": 1, "rate": 14, "ravenclaw": 7, "raw": 14, "rb": 12, "reach": 14, "read": 12, "readi": [6, 12, 15], "readm": 4, "readme_exampl": 7, "real": [1, 2, 12], "reason": 12, "recommend": [2, 10, 12, 14, 15], "recoveri": 10, "reduc": [1, 8, 12], "refer": [1, 11, 12], "reference_hf": 11, "refus": [12, 15], "regex": [7, 14], "regist": 5, "regular": [7, 14], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 15, "releas": [2, 10], "relev": 14, "rememb": 6, "remot": 10, "remov": [0, 11], "repeat": 14, "repetit": 12, "repetition_penalti": 14, "replac": [1, 10, 11], "repo": 2, "report": [1, 17], "repres": 12, "reproduc": 12, "req": [8, 12, 14], "request": [1, 7, 9, 12, 14], "request_count": 12, "request_id": 12, "requir": 0, "resourc": [10, 11], "respond": 15, "respons": [1, 3, 6, 12, 14, 15], "response_format": 12, "restart": 16, "result": [3, 12, 14], "result_cont": 12, "result_file_id": 12, "retoken": 14, "retracted_req": 8, "retriev": 12, "return": [12, 14], "return_logprob": 14, "return_text_in_logprob": 14, "reus": 11, "revolution": 15, "rid": 14, "rm": 16, "rmsnorm": 11, "role": [1, 12, 15], "rome": 12, "root": 10, "run": [0, 2, 4, 6, 7, 11, 14, 15], "run_batch": 7, "runner_allow_runasroot": 16, "running_request": 14, "runtim": [9, 10], "runtimeendpoint": [3, 7], "same": [1, 2, 6, 7, 11, 14], "sampl": [9, 10, 11, 12, 17], "sampling_param": [1, 14], "scalabl": 12, "scale": [10, 14], "scan": 12, "schema": [7, 14], "script": 11, "search": [7, 12], "second": 12, "secret": 10, "section": [14, 15], "see": [1, 7, 8, 10, 14], "seed": 12, "select": [7, 10], "send": [1, 8, 9, 12, 14], "send_request": 12, "sens": 12, "sent": 12, "sentenc": 14, "sep": 5, "sep_styl": 5, "sequenc": 12, "serv": [1, 2, 8, 9, 10, 14], "server": [0, 2, 5, 7, 8, 9, 12, 14], "server_process": [12, 15], "servic": [10, 12, 15], "service_ti": [12, 15], "set": [1, 2, 5, 7, 10, 12, 14, 15, 17], "set_default_backend": 7, "sever": [1, 2], "sgl": [0, 1, 3, 7, 10, 13, 14, 16], "sgl0": 16, "sglang": [2, 4, 6, 12, 13, 15, 16], "sglang_is_in_ci": 16, "sglang_use_modelscop": 1, "sh": 13, "share": [8, 16], "she": 12, "shell": 6, "ship": 12, "shm": 16, "short": [12, 14], "shorter": [3, 15], "should": [5, 11], "show": 7, "side": 12, "sign": 12, "siluandmul": 11, "similar": [11, 12, 14], "simpl": [7, 12, 15], "simpli": 3, "sinc": [12, 14], "singl": [1, 2, 10, 11, 12, 14], "siri": 15, "size": [1, 2, 16], "sk": [7, 16], "skip": 14, "skip_special_token": 14, "sky": 10, "skyserv": 10, "sleep": [12, 16], "slightli": 12, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 17], "smollm": 1, "smooth": 12, "snippet": 2, "so": [1, 2, 12, 14], "social": 15, "some": [2, 6, 7, 11, 14, 16, 17], "sometim": 17, "sourc": [2, 9], "space": [12, 14], "spaces_between_special_token": 14, "special": 14, "specif": [1, 10, 11, 15], "specifi": [1, 3, 5, 7, 12, 14, 15, 16], "split": 12, "srt": [9, 10, 14], "stabl": 12, "stablelm": 1, "stai": 7, "stand": [8, 15], "start": [11, 14], "startswith": 14, "state": [1, 7, 12], "static": [1, 2, 14, 17], "statu": [7, 10, 12], "status_cod": 12, "step": [6, 12, 15], "still": 14, "stop": [7, 8, 12, 14, 15], "stop_str": 5, "stop_token_id": 14, "store": 14, "stori": [12, 15], "str": 14, "strang": 12, "strategi": 1, "stream": [1, 12], "stream_opt": 12, "string": [8, 12, 14], "strip": [12, 14], "strong": 3, "strongli": [12, 15], "structur": [9, 15], "student": 7, "subprocess": 6, "subset": 3, "success": 14, "successfulli": 12, "suffix": 12, "suggest": 8, "summar": 15, "summari": [7, 15], "suppli": [3, 14], "support": [3, 6, 7, 9, 10, 14, 15], "sure": [0, 11, 12, 14], "switch": 10, "sxm5": 14, "system": [1, 2, 5, 7, 12, 14, 15], "system_fingerprint": [12, 15], "t": 12, "t4": 10, "take": [8, 12], "teacher": 7, "tee": 2, "tell": 12, "temperatur": [1, 7, 12, 14, 15], "templat": [1, 7, 14], "temporarili": 5, "tensor": [1, 9], "termin": 10, "terminate_process": [6, 12, 15], "test": [1, 2, 14, 15, 16], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 12, 14, 15], "text_complet": 12, "text_embed": 6, "text_it": 7, "text_qa": 7, "thei": [12, 14], "them": [10, 15, 17], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 14, 15, 17], "thing": 8, "through": [7, 12, 14], "throughput": [1, 14], "time": [1, 2, 6, 12, 14], "tip": 17, "tip_suggest": 7, "tmp": 16, "todai": 1, "togeth": [1, 8], "tok": 14, "token": [1, 5, 6, 7, 8, 9, 10, 12], "token_id": 14, "token_length_norm": 3, "tokenizers_parallel": 6, "tokyo": [12, 15], "toml": 13, "too": 8, "tool": [7, 15], "tool_cal": [12, 15], "tool_us": 7, "top": 14, "top_k": 14, "top_logprob": 12, "top_logprobs_num": 14, "top_p": [1, 12, 14], "topic": [12, 15], "torch": [1, 8], "torch2": 10, "torchao": 1, "total": [1, 12, 14], "total_token": [12, 15], "tp": 1, "tpot": 14, "tr": 2, "trace": 2, "traffic": 14, "train": [2, 15], "transform": [6, 11, 15], "transit": 12, "translat": 15, "trepid": 12, "triton": 10, "troubleshoot": 9, "true": [1, 2, 6, 7, 12, 14, 16], "truncat": [1, 2], "try": [1, 12, 14, 17], "ttft": 14, "tune": [1, 9, 14], "turbo": 7, "turn": 7, "twine": 13, "two": [1, 5, 7, 11, 12], "txt": 0, "type": [1, 6, 15], "typic": 15, "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 16, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 15], "union": 14, "uniqu": 12, "unit": [1, 2, 7, 12], "unittest": 11, "until": 14, "up": [10, 12], "updat": [0, 2, 16], "upgrad": 10, "upload": 12, "upload_pypi": 13, "uploaded_fil": 12, "upon": [1, 6], "url": [12, 14], "us": [2, 3, 4, 5, 8, 12, 14, 16], "us_president_exampl": 3, "usag": [1, 3, 8, 15], "user": [1, 3, 5, 7, 8, 12, 14, 15], "usual": [12, 14], "utf": [12, 14], "util": [6, 8, 12, 14, 15], "v": [10, 16], "v0": 10, "v1": [1, 6, 12, 15], "valid": 12, "valu": [1, 8, 12, 14, 17], "valuabl": 11, "variabl": [1, 16], "variant": 2, "variou": [1, 12, 15], "vast": [12, 15], "veri": [8, 11, 12, 14], "verifi": 12, "version": 10, "vertexai": 7, "via": 12, "video": 16, "view": 1, "virtual": 15, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 14, "w": [7, 12], "wa": 12, "wai": [6, 11], "wait": 15, "wait_for_serv": [6, 12, 15], "wand": 7, "want": [1, 14], "warn": [8, 12, 15], "washington": 12, "we": [1, 12, 14], "web": 12, "week": 12, "weight": [1, 2, 16], "welcom": 5, "well": 11, "were": [12, 14], "what": [3, 7, 12, 15], "when": [5, 7, 8, 12, 14, 17], "where": [3, 12], "whether": 14, "which": [8, 12, 14, 15], "while": [1, 2, 10, 12, 14, 16], "whl": 10, "who": 12, "why": 12, "wide": [9, 12, 15], "within": 7, "without": [2, 10, 12], "wood": 7, "word": [7, 12], "work": [1, 5, 8, 16], "workflow": 7, "workload": 8, "write": [0, 12], "x64": 16, "x86_64": 2, "xvers": 1, "xxx": 16, "y": [2, 16], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "your": [0, 1, 5, 7, 9, 10, 12, 14, 15], "zara": 12, "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 16], "2": [10, 16], "3": [1, 10, 16], "4": 10, "405b": 1, "5": 10, "A": [6, 15], "The": 17, "With": 10, "access": 17, "achiev": 8, "action": 16, "add": [4, 11, 16], "addit": 1, "advanc": 8, "all": 14, "an": 17, "api": [1, 6, 12, 15], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 14, "batch": [7, 12], "benchmark": [1, 2, 14], "build": 0, "chat": [5, 12], "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 13], "common": 10, "compat": [1, 6, 12, 15], "complet": 12, "compos": 10, "config": 16, "configur": 16, "conserv": 8, "constrain": 7, "contain": 16, "contributor": 4, "control": 7, "correct": 11, "cuda": 17, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 16], "document": [0, 9], "dp": 8, "embed": 6, "encount": 17, "engin": 1, "error": 17, "exampl": [7, 14], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 14, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [13, 16], "greedi": 3, "guid": [4, 8], "hang": 17, "host": 16, "how": 11, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 17, "implement": 7, "implic": 14, "input": 6, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 14, "launch": [6, 15], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 13, "max": 8, "mem": 8, "memori": [8, 14, 17], "method": [3, 10], "min": 14, "modal": [7, 14], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 14], "new": [11, 14], "normal": [3, 14], "note": 10, "nsight": 2, "openai": [1, 6, 7, 12, 15], "option": 8, "other": 2, "out": 8, "packag": 13, "parallel": 7, "paramet": [12, 14], "peak": 8, "penalti": 14, "perform": [1, 14], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 14, "preview": 0, "process": 13, "profil": 2, "pypi": 13, "quick": [1, 7, 15], "refer": 9, "releas": 13, "repetit": 14, "request": [8, 15], "role": 7, "run": [1, 8, 10, 16], "runner": 16, "runtim": [1, 5, 14], "sampl": 14, "schedul": 8, "select": 3, "self": 16, "send": 15, "serv": 0, "server": [1, 6, 15, 17], "set": 16, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 14], "sh": 16, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 15, 16], "static": 8, "step": 16, "stream": [7, 14], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 14, "token": [3, 14], "tp": 8, "troubleshoot": 17, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 16, "updat": 13, "upload": 13, "us": [1, 6, 7, 10, 15], "usag": 12, "version": 13, "vllm": 11, "wa": 17, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[16, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[14, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[14, "baseline"]], "Batches": [[12, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[14, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[17, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[12, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[12, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[14, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[14, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[14, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[15, "Launch-a-server"]], "Make a release in GitHub": [[13, "make-a-release-in-github"]], "Memory": [[14, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[14, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[14, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[14, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [12, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[12, "Parameters"], [12, "id2"]], "Performance Implications on Penalties": [[14, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[14, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[13, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[15, null]], "References": [[9, null]], "Repetition Penalty": [[14, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[14, null]], "Send a Request": [[15, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[16, null]], "Step 1: Start a docker container.": [[16, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[16, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[16, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [14, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[17, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[17, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[13, "update-the-version-in-code"]], "Upload the PyPI package": [[13, "upload-the-pypi-package"]], "Usage": [[12, "Usage"], [12, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [15, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 12, 14, 15], "0": [1, 6, 7, 8, 10, 12, 14, 15, 16, 17], "00": [6, 12, 15], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "01": [6, 7, 8, 12, 14, 15], "01239013671875": 6, "01438140869140625": 6, "02": [6, 12, 14, 15], "03": [6, 12, 14], "04": [12, 14, 16], "04a5": 12, "05": [6, 12, 14, 15], "06": [12, 14], "07": [6, 12], "08": [12, 14], "09": [6, 12, 15], "0_rocm6": 16, "0_triton3": 16, "1": [2, 6, 7, 8, 11, 12, 14, 15], "10": [1, 2, 6, 12, 14, 15], "100": [6, 7, 12, 15], "10025": 12, "100525": 12, "10094": 12, "101": 14, "10192": 12, "1025173": 6, "103": 14, "10399": 12, "104": 14, "10405": 14, "10640": 12, "10666": 14, "107": 14, "10767": 14, "108": 12, "10850": 12, "11": [6, 12, 14], "114": 14, "11586": 14, "117": 14, "11732": 14, "12": [6, 10, 14, 15, 16], "120": 12, "120525": 12, "123": 12, "125": 12, "127": [1, 6, 12, 15], "128": [1, 12, 14], "128009": [12, 15], "129": 12, "12it": 12, "13": [12, 14, 15], "130": 12, "131072": [6, 12, 15], "133": 12, "134": 12, "135": 12, "137": 15, "138": 15, "139": 15, "14": [6, 12, 15], "14025": 12, "140525": 12, "142": 12, "14226": 14, "15": [6, 15], "150": 12, "16": [1, 6, 7, 12, 14, 15], "160": [6, 12, 15], "160525": 12, "16219": 14, "16384": [6, 12, 15], "167": 15, "16740": 14, "16it": 15, "17": [12, 14, 15], "17125": 14, "17167": 14, "172": 1, "1730107007": 12, "1730107008": 12, "1730107009": 12, "1730107011": 12, "1730107096": 15, "1730107097": 15, "174": 14, "175": 12, "17734": 12, "179": 14, "17it": 15, "18": [6, 12, 14, 15], "18025": 12, "180525": 12, "18388": 12, "18895": 14, "189": 14, "19": 12, "19058": 12, "191": 14, "195": 14, "1980": 12, "19807": 12, "19884": 14, "1st": 14, "2": [1, 5, 6, 7, 9, 12, 14, 15], "20": [12, 15], "200": [6, 12, 15], "20000": 1, "200525": 12, "2024": [6, 12, 15], "2048": [2, 8], "2049": [12, 15], "205": 14, "20525": 12, "20585": 12, "207": 15, "20866": 14, "21": [6, 12], "210": 12, "2106": 12, "21433": 12, "21it": 12, "22": [12, 15], "22025": 12, "22095": 14, "22312": 12, "22363": 14, "22603": 14, "23": 12, "2325": 12, "233": [8, 14], "23385": 12, "237179517": 6, "23892": 14, "2394": 12, "24": [12, 14], "243": [6, 12, 15], "24587": 12, "247": 15, "24h": 12, "24it": 15, "25": [6, 7, 12, 15], "256": [1, 2, 6, 7, 12, 14, 15], "26": [12, 14], "26025": 12, "268": 14, "27": [12, 14], "271": 14, "27879a06": 12, "2790": 12, "28": [6, 12, 15], "287": 15, "29": [6, 12, 14, 15], "293": 14, "29542e83d53f44eea0c01d1f517c4b40": 15, "29c7": 12, "3": [2, 5, 6, 7, 8, 9, 12, 14, 15], "30": [12, 14], "3000": 14, "30000": [1, 5, 7, 10, 12, 14, 15], "30010": 6, "30025": 12, "308": 14, "31": 14, "317": 8, "32": [1, 2, 6, 12, 14, 15], "320": 14, "3226": 12, "327": 15, "33": 6, "331": 15, "333": 12, "33df398d": 12, "34": [6, 12, 15], "34025": 12, "347192970": 15, "35": [12, 14], "35516": 15, "35530": 15, "35536": 15, "35540": 15, "35554": 15, "35it": 12, "36": [12, 14], "36680": 12, "36690": 12, "36696": 12, "367": 15, "36706": 12, "36708": 12, "37": [12, 14, 15], "370959": 8, "371": 12, "378": 15, "378633": 14, "38": [12, 14], "38025": 12, "3869": 12, "39": [6, 12, 14, 15], "390b6931283540278af6151e5665b9e6": 12, "395": 12, "4": [1, 6, 7, 12, 15], "40": [6, 12, 14], "4005": 6, "40525": 12, "40866": 12, "40881": 14, "409": 14, "4096": [1, 2, 6, 8, 12, 15], "40db": 12, "40it": 15, "41": [6, 12, 14], "41888": 14, "41ef": 12, "42": 12, "42025": 12, "43": [6, 12], "433": 14, "43967": 14, "43c2": 12, "44": 14, "440": 14, "442913": [12, 15], "4456": 12, "447": 14, "44926": 14, "45": [12, 14, 15], "453": 14, "45354": 14, "45445": 14, "455": 14, "4594": 8, "46": [12, 14, 15], "46025": 12, "46530": 14, "47": [12, 14, 15], "47738": 14, "48": 12, "48056": 12, "48302": 14, "4832": 14, "48960": 14, "49": [12, 15], "49017": 14, "49263": 14, "495a": 12, "4995": 12, "4e7f": 12, "5": [1, 6, 7, 12, 14], "50": [8, 12, 14, 15], "500": [8, 12], "50000": 1, "50025": 12, "50302": 14, "5079": 14, "509328": 6, "51": [12, 14], "510260": 12, "511197": 15, "512": [2, 14], "51it": 12, "52": [1, 6, 12], "5206": 14, "5255": 14, "52554": 14, "52609006": 12, "52825": 14, "52920": 14, "53": 12, "53788": 12, "54": [6, 12, 14], "54497": 14, "54868": 12, "55": [6, 12, 14], "56": [6, 12, 14], "5656": 14, "56c1c364": 12, "57": 6, "5727": 14, "57426": 14, "57it": 15, "58": [14, 15], "59": [6, 12, 14, 15], "59034": 6, "59258": 6, "59274": 6, "59280": 6, "59290": 6, "59300": 6, "5b": 11, "6": [1, 6, 12, 15, 16], "60": [2, 14], "600": [6, 12, 15], "6000": 2, "6025": 12, "60525": 12, "61": [12, 14, 15], "62": 12, "63": [12, 15], "64": [1, 2, 6, 12, 14, 15], "64g": 16, "65": 14, "66": [12, 14], "67": 14, "68": 14, "69": 14, "7": [1, 6, 12], "70": [2, 12, 14], "71": [6, 14], "72": [12, 14], "72b": 1, "73": [12, 14], "74": [6, 14], "75": [12, 14, 15], "76": [12, 14], "766008": 14, "77": 12, "774756": 14, "774955": 14, "775118": 14, "775210": 14, "775220": 14, "775651": 14, "78": [6, 12, 14, 15], "79": [12, 14, 15], "7b": [1, 5, 6, 14], "7fa2af80": 2, "8": [1, 6, 12, 14, 15, 17], "8000": 0, "80525": 12, "81": [6, 14], "8192": [6, 12, 15], "82": [8, 12], "83": 14, "84": [12, 14], "8413": 14, "84ab9ffd558f4c5595addde9e7a9b40c": 12, "85": [14, 15], "86": [6, 14], "87": 15, "88": [6, 12, 14, 15], "8840": 12, "89": [6, 12, 14], "890cb3111446": 12, "8925": 12, "8b": [1, 2, 7, 10, 12, 14, 15], "8dd58c0e0eff4036ab377324851c1726": 12, "8dd8": 12, "8fc3": 12, "9": [1, 7, 8, 12, 17], "90": 14, "9012": 12, "91": [12, 14], "9157": 12, "92": 6, "9225459caab5": 12, "927": 12, "93": [12, 14], "9370": 12, "94": 14, "95": [1, 6, 12, 14, 15], "9570": 12, "96": [14, 15], "97": [6, 12, 14], "9739540beefc": 12, "9754": 12, "97b7": 12, "98": [12, 14], "99": 12, "9900": 14, "9969": 12, "9976380bf402": 12, "9982": 12, "9998": 8, "9b8d": 12, "A": [1, 2, 7, 8, 9, 10], "As": 12, "By": [5, 14], "For": [1, 2, 3, 11, 12, 14], "If": [1, 5, 8, 10, 14, 17], "In": [1, 7, 17], "It": [1, 3, 5, 7, 8, 9, 10, 12, 14, 15], "NOT": 5, "On": 8, "THE": 12, "The": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16], "Then": [7, 16], "There": 5, "These": 14, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 13, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abl": 11, "about": [1, 5, 7, 8, 12, 15], "abov": [2, 3, 10, 12, 14], "acceler": [1, 8, 10], "accept": [12, 14], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "accuraci": 15, "achiev": 12, "across": 3, "activ": 9, "ad": 10, "ad61027db61649d0bd69f6aa901f1d8c": 15, "add": [1, 2, 6, 7, 8, 14, 17], "addit": [3, 7, 14], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "af09": 12, "af5c": 12, "after": 15, "again": 12, "against": 3, "ai": [1, 10, 12, 15], "aim": 12, "air": 12, "algorithm": 14, "alibaba": [1, 6], "alien": 12, "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 16], "all_other_model": 11, "allow": [2, 10, 15], "almost": [1, 8, 11], "also": [1, 5, 6, 7, 8, 14, 15], "altern": [3, 7], "alwai": [8, 12, 15], "amd": 16, "amount": 15, "an": [0, 1, 3, 7, 9, 10, 12, 14, 15, 16], "analysi": 12, "ancient": 12, "ani": [1, 7, 10, 14, 15], "annot": 2, "anoth": [11, 15], "answer": [3, 7, 15], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 14], "api_kei": [1, 6, 12, 15], "appear": 14, "append": 12, "appli": 14, "applic": [1, 6, 9, 12, 15], "approach": 10, "apt": [2, 16], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15], "arch": 16, "architectur": 2, "arg": 3, "argument": [2, 7, 14], "around": 17, "articl": 15, "artifici": [12, 15], "ask": 15, "assert": 12, "assign": [12, 15], "assist": [1, 3, 5, 7, 12, 14, 15], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attention_backend": [6, 12, 15], "attract": [3, 7, 12], "audio": [12, 15], "auror": 7, "australia": [12, 15], "author": [6, 15], "auto": [6, 12, 15], "automat": 14, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 6, 10, 12, 15], "averag": 3, "avoid": [10, 12], "awq": 9, "b": 10, "b590": 12, "baccd9a49bff": 12, "back": 9, "backend": [2, 3, 10, 14, 17], "backend_input_fil": 12, "backend_result_fil": 12, "bad": 3, "baichuan2": 1, "balanc": [7, 12], "base": [3, 14], "base64": 14, "base_url": [1, 6, 12, 15], "bash": [13, 16], "basic": 14, "batch": [1, 2, 6, 8, 9, 14, 15], "batch_4c254e9a": 12, "batch_9c319ff5": 12, "batch_bb7ab5e0": 12, "batch_detail": 12, "batch_id": 12, "batch_job": 12, "batch_request": 12, "batch_respons": 12, "batchrequestcount": 12, "bdb569b5e77147d0b4ebe2a79b451814": 12, "be08": 12, "bearer": [6, 15], "becaus": [7, 8], "befor": [2, 14], "begin": [6, 7, 12, 15], "beij": 12, "being": 8, "below": [7, 10, 14, 16], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 14], "benchmark": 9, "berlin": 3, "bespok": 3, "better": [1, 8, 10, 11, 12], "between": [1, 14], "bfloat16": [12, 15], "bia": [7, 15], "bin": 16, "blank": 15, "blob": 14, "block": [7, 15], "blogpost": 3, "blood": 7, "bodi": [7, 12, 14], "bogart": 7, "book": 15, "bool": 14, "born": 7, "both": 8, "bottleneck": 8, "bra": 12, "branch": 10, "bras\u00edlia": [12, 15], "brazil": [12, 15], "break": 14, "breath": 12, "breez": 12, "bring": 12, "browser": 0, "bug": 12, "build": [1, 10, 13], "built": 10, "c": [6, 10, 12, 15], "cach": [1, 2, 6, 8, 9, 10, 12, 15, 16], "calcul": 7, "call": [3, 7, 9], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "canberra": [12, 15], "cancel": 12, "cancelled_job": 12, "cannot": 14, "capit": [1, 3, 7, 12, 14, 15], "captur": [12, 15], "case": [8, 17], "cb3d35cd73d4": 12, "cd": [4, 10, 13], "ce58": 12, "center": 12, "chain": 9, "chang": [11, 16], "charact": 7, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 14, 15], "chat_exampl": 7, "chat_templ": [5, 6, 12, 15], "chatcomplet": [12, 15], "chatcompletionmessag": [12, 15], "chatglm": 1, "chatml": [1, 5, 14], "check": [1, 10, 12], "check_output": 6, "checkpoint": [1, 2, 6, 12, 15], "china": 12, "choic": [7, 9, 12, 15], "choices_method": 3, "chunk": [1, 9, 14], "chunked_prefill_s": [6, 12, 15], "ci": 4, "civil": 12, "class": 14, "clean": 12, "cli": 2, "client": [1, 2, 6, 12, 15], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 6, 7, 15], "coher": 15, "color": 2, "com": [2, 10, 13, 14, 16], "come": [8, 14], "command": [1, 2, 4, 6, 10, 11, 12, 15, 16], "commit": 4, "common": [15, 17], "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 9, 14], "compil": [1, 8], "complet": [1, 6, 7, 15], "completion_token": [12, 15], "completion_tokens_detail": [12, 15], "completion_window": 12, "completionchoic": 12, "completionusag": [12, 15], "complex": 7, "comprehend": 15, "comput": [2, 7, 8, 12, 14], "concis": [12, 15], "conda": 10, "condens": 15, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 14], "constrain": [8, 9, 14], "constrained_json_whitespace_pattern": [6, 12, 15], "constraint": 7, "contain": 3, "content": [1, 6, 7, 12, 15], "context": 15, "context_len": [6, 12, 15], "context_length": [6, 12, 15], "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 15], "convert": 11, "cool": 12, "copi": 10, "core": [7, 9], "corpu": 15, "correct": [2, 14], "cost": 12, "could": 14, "count": 12, "countri": [1, 12, 15], "coverag": 11, "cpu": 8, "craft": 12, "creat": [1, 6, 11, 12, 15], "created_at": 12, "creation": 15, "creativ": 12, "crisp": 12, "critic": 2, "ctrl": [6, 12, 15], "cu121": 10, "cuda": [1, 2, 6, 10, 12, 14, 15, 16], "cuda_graph_max_b": [6, 12, 15], "cuda_visible_devic": 16, "curl": [1, 14, 15, 16], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 12, "custom": [1, 15], "custom_id": 12, "d": [0, 1, 2, 6, 7, 10, 12, 15], "dark": 12, "data": [1, 6, 8, 12, 14, 15], "dataclass": 14, "dataset": [2, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 14, "dd4a2fc580ea": 12, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 12, 14, 15], "decode_unicod": 14, "decor": 7, "decreas": 8, "deep": 12, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 14, 17], "defin": [5, 7], "del_respons": 12, "delai": 2, "delet": 12, "depend": 10, "deploi": 10, "deploy": [10, 15], "describ": [3, 14], "descript": [2, 14], "design": [9, 15], "desir": 14, "detail": [12, 14], "detailed_tip": 7, "determin": 3, "detoken": 14, "dev": [1, 16], "devel": 16, "develop": [2, 12, 15], "devic": [1, 6, 10, 12, 15, 16], "devtool": 2, "dict": 14, "diet": 7, "differ": 11, "difficult": 14, "digit": 15, "directli": 1, "directori": 11, "disabl": [1, 2, 14, 17], "disable_cuda_graph": [6, 12, 15], "disable_cuda_graph_pad": [6, 12, 15], "disable_custom_all_reduc": [6, 12, 15], "disable_disk_cach": [6, 12, 15], "disable_flashinf": [6, 12, 15], "disable_flashinfer_sampl": [6, 12, 15], "disable_mla": [6, 12, 15], "disable_nan_detect": [6, 12, 15], "disable_pen": [6, 12, 15], "disable_radix_cach": [6, 12, 15], "disable_regex_jump_forward": [6, 12, 15], "discov": 12, "discuss": 15, "dislik": 14, "dist_init_addr": [6, 12, 15], "distrib_releas": 2, "distribut": [6, 12, 15], "divers": 12, "dn": 7, "do": [2, 8, 12, 14, 16], "doc": [2, 3, 10, 14], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [14, 16], "down": [3, 6, 12, 15], "download": [2, 14], "dp": 1, "dp_size": [6, 12, 15], "dpkg": 2, "drawback": 14, "dri": 16, "ds_channel_config_path": [6, 12, 15], "ds_heavy_channel_num": [6, 12, 15], "ds_heavy_channel_typ": [6, 12, 15], "ds_heavy_token_num": [6, 12, 15], "ds_sparse_decode_threshold": [6, 12, 15], "dtype": [1, 6, 12, 15], "duck": 3, "due": [3, 8, 17], "dummi": 2, "dump": [6, 12], "durat": [2, 14], "dure": [1, 8, 12, 14, 15], "dusti": 12, "dynam": [1, 2], "e": [2, 10, 11, 12, 16], "e2": 14, "e5": [1, 6, 9], "each": 1, "earli": 8, "earlier": 3, "easi": [9, 11, 17], "easier": 7, "eater": 7, "echo": [2, 16], "edit": 16, "educ": 15, "effici": [1, 9], "either": 14, "element": 12, "eleutherai": 3, "elif": 7, "els": 12, "embed": [1, 9, 12], "embedding_model": 12, "embedding_process": 6, "empti": 1, "enabl": [1, 7, 8, 10, 15], "enable_cache_report": [6, 12, 15], "enable_double_spars": [6, 12, 15], "enable_mixed_chunk": [6, 12, 15], "enable_overlap_schedul": [6, 12, 15], "enable_p2p_check": [6, 12, 15], "enable_torch_compil": [6, 12, 15], "encod": [6, 14], "encount": 10, "encourag": [12, 14], "end": [6, 7, 11, 12, 14, 15], "endpoint": [1, 10, 12, 14], "engag": 15, "engin": 7, "enough": [1, 8], "entir": 15, "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 6, 16], "eo": [8, 14], "equival": [6, 15], "error": [1, 8, 12], "especi": 8, "etc": [2, 9], "eth0": 1, "even": [3, 12, 15], "everi": 14, "exampl": [1, 3, 6, 11, 12, 15, 16], "example_imag": 14, "exaon": 1, "except": 12, "excl": 14, "exec": 2, "execut": [10, 15], "exercis": 7, "exist": 11, "expand": 7, "experiment": 8, "explor": 12, "export": [0, 1, 7, 16], "express": [7, 14], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 14, "f": [1, 6, 7, 12], "fa4ddf26": 12, "face": [1, 5], "fail": [3, 12], "failur": 10, "fals": [6, 12, 14, 15], "far": 14, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feel": [12, 15], "fetch": 2, "file": [0, 2, 4, 11, 12, 14], "file_respons": 12, "file_storage_pth": [6, 12, 15], "fill": [7, 12, 15], "fillmor": 3, "final": 12, "find": [7, 11, 14], "finish": [6, 12, 15], "finish_reason": [12, 15], "fire": [6, 12, 15], "first": [1, 2, 6, 7, 8, 14], "fix": 17, "flashinf": [6, 9, 10, 12, 15], "flexibl": 9, "float": 14, "float16": 6, "floral": 12, "flow": 9, "fluenci": 12, "flush": [7, 14], "focus": 12, "folder": [2, 4, 16], "follow": [1, 2, 5, 6, 7, 8, 11, 14, 16], "forev": 16, "fork": [2, 7], "format": [2, 6, 7, 12, 14, 15], "forward": [9, 11], "forward_batch": 11, "found": 7, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 14, 17], "framework": 9, "franc": [1, 3, 7, 14], "free": 15, "frequency_penalti": [12, 14], "frequent": 8, "from": [4, 5, 6, 7, 12, 15], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8, 12], "function": [3, 7, 11], "function_cal": [12, 15], "further": 10, "futur": [1, 11], "g": [2, 10, 11, 16], "gaze": 12, "gb": [6, 12, 15], "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8, 12, 15], "gener": [0, 1, 9, 12, 14, 15], "generatereqinput": 14, "get": [6, 10, 11, 12, 14, 15], "get_model_info": [6, 12, 15], "git": [10, 16], "github": [0, 10, 14], "give": [11, 16], "given": 14, "glimps": 14, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": 7, "gpt": 7, "gptq": [6, 9, 12, 15], "gpu": [1, 8, 10, 14, 16], "grammar_backend": [6, 12, 15], "graph": [1, 2, 12, 15, 17], "greedy_token_select": 3, "grok": 1, "group": 16, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 14, 15], "h": [1, 6, 15], "h100": [10, 14], "ha": [8, 11], "haisgl": 16, "half": 7, "hand": 8, "handl": [1, 2, 14], "happen": 8, "hardwar": 14, "harri": 7, "hasattr": 12, "have": [0, 1, 3, 8, 12, 14, 15], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 12, 14, 15], "henryx": 16, "her": 12, "here": [1, 6, 7, 12, 15], "hf": 5, "hf_home": 16, "hf_token": [10, 16], "hf_xxx": 16, "high": [3, 8, 12, 14, 15], "higher": [12, 14], "highest": [3, 7], "highlight": 15, "highlight_text": [6, 12, 15], "historian": 12, "hit": [6, 12, 14, 15], "host": [6, 10, 12, 15], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "howev": 15, "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 16], "human": 15, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17], "id": [12, 14, 15], "idea": 15, "ident": 11, "ignor": 14, "ignore_eo": 14, "im_end": [5, 14], "im_start": [5, 14], "imag": [7, 10, 14], "image_data": 14, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11, 12], "import": [1, 2, 6, 7, 8, 12, 14, 15], "includ": [7, 9, 12, 15], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 12, 15], "indic": 8, "indigo": 12, "industri": [9, 15], "inf": 14, "infer": [1, 14], "info": [6, 12, 15], "inform": [7, 14, 15], "infra": 10, "init": [1, 6, 12, 15], "initi": [3, 12, 15], "input": [1, 2, 7, 9, 12, 14], "input_file_id": 12, "input_file_path": 12, "input_id": [6, 14], "input_ids_embed": 6, "insid": 16, "instal": [0, 2, 4, 6, 9, 12, 13, 15, 16], "installationguid": 2, "instanc": 3, "instead": [1, 17], "instruct": [1, 2, 6, 7, 10, 12, 14, 15], "int": 14, "int4": 9, "int4wo": 1, "integr": 9, "intellig": [12, 15], "inter": 14, "interact": 9, "interfac": [9, 11], "internet": 15, "internlm": 1, "interpret": 12, "intfloat": 6, "intuit": 9, "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 12, "is_embed": [6, 12, 15], "issu": [7, 10, 17], "itali": 12, "iter_lin": 14, "itl": 14, "its": 3, "iv": 12, "japan": [7, 12, 15], "job": 12, "joke": 12, "json": [1, 2, 5, 6, 12, 14, 15], "json_decod": 7, "json_model_override_arg": [6, 12, 15], "json_output": 7, "json_schema": 14, "jsonl": 12, "jump": 9, "just": [5, 12], "k": 14, "k8": 10, "keep": 12, "kei": [2, 7, 12, 15], "kernel": [9, 10, 17], "kfd": 16, "kingdom": 7, "knowledg": 12, "kv": [1, 8], "kv_cache_dtyp": [6, 12, 15], "l": 14, "l4": 10, "l40": 10, "lab": [1, 14], "label": 16, "land": 12, "landscap": 12, "lang": 14, "languag": [5, 9, 10, 12, 15], "larg": [1, 2, 8, 9, 15], "last": 10, "late": 12, "later": [3, 16], "latest": 10, "lauch_sglang_serv": [6, 12, 15], "launch": [1, 2, 5, 7, 9, 10, 14, 17], "launch_serv": [1, 2, 5, 6, 7, 10, 12, 14, 15], "layer": 11, "layer_id": 11, "learn": [1, 4, 11, 15], "least": 14, "len": [1, 2, 14], "length": [7, 12, 14], "less": 12, "let": 1, "level": [12, 14], "librari": 7, "life": 12, "light": 12, "like": [8, 15], "limit": 3, "line": [12, 15], "lint": 4, "linux": 16, "list": [1, 2, 7, 11, 12, 14, 15, 17], "llama": [2, 5, 7, 9, 10, 11, 12, 14, 15], "llama3": 1, "llamaforcausallm": [12, 15], "llava": [1, 9, 14], "llava_llama_3": 1, "llm": [1, 3, 9, 15], "lm_eval": [6, 12, 15], "lmm": [1, 14], "lmsysorg": 10, "load": [1, 2, 6, 8, 12, 14, 15], "load_balance_method": [6, 12, 15], "load_format": [6, 12, 15], "load_imag": 14, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 12, 14, 15], "locat": 14, "log": [7, 8], "log_level": [6, 12, 15], "log_level_http": [6, 12, 15], "log_request": [6, 12, 15], "logic": 14, "logit": [7, 11, 14], "logitsprocessor": 11, "logprob": [3, 12, 14, 15], "logprob_start_len": 14, "london": 3, "long": [1, 12, 15], "longer": [3, 12], "longest": 8, "look": [5, 8], "loop": 7, "lora_path": [6, 12, 15], "low": 14, "lower": [8, 12], "lpm": [6, 8, 12, 15], "lsb": 2, "lt": [6, 12, 15], "lung": 12, "lyra": 12, "m": [0, 1, 2, 5, 6, 7, 10, 11, 12, 14, 15], "machin": 10, "magic": 7, "mai": [1, 2, 6, 7, 12, 15, 17], "main": [1, 14, 15], "maintain": 11, "major": [11, 12], "make": [0, 8, 9, 11, 12, 14], "manag": 7, "mani": [3, 8, 11], "manner": 14, "mask": 7, "massiv": 15, "match": 8, "matched_stop": [12, 15], "math": 7, "max": 14, "max_check": 12, "max_loras_per_batch": [6, 12, 15], "max_new_token": [1, 8, 14], "max_prefill_token": [6, 12, 15], "max_running_request": [6, 12, 15], "max_token": [1, 7, 12, 15], "max_total_num_token": [6, 12, 15], "max_total_token": [6, 12, 15], "maximum": 14, "md": 4, "me": 12, "mean": [8, 14, 15], "meanwhil": 5, "measur": 14, "media": 15, "median": 14, "meet": 1, "mem": [1, 6, 12, 14, 15, 17], "mem_fraction_stat": [6, 12, 15], "memori": [1, 2, 6, 12, 15], "messag": [1, 7, 12, 15], "meta": [1, 2, 5, 7, 10, 12, 14, 15], "method": [9, 12], "mild": 12, "millard": 3, "min_new_token": 14, "min_p": 14, "minicpm": 1, "ministri": 7, "minut": [12, 15], "mislead": 3, "miss": 5, "mission": 12, "mistral": [1, 6, 9], "mix": [12, 14], "mixtral": 1, "modal": [1, 9], "mode": 12, "model": [2, 3, 5, 8, 9, 10, 12, 14, 15, 16], "model_path": [1, 6, 12, 15], "moder": 12, "moe": 1, "monitor": 12, "more": [1, 9, 10, 12, 14], "most": [5, 8, 11], "mount": 16, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 12], "multipli": 14, "must": 14, "my": 1, "my_model": 5, "my_model_templ": 5, "myself": 15, "n": [7, 12, 14, 15], "n1": [12, 15], "n2": [12, 15], "n3": [12, 15], "n4": 15, "name": [1, 2, 3, 5, 7, 14, 16], "natur": 12, "nbecaus": 12, "nccl": 1, "ndescrib": 14, "need": [2, 5, 7, 10, 11, 15, 16], "nemo": 1, "nest": 7, "new": [1, 6, 8, 9, 12, 13, 15, 16], "new_token_ratio": 8, "newli": 12, "next": [1, 6, 12, 15], "ngener": 1, "nif": 15, "nlarg": 15, "nlist": 12, "nlp": [1, 6], "nlyra": 12, "nnode": [1, 6, 12, 15], "node": [1, 2], "node_rank": [6, 12, 15], "non": 7, "none": [6, 12, 14, 15], "normal": 7, "note": [1, 2, 5, 11, 14, 16], "novel": 12, "now": 7, "npython": 12, "nsome": 15, "nsy": 2, "nuanc": 15, "null": [10, 15], "num": [1, 2, 14], "num_continuous_decode_step": [6, 12, 15], "number": [8, 14], "nvidia": [2, 14, 16], "nvtx": 2, "nyou": 14, "o": [2, 6, 14, 16], "object": [12, 15], "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 15, "ok": [6, 12, 15], "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 15], "one": [3, 7, 12, 14, 15], "onevis": [1, 14], "onli": [2, 3, 7, 10, 11, 12, 14], "onlin": [1, 2], "only_run": 11, "onto": 12, "oom": [8, 14], "open": [9, 10, 12], "openai": [3, 5, 9, 10, 14], "openai_api_kei": [7, 16], "oper": 10, "optim": 17, "option": [3, 14], "order": 7, "other": [3, 8, 10, 11, 14, 15], "otherworldli": 12, "out": [1, 2, 7, 10, 12, 17], "outlin": [6, 12, 15], "output": [1, 2, 11, 12, 14], "output_file_id": 12, "ov": [1, 14], "overhead": [8, 14], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 14], "ozon": 12, "p": [10, 14], "p2p": 1, "p99": 14, "page": [9, 17], "paragraph": 7, "parallel": [1, 8, 9, 14], "paramet": [8, 9], "pari": 3, "part": 11, "pass": [4, 7, 11], "path": [0, 1, 2, 3, 5, 6, 7, 10, 12, 14, 15], "patronu": 7, "pattern": 15, "peer": 1, "penal": 14, "penalti": 12, "per": 14, "perform": 3, "phoenix": 7, "phrase": 12, "pip": [0, 2, 13, 16], "pip3": 4, "plan": 10, "planet": 12, "playground": 11, "pleas": [1, 7, 10], "png": 14, "point": 15, "pool": [1, 6, 8, 12, 15], "poorli": 3, "popular": 12, "port": [1, 5, 6, 7, 10, 12, 14, 15], "post": [6, 12, 14, 15], "post2": 10, "post3_vllm0": 16, "potenti": 15, "potter": 7, "pre": 4, "predict": 3, "prefer": 12, "prefil": [1, 2, 6, 9, 11, 12, 15], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": [12, 14], "presid": [1, 3], "press": [6, 12, 15], "prev": 14, "primit": [3, 7], "print": [1, 2, 7, 12, 14], "probabl": 7, "proceed": [6, 12, 15], "process": [6, 12, 14, 15], "profil": 9, "program": [9, 10, 12], "programm": 12, "progress_bar": 7, "project": [0, 5, 10, 13, 14, 16], "prompt": [1, 2, 7, 9, 12, 14], "prompt_token": [12, 15], "prompt_tokens_detail": [12, 15], "proper": 10, "provid": [1, 2, 7, 9, 10, 12, 15], "pub": 2, "pull": 16, "pure": 7, "purpos": 12, "py": [0, 1, 2, 5, 6, 7, 11, 12, 13, 14, 15], "pydant": 7, "pyproject": 13, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15], "python3": [0, 1, 2, 10, 11, 14, 16], "pytorch": [10, 17], "q": 7, "qk": [6, 12, 15], "quantiz": [1, 6, 9, 12, 15], "queri": 1, "question": [7, 15], "question_1": 7, "question_2": 7, "queue": [6, 8, 12, 15], "quick": [2, 9], "quick_start": 7, "quit": [6, 12, 15], "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 14], "qwen2forcausallm": 6, "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "rais": [12, 15], "ran": 14, "random": [2, 14], "random_se": [6, 12, 15], "rang": [8, 9, 12], "rank": 1, "rate": [6, 12, 14, 15], "ravenclaw": 7, "raw": 14, "rb": 12, "reach": 14, "read": 12, "reader": 12, "readi": [6, 12, 15], "readm": 4, "readme_exampl": 7, "real": [1, 2], "reason": 12, "recommend": [2, 10, 12, 14, 15], "recoveri": 10, "reduc": [1, 8, 12], "refer": [1, 11, 12], "reference_hf": 11, "refus": [12, 15], "regex": [7, 14], "regist": 5, "regular": [7, 14], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 15, "releas": [2, 10], "relev": 14, "rememb": 6, "remot": 10, "remov": [0, 11], "repeat": 14, "repetit": 12, "repetition_penalti": 14, "replac": [1, 10, 11], "repo": 2, "report": [1, 17], "reproduc": 12, "req": [6, 8, 12, 14, 15], "request": [1, 7, 9, 12, 14], "request_count": 12, "request_id": 12, "requir": 0, "resourc": [10, 11], "respons": [1, 3, 6, 12, 14, 15], "restart": 16, "result": [3, 12, 14], "result_cont": 12, "result_file_id": 12, "retoken": 14, "retracted_req": 8, "retriev": 12, "return": 14, "return_logprob": 14, "return_text_in_logprob": 14, "reus": 11, "revolution": 15, "rid": 14, "rm": 16, "rmsnorm": 11, "role": [1, 12, 15], "roll": [6, 12, 15], "rome": 12, "root": 10, "round_robin": [6, 12, 15], "run": [0, 2, 4, 6, 7, 11, 12, 14, 15], "run_batch": 7, "runner_allow_runasroot": 16, "running_request": 14, "runtim": [9, 10], "runtimeendpoint": [3, 7], "safetensor": [6, 12, 15], "same": [1, 2, 6, 7, 11, 14], "sampl": [9, 10, 11, 17], "sampling_backend": [6, 12, 15], "sampling_param": [1, 14], "scale": [10, 14], "scent": 12, "schedule_conserv": [6, 12, 15], "schedule_polici": [6, 12, 15], "schema": [7, 14], "scientif": 12, "script": 11, "search": 7, "second": 12, "secret": 10, "section": [14, 15], "see": [1, 7, 8, 10, 14], "seed": 12, "select": [7, 10], "semant": 15, "send": [1, 8, 9, 12, 14], "send_request": 12, "sensori": 12, "sentenc": 14, "sep": 5, "sep_styl": 5, "seq": [6, 12, 15], "sequenc": 12, "serv": [1, 2, 8, 9, 10, 14], "served_model_nam": [6, 12, 15], "server": [0, 2, 5, 7, 8, 9, 12, 14], "server_arg": [6, 12, 15], "server_process": [12, 15], "serverarg": [6, 12, 15], "servic": [10, 12, 15], "service_ti": [12, 15], "set": [1, 2, 5, 7, 10, 12, 14, 15, 17], "set_default_backend": 7, "sever": [1, 2, 12, 15], "sgl": [0, 1, 3, 7, 10, 13, 14, 16], "sgl0": 16, "sglang": [2, 4, 6, 12, 13, 15, 16], "sglang_is_in_ci": 16, "sglang_storag": [6, 12, 15], "sglang_use_modelscop": 1, "sh": 13, "shade": 12, "shard": [6, 12, 15], "share": [8, 16], "she": 12, "shell": 6, "shm": 16, "short": [12, 14], "shorter": 3, "should": [5, 11], "show": 7, "show_time_cost": [6, 12, 15], "shut": [6, 12, 15], "shutdown": [6, 12, 15], "siluandmul": 11, "similar": [11, 12, 14], "simpl": [7, 12], "simpli": 3, "sinc": [12, 14], "singl": [1, 2, 10, 11, 12, 14], "size": [1, 2, 16], "sk": [7, 16], "skip": 14, "skip_special_token": 14, "skip_tokenizer_init": [6, 12, 15], "sky": [10, 12], "skyserv": 10, "sleep": [12, 16], "slightli": 12, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 17], "smollm": 1, "smooth": 12, "snippet": 2, "so": [1, 2, 14], "social": 15, "societi": 15, "some": [2, 6, 7, 11, 14, 16, 17], "someth": 12, "sometim": 17, "sourc": [2, 9, 15], "space": [12, 14], "spaces_between_special_token": 14, "special": 14, "specif": [1, 10, 11, 15], "specifi": [1, 3, 5, 7, 12, 14, 15, 16], "split": 12, "srt": [9, 10, 14], "stabl": 12, "stablelm": 1, "stai": 7, "stand": [8, 15], "start": [6, 11, 12, 14], "startswith": 14, "startup": [6, 12, 15], "state": [1, 7, 12], "static": [1, 2, 14, 17], "statu": [7, 10, 12], "status_cod": 12, "step": [6, 12, 15], "still": 14, "stop": [7, 8, 12, 14, 15], "stop_str": 5, "stop_token_id": 14, "store": 14, "stori": [12, 15], "str": 14, "strategi": 1, "stream": 1, "stream_interv": [6, 12, 15], "string": [8, 14], "strip": [12, 14], "strong": [3, 12], "strongli": [12, 15], "structur": 9, "student": 7, "subprocess": 6, "subset": 3, "success": 14, "successfulli": 12, "suggest": 8, "summar": 15, "summari": [7, 15], "suppli": [3, 14], "support": [3, 6, 7, 9, 10, 14, 15], "sure": [0, 11, 14], "surfac": 12, "sweetli": 12, "switch": 10, "sxm5": 14, "syntax": 15, "system": [1, 2, 5, 7, 12, 14, 15], "system_fingerprint": [12, 15], "t4": 10, "take": [8, 12, 15], "teacher": 7, "tee": 2, "tell": 12, "temperatur": [1, 7, 12, 14, 15], "templat": [1, 7, 14], "temporarili": 5, "tensor": [1, 9], "termin": 10, "terminate_process": [6, 12, 15], "test": [1, 2, 14, 15, 16], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 12, 14, 15], "text_complet": 12, "text_embed": 6, "text_it": 7, "text_qa": 7, "thei": [14, 15], "them": [10, 15, 17], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17], "thing": 8, "through": [7, 14], "throughput": [1, 12, 14, 15], "time": [1, 2, 6, 12, 14], "tip": 17, "tip_suggest": 7, "tmp": 16, "todai": 1, "togeth": [1, 8], "tok": 14, "token": [1, 5, 6, 7, 8, 9, 10, 12, 15], "token_id": 14, "token_length_norm": 3, "tokenizer_mod": [6, 12, 15], "tokenizer_path": [6, 12, 15], "tokenizers_parallel": 6, "tokyo": [12, 15], "toml": 13, "too": 8, "took": 12, "tool": 7, "tool_cal": [12, 15], "tool_us": 7, "top": 14, "top_k": 14, "top_logprobs_num": 14, "top_p": [1, 12, 14], "topic": [12, 15], "torch": [1, 6, 8, 12, 15], "torch2": 10, "torch_compile_max_b": [6, 12, 15], "torchao": 1, "torchao_config": [6, 12, 15], "total": [1, 12, 14], "total_token": [12, 15], "tp": 1, "tp0": [6, 12, 15], "tp_size": [6, 12, 15], "tpot": 14, "tr": 2, "trace": 2, "traffic": 14, "train": [2, 15], "transform": [6, 11], "transit": 12, "translat": 15, "triton": 10, "triton_attention_reduce_in_fp32": [6, 12, 15], "troubleshoot": 9, "true": [1, 2, 6, 7, 14, 16], "truncat": [1, 2], "trust_remote_cod": [6, 12, 15], "try": [1, 12, 14, 17], "ttft": 14, "tune": [1, 9, 14], "turbo": 7, "turn": 7, "tutori": 12, "twine": 13, "two": [1, 5, 7, 11, 12], "txt": 0, "type": [1, 6, 12, 15], "u": [3, 15], "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 16, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 15], "union": 14, "unit": [1, 2, 7, 12], "unittest": 11, "until": 14, "up": [6, 10, 12, 15], "updat": [0, 2, 16], "upgrad": 10, "upload": 12, "upload_pypi": 13, "uploaded_fil": 12, "upon": [1, 6], "url": [12, 14], "us": [2, 3, 4, 5, 8, 12, 14, 16], "us_president_exampl": 3, "usabl": [6, 12, 15], "usag": [1, 3, 6, 8, 15], "user": [1, 3, 5, 7, 8, 12, 14, 15], "usual": [12, 14], "utf": [12, 14], "util": [6, 8, 12, 14, 15], "uvicorn": [6, 12, 15], "v": [10, 16], "v0": 10, "v1": [1, 6, 12, 15], "valid": 12, "valu": [1, 8, 14, 17], "valuabl": 11, "variabl": [1, 16], "variant": 2, "variou": [1, 12, 15], "vast": 15, "veri": [8, 11, 12, 14], "verifi": 12, "version": 10, "vertexai": 7, "video": 16, "view": 1, "virtual": 15, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 14, "w": [7, 12], "wa": 12, "wai": [6, 11], "wait": [6, 12, 15], "wait_for_serv": [6, 12, 15], "wand": 7, "want": [1, 14], "warn": 8, "washington": 12, "watchdog_timeout": [6, 12, 15], "we": [1, 12, 14, 15], "web": 12, "weight": [1, 2, 6, 12, 15, 16], "weight_util": [6, 12, 15], "welcom": 5, "well": 11, "were": [12, 14], "what": [3, 7, 12, 15], "when": [5, 7, 8, 12, 14, 17], "where": 3, "whether": 14, "which": [8, 12, 14, 15], "while": [1, 2, 10, 12, 14, 16], "whl": 10, "who": 12, "why": 12, "wide": [9, 12], "within": 7, "without": [2, 10, 12], "wood": 7, "word": [7, 12], "work": [1, 5, 8, 16], "workflow": 7, "workload": 8, "write": [0, 12], "x64": 16, "x86_64": 2, "xvers": 1, "xxx": 16, "xylophia": 12, "y": [2, 16], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "your": [0, 1, 5, 7, 9, 10, 12, 14, 15], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 16], "2": [10, 16], "3": [1, 10, 16], "4": 10, "405b": 1, "5": 10, "A": [6, 15], "The": 17, "With": 10, "access": 17, "achiev": 8, "action": 16, "add": [4, 11, 16], "addit": 1, "advanc": 8, "all": 14, "an": 17, "api": [1, 6, 12, 15], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 14, "batch": [7, 12], "benchmark": [1, 2, 14], "build": 0, "chat": [5, 12], "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 13], "common": 10, "compat": [1, 6, 12, 15], "complet": 12, "compos": 10, "config": 16, "configur": 16, "conserv": 8, "constrain": 7, "contain": 16, "contributor": 4, "control": 7, "correct": 11, "cuda": 17, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 16], "document": [0, 9], "dp": 8, "embed": 6, "encount": 17, "engin": 1, "error": 17, "exampl": [7, 14], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 14, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [13, 16], "greedi": 3, "guid": [4, 8], "hang": 17, "host": 16, "how": 11, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 17, "implement": 7, "implic": 14, "input": 6, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 14, "launch": [6, 15], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 13, "max": 8, "mem": 8, "memori": [8, 14, 17], "method": [3, 10], "min": 14, "modal": [7, 14], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 14], "new": [11, 14], "normal": [3, 14], "note": 10, "nsight": 2, "openai": [1, 6, 7, 12, 15], "option": 8, "other": 2, "out": 8, "packag": 13, "parallel": 7, "paramet": [12, 14], "peak": 8, "penalti": 14, "perform": [1, 14], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 14, "preview": 0, "process": 13, "profil": 2, "pypi": 13, "quick": [1, 7, 15], "refer": 9, "releas": 13, "repetit": 14, "request": [8, 15], "role": 7, "run": [1, 8, 10, 16], "runner": 16, "runtim": [1, 5, 14], "sampl": 14, "schedul": 8, "select": 3, "self": 16, "send": 15, "serv": 0, "server": [1, 6, 15, 17], "set": 16, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 14], "sh": 16, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 15, 16], "static": 8, "step": 16, "stream": [7, 14], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 14, "token": [3, 14], "tp": 8, "troubleshoot": 17, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 16, "updat": 13, "upload": 13, "us": [1, 6, 7, 10, 15], "usag": 12, "version": 13, "vllm": 11, "wa": 17, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
diff --git a/send_request.html b/send_request.html
index 10e8c2a..95f2121 100644
--- a/send_request.html
+++ b/send_request.html
@@ -33,8 +33,9 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
     <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css?v=2aa19091" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -59,7 +60,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -411,14 +412,18 @@ <h2> Contents </h2>
 <div id="searchbox"></div>
                 <article class="bd-article">
                   
-  <section id="Quick-Start:-Launch-A-Server-and-Send-Requests">
+  <style>
+    .output_area.stderr, .output_area.stdout {
+        color: #d3d3d3 !important; /* 浅灰色 */
+    }
+</style><section id="Quick-Start:-Launch-A-Server-and-Send-Requests">
 <h1>Quick Start: Launch A Server and Send Requests<a class="headerlink" href="#Quick-Start:-Launch-A-Server-and-Send-Requests" title="Link to this heading">#</a></h1>
 <p>This section provides a quick start guide to using SGLang after installation.</p>
 <section id="Launch-a-server">
 <h2>Launch a server<a class="headerlink" href="#Launch-a-server" title="Link to this heading">#</a></h2>
 <p>This code block is equivalent to executing</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>sglang.launch_server<span class="w"> </span>--model-path<span class="w"> </span>meta-llama/Meta-Llama-3.1-8B-Instruct<span class="w"> </span><span class="se">\</span>
---port<span class="w"> </span><span class="m">30000</span><span class="w"> </span>--host<span class="w"> </span><span class="m">0</span>.0.0.0<span class="w"> </span>--log-level<span class="w"> </span>warning
+--port<span class="w"> </span><span class="m">30000</span><span class="w"> </span>--host<span class="w"> </span><span class="m">0</span>.0.0.0
 </pre></div>
 </div>
 <p>in your command line and wait for the server to be ready.</p>
@@ -426,29 +431,55 @@ <h2>Launch a server<a class="headerlink" href="#Launch-a-server" title="Link to
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
 </pre></div>
 </div>
-<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">execute_shell_command</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sglang.utils</span> <span class="kn">import</span> <span class="n">lauch_sglang_server</span><span class="p">,</span> <span class="n">wait_for_server</span><span class="p">,</span> <span class="n">terminate_process</span><span class="p">,</span> <span class="n">highlight_text</span>
 
 
-<span class="n">server_process</span> <span class="o">=</span> <span class="n">execute_shell_command</span><span class="p">(</span>
+<span class="n">server_process</span> <span class="o">=</span> <span class="n">lauch_sglang_server</span><span class="p">(</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \</span>
-<span class="sd">--port 30000 --host 0.0.0.0 --log-level warning</span>
+<span class="sd">--port 30000 --host 0.0.0.0</span>
 <span class="sd">&quot;&quot;&quot;</span>
 <span class="p">)</span>
 
 <span class="n">wait_for_server</span><span class="p">(</span><span class="s2">&quot;http://localhost:30000&quot;</span><span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="s2">&quot;Server is ready. Proceeding with the next steps.&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-Server is ready. Proceeding with the next steps.
+[2024-10-28 09:17:45] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-28 09:18:01 TP0] Init torch distributed begin.
+[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00&lt;?, ?it/s]
+Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.24it/s]
+Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.16it/s]
+Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.17it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.57it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.40it/s]
+
+[2024-10-28 09:18:05 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-28 09:18:05 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-28 09:18:05 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-28 09:18:12 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-28 09:18:12] INFO:     Started server process [511197]
+[2024-10-28 09:18:12] INFO:     Waiting for application startup.
+[2024-10-28 09:18:12] INFO:     Application startup complete.
+[2024-10-28 09:18:12] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35516 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong></div>
+</div>
 </section>
 <section id="Send-a-Request">
 <h2>Send a Request<a class="headerlink" href="#Send-a-Request" title="Link to this heading">#</a></h2>
@@ -469,7 +500,21 @@ <h2>Send a Request<a class="headerlink" href="#Send-a-Request" title="Link to th
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-{&#34;id&#34;:&#34;6ae7fabfd4c54054a8017e2aa7c6bc5a&#34;,&#34;object&#34;:&#34;chat.completion&#34;,&#34;created&#34;:1730071553,&#34;model&#34;:&#34;meta-llama/Meta-Llama-3.1-8B-Instruct&#34;,&#34;choices&#34;:[{&#34;index&#34;:0,&#34;message&#34;:{&#34;role&#34;:&#34;assistant&#34;,&#34;content&#34;:&#34;LLM stands for Large Language Model. It&#39;s a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\n\nLarge Language Models are typically characterized by their ability to:\n\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\n\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\n\nSome popular examples of LLMs include:\n\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\n\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation.&#34;},&#34;logprobs&#34;:null,&#34;finish_reason&#34;:&#34;stop&#34;,&#34;matched_stop&#34;:128009}],&#34;usage&#34;:{&#34;prompt_tokens&#34;:47,&#34;total_tokens&#34;:450,&#34;completion_tokens&#34;:403,&#34;prompt_tokens_details&#34;:null}}
+[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35536 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0
+[2024-10-28 09:18:13] INFO:     127.0.0.1:35540 - &#34;POST /generate HTTP/1.1&#34; 200 OK
+[2024-10-28 09:18:13] The server is fired up and ready to roll!
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0
+[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0
+[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0
+[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0
+[2024-10-28 09:18:16] INFO:     127.0.0.1:35530 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+{&#34;id&#34;:&#34;ad61027db61649d0bd69f6aa901f1d8c&#34;,&#34;object&#34;:&#34;chat.completion&#34;,&#34;created&#34;:1730107096,&#34;model&#34;:&#34;meta-llama/Meta-Llama-3.1-8B-Instruct&#34;,&#34;choices&#34;:[{&#34;index&#34;:0,&#34;message&#34;:{&#34;role&#34;:&#34;assistant&#34;,&#34;content&#34;:&#34;LLM stands for Large Language Model. It&#39;s a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\n\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\n\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\n\nSome common applications of LLMs include:\n\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\n\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\n\nIf you have any specific questions or topics you&#39;d like to discuss, feel free to ask!&#34;},&#34;logprobs&#34;:null,&#34;finish_reason&#34;:&#34;stop&#34;,&#34;matched_stop&#34;:128009}],&#34;usage&#34;:{&#34;prompt_tokens&#34;:47,&#34;total_tokens&#34;:378,&#34;completion_tokens&#34;:331,&#34;prompt_tokens_details&#34;:null}}
 </pre></div></div>
 </div>
 </section>
@@ -498,19 +543,27 @@ <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compati
     <span class="n">temperature</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
     <span class="n">max_tokens</span><span class="o">=</span><span class="mi">64</span><span class="p">,</span>
 <span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
+<span class="n">highlight_text</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
-<div class="nboutput nblast docutils container">
+<div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-ChatCompletion(id=&#39;da93c64364af475cbdd2cb19155fd68d&#39;, choices=[Choice(finish_reason=&#39;stop&#39;, index=0, logprobs=None, message=ChatCompletionMessage(content=&#39;Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília&#39;, refusal=None, role=&#39;assistant&#39;, audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, object=&#39;chat.completion&#39;, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))
+[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0
+[2024-10-28 09:18:17] INFO:     127.0.0.1:35554 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
-<div class="nbinput nblast docutils container">
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area rendered_html docutils container">
+<strong style='color: #00008B;'>ChatCompletion(id='29542e83d53f44eea0c01d1f517c4b40', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107097, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+</div>
+<div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
 </pre></div>
 </div>
@@ -518,6 +571,17 @@ <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compati
 </pre></div>
 </div>
 </div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+[2024-10-28 09:18:17] INFO:     Shutting down
+[2024-10-28 09:18:17] INFO:     Waiting for application shutdown.
+[2024-10-28 09:18:17] INFO:     Application shutdown complete.
+[2024-10-28 09:18:17] INFO:     Finished server process [511197]
+</pre></div></div>
+</div>
 </section>
 </section>
 
@@ -603,7 +667,7 @@ <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compati
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/send_request.ipynb b/send_request.ipynb
index ea93b12..ea640a6 100644
--- a/send_request.ipynb
+++ b/send_request.ipynb
@@ -19,7 +19,7 @@
     "\n",
     "```bash\n",
     "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
     "```\n",
     "\n",
     "in your command line and wait for the server to be ready."
@@ -30,10 +30,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:12.782403Z",
-     "iopub.status.busy": "2024-10-27T23:25:12.781995Z",
-     "iopub.status.idle": "2024-10-27T23:25:50.292760Z",
-     "shell.execute_reply": "2024-10-27T23:25:50.291723Z"
+     "iopub.execute_input": "2024-10-28T09:17:35.325923Z",
+     "iopub.status.busy": "2024-10-28T09:17:35.325748Z",
+     "iopub.status.idle": "2024-10-28T09:18:13.770765Z",
+     "shell.execute_reply": "2024-10-28T09:18:13.770130Z"
     }
    },
    "outputs": [
@@ -41,23 +41,127 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "[2024-10-28 09:17:45] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=347192970, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:01 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:01 TP0] Load weight begin. avail mem=78.59 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:02 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-28 09:18:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.24it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.16it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.17it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.57it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.40it/s]\n",
+      "\n",
+      "[2024-10-28 09:18:05 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-28 09:18:05 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-28 09:18:05 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:12 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:12] INFO:     Started server process [511197]\n",
+      "[2024-10-28 09:18:12] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 09:18:12] INFO:     Application startup complete.\n",
+      "[2024-10-28 09:18:12] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35516 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import lauch_sglang_server, wait_for_server, terminate_process, highlight_text\n",
     "\n",
     "\n",
-    "server_process = execute_shell_command(\n",
+    "server_process = lauch_sglang_server(\n",
     "    \"\"\"\n",
     "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
     "\"\"\"\n",
     ")\n",
     "\n",
     "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "highlight_text(\"Server is ready. Proceeding with the next steps.\")"
    ]
   },
   {
@@ -74,10 +178,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:50.328286Z",
-     "iopub.status.busy": "2024-10-27T23:25:50.327797Z",
-     "iopub.status.idle": "2024-10-27T23:25:53.479602Z",
-     "shell.execute_reply": "2024-10-27T23:25:53.478670Z"
+     "iopub.execute_input": "2024-10-28T09:18:13.772846Z",
+     "iopub.status.busy": "2024-10-28T09:18:13.772593Z",
+     "iopub.status.idle": "2024-10-28T09:18:16.416442Z",
+     "shell.execute_reply": "2024-10-28T09:18:16.415708Z"
     }
    },
    "outputs": [
@@ -85,7 +189,87 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\"id\":\"6ae7fabfd4c54054a8017e2aa7c6bc5a\",\"object\":\"chat.completion\",\"created\":1730071553,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and structures of language.\\n\\nLarge Language Models are typically characterized by their ability to:\\n\\n1. **Understand natural language**: LLMs can comprehend and interpret human language, including nuances, idioms, and context.\\n2. **Generate text**: LLMs can create coherent and context-specific text, such as responses to questions, summaries of articles, or even entire stories.\\n3. **Answer questions**: LLMs can provide accurate and informative answers to a wide range of questions, from simple facts to complex topics.\\n4. **Translate languages**: LLMs can translate text from one language to another, often with high accuracy.\\n5. **Summarize content**: LLMs can condense long pieces of text into shorter, more digestible summaries.\\n\\nThe core of an LLM is its **neural network architecture**, which is composed of multiple layers of interconnected nodes (neurons) that process and transform the input data. This architecture allows LLMs to learn complex patterns and relationships in language, enabling them to generate human-like text.\\n\\nSome popular examples of LLMs include:\\n\\n* **Chatbots**: Virtual assistants that use LLMs to understand and respond to user queries.\\n* **Language translation tools**: Services that use LLMs to translate text from one language to another.\\n* **Content generation platforms**: Tools that use LLMs to generate text, such as articles, social media posts, or even entire books.\\n* **Virtual assistants**: AI-powered assistants, like Siri, Alexa, or Google Assistant, that use LLMs to understand and respond to user queries.\\n\\nOverall, LLMs have revolutionized the field of natural language processing (NLP) and have numerous applications in various industries, from customer service to content creation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":450,\"completion_tokens\":403,\"prompt_tokens_details\":null}}"
+      "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35536 - \"GET /get_model_info HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:13] INFO:     127.0.0.1:35540 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 09:18:13] The server is fired up and ready to roll!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 25.58, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 139.75, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:14 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 138.20, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 137.96, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:15 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 138.10, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:16 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 138.22, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:16] INFO:     127.0.0.1:35530 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"ad61027db61649d0bd69f6aa901f1d8c\",\"object\":\"chat.completion\",\"created\":1730107096,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models like myself are trained on a massive corpus of text, often sourced from the internet, books, and other digital sources. This training enables us to:\\n\\n1. **Understand**: We can comprehend the meaning of text, including context, syntax, and semantics.\\n2. **Generate**: We can create coherent and context-specific text, such as responses to questions, articles, or even entire stories.\\n3. **Complete**: We can fill in the blanks, summarize long texts, or translate languages.\\n\\nSome common applications of LLMs include:\\n\\n1. **Virtual assistants**: Like myself, we can provide information, answer questions, and even engage in conversations.\\n2. **Language translation**: We can translate text from one language to another, often with high accuracy.\\n3. **Text summarization**: We can condense long texts into concise summaries, highlighting key points and main ideas.\\n4. **Content creation**: We can generate text, such as articles, social media posts, or even entire books.\\n\\nLarge Language Models have the potential to revolutionize various industries, including education, customer service, and content creation. However, they also raise important questions about the role of AI in society, the potential for bias in language models, and the need for responsible AI development and deployment.\\n\\nIf you have any specific questions or topics you'd like to discuss, feel free to ask!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":378,\"completion_tokens\":331,\"prompt_tokens_details\":null}}"
      ]
     }
    ],
@@ -110,10 +294,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:53.481936Z",
-     "iopub.status.busy": "2024-10-27T23:25:53.481707Z",
-     "iopub.status.idle": "2024-10-27T23:25:54.273214Z",
-     "shell.execute_reply": "2024-10-27T23:25:54.272434Z"
+     "iopub.execute_input": "2024-10-28T09:18:16.418642Z",
+     "iopub.status.busy": "2024-10-28T09:18:16.418313Z",
+     "iopub.status.idle": "2024-10-28T09:18:17.213494Z",
+     "shell.execute_reply": "2024-10-28T09:18:17.212929Z"
     }
    },
    "outputs": [
@@ -121,8 +305,28 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='da93c64364af475cbdd2cb19155fd68d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730071554, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))\n"
+      "[2024-10-28 09:18:16 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17 TP0] Decode batch. #running-req: 1, #token: 79, token usage: 0.00, gen throughput (token/s): 46.61, #queue-req: 0\n",
+      "[2024-10-28 09:18:17] INFO:     127.0.0.1:35554 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>ChatCompletion(id='29542e83d53f44eea0c01d1f517c4b40', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730107097, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -144,7 +348,7 @@
     "    temperature=0,\n",
     "    max_tokens=64,\n",
     ")\n",
-    "print(response)"
+    "highlight_text(response)"
    ]
   },
   {
@@ -152,13 +356,30 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-27T23:25:54.275385Z",
-     "iopub.status.busy": "2024-10-27T23:25:54.274807Z",
-     "iopub.status.idle": "2024-10-27T23:25:57.082401Z",
-     "shell.execute_reply": "2024-10-27T23:25:57.080829Z"
+     "iopub.execute_input": "2024-10-28T09:18:17.215264Z",
+     "iopub.status.busy": "2024-10-28T09:18:17.215073Z",
+     "iopub.status.idle": "2024-10-28T09:18:20.076158Z",
+     "shell.execute_reply": "2024-10-28T09:18:20.075276Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17] INFO:     Shutting down\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 09:18:17] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 09:18:17] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 09:18:17] INFO:     Finished server process [511197]\n"
+     ]
+    }
+   ],
    "source": [
     "terminate_process(server_process)"
    ]
diff --git a/setup_github_runner.html b/setup_github_runner.html
index 31a4cad..4ed9f38 100644
--- a/setup_github_runner.html
+++ b/setup_github_runner.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -54,7 +55,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -530,7 +531,7 @@ <h3>Step 3: Run the runner by <code class="docutils literal notranslate"><span c
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>
diff --git a/troubleshooting.html b/troubleshooting.html
index e75cf7f..b9a9cea 100644
--- a/troubleshooting.html
+++ b/troubleshooting.html
@@ -33,7 +33,8 @@
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
     <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
     <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
-    <link rel="stylesheet" type="text/css" href="_static/css/readthedocs.css?v=73f7402e" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=8f6ac367" />
   
   <!-- So that users can add custom icons -->
   <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
@@ -56,7 +57,7 @@
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
   <meta name="docsearch:version" content="0.3.4.post2" />
-    <meta name="docbuild:last-update" content="Oct 27, 2024"/>
+    <meta name="docbuild:last-update" content="Oct 28, 2024"/>
   </head>
   
   
@@ -509,7 +510,7 @@ <h2>The server hangs<a class="headerlink" href="#the-server-hangs" title="Link t
   
   <div class="footer-item">
     <p class="last-updated">
-  Last updated on Oct 27, 2024.
+  Last updated on Oct 28, 2024.
   <br/>
 </p>
   </div>