diff --git a/README.html b/README.html
index daea8d0..3a274dd 100644
--- a/README.html
+++ b/README.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/_sources/embedding_model.ipynb b/_sources/embedding_model.ipynb
index a5f04f6..9ac7ae8 100644
--- a/_sources/embedding_model.ipynb
+++ b/_sources/embedding_model.ipynb
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:50:44.977451Z",
-     "iopub.status.busy": "2024-10-30T09:50:44.977291Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.305212Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.304521Z"
+     "iopub.execute_input": "2024-10-30T10:18:30.036577Z",
+     "iopub.status.busy": "2024-10-30T10:18:30.036415Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.359146Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.358513Z"
     }
    },
    "outputs": [
@@ -43,41 +43,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:50:54] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=448515216, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:18:40] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=922321184, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:10 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:18:55 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:11 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:18:56 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:12 TP0] lm_eval is not installed, GPTQ may not be usable\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO 10-30 09:51:13 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+      "[2024-10-30 10:18:56 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "INFO 10-30 10:18:56 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
       "\r",
       "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n"
      ]
@@ -87,7 +81,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:10,  1.78s/it]\n"
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:10,  1.68s/it]\n"
      ]
     },
     {
@@ -95,7 +89,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:09,  1.81s/it]\n"
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:08,  1.76s/it]\n"
      ]
     },
     {
@@ -103,7 +97,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.86s/it]\n"
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.82s/it]\n"
      ]
     },
     {
@@ -111,7 +105,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.88s/it]\n"
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.85s/it]\n"
      ]
     },
     {
@@ -119,7 +113,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.90s/it]\n"
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.83s/it]\n"
      ]
     },
     {
@@ -127,7 +121,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10<00:01,  1.77s/it]\n"
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10<00:01,  1.67s/it]\n"
      ]
     },
     {
@@ -135,58 +129,46 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.47s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.41s/it]\n",
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.68s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.61s/it]\n",
       "\n",
-      "[2024-10-30 09:51:25 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:51:25 TP0] Memory pool end. avail mem=7.43 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:51:26 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
+      "[2024-10-30 10:19:08 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n",
+      "[2024-10-30 10:19:08 TP0] Memory pool end. avail mem=7.43 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:26] INFO:     Started server process [1244882]\n",
-      "[2024-10-30 09:51:26] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:51:26] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:51:26] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n"
+      "[2024-10-30 10:19:08 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:27] INFO:     127.0.0.1:43056 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:09] INFO:     Started server process [2218395]\n",
+      "[2024-10-30 10:19:09] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:19:09] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:19:09] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-30 10:19:09] INFO:     127.0.0.1:49928 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:27] INFO:     127.0.0.1:43062 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:51:27 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:19:10] INFO:     127.0.0.1:49940 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:19:10 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:28] INFO:     127.0.0.1:43078 - \"POST /encode HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:51:28] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:19:11] INFO:     127.0.0.1:49946 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:19:11] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -232,10 +214,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.307542Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.307258Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.349027Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.348350Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.361455Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.361206Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.399746Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.399141Z"
     }
    },
    "outputs": [
@@ -243,8 +225,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:32] INFO:     127.0.0.1:34504 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:14] INFO:     127.0.0.1:49980 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -289,10 +271,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.350944Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.350735Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.816553Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.815847Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.401590Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.401411Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.850015Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.849423Z"
     }
    },
    "outputs": [
@@ -300,8 +282,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:32] INFO:     127.0.0.1:34520 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:14] INFO:     127.0.0.1:49986 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -346,10 +328,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.818545Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.818328Z",
-     "iopub.status.idle": "2024-10-30T09:51:39.897939Z",
-     "shell.execute_reply": "2024-10-30T09:51:39.897207Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.851882Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.851691Z",
+     "iopub.status.idle": "2024-10-30T10:19:21.910313Z",
+     "shell.execute_reply": "2024-10-30T10:19:21.909605Z"
     }
    },
    "outputs": [
@@ -357,8 +339,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:39 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:39] INFO:     127.0.0.1:42628 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:21 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:21] INFO:     127.0.0.1:43896 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -401,10 +383,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:39.900119Z",
-     "iopub.status.busy": "2024-10-30T09:51:39.899764Z",
-     "iopub.status.idle": "2024-10-30T09:51:40.354387Z",
-     "shell.execute_reply": "2024-10-30T09:51:40.353699Z"
+     "iopub.execute_input": "2024-10-30T10:19:21.912691Z",
+     "iopub.status.busy": "2024-10-30T10:19:21.912221Z",
+     "iopub.status.idle": "2024-10-30T10:19:22.397961Z",
+     "shell.execute_reply": "2024-10-30T10:19:22.397219Z"
     }
    },
    "outputs": [],
diff --git a/_sources/index.rst b/_sources/index.rst
index ab54a39..8cb2860 100644
--- a/_sources/index.rst
+++ b/_sources/index.rst
@@ -46,3 +46,4 @@ The core features include:
    benchmark_and_profiling.md
    troubleshooting.md
    embedding_model.ipynb
+   learn_more.md
diff --git a/_sources/learn_more.md b/_sources/learn_more.md
new file mode 100644
index 0000000..62d02a0
--- /dev/null
+++ b/_sources/learn_more.md
@@ -0,0 +1,3 @@
+# Learn more
+
+You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).
\ No newline at end of file
diff --git a/_sources/openai_api.ipynb b/_sources/openai_api.ipynb
index f3058b3..aa2f116 100644
--- a/_sources/openai_api.ipynb
+++ b/_sources/openai_api.ipynb
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:44.961224Z",
-     "iopub.status.busy": "2024-10-30T09:51:44.961060Z",
-     "iopub.status.idle": "2024-10-30T09:52:28.261228Z",
-     "shell.execute_reply": "2024-10-30T09:52:28.260423Z"
+     "iopub.execute_input": "2024-10-30T10:19:27.012379Z",
+     "iopub.status.busy": "2024-10-30T10:19:27.012217Z",
+     "iopub.status.idle": "2024-10-30T10:20:10.309947Z",
+     "shell.execute_reply": "2024-10-30T10:20:10.309343Z"
     }
    },
    "outputs": [
@@ -43,35 +43,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:55] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=747149505, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:19:37] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=203851697, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:10 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:19:52 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:11 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:19:53 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:11 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+      "[2024-10-30 10:19:53 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO 10-30 09:52:12 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "INFO 10-30 10:19:54 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
       "\r",
       "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
      ]
@@ -81,7 +81,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.44it/s]\n"
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.17it/s]\n"
      ]
     },
     {
@@ -89,7 +89,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.32it/s]\n"
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.05it/s]\n"
      ]
     },
     {
@@ -97,7 +97,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.22it/s]\n"
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.02it/s]\n"
      ]
     },
     {
@@ -105,41 +105,41 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.61it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.35it/s]\n",
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.48it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.23it/s]\n",
       "\n",
-      "[2024-10-30 09:52:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
-      "[2024-10-30 09:52:14 TP0] Memory pool end. avail mem=8.37 GB\n",
-      "[2024-10-30 09:52:15 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+      "[2024-10-30 10:19:57 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-30 10:19:57 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-30 10:19:57 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:22 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+      "[2024-10-30 10:20:04 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:23] INFO:     Started server process [1245797]\n",
-      "[2024-10-30 09:52:23] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:52:23] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:52:23] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-10-30 09:52:23] INFO:     127.0.0.1:40674 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:05] INFO:     Started server process [2219311]\n",
+      "[2024-10-30 10:20:05] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:20:05] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:20:05] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-30 10:20:05] INFO:     127.0.0.1:40444 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:24] INFO:     127.0.0.1:40678 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:24 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:24] INFO:     127.0.0.1:40686 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:24] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:20:06] INFO:     127.0.0.1:40446 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:06 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:06] INFO:     127.0.0.1:40454 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:06] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -175,10 +175,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:28.263865Z",
-     "iopub.status.busy": "2024-10-30T09:52:28.263388Z",
-     "iopub.status.idle": "2024-10-30T09:52:29.047736Z",
-     "shell.execute_reply": "2024-10-30T09:52:29.046944Z"
+     "iopub.execute_input": "2024-10-30T10:20:10.311989Z",
+     "iopub.status.busy": "2024-10-30T10:20:10.311744Z",
+     "iopub.status.idle": "2024-10-30T10:20:11.072023Z",
+     "shell.execute_reply": "2024-10-30T10:20:11.071472Z"
     }
    },
    "outputs": [
@@ -186,21 +186,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:28 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:10 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:28 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.28, #queue-req: 0\n",
-      "[2024-10-30 09:52:29] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:10 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.33, #queue-req: 0\n",
+      "[2024-10-30 10:20:11] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='cc26dae40dc0474f92b9a23846f0a2ba', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281949, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='6ebfa0d17b5a4581a7fcf6ba813ec4c7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283611, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -244,10 +244,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:29.049977Z",
-     "iopub.status.busy": "2024-10-30T09:52:29.049754Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.069676Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.068949Z"
+     "iopub.execute_input": "2024-10-30T10:20:11.073825Z",
+     "iopub.status.busy": "2024-10-30T10:20:11.073638Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.087238Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.086715Z"
     }
    },
    "outputs": [
@@ -255,35 +255,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 110.01, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 112.96, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.67, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 131.93, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.20, #queue-req: 0\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.44, #queue-req: 0\n",
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (46 BCE) established a complex system of laws, paving the way for modern democracy.<br>2. **Architecture and Engineering**: Romans developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.<br>3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the language of government, commerce, and literature, influencing modern languages like French, Spanish</strong>"
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: Romans developed concrete (Opus caementicium), aqueducts, roads (e.g., Appian Way), bridges, and monumental buildings like the Colosseum and Pantheon.<br>2. **Law and Governance**: They established the Twelve Tables (450 BCE), a foundation for Roman law, and developed a complex system of governance, including the Senate and Assemblies.<br>3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -332,10 +332,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.071909Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.071567Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.329870Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.329208Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.088958Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.088783Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.151298Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.150751Z"
     }
    },
    "outputs": [
@@ -343,24 +343,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "It looks like we"
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "This is only a test"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "'re about to begin. What type of test would you like to conduct? A language[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 123.86, #queue-req: 0\n",
-      " proficiency test, a knowledge quiz"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ", or something else entirely?"
+      "."
      ]
     }
    ],
@@ -391,10 +383,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.331853Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.331471Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.791982Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.791382Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.152940Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.152766Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.611937Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.611432Z"
     }
    },
    "outputs": [
@@ -402,27 +394,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 133.41, #queue-req: 0\n"
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 120.08, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.95, #queue-req: 0\n",
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='83f0cdf978484de5b357f0abc71e3f07', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='6636fb77cf774cf7a76fc4d793c95a38', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730283612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -461,10 +448,10 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.793939Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.793562Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.916789Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.916187Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.613616Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.613445Z",
+     "iopub.status.idle": "2024-10-30T10:20:13.749192Z",
+     "shell.execute_reply": "2024-10-30T10:20:13.748684Z"
     }
    },
    "outputs": [
@@ -472,15 +459,41 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 131.40, #queue-req: 0\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 126.71, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 134.52, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 133.35, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46764 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='569d9fcd3afc47828a91328dcc3da4cb', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' There are 3 pages, and each page has a few sentences. ', matched_stop='\\n\\n')], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=15, prompt_tokens=10, total_tokens=25, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='c9ed6e0b75884675b752770fdf00118a', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' On a mission to explore the newly discovered planet, Kepler-62f, which is located about 1,200 light-years from Earth. The story should be not more than 2 pages.\\nCaptain Jameson stood at the edge of the ship’s observation deck, gazing out at the swirling clouds of gas and dust that surrounded Kepler-62f. The planet hung before him like a blue-green jewel, its atmosphere shimmering with an otherworldly light. He felt a shiver run down his spine as he contemplated the enormity of their discovery.\\nKepler-62f was a super-Earth, a world orbiting a star very similar to our own Sun. The initial scans had hinted at liquid water and a stable climate, making', matched_stop=None)], created=1730283613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -529,10 +542,10 @@
    "execution_count": 7,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.918549Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.918362Z",
-     "iopub.status.idle": "2024-10-30T09:52:31.003443Z",
-     "shell.execute_reply": "2024-10-30T09:52:31.002832Z"
+     "iopub.execute_input": "2024-10-30T10:20:13.750906Z",
+     "iopub.status.busy": "2024-10-30T10:20:13.750729Z",
+     "iopub.status.idle": "2024-10-30T10:20:13.833501Z",
+     "shell.execute_reply": "2024-10-30T10:20:13.832992Z"
     }
    },
    "outputs": [
@@ -540,21 +553,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:13 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940</strong>"
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -619,10 +626,10 @@
    "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:31.005316Z",
-     "iopub.status.busy": "2024-10-30T09:52:31.004960Z",
-     "iopub.status.idle": "2024-10-30T09:52:34.028151Z",
-     "shell.execute_reply": "2024-10-30T09:52:34.027397Z"
+     "iopub.execute_input": "2024-10-30T10:20:13.835070Z",
+     "iopub.status.busy": "2024-10-30T10:20:13.834892Z",
+     "iopub.status.idle": "2024-10-30T10:20:16.854742Z",
+     "shell.execute_reply": "2024-10-30T10:20:16.854185Z"
     }
    },
    "outputs": [
@@ -630,7 +637,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:31 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 134.02, #queue-req: 0\n"
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 2, #token: 56, token usage: 0.00, gen throughput (token/s): 107.56, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:14 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 171.05, #queue-req: 0\n"
      ]
     },
     {
@@ -638,10 +652,10 @@
      "output_type": "stream",
      "text": [
       "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"GET /v1/batches/batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940 HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"GET /v1/batches/batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538 HTTP/1.1\" 200 OK\n",
       "Batch job completed successfully!\n",
       "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"GET /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab/content HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"GET /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033/content HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -659,7 +673,7 @@
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -683,7 +697,7 @@
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n=====================\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 198'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -708,7 +722,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"DELETE /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"DELETE /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033 HTTP/1.1\" 200 OK\n"
      ]
     }
    ],
@@ -760,10 +774,10 @@
    "execution_count": 9,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:34.030322Z",
-     "iopub.status.busy": "2024-10-30T09:52:34.030006Z",
-     "iopub.status.idle": "2024-10-30T09:52:59.161348Z",
-     "shell.execute_reply": "2024-10-30T09:52:59.160592Z"
+     "iopub.execute_input": "2024-10-30T10:20:16.856439Z",
+     "iopub.status.busy": "2024-10-30T10:20:16.856263Z",
+     "iopub.status.idle": "2024-10-30T10:20:41.986619Z",
+     "shell.execute_reply": "2024-10-30T10:20:41.985974Z"
     }
    },
    "outputs": [
@@ -771,14 +785,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -803,105 +817,105 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 44.86%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.33%, token usage: 0.00, #running-req: 17, #queue-req: 0\n"
+      "[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 44.36%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.33%, token usage: 0.00, #running-req: 7, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 5225, token usage: 0.01, gen throughput (token/s): 691.07, #queue-req: 0\n"
+      "[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 6425, token usage: 0.01, gen throughput (token/s): 1051.94, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 9225, token usage: 0.02, gen throughput (token/s): 10881.22, #queue-req: 0\n"
+      "[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 10425, token usage: 0.02, gen throughput (token/s): 10803.69, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 13225, token usage: 0.03, gen throughput (token/s): 10661.22, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 14425, token usage: 0.03, gen throughput (token/s): 10578.85, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 17225, token usage: 0.04, gen throughput (token/s): 10362.11, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 18425, token usage: 0.04, gen throughput (token/s): 10257.61, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 21225, token usage: 0.05, gen throughput (token/s): 10117.01, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 22425, token usage: 0.05, gen throughput (token/s): 10138.49, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 25225, token usage: 0.06, gen throughput (token/s): 9825.47, #queue-req: 0\n"
+      "[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 26425, token usage: 0.06, gen throughput (token/s): 9904.84, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 29225, token usage: 0.07, gen throughput (token/s): 9705.75, #queue-req: 0\n"
+      "[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 30425, token usage: 0.07, gen throughput (token/s): 9674.94, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 33225, token usage: 0.08, gen throughput (token/s): 9505.80, #queue-req: 0\n"
+      "[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 34425, token usage: 0.08, gen throughput (token/s): 9519.28, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 37225, token usage: 0.08, gen throughput (token/s): 9362.90, #queue-req: 0\n"
+      "[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 38425, token usage: 0.09, gen throughput (token/s): 9329.19, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 41225, token usage: 0.09, gen throughput (token/s): 9069.10, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 42425, token usage: 0.10, gen throughput (token/s): 9120.97, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 45225, token usage: 0.10, gen throughput (token/s): 9011.38, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 46425, token usage: 0.10, gen throughput (token/s): 8980.70, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 49225, token usage: 0.11, gen throughput (token/s): 8748.48, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 50425, token usage: 0.11, gen throughput (token/s): 8799.31, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:44] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:26] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -926,13 +940,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:47] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:29] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -957,13 +971,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:50] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:32] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -988,13 +1002,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:53] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:35] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1019,13 +1033,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:56] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:38] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1123,10 +1137,10 @@
    "execution_count": 10,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:59.163798Z",
-     "iopub.status.busy": "2024-10-30T09:52:59.163594Z",
-     "iopub.status.idle": "2024-10-30T09:53:12.286875Z",
-     "shell.execute_reply": "2024-10-30T09:53:12.286291Z"
+     "iopub.execute_input": "2024-10-30T10:20:41.988992Z",
+     "iopub.status.busy": "2024-10-30T10:20:41.988692Z",
+     "iopub.status.idle": "2024-10-30T10:20:55.101428Z",
+     "shell.execute_reply": "2024-10-30T10:20:55.100864Z"
     }
    },
    "outputs": [
@@ -1134,14 +1148,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_31479bae-6187-4511-99f1-5462430498d3</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1166,86 +1180,92 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 37, #new-token: 37, #cached-token: 1998, cache hit rate: 59.11%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 334, #new-token: 8192, #cached-token: 10177, cache hit rate: 56.51%, token usage: 0.01, #running-req: 37, #queue-req: 129\n"
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 36, #new-token: 36, #cached-token: 1944, cache hit rate: 58.83%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 335, #new-token: 8192, #cached-token: 10231, cache hit rate: 56.51%, token usage: 0.01, #running-req: 36, #queue-req: 129\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 130, #new-token: 3872, #cached-token: 3278, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 130, #new-token: 3871, #cached-token: 3279, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1\n",
-      "[2024-10-30 09:52:59 TP0] Decode batch. #running-req: 500, #token: 16525, token usage: 0.04, gen throughput (token/s): 248.66, #queue-req: 0\n"
+      "[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 22525, token usage: 0.05, gen throughput (token/s): 474.44, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:00 TP0] Decode batch. #running-req: 500, #token: 36525, token usage: 0.08, gen throughput (token/s): 25014.55, #queue-req: 0\n"
+      "[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 42525, token usage: 0.10, gen throughput (token/s): 24683.11, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:01 TP0] Decode batch. #running-req: 500, #token: 56525, token usage: 0.13, gen throughput (token/s): 23757.41, #queue-req: 0\n"
+      "[2024-10-30 10:20:44 TP0] Decode batch. #running-req: 500, #token: 62525, token usage: 0.14, gen throughput (token/s): 23362.68, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:02 TP0] Decode batch. #running-req: 500, #token: 76525, token usage: 0.17, gen throughput (token/s): 22704.73, #queue-req: 0\n"
+      "[2024-10-30 10:20:45 TP0] Decode batch. #running-req: 500, #token: 82525, token usage: 0.19, gen throughput (token/s): 22313.40, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:03 TP0] Decode batch. #running-req: 500, #token: 96525, token usage: 0.22, gen throughput (token/s): 21749.38, #queue-req: 0\n"
+      "[2024-10-30 10:20:46 TP0] Decode batch. #running-req: 500, #token: 102525, token usage: 0.23, gen throughput (token/s): 21343.74, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:04 TP0] Decode batch. #running-req: 500, #token: 116525, token usage: 0.26, gen throughput (token/s): 20892.42, #queue-req: 0\n"
+      "[2024-10-30 10:20:47 TP0] Decode batch. #running-req: 500, #token: 122525, token usage: 0.28, gen throughput (token/s): 20537.36, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:05 TP0] Decode batch. #running-req: 500, #token: 136525, token usage: 0.31, gen throughput (token/s): 20062.57, #queue-req: 0\n"
+      "[2024-10-30 10:20:48 TP0] Decode batch. #running-req: 500, #token: 142525, token usage: 0.32, gen throughput (token/s): 19740.76, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:06 TP0] Decode batch. #running-req: 500, #token: 156525, token usage: 0.35, gen throughput (token/s): 19298.73, #queue-req: 0\n"
+      "[2024-10-30 10:20:49 TP0] Decode batch. #running-req: 500, #token: 162525, token usage: 0.37, gen throughput (token/s): 18991.46, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:07 TP0] Decode batch. #running-req: 500, #token: 176525, token usage: 0.40, gen throughput (token/s): 18643.44, #queue-req: 0\n"
+      "[2024-10-30 10:20:50 TP0] Decode batch. #running-req: 500, #token: 182525, token usage: 0.41, gen throughput (token/s): 18340.00, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:08 TP0] Decode batch. #running-req: 500, #token: 196525, token usage: 0.44, gen throughput (token/s): 17661.48, #queue-req: 0\n"
+      "[2024-10-30 10:20:51 TP0] Decode batch. #running-req: 500, #token: 202525, token usage: 0.46, gen throughput (token/s): 17609.08, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:09] INFO:     127.0.0.1:52896 - \"POST /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3/cancel HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:52] INFO:     127.0.0.1:46808 - \"POST /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10/cancel HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1264,7 +1284,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - \"GET /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3 HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - \"GET /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1295,7 +1315,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - \"DELETE /v1/files/backend_input_file-aa3ee9f6-3d9e-4b48-b53f-eaabadf1dae7 HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - \"DELETE /v1/files/backend_input_file-e7d79e39-9d0c-4c13-890f-d1f9806de20e HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1395,10 +1415,10 @@
    "execution_count": 11,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:53:12.288722Z",
-     "iopub.status.busy": "2024-10-30T09:53:12.288529Z",
-     "iopub.status.idle": "2024-10-30T09:53:14.032989Z",
-     "shell.execute_reply": "2024-10-30T09:53:14.032049Z"
+     "iopub.execute_input": "2024-10-30T10:20:55.103211Z",
+     "iopub.status.busy": "2024-10-30T10:20:55.103029Z",
+     "iopub.status.idle": "2024-10-30T10:20:56.879735Z",
+     "shell.execute_reply": "2024-10-30T10:20:56.879040Z"
     }
    },
    "outputs": [],
diff --git a/_sources/send_request.ipynb b/_sources/send_request.ipynb
index faacc69..98802c5 100644
--- a/_sources/send_request.ipynb
+++ b/_sources/send_request.ipynb
@@ -30,10 +30,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:53:18.660010Z",
-     "iopub.status.busy": "2024-10-30T09:53:18.659842Z",
-     "iopub.status.idle": "2024-10-30T09:54:01.960342Z",
-     "shell.execute_reply": "2024-10-30T09:54:01.959701Z"
+     "iopub.execute_input": "2024-10-30T10:21:01.529293Z",
+     "iopub.status.busy": "2024-10-30T10:21:01.529124Z",
+     "iopub.status.idle": "2024-10-30T10:21:45.827106Z",
+     "shell.execute_reply": "2024-10-30T10:21:45.826480Z"
     }
    },
    "outputs": [
@@ -41,37 +41,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:28] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=522777218, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:21:11] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=1057403943, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:43 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:21:27 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:44 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:21:27 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:44 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+      "[2024-10-30 10:21:28 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO 10-30 09:53:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "\r",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+      "INFO 10-30 10:21:28 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
      ]
     },
     {
@@ -79,7 +77,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.22it/s]\n"
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
      ]
     },
     {
@@ -87,7 +85,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.13it/s]\n"
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.14it/s]\n"
      ]
     },
     {
@@ -95,7 +93,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.11it/s]\n"
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.03it/s]\n"
      ]
     },
     {
@@ -103,47 +101,49 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.47it/s]\n",
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.33it/s]\n",
-      "\n",
-      "[2024-10-30 09:53:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
-      "[2024-10-30 09:53:48 TP0] Memory pool end. avail mem=8.37 GB\n",
-      "[2024-10-30 09:53:48 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.00it/s]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.33it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.20it/s]\n",
+      "\n",
+      "[2024-10-30 10:21:32 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-30 10:21:32 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-30 10:21:32 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56] INFO:     Started server process [1246720]\n",
-      "[2024-10-30 09:53:56] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:53:56] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:53:56] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+      "[2024-10-30 10:21:39 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56] INFO:     127.0.0.1:57724 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:21:39] INFO:     Started server process [2220234]\n",
+      "[2024-10-30 10:21:39] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:21:39] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:21:39] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:57] INFO:     127.0.0.1:57738 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:53:57 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:53:57] INFO:     127.0.0.1:57740 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:53:57] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49116 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49124 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49128 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -191,10 +191,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:01.962469Z",
-     "iopub.status.busy": "2024-10-30T09:54:01.962223Z",
-     "iopub.status.idle": "2024-10-30T09:54:05.207796Z",
-     "shell.execute_reply": "2024-10-30T09:54:05.207035Z"
+     "iopub.execute_input": "2024-10-30T10:21:45.829930Z",
+     "iopub.status.busy": "2024-10-30T10:21:45.829007Z",
+     "iopub.status.idle": "2024-10-30T10:21:48.222006Z",
+     "shell.execute_reply": "2024-10-30T10:21:48.221268Z"
     }
    },
    "outputs": [
@@ -202,85 +202,64 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:02 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 6.49, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.20, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.07, #queue-req: 0\n"
+      "[2024-10-30 10:21:45 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.86, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.95, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.84, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.74, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.44, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 138.73, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 138.52, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.73, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 138.51, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.56, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 138.53, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.80, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 138.21, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05] INFO:     127.0.0.1:55856 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "{\"id\":\"bbcbab6a628b4139b82000ab40565b10\",\"object\":\"chat.completion\",\"created\":1730282045,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on massive datasets of text, which enables them to learn patterns, relationships, and structures of language.\\n\\nThese models are often based on transformer architecture, which allows them to understand the context and nuances of language, including grammar, syntax, and semantics. This enables them to generate text that is coherent, readable, and often indistinguishable from human-written text.\\n\\nSome common applications of LLMs include:\\n\\n1. **Language translation**: LLMs can translate text from one language to another with high accuracy.\\n2. **Text summarization**: LLMs can summarize long pieces of text into concise, informative summaries.\\n3. **Content generation**: LLMs can generate text on a given topic, such as articles, blog posts, or even entire books.\\n4. **Chatbots and conversational AI**: LLMs can be used to power chatbots and conversational AI systems that can understand and respond to user queries.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely-used LLM that has achieved state-of-the-art results in many NLP (Natural Language Processing) tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is a variant of BERT that has been optimized for specific NLP tasks.\\n3. **ChatGPT**: Developed by OpenAI, ChatGPT is a conversational AI model that uses a type of LLM to generate human-like responses to user queries.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, and are being applied in a wide range of fields, from healthcare and finance to education and entertainment.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":469,\"completion_tokens\":422,\"prompt_tokens_details\":null}}"
+      "[2024-10-30 10:21:48] INFO:     127.0.0.1:49156 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"e0473f036ab74e6e95f25ac670600abd\",\"object\":\"chat.completion\",\"created\":1730283708,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language at a massive scale. \\n\\nA Large Language Model is trained on a vast amount of text data, which enables it to learn patterns, relationships, and structures of language. This training allows the model to generate human-like text, answer questions, summarize content, and even engage in conversations.\\n\\nLLMs use natural language processing (NLP) techniques, such as tokenization, part-of-speech tagging, and dependency parsing, to analyze and understand the meaning behind the text. They also employ machine learning algorithms, like transformer architecture, to learn from the data and improve their performance over time.\\n\\nSome common applications of Large Language Models include:\\n\\n1. **Chatbots**: LLMs power conversational interfaces, enabling users to interact with AI systems in a more natural and intuitive way.\\n2. **Language translation**: LLMs can translate text from one language to another, often with impressive accuracy.\\n3. **Text summarization**: LLMs can summarize long pieces of text, extracting key points and main ideas.\\n4. **Content generation**: LLMs can generate text, such as articles, stories, or even entire books.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nThe most well-known example of an LLM is perhaps the one I'm based on: I'm a large language model myself.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":354,\"completion_tokens\":307,\"prompt_tokens_details\":null}}"
      ]
     }
    ],
@@ -305,10 +284,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:05.209938Z",
-     "iopub.status.busy": "2024-10-30T09:54:05.209737Z",
-     "iopub.status.idle": "2024-10-30T09:54:05.956792Z",
-     "shell.execute_reply": "2024-10-30T09:54:05.956182Z"
+     "iopub.execute_input": "2024-10-30T10:21:48.224352Z",
+     "iopub.status.busy": "2024-10-30T10:21:48.224065Z",
+     "iopub.status.idle": "2024-10-30T10:21:48.962070Z",
+     "shell.execute_reply": "2024-10-30T10:21:48.961531Z"
     }
    },
    "outputs": [
@@ -316,21 +295,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:54:05 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 48.85, #queue-req: 0\n"
+      "[2024-10-30 10:21:48 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:21:48 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 49.36, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05] INFO:     127.0.0.1:55872 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:21:48] INFO:     127.0.0.1:56782 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='eb152de88c6a42eaab7b3911b3664583', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730282045, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>ChatCompletion(id='6782b0981f72435ab6f3311224bd8b74', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283708, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -362,10 +341,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:05.958571Z",
-     "iopub.status.busy": "2024-10-30T09:54:05.958388Z",
-     "iopub.status.idle": "2024-10-30T09:54:07.683148Z",
-     "shell.execute_reply": "2024-10-30T09:54:07.681009Z"
+     "iopub.execute_input": "2024-10-30T10:21:48.963733Z",
+     "iopub.status.busy": "2024-10-30T10:21:48.963552Z",
+     "iopub.status.idle": "2024-10-30T10:21:50.668291Z",
+     "shell.execute_reply": "2024-10-30T10:21:50.667574Z"
     }
    },
    "outputs": [],
diff --git a/backend.html b/backend.html
index f78a197..2344dd9 100644
--- a/backend.html
+++ b/backend.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/benchmark_and_profiling.html b/benchmark_and_profiling.html
index 9d4e82d..e81f44e 100644
--- a/benchmark_and_profiling.html
+++ b/benchmark_and_profiling.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/choices_methods.html b/choices_methods.html
index 37ad61c..b4bae6f 100644
--- a/choices_methods.html
+++ b/choices_methods.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/contributor_guide.html b/contributor_guide.html
index 0ccb2a6..14076f8 100644
--- a/contributor_guide.html
+++ b/contributor_guide.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/custom_chat_template.html b/custom_chat_template.html
index bc17a14..a41aa87 100644
--- a/custom_chat_template.html
+++ b/custom_chat_template.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/embedding_model.html b/embedding_model.html
index a78dc2a..8a255b3 100644
--- a/embedding_model.html
+++ b/embedding_model.html
@@ -55,6 +55,7 @@
     <script>DOCUMENTATION_OPTIONS.pagename = 'embedding_model';</script>
     <link rel="index" title="Index" href="genindex.html" />
     <link rel="search" title="Search" href="search.html" />
+    <link rel="next" title="Learn more" href="learn_more.html" />
     <link rel="prev" title="Troubleshooting" href="troubleshooting.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
@@ -176,6 +177,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
@@ -458,33 +460,33 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:50:54] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, chat_template=None, is_embedding=True, host=&#39;0.0.0.0&#39;, port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=448515216, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:51:10 TP0] Init torch distributed begin.
-[2024-10-30 09:51:11 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:51:12 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:51:13 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+[2024-10-30 10:18:40] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, chat_template=None, is_embedding=True, host=&#39;0.0.0.0&#39;, port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=922321184, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:18:55 TP0] Init torch distributed begin.
+[2024-10-30 10:18:56 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:18:56 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:18:56 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
 Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00&lt;?, ?it/s]
-Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01&lt;00:10,  1.78s/it]
-Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03&lt;00:09,  1.81s/it]
-Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05&lt;00:07,  1.86s/it]
-Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07&lt;00:05,  1.88s/it]
-Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09&lt;00:03,  1.90s/it]
-Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10&lt;00:01,  1.77s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.47s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.68s/it]
-
-[2024-10-30 09:51:25 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB
-[2024-10-30 09:51:25 TP0] Memory pool end. avail mem=7.43 GB
-[2024-10-30 09:51:26 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072
-[2024-10-30 09:51:26] INFO:     Started server process [1244882]
-[2024-10-30 09:51:26] INFO:     Waiting for application startup.
-[2024-10-30 09:51:26] INFO:     Application startup complete.
-[2024-10-30 09:51:26] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)
-[2024-10-30 09:51:27] INFO:     127.0.0.1:43056 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
-[2024-10-30 09:51:27] INFO:     127.0.0.1:43062 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
-[2024-10-30 09:51:27 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:28] INFO:     127.0.0.1:43078 - &#34;POST /encode HTTP/1.1&#34; 200 OK
-[2024-10-30 09:51:28] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01&lt;00:10,  1.68s/it]
+Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03&lt;00:08,  1.76s/it]
+Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05&lt;00:07,  1.82s/it]
+Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07&lt;00:05,  1.85s/it]
+Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09&lt;00:03,  1.83s/it]
+Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10&lt;00:01,  1.67s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.41s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.61s/it]
+
+[2024-10-30 10:19:08 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB
+[2024-10-30 10:19:08 TP0] Memory pool end. avail mem=7.43 GB
+[2024-10-30 10:19:08 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072
+[2024-10-30 10:19:09] INFO:     Started server process [2218395]
+[2024-10-30 10:19:09] INFO:     Waiting for application startup.
+[2024-10-30 10:19:09] INFO:     Application startup complete.
+[2024-10-30 10:19:09] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)
+[2024-10-30 10:19:09] INFO:     127.0.0.1:49928 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:10] INFO:     127.0.0.1:49940 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:10 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:11] INFO:     127.0.0.1:49946 - &#34;POST /encode HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:11] The server is fired up and ready to roll!
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -522,8 +524,8 @@ <h2>Use Curl<a class="headerlink" href="#Use-Curl" title="Link to this heading">
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:32] INFO:     127.0.0.1:34504 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:14] INFO:     127.0.0.1:49980 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -559,8 +561,8 @@ <h2>Using OpenAI Compatible API<a class="headerlink" href="#Using-OpenAI-Compati
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:32] INFO:     127.0.0.1:34520 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:14] INFO:     127.0.0.1:49986 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -604,8 +606,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:51:39 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:39] INFO:     127.0.0.1:42628 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2024-10-30 10:19:21 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:21] INFO:     127.0.0.1:43896 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -645,6 +647,15 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
         <p class="prev-next-title">Troubleshooting</p>
       </div>
     </a>
+    <a class="right-next"
+       href="learn_more.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">Learn more</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
 </div>
                 </footer>
               
diff --git a/embedding_model.ipynb b/embedding_model.ipynb
index a5f04f6..9ac7ae8 100644
--- a/embedding_model.ipynb
+++ b/embedding_model.ipynb
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:50:44.977451Z",
-     "iopub.status.busy": "2024-10-30T09:50:44.977291Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.305212Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.304521Z"
+     "iopub.execute_input": "2024-10-30T10:18:30.036577Z",
+     "iopub.status.busy": "2024-10-30T10:18:30.036415Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.359146Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.358513Z"
     }
    },
    "outputs": [
@@ -43,41 +43,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:50:54] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=448515216, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:18:40] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=922321184, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:10 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:18:55 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:11 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:18:56 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:12 TP0] lm_eval is not installed, GPTQ may not be usable\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO 10-30 09:51:13 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+      "[2024-10-30 10:18:56 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "INFO 10-30 10:18:56 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
       "\r",
       "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n"
      ]
@@ -87,7 +81,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:10,  1.78s/it]\n"
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01<00:10,  1.68s/it]\n"
      ]
     },
     {
@@ -95,7 +89,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:09,  1.81s/it]\n"
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03<00:08,  1.76s/it]\n"
      ]
     },
     {
@@ -103,7 +97,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.86s/it]\n"
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05<00:07,  1.82s/it]\n"
      ]
     },
     {
@@ -111,7 +105,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.88s/it]\n"
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07<00:05,  1.85s/it]\n"
      ]
     },
     {
@@ -119,7 +113,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.90s/it]\n"
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09<00:03,  1.83s/it]\n"
      ]
     },
     {
@@ -127,7 +121,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10<00:01,  1.77s/it]\n"
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10<00:01,  1.67s/it]\n"
      ]
     },
     {
@@ -135,58 +129,46 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.47s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.41s/it]\n",
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.68s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00,  1.61s/it]\n",
       "\n",
-      "[2024-10-30 09:51:25 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:51:25 TP0] Memory pool end. avail mem=7.43 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:51:26 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
+      "[2024-10-30 10:19:08 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB\n",
+      "[2024-10-30 10:19:08 TP0] Memory pool end. avail mem=7.43 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:26] INFO:     Started server process [1244882]\n",
-      "[2024-10-30 09:51:26] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:51:26] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:51:26] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n"
+      "[2024-10-30 10:19:08 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:27] INFO:     127.0.0.1:43056 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:09] INFO:     Started server process [2218395]\n",
+      "[2024-10-30 10:19:09] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:19:09] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:19:09] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-30 10:19:09] INFO:     127.0.0.1:49928 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:27] INFO:     127.0.0.1:43062 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:51:27 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:19:10] INFO:     127.0.0.1:49940 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:19:10 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:28] INFO:     127.0.0.1:43078 - \"POST /encode HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:51:28] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:19:11] INFO:     127.0.0.1:49946 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:19:11] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -232,10 +214,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.307542Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.307258Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.349027Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.348350Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.361455Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.361206Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.399746Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.399141Z"
     }
    },
    "outputs": [
@@ -243,8 +225,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:32] INFO:     127.0.0.1:34504 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:14] INFO:     127.0.0.1:49980 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -289,10 +271,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.350944Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.350735Z",
-     "iopub.status.idle": "2024-10-30T09:51:32.816553Z",
-     "shell.execute_reply": "2024-10-30T09:51:32.815847Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.401590Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.401411Z",
+     "iopub.status.idle": "2024-10-30T10:19:14.850015Z",
+     "shell.execute_reply": "2024-10-30T10:19:14.849423Z"
     }
    },
    "outputs": [
@@ -300,8 +282,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:32] INFO:     127.0.0.1:34520 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:14] INFO:     127.0.0.1:49986 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -346,10 +328,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:32.818545Z",
-     "iopub.status.busy": "2024-10-30T09:51:32.818328Z",
-     "iopub.status.idle": "2024-10-30T09:51:39.897939Z",
-     "shell.execute_reply": "2024-10-30T09:51:39.897207Z"
+     "iopub.execute_input": "2024-10-30T10:19:14.851882Z",
+     "iopub.status.busy": "2024-10-30T10:19:14.851691Z",
+     "iopub.status.idle": "2024-10-30T10:19:21.910313Z",
+     "shell.execute_reply": "2024-10-30T10:19:21.909605Z"
     }
    },
    "outputs": [
@@ -357,8 +339,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:39 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:51:39] INFO:     127.0.0.1:42628 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:19:21 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:19:21] INFO:     127.0.0.1:43896 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -401,10 +383,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:39.900119Z",
-     "iopub.status.busy": "2024-10-30T09:51:39.899764Z",
-     "iopub.status.idle": "2024-10-30T09:51:40.354387Z",
-     "shell.execute_reply": "2024-10-30T09:51:40.353699Z"
+     "iopub.execute_input": "2024-10-30T10:19:21.912691Z",
+     "iopub.status.busy": "2024-10-30T10:19:21.912221Z",
+     "iopub.status.idle": "2024-10-30T10:19:22.397961Z",
+     "shell.execute_reply": "2024-10-30T10:19:22.397219Z"
     }
    },
    "outputs": [],
diff --git a/frontend.html b/frontend.html
index 2d1c997..295e699 100644
--- a/frontend.html
+++ b/frontend.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/genindex.html b/genindex.html
index 35b703b..8744be4 100644
--- a/genindex.html
+++ b/genindex.html
@@ -181,6 +181,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/hyperparameter_tuning.html b/hyperparameter_tuning.html
index 5a4d857..58421dc 100644
--- a/hyperparameter_tuning.html
+++ b/hyperparameter_tuning.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/index.html b/index.html
index 10e2764..d07820a 100644
--- a/index.html
+++ b/index.html
@@ -177,6 +177,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
@@ -442,6 +443,7 @@ <h1>SGLang Documentation<a class="headerlink" href="#sglang-documentation" title
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 </div>
 </section>
diff --git a/install.html b/install.html
index 2cbad93..8a89340 100644
--- a/install.html
+++ b/install.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/learn_more.html b/learn_more.html
new file mode 100644
index 0000000..0a64f14
--- /dev/null
+++ b/learn_more.html
@@ -0,0 +1,484 @@
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="./" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>Learn more &#8212; SGLang</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!-- 
+    this give us a css class that will be invisible only if js is disabled 
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="_static/styles/theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+<link href="_static/styles/pydata-sphinx-theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=a3416100" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=d5b75504" />
+    <link rel="stylesheet" type="text/css" href="_static/css/custom_log.css?v=d5b75504" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
+<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
+
+    <script src="_static/documentation_options.js?v=33e7ae7f"></script>
+    <script src="_static/doctools.js?v=9a2dae69"></script>
+    <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="_static/copybutton.js?v=ccdb6887"></script>
+    <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'learn_more';</script>
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="prev" title="Embedding Model" href="embedding_model.html" />
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.3.4.post2" />
+    <meta name="docbuild:last-update" content="Oct 30, 2024"/>
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search..."
+         aria-label="Search..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+    
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="_static/logo.png" class="logo__image only-light" alt="SGLang - Home"/>
+    <img src="_static/logo.png" class="logo__image only-dark pst-js-only" alt="SGLang - Home"/>
+  
+  
+</a></div>
+        <div class="sidebar-primary-item">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button></div>
+        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
+    <div class="bd-toc-item navbar-nav active">
+        <p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="install.html">Install SGLang</a></li>
+<li class="toctree-l1"><a class="reference internal" href="send_request.html">Quick Start: Launch A Server and Send Requests</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Backend Tutorial</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="openai_api.html">OpenAI Compatible API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="backend.html">Backend: SGLang Runtime (SRT)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Frontend Tutorial</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="frontend.html">Frontend: Structured Generation Language (SGLang)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">References</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="sampling_params.html">Sampling Parameters in SGLang Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="hyperparameter_tuning.html">Guide on Hyperparameter Tuning</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_support.html">How to Support a New Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contributor_guide.html">Contributor Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="choices_methods.html">Choices Methods in SGLang</a></li>
+<li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1 current active"><a class="current reference internal" href="#">Learn more</a></li>
+</ul>
+
+    </div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+  
+  <div id="rtd-footer-container"></div>
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+
+<div class="sbt-scroll-pixel-helper"></div>
+
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <span class="fa-solid fa-bars"></span>
+</button></div>
+      
+    </div>
+  
+  
+    <div class="header-article-items__end">
+      
+        <div class="header-article-item">
+
+<div class="article-header-buttons">
+
+
+
+
+
+<div class="dropdown dropdown-source-buttons">
+  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
+    <i class="fab fa-github"></i>
+  </button>
+  <ul class="dropdown-menu">
+      
+      
+      
+      <li><a href="https://github.com/sgl-project/sglang" target="_blank"
+   class="btn btn-sm btn-source-repository-button dropdown-item"
+   title="Source repository"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fab fa-github"></i>
+  </span>
+<span class="btn__text-container">Repository</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/learn_more.md?plain=1" target="_blank"
+   class="btn btn-sm btn-source-file-button dropdown-item"
+   title="Show source"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-code"></i>
+  </span>
+<span class="btn__text-container">Show source</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/learn_more.md" target="_blank"
+   class="btn btn-sm btn-source-edit-button dropdown-item"
+   title="Suggest edit"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-pencil-alt"></i>
+  </span>
+<span class="btn__text-container">Suggest edit</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li><a href="https://github.com/sgl-project/sglang/issues/new?title=Issue%20on%20page%20%2Flearn_more.html&body=Your%20issue%20content%20here." target="_blank"
+   class="btn btn-sm btn-source-issues-button dropdown-item"
+   title="Open an issue"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-lightbulb"></i>
+  </span>
+<span class="btn__text-container">Open issue</span>
+</a>
+</li>
+      
+  </ul>
+</div>
+
+
+
+
+
+
+<div class="dropdown dropdown-download-buttons">
+  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
+    <i class="fas fa-download"></i>
+  </button>
+  <ul class="dropdown-menu">
+      
+      
+      
+      <li><a href="_sources/learn_more.md" target="_blank"
+   class="btn btn-sm btn-download-source-button dropdown-item"
+   title="Download source file"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file"></i>
+  </span>
+<span class="btn__text-container">.md</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li>
+<button onclick="window.print()"
+  class="btn btn-sm btn-download-pdf-button dropdown-item"
+  title="Print to PDF"
+  data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file-pdf"></i>
+  </span>
+<span class="btn__text-container">.pdf</span>
+</button>
+</li>
+      
+  </ul>
+</div>
+
+
+
+
+<button onclick="toggleFullScreen()"
+  class="btn btn-sm btn-fullscreen-button"
+  title="Fullscreen mode"
+  data-bs-placement="bottom" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-expand"></i>
+  </span>
+
+</button>
+
+
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button>
+
+
+<button class="btn btn-sm pst-navbar-icon search-button search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
+</button>
+
+</div></div>
+      
+    </div>
+  
+</div>
+</div>
+              
+              
+
+<div id="jb-print-docs-body" class="onlyprint">
+    <h1>Learn more</h1>
+    <!-- Table of contents -->
+    <div id="print-main-content">
+        <div id="jb-print-toc">
+            
+        </div>
+    </div>
+</div>
+
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section class="tex2jax_ignore mathjax_ignore" id="learn-more">
+<h1>Learn more<a class="headerlink" href="#learn-more" title="Link to this heading">#</a></h1>
+<p>You can find more blogs, slides, and videos about SGLang at <a class="github reference external" href="https://github.com/sgl-project/sgl-learning-materials">sgl-project/sgl-learning-materials</a>.</p>
+</section>
+
+
+                </article>
+              
+
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="embedding_model.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">Embedding Model</p>
+      </div>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+              
+            
+          </div>
+          <footer class="bd-footer-content">
+            
+<div class="bd-footer-content__inner container">
+  
+  <div class="footer-item">
+    
+<p class="component-author">
+By SGLang Team
+</p>
+
+  </div>
+  
+  <div class="footer-item">
+    
+
+  <p class="copyright">
+    
+      © Copyright 2023-2024, SGLang.
+      <br/>
+    
+  </p>
+
+  </div>
+  
+  <div class="footer-item">
+    <p class="last-updated">
+  Last updated on Oct 30, 2024.
+  <br/>
+</p>
+  </div>
+  
+  <div class="footer-item">
+    
+  </div>
+  
+</div>
+          </footer>
+        
+
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549"></script>
+<script defer src="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549"></script>
+
+  <footer class="bd-footer">
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/model_support.html b/model_support.html
index 1da3663..4743081 100644
--- a/model_support.html
+++ b/model_support.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/objects.inv b/objects.inv
index 5473df9..c7b4c7f 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/openai_api.html b/openai_api.html
index 2b591af..cb68c1e 100644
--- a/openai_api.html
+++ b/openai_api.html
@@ -177,6 +177,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
@@ -463,31 +464,31 @@ <h3>Usage<a class="headerlink" href="#Usage" title="Link to this heading">#</a><
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:51:55] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=747149505, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:52:10 TP0] Init torch distributed begin.
-[2024-10-30 09:52:11 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:52:11 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:52:12 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+[2024-10-30 10:19:37] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=203851697, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:19:52 TP0] Init torch distributed begin.
+[2024-10-30 10:19:53 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:19:53 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:19:54 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
 Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00&lt;?, ?it/s]
-Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.44it/s]
-Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.32it/s]
-Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.22it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.61it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02&lt;00:00,  1.48it/s]
-
-[2024-10-30 09:52:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
-[2024-10-30 09:52:14 TP0] Memory pool end. avail mem=8.37 GB
-[2024-10-30 09:52:15 TP0] Capture cuda graph begin. This can take up to several minutes.
-[2024-10-30 09:52:22 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
-[2024-10-30 09:52:23] INFO:     Started server process [1245797]
-[2024-10-30 09:52:23] INFO:     Waiting for application startup.
-[2024-10-30 09:52:23] INFO:     Application startup complete.
-[2024-10-30 09:52:23] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
-[2024-10-30 09:52:23] INFO:     127.0.0.1:40674 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:24] INFO:     127.0.0.1:40678 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:24 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:24] INFO:     127.0.0.1:40686 - &#34;POST /generate HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:24] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.17it/s]
+Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.05it/s]
+Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.02it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03&lt;00:00,  1.35it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03&lt;00:00,  1.23it/s]
+
+[2024-10-30 10:19:57 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-30 10:19:57 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-30 10:19:57 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-30 10:20:04 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-30 10:20:05] INFO:     Started server process [2219311]
+[2024-10-30 10:20:05] INFO:     Waiting for application startup.
+[2024-10-30 10:20:05] INFO:     Application startup complete.
+[2024-10-30 10:20:05] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-30 10:20:05] INFO:     127.0.0.1:40444 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:06] INFO:     127.0.0.1:40446 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:06 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:06] INFO:     127.0.0.1:40454 - &#34;POST /generate HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:06] The server is fired up and ready to roll!
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -523,16 +524,16 @@ <h3>Usage<a class="headerlink" href="#Usage" title="Link to this heading">#</a><
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:28 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:28 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.28, #queue-req: 0
-[2024-10-30 09:52:29] INFO:     127.0.0.1:40694 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:10 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:10 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.33, #queue-req: 0
+[2024-10-30 10:20:11] INFO:     127.0.0.1:46764 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Response: ChatCompletion(id='cc26dae40dc0474f92b9a23846f0a2ba', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281949, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+<strong style='color: #00008B;'>Response: ChatCompletion(id='6ebfa0d17b5a4581a7fcf6ba813ec4c7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283611, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
 </div>
 </section>
 <section id="Parameters">
@@ -575,18 +576,18 @@ <h3>Parameters<a class="headerlink" href="#Parameters" title="Link to this headi
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:29 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 110.01, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.67, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.20, #queue-req: 0
-[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:11 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 112.96, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 131.93, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.44, #queue-req: 0
+[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (46 BCE) established a complex system of laws, paving the way for modern democracy.<br>2. **Architecture and Engineering**: Romans developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.<br>3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the language of government, commerce, and literature, influencing modern languages like French, Spanish</strong></div>
+<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: Romans developed concrete (Opus caementicium), aqueducts, roads (e.g., Appian Way), bridges, and monumental buildings like the Colosseum and Pantheon.<br>2. **Law and Governance**: They established the Twelve Tables (450 BCE), a foundation for Roman law, and developed a complex system of governance, including the Senate and Assemblies.<br>3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the</strong></div>
 </div>
 <p>Streaming mode is also supported</p>
 <div class="nbinput docutils container">
@@ -609,10 +610,9 @@ <h3>Parameters<a class="headerlink" href="#Parameters" title="Link to this headi
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0
-It looks like we&#39;re about to begin. What type of test would you like to conduct? A language[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 123.86, #queue-req: 0
- proficiency test, a knowledge quiz, or something else entirely?
+[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0
+This is only a test.
 </pre></div></div>
 </div>
 </section>
@@ -644,16 +644,17 @@ <h3>Usage<a class="headerlink" href="#id1" title="Link to this heading">#</a></h
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 133.41, #queue-req: 0
-[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 120.08, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.95, #queue-req: 0
+[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Response: Completion(id='83f0cdf978484de5b357f0abc71e3f07', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+<strong style='color: #00008B;'>Response: Completion(id='6636fb77cf774cf7a76fc4d793c95a38', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\nList 3 countries and their capitals. 1. 2. 3.\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730283612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
 </div>
 </section>
 <section id="id2">
@@ -686,16 +687,18 @@ <h3>Parameters<a class="headerlink" href="#id2" title="Link to this heading">#</
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 131.40, #queue-req: 0
-[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 126.71, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 134.52, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 133.35, #queue-req: 0
+[2024-10-30 10:20:13] INFO:     127.0.0.1:46764 - &#34;POST /v1/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Response: Completion(id='569d9fcd3afc47828a91328dcc3da4cb', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' There are 3 pages, and each page has a few sentences. ', matched_stop='\n\n')], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=15, prompt_tokens=10, total_tokens=25, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+<strong style='color: #00008B;'>Response: Completion(id='c9ed6e0b75884675b752770fdf00118a', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' On a mission to explore the newly discovered planet, Kepler-62f, which is located about 1,200 light-years from Earth. The story should be not more than 2 pages.\nCaptain Jameson stood at the edge of the ship’s observation deck, gazing out at the swirling clouds of gas and dust that surrounded Kepler-62f. The planet hung before him like a blue-green jewel, its atmosphere shimmering with an otherworldly light. He felt a shiver run down his spine as he contemplated the enormity of their discovery.\nKepler-62f was a super-Earth, a world orbiting a star very similar to our own Sun. The initial scans had hinted at liquid water and a stable climate, making', matched_stop=None)], created=1730283613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
 </div>
 </section>
 </section>
@@ -768,16 +771,16 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:13 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job created with ID: batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940</strong></div>
+<strong style='color: #00008B;'>Batch job created with ID: batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538</strong></div>
 </div>
 <div class="nbinput docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
@@ -819,12 +822,13 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:31 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 134.02, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 2, #token: 56, token usage: 0.00, gen throughput (token/s): 107.56, #queue-req: 0
+[2024-10-30 10:20:14 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 171.05, #queue-req: 0
 Batch job status: validating...trying again in 3 seconds...
-[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - &#34;GET /v1/batches/batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940 HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - &#34;GET /v1/batches/batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538 HTTP/1.1&#34; 200 OK
 Batch job completed successfully!
 Request counts: BatchRequestCounts(completed=2, failed=0, total=2)
-[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - &#34;GET /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab/content HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - &#34;GET /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033/content HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
@@ -837,7 +841,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\n\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong></div>
+<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\n\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
@@ -849,7 +853,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong></div>
+<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\n=====================\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 198'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
@@ -862,7 +866,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - &#34;DELETE /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - &#34;DELETE /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <p>It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.</p>
@@ -944,15 +948,15 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Created batch job with ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd</strong></div>
+<strong style='color: #00008B;'>Created batch job with ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9</strong></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
@@ -965,28 +969,28 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 44.86%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.33%, token usage: 0.00, #running-req: 17, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 5225, token usage: 0.01, gen throughput (token/s): 691.07, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 9225, token usage: 0.02, gen throughput (token/s): 10881.22, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 13225, token usage: 0.03, gen throughput (token/s): 10661.22, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 17225, token usage: 0.04, gen throughput (token/s): 10362.11, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 21225, token usage: 0.05, gen throughput (token/s): 10117.01, #queue-req: 0
-[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 25225, token usage: 0.06, gen throughput (token/s): 9825.47, #queue-req: 0
-[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 29225, token usage: 0.07, gen throughput (token/s): 9705.75, #queue-req: 0
-[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 33225, token usage: 0.08, gen throughput (token/s): 9505.80, #queue-req: 0
-[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 37225, token usage: 0.08, gen throughput (token/s): 9362.90, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 41225, token usage: 0.09, gen throughput (token/s): 9069.10, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 45225, token usage: 0.10, gen throughput (token/s): 9011.38, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 49225, token usage: 0.11, gen throughput (token/s): 8748.48, #queue-req: 0
-[2024-10-30 09:52:44] INFO:     127.0.0.1:46622 - &#34;GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 44.36%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.33%, token usage: 0.00, #running-req: 7, #queue-req: 0
+[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 6425, token usage: 0.01, gen throughput (token/s): 1051.94, #queue-req: 0
+[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 10425, token usage: 0.02, gen throughput (token/s): 10803.69, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 14425, token usage: 0.03, gen throughput (token/s): 10578.85, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 18425, token usage: 0.04, gen throughput (token/s): 10257.61, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 22425, token usage: 0.05, gen throughput (token/s): 10138.49, #queue-req: 0
+[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 26425, token usage: 0.06, gen throughput (token/s): 9904.84, #queue-req: 0
+[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 30425, token usage: 0.07, gen throughput (token/s): 9674.94, #queue-req: 0
+[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 34425, token usage: 0.08, gen throughput (token/s): 9519.28, #queue-req: 0
+[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 38425, token usage: 0.09, gen throughput (token/s): 9329.19, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 42425, token usage: 0.10, gen throughput (token/s): 9120.97, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 46425, token usage: 0.10, gen throughput (token/s): 8980.70, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 50425, token usage: 0.11, gen throughput (token/s): 8799.31, #queue-req: 0
+[2024-10-30 10:20:26] INFO:     127.0.0.1:39106 - &#34;GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong></div>
+<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
@@ -999,14 +1003,14 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:47] INFO:     127.0.0.1:46622 - &#34;GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:29] INFO:     127.0.0.1:39106 - &#34;GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong></div>
+<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
@@ -1019,14 +1023,14 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:50] INFO:     127.0.0.1:46622 - &#34;GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:32] INFO:     127.0.0.1:39106 - &#34;GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong></div>
+<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
@@ -1039,14 +1043,14 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:53] INFO:     127.0.0.1:46622 - &#34;GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:35] INFO:     127.0.0.1:39106 - &#34;GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong></div>
+<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
@@ -1059,14 +1063,14 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:56] INFO:     127.0.0.1:46622 - &#34;GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:38] INFO:     127.0.0.1:39106 - &#34;GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong></div>
+<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
@@ -1163,15 +1167,15 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
-[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - &#34;POST /v1/files HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - &#34;POST /v1/batches HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>Created batch job with ID: batch_31479bae-6187-4511-99f1-5462430498d3</strong></div>
+<strong style='color: #00008B;'>Created batch job with ID: batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10</strong></div>
 </div>
 <div class="nboutput docutils container">
 <div class="prompt empty docutils container">
@@ -1184,20 +1188,20 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 37, #new-token: 37, #cached-token: 1998, cache hit rate: 59.11%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 334, #new-token: 8192, #cached-token: 10177, cache hit rate: 56.51%, token usage: 0.01, #running-req: 37, #queue-req: 129
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 130, #new-token: 3871, #cached-token: 3279, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1
-[2024-10-30 09:52:59 TP0] Decode batch. #running-req: 500, #token: 16525, token usage: 0.04, gen throughput (token/s): 248.66, #queue-req: 0
-[2024-10-30 09:53:00 TP0] Decode batch. #running-req: 500, #token: 36525, token usage: 0.08, gen throughput (token/s): 25014.55, #queue-req: 0
-[2024-10-30 09:53:01 TP0] Decode batch. #running-req: 500, #token: 56525, token usage: 0.13, gen throughput (token/s): 23757.41, #queue-req: 0
-[2024-10-30 09:53:02 TP0] Decode batch. #running-req: 500, #token: 76525, token usage: 0.17, gen throughput (token/s): 22704.73, #queue-req: 0
-[2024-10-30 09:53:03 TP0] Decode batch. #running-req: 500, #token: 96525, token usage: 0.22, gen throughput (token/s): 21749.38, #queue-req: 0
-[2024-10-30 09:53:04 TP0] Decode batch. #running-req: 500, #token: 116525, token usage: 0.26, gen throughput (token/s): 20892.42, #queue-req: 0
-[2024-10-30 09:53:05 TP0] Decode batch. #running-req: 500, #token: 136525, token usage: 0.31, gen throughput (token/s): 20062.57, #queue-req: 0
-[2024-10-30 09:53:06 TP0] Decode batch. #running-req: 500, #token: 156525, token usage: 0.35, gen throughput (token/s): 19298.73, #queue-req: 0
-[2024-10-30 09:53:07 TP0] Decode batch. #running-req: 500, #token: 176525, token usage: 0.40, gen throughput (token/s): 18643.44, #queue-req: 0
-[2024-10-30 09:53:08 TP0] Decode batch. #running-req: 500, #token: 196525, token usage: 0.44, gen throughput (token/s): 17661.48, #queue-req: 0
-[2024-10-30 09:53:09] INFO:     127.0.0.1:52896 - &#34;POST /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3/cancel HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 36, #new-token: 36, #cached-token: 1944, cache hit rate: 58.83%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 335, #new-token: 8192, #cached-token: 10231, cache hit rate: 56.51%, token usage: 0.01, #running-req: 36, #queue-req: 129
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 130, #new-token: 3872, #cached-token: 3278, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1
+[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 22525, token usage: 0.05, gen throughput (token/s): 474.44, #queue-req: 0
+[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 42525, token usage: 0.10, gen throughput (token/s): 24683.11, #queue-req: 0
+[2024-10-30 10:20:44 TP0] Decode batch. #running-req: 500, #token: 62525, token usage: 0.14, gen throughput (token/s): 23362.68, #queue-req: 0
+[2024-10-30 10:20:45 TP0] Decode batch. #running-req: 500, #token: 82525, token usage: 0.19, gen throughput (token/s): 22313.40, #queue-req: 0
+[2024-10-30 10:20:46 TP0] Decode batch. #running-req: 500, #token: 102525, token usage: 0.23, gen throughput (token/s): 21343.74, #queue-req: 0
+[2024-10-30 10:20:47 TP0] Decode batch. #running-req: 500, #token: 122525, token usage: 0.28, gen throughput (token/s): 20537.36, #queue-req: 0
+[2024-10-30 10:20:48 TP0] Decode batch. #running-req: 500, #token: 142525, token usage: 0.32, gen throughput (token/s): 19740.76, #queue-req: 0
+[2024-10-30 10:20:49 TP0] Decode batch. #running-req: 500, #token: 162525, token usage: 0.37, gen throughput (token/s): 18991.46, #queue-req: 0
+[2024-10-30 10:20:50 TP0] Decode batch. #running-req: 500, #token: 182525, token usage: 0.41, gen throughput (token/s): 18340.00, #queue-req: 0
+[2024-10-30 10:20:51 TP0] Decode batch. #running-req: 500, #token: 202525, token usage: 0.46, gen throughput (token/s): 17609.08, #queue-req: 0
+[2024-10-30 10:20:52] INFO:     127.0.0.1:46808 - &#34;POST /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10/cancel HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
@@ -1211,7 +1215,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - &#34;GET /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3 HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - &#34;GET /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10 HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput docutils container">
@@ -1231,7 +1235,7 @@ <h2>Batches<a class="headerlink" href="#Batches" title="Link to this heading">#<
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - &#34;DELETE /v1/files/backend_input_file-aa3ee9f6-3d9e-4b48-b53f-eaabadf1dae7 HTTP/1.1&#34; 200 OK
+[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - &#34;DELETE /v1/files/backend_input_file-e7d79e39-9d0c-4c13-890f-d1f9806de20e HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
diff --git a/openai_api.ipynb b/openai_api.ipynb
index f3058b3..aa2f116 100644
--- a/openai_api.ipynb
+++ b/openai_api.ipynb
@@ -32,10 +32,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:51:44.961224Z",
-     "iopub.status.busy": "2024-10-30T09:51:44.961060Z",
-     "iopub.status.idle": "2024-10-30T09:52:28.261228Z",
-     "shell.execute_reply": "2024-10-30T09:52:28.260423Z"
+     "iopub.execute_input": "2024-10-30T10:19:27.012379Z",
+     "iopub.status.busy": "2024-10-30T10:19:27.012217Z",
+     "iopub.status.idle": "2024-10-30T10:20:10.309947Z",
+     "shell.execute_reply": "2024-10-30T10:20:10.309343Z"
     }
    },
    "outputs": [
@@ -43,35 +43,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:51:55] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=747149505, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:19:37] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=203851697, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:10 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:19:52 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:11 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:19:53 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:11 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+      "[2024-10-30 10:19:53 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO 10-30 09:52:12 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "INFO 10-30 10:19:54 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
       "\r",
       "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
      ]
@@ -81,7 +81,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.44it/s]\n"
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.17it/s]\n"
      ]
     },
     {
@@ -89,7 +89,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.32it/s]\n"
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.05it/s]\n"
      ]
     },
     {
@@ -97,7 +97,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.22it/s]\n"
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.02it/s]\n"
      ]
     },
     {
@@ -105,41 +105,41 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.61it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.35it/s]\n",
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.48it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.23it/s]\n",
       "\n",
-      "[2024-10-30 09:52:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
-      "[2024-10-30 09:52:14 TP0] Memory pool end. avail mem=8.37 GB\n",
-      "[2024-10-30 09:52:15 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+      "[2024-10-30 10:19:57 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-30 10:19:57 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-30 10:19:57 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:22 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+      "[2024-10-30 10:20:04 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:23] INFO:     Started server process [1245797]\n",
-      "[2024-10-30 09:52:23] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:52:23] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:52:23] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-10-30 09:52:23] INFO:     127.0.0.1:40674 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:05] INFO:     Started server process [2219311]\n",
+      "[2024-10-30 10:20:05] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:20:05] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:20:05] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-30 10:20:05] INFO:     127.0.0.1:40444 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:24] INFO:     127.0.0.1:40678 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:24 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:24] INFO:     127.0.0.1:40686 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:24] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:20:06] INFO:     127.0.0.1:40446 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:06 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:06] INFO:     127.0.0.1:40454 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:06] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -175,10 +175,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:28.263865Z",
-     "iopub.status.busy": "2024-10-30T09:52:28.263388Z",
-     "iopub.status.idle": "2024-10-30T09:52:29.047736Z",
-     "shell.execute_reply": "2024-10-30T09:52:29.046944Z"
+     "iopub.execute_input": "2024-10-30T10:20:10.311989Z",
+     "iopub.status.busy": "2024-10-30T10:20:10.311744Z",
+     "iopub.status.idle": "2024-10-30T10:20:11.072023Z",
+     "shell.execute_reply": "2024-10-30T10:20:11.071472Z"
     }
    },
    "outputs": [
@@ -186,21 +186,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:28 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:10 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:28 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.28, #queue-req: 0\n",
-      "[2024-10-30 09:52:29] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:10 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.33, #queue-req: 0\n",
+      "[2024-10-30 10:20:11] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='cc26dae40dc0474f92b9a23846f0a2ba', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281949, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='6ebfa0d17b5a4581a7fcf6ba813ec4c7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283611, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -244,10 +244,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:29.049977Z",
-     "iopub.status.busy": "2024-10-30T09:52:29.049754Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.069676Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.068949Z"
+     "iopub.execute_input": "2024-10-30T10:20:11.073825Z",
+     "iopub.status.busy": "2024-10-30T10:20:11.073638Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.087238Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.086715Z"
     }
    },
    "outputs": [
@@ -255,35 +255,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 110.01, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 112.96, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.67, #queue-req: 0\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 131.93, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.20, #queue-req: 0\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.44, #queue-req: 0\n",
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (46 BCE) established a complex system of laws, paving the way for modern democracy.<br>2. **Architecture and Engineering**: Romans developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.<br>3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the language of government, commerce, and literature, influencing modern languages like French, Spanish</strong>"
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: Romans developed concrete (Opus caementicium), aqueducts, roads (e.g., Appian Way), bridges, and monumental buildings like the Colosseum and Pantheon.<br>2. **Law and Governance**: They established the Twelve Tables (450 BCE), a foundation for Roman law, and developed a complex system of governance, including the Senate and Assemblies.<br>3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that lasted for centuries.<br>4. **Language and Literature**: Latin became the</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -332,10 +332,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.071909Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.071567Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.329870Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.329208Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.088958Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.088783Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.151298Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.150751Z"
     }
    },
    "outputs": [
@@ -343,24 +343,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "It looks like we"
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "This is only a test"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "'re about to begin. What type of test would you like to conduct? A language[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 123.86, #queue-req: 0\n",
-      " proficiency test, a knowledge quiz"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ", or something else entirely?"
+      "."
      ]
     }
    ],
@@ -391,10 +383,10 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.331853Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.331471Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.791982Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.791382Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.152940Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.152766Z",
+     "iopub.status.idle": "2024-10-30T10:20:12.611937Z",
+     "shell.execute_reply": "2024-10-30T10:20:12.611432Z"
     }
    },
    "outputs": [
@@ -402,27 +394,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 133.41, #queue-req: 0\n"
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 120.08, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.95, #queue-req: 0\n",
+      "[2024-10-30 10:20:12] INFO:     127.0.0.1:46764 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='83f0cdf978484de5b357f0abc71e3f07', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='6636fb77cf774cf7a76fc4d793c95a38', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730283612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -461,10 +448,10 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.793939Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.793562Z",
-     "iopub.status.idle": "2024-10-30T09:52:30.916789Z",
-     "shell.execute_reply": "2024-10-30T09:52:30.916187Z"
+     "iopub.execute_input": "2024-10-30T10:20:12.613616Z",
+     "iopub.status.busy": "2024-10-30T10:20:12.613445Z",
+     "iopub.status.idle": "2024-10-30T10:20:13.749192Z",
+     "shell.execute_reply": "2024-10-30T10:20:13.748684Z"
     }
    },
    "outputs": [
@@ -472,15 +459,41 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 131.40, #queue-req: 0\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:40694 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 126.71, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 134.52, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 133.35, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46764 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='569d9fcd3afc47828a91328dcc3da4cb', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' There are 3 pages, and each page has a few sentences. ', matched_stop='\\n\\n')], created=1730281950, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=15, prompt_tokens=10, total_tokens=25, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='c9ed6e0b75884675b752770fdf00118a', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' On a mission to explore the newly discovered planet, Kepler-62f, which is located about 1,200 light-years from Earth. The story should be not more than 2 pages.\\nCaptain Jameson stood at the edge of the ship’s observation deck, gazing out at the swirling clouds of gas and dust that surrounded Kepler-62f. The planet hung before him like a blue-green jewel, its atmosphere shimmering with an otherworldly light. He felt a shiver run down his spine as he contemplated the enormity of their discovery.\\nKepler-62f was a super-Earth, a world orbiting a star very similar to our own Sun. The initial scans had hinted at liquid water and a stable climate, making', matched_stop=None)], created=1730283613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -529,10 +542,10 @@
    "execution_count": 7,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:30.918549Z",
-     "iopub.status.busy": "2024-10-30T09:52:30.918362Z",
-     "iopub.status.idle": "2024-10-30T09:52:31.003443Z",
-     "shell.execute_reply": "2024-10-30T09:52:31.002832Z"
+     "iopub.execute_input": "2024-10-30T10:20:13.750906Z",
+     "iopub.status.busy": "2024-10-30T10:20:13.750729Z",
+     "iopub.status.idle": "2024-10-30T10:20:13.833501Z",
+     "shell.execute_reply": "2024-10-30T10:20:13.832992Z"
     }
    },
    "outputs": [
@@ -540,21 +553,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:30] INFO:     127.0.0.1:58766 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:13] INFO:     127.0.0.1:46776 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:13 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940</strong>"
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -619,10 +626,10 @@
    "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:31.005316Z",
-     "iopub.status.busy": "2024-10-30T09:52:31.004960Z",
-     "iopub.status.idle": "2024-10-30T09:52:34.028151Z",
-     "shell.execute_reply": "2024-10-30T09:52:34.027397Z"
+     "iopub.execute_input": "2024-10-30T10:20:13.835070Z",
+     "iopub.status.busy": "2024-10-30T10:20:13.834892Z",
+     "iopub.status.idle": "2024-10-30T10:20:16.854742Z",
+     "shell.execute_reply": "2024-10-30T10:20:16.854185Z"
     }
    },
    "outputs": [
@@ -630,7 +637,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:31 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 134.02, #queue-req: 0\n"
+      "[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 2, #token: 56, token usage: 0.00, gen throughput (token/s): 107.56, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:14 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 171.05, #queue-req: 0\n"
      ]
     },
     {
@@ -638,10 +652,10 @@
      "output_type": "stream",
      "text": [
       "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"GET /v1/batches/batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940 HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"GET /v1/batches/batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538 HTTP/1.1\" 200 OK\n",
       "Batch job completed successfully!\n",
       "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"GET /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab/content HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"GET /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033/content HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -659,7 +673,7 @@
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -683,7 +697,7 @@
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730281951, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730283614, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n=====================\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 198'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -708,7 +722,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58766 - \"DELETE /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46776 - \"DELETE /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033 HTTP/1.1\" 200 OK\n"
      ]
     }
    ],
@@ -760,10 +774,10 @@
    "execution_count": 9,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:34.030322Z",
-     "iopub.status.busy": "2024-10-30T09:52:34.030006Z",
-     "iopub.status.idle": "2024-10-30T09:52:59.161348Z",
-     "shell.execute_reply": "2024-10-30T09:52:59.160592Z"
+     "iopub.execute_input": "2024-10-30T10:20:16.856439Z",
+     "iopub.status.busy": "2024-10-30T10:20:16.856263Z",
+     "iopub.status.idle": "2024-10-30T10:20:41.986619Z",
+     "shell.execute_reply": "2024-10-30T10:20:41.985974Z"
     }
    },
    "outputs": [
@@ -771,14 +785,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:34] INFO:     127.0.0.1:58778 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:16] INFO:     127.0.0.1:46792 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -803,105 +817,105 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 44.86%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.33%, token usage: 0.00, #running-req: 17, #queue-req: 0\n"
+      "[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 44.36%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.33%, token usage: 0.00, #running-req: 7, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 5225, token usage: 0.01, gen throughput (token/s): 691.07, #queue-req: 0\n"
+      "[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 6425, token usage: 0.01, gen throughput (token/s): 1051.94, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 9225, token usage: 0.02, gen throughput (token/s): 10881.22, #queue-req: 0\n"
+      "[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 10425, token usage: 0.02, gen throughput (token/s): 10803.69, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 13225, token usage: 0.03, gen throughput (token/s): 10661.22, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 14425, token usage: 0.03, gen throughput (token/s): 10578.85, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 17225, token usage: 0.04, gen throughput (token/s): 10362.11, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 18425, token usage: 0.04, gen throughput (token/s): 10257.61, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 21225, token usage: 0.05, gen throughput (token/s): 10117.01, #queue-req: 0\n"
+      "[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 22425, token usage: 0.05, gen throughput (token/s): 10138.49, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 25225, token usage: 0.06, gen throughput (token/s): 9825.47, #queue-req: 0\n"
+      "[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 26425, token usage: 0.06, gen throughput (token/s): 9904.84, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 29225, token usage: 0.07, gen throughput (token/s): 9705.75, #queue-req: 0\n"
+      "[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 30425, token usage: 0.07, gen throughput (token/s): 9674.94, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 33225, token usage: 0.08, gen throughput (token/s): 9505.80, #queue-req: 0\n"
+      "[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 34425, token usage: 0.08, gen throughput (token/s): 9519.28, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 37225, token usage: 0.08, gen throughput (token/s): 9362.90, #queue-req: 0\n"
+      "[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 38425, token usage: 0.09, gen throughput (token/s): 9329.19, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 41225, token usage: 0.09, gen throughput (token/s): 9069.10, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 42425, token usage: 0.10, gen throughput (token/s): 9120.97, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 45225, token usage: 0.10, gen throughput (token/s): 9011.38, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 46425, token usage: 0.10, gen throughput (token/s): 8980.70, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 49225, token usage: 0.11, gen throughput (token/s): 8748.48, #queue-req: 0\n"
+      "[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 50425, token usage: 0.11, gen throughput (token/s): 8799.31, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:44] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:26] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -926,13 +940,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:47] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:29] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -957,13 +971,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:50] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:32] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -988,13 +1002,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:53] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:35] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1019,13 +1033,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:56] INFO:     127.0.0.1:46622 - \"GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:38] INFO:     127.0.0.1:39106 - \"GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_1ec1c736-3552-4779-ad78-e1d0a04103dd // Status: completed // Created at: 1730281954 // Input file ID: backend_input_file-9abb78d6-caeb-43a5-8edc-d9ca919e548a // Output file ID: backend_result_file-842f31d9-cd2c-4138-9aba-c88b760a24d5</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 // Status: completed // Created at: 1730283616 // Input file ID: backend_input_file-7e25f262-5647-4217-98d4-6e16a5b8771e // Output file ID: backend_result_file-7988b5b5-a13b-4f8d-a921-6f8bdf96635e</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1123,10 +1137,10 @@
    "execution_count": 10,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:52:59.163798Z",
-     "iopub.status.busy": "2024-10-30T09:52:59.163594Z",
-     "iopub.status.idle": "2024-10-30T09:53:12.286875Z",
-     "shell.execute_reply": "2024-10-30T09:53:12.286291Z"
+     "iopub.execute_input": "2024-10-30T10:20:41.988992Z",
+     "iopub.status.busy": "2024-10-30T10:20:41.988692Z",
+     "iopub.status.idle": "2024-10-30T10:20:55.101428Z",
+     "shell.execute_reply": "2024-10-30T10:20:55.100864Z"
     }
    },
    "outputs": [
@@ -1134,14 +1148,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:52:59] INFO:     127.0.0.1:47744 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:20:42] INFO:     127.0.0.1:44822 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_31479bae-6187-4511-99f1-5462430498d3</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1166,86 +1180,92 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 37, #new-token: 37, #cached-token: 1998, cache hit rate: 59.11%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 334, #new-token: 8192, #cached-token: 10177, cache hit rate: 56.51%, token usage: 0.01, #running-req: 37, #queue-req: 129\n"
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 36, #new-token: 36, #cached-token: 1944, cache hit rate: 58.83%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 335, #new-token: 8192, #cached-token: 10231, cache hit rate: 56.51%, token usage: 0.01, #running-req: 36, #queue-req: 129\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 130, #new-token: 3872, #cached-token: 3278, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 130, #new-token: 3871, #cached-token: 3279, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1\n",
-      "[2024-10-30 09:52:59 TP0] Decode batch. #running-req: 500, #token: 16525, token usage: 0.04, gen throughput (token/s): 248.66, #queue-req: 0\n"
+      "[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 22525, token usage: 0.05, gen throughput (token/s): 474.44, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:00 TP0] Decode batch. #running-req: 500, #token: 36525, token usage: 0.08, gen throughput (token/s): 25014.55, #queue-req: 0\n"
+      "[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 42525, token usage: 0.10, gen throughput (token/s): 24683.11, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:01 TP0] Decode batch. #running-req: 500, #token: 56525, token usage: 0.13, gen throughput (token/s): 23757.41, #queue-req: 0\n"
+      "[2024-10-30 10:20:44 TP0] Decode batch. #running-req: 500, #token: 62525, token usage: 0.14, gen throughput (token/s): 23362.68, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:02 TP0] Decode batch. #running-req: 500, #token: 76525, token usage: 0.17, gen throughput (token/s): 22704.73, #queue-req: 0\n"
+      "[2024-10-30 10:20:45 TP0] Decode batch. #running-req: 500, #token: 82525, token usage: 0.19, gen throughput (token/s): 22313.40, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:03 TP0] Decode batch. #running-req: 500, #token: 96525, token usage: 0.22, gen throughput (token/s): 21749.38, #queue-req: 0\n"
+      "[2024-10-30 10:20:46 TP0] Decode batch. #running-req: 500, #token: 102525, token usage: 0.23, gen throughput (token/s): 21343.74, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:04 TP0] Decode batch. #running-req: 500, #token: 116525, token usage: 0.26, gen throughput (token/s): 20892.42, #queue-req: 0\n"
+      "[2024-10-30 10:20:47 TP0] Decode batch. #running-req: 500, #token: 122525, token usage: 0.28, gen throughput (token/s): 20537.36, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:05 TP0] Decode batch. #running-req: 500, #token: 136525, token usage: 0.31, gen throughput (token/s): 20062.57, #queue-req: 0\n"
+      "[2024-10-30 10:20:48 TP0] Decode batch. #running-req: 500, #token: 142525, token usage: 0.32, gen throughput (token/s): 19740.76, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:06 TP0] Decode batch. #running-req: 500, #token: 156525, token usage: 0.35, gen throughput (token/s): 19298.73, #queue-req: 0\n"
+      "[2024-10-30 10:20:49 TP0] Decode batch. #running-req: 500, #token: 162525, token usage: 0.37, gen throughput (token/s): 18991.46, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:07 TP0] Decode batch. #running-req: 500, #token: 176525, token usage: 0.40, gen throughput (token/s): 18643.44, #queue-req: 0\n"
+      "[2024-10-30 10:20:50 TP0] Decode batch. #running-req: 500, #token: 182525, token usage: 0.41, gen throughput (token/s): 18340.00, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:08 TP0] Decode batch. #running-req: 500, #token: 196525, token usage: 0.44, gen throughput (token/s): 17661.48, #queue-req: 0\n"
+      "[2024-10-30 10:20:51 TP0] Decode batch. #running-req: 500, #token: 202525, token usage: 0.46, gen throughput (token/s): 17609.08, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:09] INFO:     127.0.0.1:52896 - \"POST /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3/cancel HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:52] INFO:     127.0.0.1:46808 - \"POST /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10/cancel HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1264,7 +1284,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - \"GET /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3 HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - \"GET /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10 HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1295,7 +1315,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:12] INFO:     127.0.0.1:52896 - \"DELETE /v1/files/backend_input_file-aa3ee9f6-3d9e-4b48-b53f-eaabadf1dae7 HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:20:55] INFO:     127.0.0.1:46808 - \"DELETE /v1/files/backend_input_file-e7d79e39-9d0c-4c13-890f-d1f9806de20e HTTP/1.1\" 200 OK\n"
      ]
     },
     {
@@ -1395,10 +1415,10 @@
    "execution_count": 11,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:53:12.288722Z",
-     "iopub.status.busy": "2024-10-30T09:53:12.288529Z",
-     "iopub.status.idle": "2024-10-30T09:53:14.032989Z",
-     "shell.execute_reply": "2024-10-30T09:53:14.032049Z"
+     "iopub.execute_input": "2024-10-30T10:20:55.103211Z",
+     "iopub.status.busy": "2024-10-30T10:20:55.103029Z",
+     "iopub.status.idle": "2024-10-30T10:20:56.879735Z",
+     "shell.execute_reply": "2024-10-30T10:20:56.879040Z"
     }
    },
    "outputs": [],
diff --git a/release_process.html b/release_process.html
index 92e5825..1d7d9b9 100644
--- a/release_process.html
+++ b/release_process.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/sampling_params.html b/sampling_params.html
index 6e4cab2..753da3d 100644
--- a/sampling_params.html
+++ b/sampling_params.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/search.html b/search.html
index f3ca4f8..0378750 100644
--- a/search.html
+++ b/search.html
@@ -183,6 +183,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/searchindex.js b/searchindex.js
index 8d741f6..3260a1f 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[16, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[14, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[14, "baseline"]], "Batches": [[12, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[14, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[17, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[12, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[12, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[14, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[14, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[14, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[15, "Launch-a-server"]], "Make a release in GitHub": [[13, "make-a-release-in-github"]], "Memory": [[14, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[14, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[14, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[14, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [12, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[12, "Parameters"], [12, "id2"]], "Performance Implications on Penalties": [[14, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[14, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[13, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[15, null]], "References": [[9, null]], "Repetition Penalty": [[14, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[14, null]], "Send a Request": [[15, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[16, null]], "Step 1: Start a docker container.": [[16, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[16, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[16, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [14, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[17, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[17, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[13, "update-the-version-in-code"]], "Upload the PyPI package": [[13, "upload-the-pypi-package"]], "Usage": [[12, "Usage"], [12, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]], "Using OpenAI Python Client": [[15, "Using-OpenAI-Python-Client"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 12, 14, 15], "0": [1, 6, 7, 8, 10, 12, 14, 15, 16, 17], "00": [6, 12, 15], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "01": [6, 7, 8, 12, 14, 15], "01239013671875": 6, "01438140869140625": 6, "02": [12, 14, 15], "03": [6, 12, 14, 15], "04": [12, 14, 15, 16], "05": [6, 12, 14, 15], "06": [12, 14], "07": [6, 12, 15], "08": [12, 14], "089f36cfecab": 12, "09": [6, 12, 15], "0_rocm6": 16, "0_triton3": 16, "1": [2, 6, 7, 8, 11, 12, 14, 15], "10": [1, 2, 6, 12, 14, 15], "100": [6, 7, 12, 15], "101": 14, "10117": 12, "10177": 12, "1025173": 6, "103": 14, "10362": 12, "104": [12, 14], "10405": 14, "10661": 12, "10666": 14, "107": 14, "10767": 14, "10881": 12, "11": [6, 12, 14], "110": 12, "114": 14, "11586": 14, "116525": 12, "117": 14, "11732": 14, "11it": 15, "12": [6, 10, 12, 14, 16], "120": 15, "123": 12, "1244882": 6, "1245797": 12, "1246720": 15, "127": [1, 6, 12, 15], "128": [1, 12, 14], "128009": [12, 15], "129": 12, "13": [6, 12, 14, 15], "130": 12, "131": 12, "131072": [6, 12, 15], "132": 12, "13225": 12, "133": 12, "134": 12, "136525": 12, "137": 15, "139": 15, "13it": 15, "14": [6, 12], "14226": 14, "144": 12, "15": 12, "150": 12, "156525": 12, "16": [1, 7, 14], "160": [6, 12, 15], "16219": 14, "16384": [6, 12, 15], "16525": 12, "16740": 14, "17": [12, 14], "17125": 14, "17167": 14, "172": 1, "17225": 12, "1730281949": 12, "1730281950": 12, "1730281951": 12, "1730281954": 12, "1730282045": 15, "174": 14, "176525": 12, "17661": 12, "179": 14, "18": [6, 14], "184": 12, "18643": 12, "18895": 14, "189": 14, "191": 14, "19298": 12, "195": 14, "196525": 12, "1980": 12, "19884": 14, "1998": 12, "1st": 14, "2": [1, 5, 6, 7, 9, 12, 14, 15], "20": [12, 15], "200": [6, 12, 15], "20000": 1, "20062": 12, "2024": [6, 12, 15], "2048": [2, 8], "2049": [12, 15], "205": 14, "2075": 12, "20866": 14, "20892": 12, "21": [6, 12], "21225": 12, "21749": 12, "22": 12, "22095": 14, "22363": 14, "22603": 14, "22704": 12, "22it": [12, 15], "23": 12, "233": [8, 14], "23757": 12, "23892": 14, "24": [12, 14], "240": 15, "243": [6, 12, 15], "248": 12, "2490": 12, "24h": 12, "25": [6, 7, 12, 15], "25014": 12, "25225": 12, "256": [1, 2, 6, 7, 12, 14, 15], "26": [6, 12, 14], "268": 14, "27": [6, 14], "271": 14, "28": [6, 12, 15], "280": 15, "29": [6, 12, 14, 15], "29225": 12, "293": 14, "2f8df779": 12, "3": [2, 5, 6, 7, 8, 9, 12, 14, 15], "30": [6, 12, 14, 15], "3000": 14, "30000": [1, 5, 7, 10, 12, 14, 15], "30010": 6, "308": 14, "31": [12, 14], "317": 8, "32": [1, 2, 6, 12, 14, 15], "320": [14, 15], "3279": 12, "32it": 12, "33": [6, 12], "33225": 12, "334": 12, "33it": 15, "34": [6, 12, 15], "34504": 6, "34520": 6, "35": [12, 14], "3552": 12, "36": [12, 14], "360": 15, "36525": 12, "37": [12, 14, 15], "370": 12, "370959": 8, "37225": 12, "378633": 14, "38": [12, 14], "3871": 12, "39": [6, 12, 14, 15], "3bcc8392f940": 12, "3d9e": 12, "4": [1, 6, 7, 12, 15], "40": [12, 14], "400": 15, "4005": 6, "40674": 12, "40678": 12, "40686": 12, "40694": 12, "40881": 14, "409": 14, "4096": [1, 2, 6, 8, 12, 15], "41": [12, 14], "41225": 12, "4138": 12, "41888": 14, "42": 12, "422": 15, "425": 12, "42628": 6, "43": [6, 15], "43056": 6, "43062": 6, "43078": 6, "433": 14, "43967": 14, "43a5": 12, "44": [12, 14, 15], "440": [14, 15], "442913": [12, 15], "447": 14, "448515216": 6, "44926": 14, "44it": 12, "45": [12, 14, 15], "450": 12, "4511": 12, "45225": 12, "453": 14, "45354": 14, "45445": 14, "455": 14, "4594": 8, "46": [12, 14, 15], "46530": 14, "46622": 12, "469": 15, "46f7": 12, "47": [6, 12, 14, 15], "47738": 14, "47744": 12, "4779": 12, "47it": 15, "48": [12, 15], "48302": 14, "4832": 14, "48960": 14, "48it": 12, "49": [12, 15], "49017": 14, "49225": 12, "49263": 14, "4984": 12, "4b48": 12, "5": [1, 6, 7, 12, 14], "50": [6, 8, 12, 14, 15], "500": [8, 12], "50000": 1, "50302": 14, "5079": 14, "51": [6, 12, 14], "510": 12, "512": [2, 14], "52": [1, 12], "5206": 14, "5225": 12, "522777218": 15, "5255": 14, "52554": 14, "52825": 14, "52896": 12, "52920": 14, "53": [12, 15], "54": [6, 12, 14, 15], "54497": 14, "5462430498d3": 12, "55": [12, 14], "55856": 15, "55872": 15, "56": [12, 14, 15], "56525": 12, "5656": 14, "569d9fcd3afc47828a91328dcc3da4cb": 12, "57": [6, 12, 15], "5727": 14, "57426": 14, "57724": 15, "57738": 15, "57740": 15, "58": 14, "58766": 12, "58778": 12, "59": [6, 12, 14, 15], "5b": 11, "6": [1, 6, 12, 15, 16], "60": [2, 12, 14], "600": [6, 12, 15], "6000": 2, "61": [12, 14, 15], "6187": 12, "61it": 12, "62": 15, "63": [12, 15], "64": [1, 2, 6, 12, 14, 15], "64g": 16, "65": 14, "66": [12, 14], "67": [12, 14], "68": [6, 14], "69": 14, "691": 12, "7": [1, 6, 12, 15], "70": [2, 12, 14], "71": [6, 14], "72": 14, "72b": 1, "73": [12, 14, 15], "74": 14, "747149505": 12, "75": [12, 14, 15], "76": 14, "76525": 12, "766008": 14, "77": 6, "774756": 14, "774955": 14, "775118": 14, "775210": 14, "775220": 14, "775651": 14, "78": [6, 12, 14, 15], "79": [12, 14], "7b": [1, 5, 6, 14], "7fa2af80": 2, "8": [1, 6, 12, 14, 15, 17], "80": [12, 15], "8000": 0, "81": [6, 14], "8192": [6, 12, 15], "82": [8, 12], "83": [12, 14], "83f0cdf978484de5b357f0abc71e3f07": 12, "84": [14, 15], "8413": 14, "842f31d9": 12, "85": [14, 15], "86": [6, 12, 14, 15], "86ed": 12, "8748": 12, "88": [6, 12, 14, 15], "89": [12, 14], "8b": [1, 2, 7, 10, 12, 14, 15], "8edc": 12, "9": [1, 7, 8, 12, 17], "90": [6, 12, 14], "9011": 12, "9069": 12, "91": 14, "9225": 12, "93": 14, "9362": 12, "94": 14, "95": [1, 12, 14, 15], "9505": 12, "96": 14, "96525": 12, "97": [12, 14], "9705": 12, "98": 14, "9825": 12, "9900": 14, "9998": 8, "99f1": 12, "9aba": 12, "9abb78d6": 12, "A": [1, 2, 7, 8, 9, 10, 12], "By": [5, 14], "For": [1, 2, 3, 11, 12, 14], "If": [1, 5, 8, 10, 14, 17], "In": [1, 6, 7, 12, 15, 17], "It": [1, 3, 5, 7, 8, 9, 10, 12, 14, 15], "NOT": 5, "On": 8, "THE": 12, "The": [1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16], "Then": [7, 16], "There": [5, 12], "These": 14, "To": [0, 1, 2, 6, 7, 8, 10, 11, 12, 15], "__init__": 13, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "a798": 12, "aa3ee9f6": 12, "abl": 11, "about": [1, 5, 7, 8, 12], "abov": [2, 3, 10, 14], "acceler": [1, 8, 10], "accept": [12, 14], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "accuraci": 15, "achiev": [12, 15], "across": 3, "activ": 9, "ad": 10, "ad78": 12, "add": [1, 2, 6, 7, 8, 14, 17], "addit": [3, 7, 14], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "after": 15, "again": 12, "against": 3, "ai": [1, 10, 12, 15], "algorithm": 14, "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 16], "all_other_model": 11, "allow": [2, 10, 15], "almost": [1, 8, 11], "also": [1, 5, 6, 7, 8, 12, 14, 15], "altern": [3, 7], "alwai": 8, "amd": 16, "an": [0, 1, 3, 7, 9, 10, 12, 14, 16], "analysi": 12, "ancient": 12, "ani": [1, 7, 10, 14], "annot": 2, "anoth": [11, 15], "answer": [3, 7, 15], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 14, 15], "api_kei": [1, 6, 12, 15], "appear": 14, "append": 12, "appli": [14, 15], "applic": [1, 6, 9, 12, 15], "approach": [10, 15], "apt": [2, 16], "aqueduct": 12, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15], "arch": 16, "architectur": [2, 12, 15], "arg": 3, "argument": [2, 7, 14], "around": 17, "art": 15, "articl": 15, "artifici": [12, 15], "assert": 12, "assist": [1, 3, 5, 7, 12, 14, 15], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attention_backend": [6, 12, 15], "attract": [3, 7, 12], "audio": [12, 15], "auror": 7, "australia": [12, 15], "author": [6, 15], "auto": [6, 12, 15], "automat": 14, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 6, 10, 12, 15], "averag": 3, "avoid": [10, 12], "awq": 9, "b": 10, "b53f": 12, "back": 9, "backend": [2, 3, 10, 14, 17], "backend_input_fil": 12, "backend_result_fil": 12, "bad": 3, "baichuan2": 1, "balanc": [7, 12], "base": [3, 14, 15], "base64": 14, "base_url": [1, 6, 12, 15], "bash": [13, 16], "basic": 14, "batch": [1, 2, 6, 8, 9, 14, 15], "batch_1ec1c736": 12, "batch_31479ba": 12, "batch_d6a4ccdb": 12, "batch_detail": 12, "batch_id": 12, "batch_job": 12, "batch_request": 12, "batch_respons": 12, "batchrequestcount": 12, "bbcbab6a628b4139b82000ab40565b10": 15, "bce": 12, "bearer": [6, 15], "becam": 12, "becaus": [7, 8], "been": 15, "befor": [2, 14], "begin": [6, 7, 12, 15], "beij": 12, "being": [8, 15], "below": [7, 10, 14, 16], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 14], "benchmark": 9, "berlin": 3, "bert": 15, "bespok": 3, "better": [1, 8, 10, 11, 12], "between": [1, 14], "bff5": 12, "bfloat16": [12, 15], "bia": 7, "bidirect": 15, "bin": 16, "black": [6, 12, 15], "blob": 14, "block": [7, 15], "blog": 15, "blogpost": 3, "blood": 7, "blue": [6, 12, 15], "bodi": [7, 12, 14], "bogart": 7, "book": 15, "bool": 14, "born": 7, "both": 8, "bottleneck": 8, "bra": 12, "branch": 10, "bras\u00edlia": [12, 15], "brazil": [12, 15], "break": 14, "bridg": 12, "browser": 0, "bug": 12, "build": [1, 10, 12, 13], "built": 10, "c": [6, 10, 12, 15], "c88b760a24d5": 12, "cach": [1, 2, 6, 8, 9, 10, 12, 15, 16], "caeb": 12, "calcul": 7, "call": [3, 7, 9], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "canberra": [12, 15], "cancel": 12, "cancelled_job": 12, "cannot": 14, "capit": [1, 3, 7, 12, 14, 15], "captur": [12, 15], "case": [8, 17], "cc26dae40dc0474f92b9a23846f0a2ba": 12, "cd": [4, 10, 13], "cd2c": 12, "center": 12, "centuri": 12, "chain": 9, "chang": [11, 16], "charact": 7, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 14, 15], "chat_exampl": 7, "chat_templ": [5, 6, 12, 15], "chatbot": 15, "chatcomplet": [12, 15], "chatcompletionmessag": [12, 15], "chatglm": 1, "chatgpt": 15, "chatml": [1, 5, 14], "check": [1, 10, 12], "check_output": 6, "checkpoint": [1, 2, 6, 12, 15], "china": 12, "choic": [7, 9, 12, 15], "choices_method": 3, "chunk": [1, 9, 12, 14], "chunked_prefill_s": [6, 12, 15], "ci": 4, "civil": 12, "clariti": [6, 12, 15], "class": 14, "clean": 12, "cli": 2, "client": [1, 2, 6, 12], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 6, 7, 12, 15], "coher": 15, "color": [2, 6, 12, 15], "colosseum": 12, "com": [2, 10, 13, 14, 16], "combin": [6, 12, 15], "come": [8, 14], "command": [1, 2, 4, 6, 10, 11, 12, 15, 16], "commerc": 12, "commit": 4, "common": [15, 17], "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 9, 14, 15], "compil": [1, 8], "complet": [1, 6, 7, 15], "completion_token": [12, 15], "completion_tokens_detail": [12, 15], "completion_window": 12, "completionchoic": 12, "completionusag": [12, 15], "complex": [7, 12, 15], "comput": [2, 7, 8, 12, 14], "concis": [12, 15], "concret": 12, "conda": 10, "conduct": 12, "confid": 3, "config": [1, 2], "connect": [7, 10], "conquest": 12, "consid": [2, 14], "constrain": [8, 9, 14], "constrained_json_whitespace_pattern": [6, 12, 15], "constraint": 7, "contain": 3, "content": [1, 6, 7, 12, 15], "context": 15, "context_len": [6, 12, 15], "context_length": [6, 12, 15], "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 15], "convert": 11, "copi": 10, "core": [7, 9], "correct": [2, 14], "cost": 12, "could": 14, "count": 12, "countri": [1, 12, 15], "cover": 12, "coverag": 11, "cpu": 8, "creat": [1, 6, 11, 12, 15], "created_at": 12, "creativ": 12, "critic": 2, "ctrl": [6, 12, 15], "cu121": 10, "cuda": [1, 2, 6, 10, 12, 14, 15, 16], "cuda_graph_max_b": [6, 12, 15], "cuda_visible_devic": 16, "curl": [1, 14, 15, 16], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 12, "custom": 1, "custom_id": 12, "d": [0, 1, 2, 6, 7, 10, 12, 15], "d9ca919e548a": 12, "dark": 12, "data": [1, 6, 8, 12, 14], "dataclass": 14, "dataset": [2, 14, 15], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 14, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 12, 14, 15], "decode_unicod": 14, "decor": 7, "decreas": 8, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 14, 17], "defin": [5, 7], "del_respons": 12, "delai": 2, "delet": 12, "delta": 12, "democraci": 12, "depend": 10, "deploi": 10, "deploy": 10, "describ": [3, 14], "descript": [2, 14], "design": [9, 15], "desir": 14, "detail": [12, 14], "detailed_tip": 7, "determin": 3, "detoken": 14, "dev": [1, 16], "devel": 16, "develop": [2, 12, 15], "devic": [1, 6, 10, 12, 15, 16], "devtool": 2, "dict": 14, "diet": 7, "differ": 11, "difficult": 14, "directli": 1, "directori": 11, "disabl": [1, 2, 14, 17], "disable_cuda_graph": [6, 12, 15], "disable_cuda_graph_pad": [6, 12, 15], "disable_custom_all_reduc": [6, 12, 15], "disable_disk_cach": [6, 12, 15], "disable_flashinf": [6, 12, 15], "disable_flashinfer_sampl": [6, 12, 15], "disable_mla": [6, 12, 15], "disable_nan_detect": [6, 12, 15], "disable_pen": [6, 12, 15], "disable_radix_cach": [6, 12, 15], "disable_regex_jump_forward": [6, 12, 15], "dislik": 14, "displai": [6, 12, 15], "dist_init_addr": [6, 12, 15], "distrib_releas": 2, "distribut": [6, 12, 15], "divers": 12, "dn": 7, "do": [2, 8, 12, 14, 16], "doc": [2, 3, 10, 14], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [14, 16], "down": 3, "download": [2, 14], "dp": 1, "dp_size": [6, 12, 15], "dpkg": 2, "drawback": 14, "dri": 16, "ds_channel_config_path": [6, 12, 15], "ds_heavy_channel_num": [6, 12, 15], "ds_heavy_channel_typ": [6, 12, 15], "ds_heavy_token_num": [6, 12, 15], "ds_sparse_decode_threshold": [6, 12, 15], "dtype": [1, 6, 12, 15], "duck": 3, "due": [3, 8, 17], "dummi": 2, "dump": [6, 12], "durat": [2, 14], "dure": [1, 8, 12, 14], "dynam": [1, 2], "e": [2, 10, 11, 12, 16], "e1d0a04103dd": 12, "e2": 14, "e5": [1, 6, 9], "eaabadf1dae7": 12, "each": [1, 12], "earli": 8, "earlier": 3, "easi": [9, 11, 17], "easier": 7, "eater": 7, "eb152de88c6a42eaab7b3911b3664583": 15, "echo": [2, 16], "edit": 16, "educ": 15, "effici": [1, 9], "either": 14, "element": 12, "eleutherai": 3, "elif": 7, "els": 12, "embed": [1, 9, 12], "embedding_model": 12, "embedding_process": 6, "empir": 12, "empti": 1, "enabl": [1, 7, 8, 10, 15], "enable_cache_report": [6, 12, 15], "enable_double_spars": [6, 12, 15], "enable_mixed_chunk": [6, 12, 15], "enable_overlap_schedul": [6, 12, 15], "enable_p2p_check": [6, 12, 15], "enable_torch_compil": [6, 12, 15], "encod": [6, 14, 15], "encount": 10, "encourag": [12, 14], "end": [6, 7, 11, 12, 14, 15], "endpoint": [1, 10, 12, 14], "engin": [7, 12], "enough": [1, 8], "entertain": 15, "entir": [12, 15], "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 6, 16], "eo": [8, 14], "equival": [6, 15], "error": [1, 8, 12], "especi": 8, "establish": 12, "etc": [2, 9], "eth0": 1, "even": [3, 15], "everi": 14, "exampl": [1, 3, 6, 11, 12, 15, 16], "example_imag": 14, "exaon": 1, "except": 12, "excl": 14, "exec": 2, "execut": [10, 15], "execute_shell_command": [6, 12, 15], "exercis": 7, "exist": 11, "expand": [7, 12], "experiment": 8, "explor": 12, "export": [0, 1, 7, 16], "express": [7, 14], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 14, "f": [1, 6, 7, 12], "f75d": 12, "face": [1, 5], "facebook": 15, "fail": [3, 12], "failur": 10, "fals": [6, 12, 14, 15], "far": 14, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "fetch": 2, "few": 12, "field": 15, "file": [0, 2, 4, 11, 12, 14], "file_respons": 12, "file_storage_pth": [6, 12, 15], "fill": 7, "fillmor": 3, "final": 12, "financ": 15, "find": [7, 11, 14], "finish_reason": [12, 15], "fire": [6, 12, 15], "first": [1, 2, 6, 7, 8, 14], "fix": 17, "flashinf": [6, 9, 10, 12, 15], "flexibl": 9, "float": 14, "float16": 6, "flow": 9, "fluenci": 12, "flush": [7, 14], "focus": 12, "folder": [2, 4, 16], "follow": [1, 2, 5, 6, 7, 8, 11, 14, 16], "forev": 16, "fork": [2, 7], "format": [2, 6, 7, 12, 14, 15], "forward": [9, 11], "forward_batch": 11, "found": 7, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 14, 17], "framework": 9, "franc": [1, 3, 7, 14], "french": 12, "frequency_penalti": [12, 14], "frequent": 8, "from": [4, 5, 6, 7, 12, 15], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8, 12], "function": [3, 7, 11], "function_cal": [12, 15], "further": 10, "futur": [1, 11], "g": [2, 10, 11, 16], "gb": [6, 12, 15], "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8, 12, 15], "gener": [0, 1, 9, 12, 14, 15], "generatereqinput": 14, "get": [6, 10, 11, 12, 14, 15], "get_model_info": [6, 12, 15], "git": [10, 16], "github": [0, 10, 14], "give": [11, 16], "given": [14, 15], "glimps": 14, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": [7, 15], "govern": 12, "gpt": 7, "gptq": [6, 9, 12, 15], "gpu": [1, 8, 10, 14, 16], "grammar": 15, "grammar_backend": [6, 12, 15], "graph": [1, 2, 12, 15, 17], "greedy_token_select": 3, "grok": 1, "group": 16, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 14, 15], "h": [1, 6, 15], "h100": [10, 14], "ha": [8, 11, 12, 15], "haisgl": 16, "half": 7, "hand": 8, "handl": [1, 2, 14], "happen": 8, "hardwar": 14, "harri": 7, "hasattr": 12, "have": [0, 1, 3, 8, 12, 14, 15], "healthcar": 15, "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 12, 14, 15], "henryx": 16, "here": [1, 6, 7, 12, 15], "hf": 5, "hf_home": 16, "hf_token": [10, 16], "hf_xxx": 16, "high": [3, 8, 12, 14, 15], "higher": [12, 14], "highest": [3, 7], "highlight": [6, 12, 15], "historian": 12, "hit": [6, 12, 14, 15], "host": [6, 10, 12, 15], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 16], "human": 15, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17], "id": [12, 14, 15], "ident": 11, "ignor": 14, "ignore_eo": 14, "im_end": [5, 14], "im_start": [5, 14], "imag": [7, 10, 14], "image_data": 14, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11, 12, 15], "import": [1, 2, 6, 7, 8, 12, 14, 15], "improv": [6, 12, 15], "includ": [7, 9, 12, 15], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 12, 15], "indic": 8, "indistinguish": 15, "industri": 9, "inf": 14, "infer": [1, 14], "influenc": 12, "info": [6, 12, 15], "inform": [7, 14, 15], "infra": 10, "init": [1, 6, 12, 15], "initi": [3, 12], "input": [1, 2, 7, 9, 12, 14], "input_file_id": 12, "input_file_path": 12, "input_id": [6, 14], "input_ids_embed": 6, "insid": 16, "instal": [0, 2, 4, 6, 9, 12, 13, 15, 16], "installationguid": 2, "instanc": 3, "instead": [1, 17], "instruct": [1, 2, 6, 7, 10, 12, 14, 15], "int": 14, "int4": 9, "int4wo": 1, "integr": 9, "intellig": [12, 15], "inter": 14, "interact": [9, 15], "interfac": [9, 11], "internlm": 1, "interpret": 12, "intfloat": 6, "intuit": 9, "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 12, "is_embed": [6, 12, 15], "issu": [7, 10, 17], "itali": 12, "iter_lin": 14, "itl": 14, "its": [3, 12], "japan": [7, 12, 15], "job": 12, "joke": 12, "json": [1, 2, 5, 6, 12, 14, 15], "json_decod": 7, "json_model_override_arg": [6, 12, 15], "json_output": 7, "json_schema": 14, "jsonl": 12, "julian": 12, "jump": 9, "just": [5, 12], "k": 14, "k8": 10, "keep": 12, "kei": [2, 7], "kernel": [9, 10, 17], "kfd": 16, "kingdom": 7, "knowledg": 12, "kv": [1, 8], "kv_cache_dtyp": [6, 12, 15], "l": 14, "l4": 10, "l40": 10, "lab": [1, 14], "label": 16, "lang": 14, "languag": [5, 9, 10, 12, 15], "larg": [1, 2, 8, 9, 15], "last": [10, 12], "late": 12, "later": [3, 16], "latest": 10, "latin": 12, "launch": [1, 2, 5, 7, 9, 10, 14, 17], "launch_serv": [1, 2, 5, 6, 7, 10, 12, 14, 15], "law": 12, "layer": 11, "layer_id": 11, "learn": [1, 4, 11, 15], "least": 14, "len": [1, 2, 14], "length": [7, 12, 14], "less": 12, "let": 1, "level": [12, 14], "librari": [7, 15], "light": 12, "like": [8, 12, 15], "limit": 3, "line": [12, 15], "lint": 4, "linux": 16, "list": [1, 2, 7, 11, 12, 14, 15, 17], "literatur": 12, "llama": [2, 5, 7, 9, 10, 11, 12, 14, 15], "llama3": 1, "llamaforcausallm": [12, 15], "llava": [1, 9, 14], "llava_llama_3": 1, "llm": [1, 3, 9, 15], "lm_eval": [6, 12, 15], "lmm": [1, 14], "lmsysorg": 10, "load": [1, 2, 6, 8, 12, 14, 15], "load_balance_method": [6, 12, 15], "load_format": [6, 12, 15], "load_imag": 14, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 12, 14, 15], "locat": 14, "log": [6, 7, 8, 12, 15], "log_level": [6, 12, 15], "log_level_http": [6, 12, 15], "log_request": [6, 12, 15], "logic": 14, "logit": [7, 11, 14], "logitsprocessor": 11, "logprob": [3, 12, 14, 15], "logprob_start_len": 14, "london": 3, "long": [1, 12, 15], "longer": [3, 12], "longest": 8, "look": [5, 8, 12], "loop": 7, "lora_path": [6, 12, 15], "low": 14, "lower": [8, 12], "lpm": [6, 8, 12, 15], "lsb": 2, "lt": [6, 12, 15], "m": [0, 1, 2, 5, 6, 7, 10, 11, 12, 14, 15], "machin": 10, "magic": 7, "mai": [1, 2, 6, 7, 12, 15, 17], "main": [1, 14], "maintain": 11, "major": [11, 12], "make": [0, 8, 9, 11, 12, 14], "manag": 7, "mani": [3, 8, 11, 15], "manner": 14, "mask": 7, "massiv": 15, "match": 8, "matched_stop": [12, 15], "math": 7, "max": 14, "max_check": 12, "max_loras_per_batch": [6, 12, 15], "max_new_token": [1, 8, 14], "max_prefill_token": [6, 12, 15], "max_running_request": [6, 12, 15], "max_token": [1, 7, 12, 15], "max_total_num_token": [6, 12, 15], "max_total_token": [6, 12, 15], "maximum": 14, "md": 4, "me": 12, "mean": [8, 14], "meanwhil": 5, "measur": 14, "median": 14, "meet": 1, "mem": [1, 6, 12, 14, 15, 17], "mem_fraction_stat": [6, 12, 15], "memori": [1, 2, 6, 12, 15], "messag": [1, 7, 12, 15], "meta": [1, 2, 5, 7, 10, 12, 14, 15], "method": [9, 12], "mild": 12, "militari": 12, "millard": 3, "min_new_token": 14, "min_p": 14, "minicpm": 1, "ministri": 7, "minut": [12, 15], "mislead": 3, "miss": 5, "mistral": [1, 6, 9], "mix": 14, "mixtral": 1, "modal": [1, 9], "mode": 12, "model": [2, 3, 5, 8, 9, 10, 12, 14, 15, 16], "model_path": [1, 6, 12, 15], "moder": 12, "modern": 12, "moe": 1, "monitor": 12, "monument": 12, "more": [1, 9, 10, 12, 14], "most": [5, 8, 11], "mount": 16, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 12], "multipli": 14, "must": 14, "my": 1, "my_model": 5, "my_model_templ": 5, "n": [7, 12, 14, 15], "n1": [12, 15], "n2": [12, 15], "n3": [12, 15], "n4": 15, "n5": 15, "name": [1, 2, 3, 5, 7, 14, 16], "natur": [12, 15], "nbecaus": 12, "nccl": 1, "ndescrib": 14, "need": [2, 5, 7, 10, 11, 16], "nemo": 1, "nest": 7, "new": [1, 6, 8, 9, 12, 13, 15, 16], "new_token_ratio": 8, "next": 1, "ngener": 1, "nlist": 12, "nlp": [1, 6, 15], "nnode": [1, 6, 12, 15], "node": [1, 2], "node_rank": [6, 12, 15], "non": 7, "none": [6, 12, 14, 15], "normal": 7, "note": [1, 2, 5, 6, 11, 12, 14, 15, 16], "notebook": [6, 12, 15], "novel": 12, "noveral": 15, "now": 7, "npython": 12, "nsome": 15, "nsy": 2, "nthese": 15, "nuanc": 15, "null": [10, 15], "num": [1, 2, 14], "num_continuous_decode_step": [6, 12, 15], "number": [8, 14], "nvidia": [2, 14, 16], "nvtx": 2, "nyou": 14, "o": [2, 6, 14, 16], "object": [12, 15], "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 15, "ok": [6, 12, 15], "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 15], "one": [3, 7, 12, 14, 15], "onevis": [1, 14], "onli": [2, 3, 7, 10, 11, 12, 14], "onlin": [1, 2], "only_run": 11, "oom": [8, 14], "open": [9, 10, 12, 15], "openai": [3, 5, 9, 10, 14], "openai_api_kei": [7, 16], "oper": 10, "optim": [15, 17], "option": [3, 14], "order": 7, "origin": [6, 12, 15], "other": [3, 8, 10, 11, 14], "out": [1, 2, 7, 10, 17], "outlin": [6, 12, 15], "output": [1, 2, 6, 11, 12, 14, 15], "output_file_id": 12, "ov": [1, 14], "overhead": [8, 14], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 14], "p": [10, 14], "p2p": 1, "p99": 14, "page": [9, 12, 17], "pantheon": 12, "paragraph": 7, "parallel": [1, 8, 9, 14], "paramet": [8, 9], "pari": 3, "part": 11, "pass": [4, 7, 11], "path": [0, 1, 2, 3, 5, 6, 7, 10, 12, 14, 15], "patronu": 7, "pattern": 15, "pave": 12, "peer": 1, "penal": 14, "penalti": 12, "per": 14, "perform": 3, "phoenix": 7, "phrase": 12, "piec": 15, "pip": [0, 2, 13, 16], "pip3": 4, "plan": 10, "playground": 11, "pleas": [1, 7, 10], "png": 14, "pool": [1, 6, 8, 12, 15], "poorli": 3, "popular": [12, 15], "port": [1, 5, 6, 7, 10, 12, 14, 15], "post": [6, 12, 14, 15], "post2": 10, "post3_vllm0": 16, "potenti": 15, "potter": 7, "power": 15, "pre": 4, "predict": 3, "prefer": 12, "prefil": [1, 2, 6, 9, 11, 12, 15], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": [12, 14], "presid": [1, 3], "press": [6, 12, 15], "pretrain": 15, "prev": 14, "primit": [3, 7], "print": [1, 2, 7, 12, 14], "print_highlight": [6, 12, 15], "probabl": 7, "process": [6, 12, 14, 15], "profici": 12, "profil": 9, "program": [9, 10, 12], "programm": 12, "progress_bar": 7, "project": [0, 5, 10, 13, 14, 16], "prompt": [1, 2, 7, 9, 12, 14], "prompt_token": [12, 15], "prompt_tokens_detail": [12, 15], "proper": 10, "provid": [1, 2, 7, 9, 10, 12, 15], "pub": 2, "pull": 16, "pure": 7, "purpos": 12, "py": [0, 1, 2, 5, 6, 7, 11, 12, 13, 14, 15], "pydant": 7, "pyproject": 13, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14], "python3": [0, 1, 2, 10, 11, 14, 16], "pytorch": [10, 17], "q": 7, "qk": [6, 12, 15], "quantiz": [1, 6, 9, 12, 15], "queri": [1, 15], "question": [7, 15], "question_1": 7, "question_2": 7, "queue": [6, 8, 12, 15], "quick": [2, 9], "quick_start": 7, "quit": [6, 12, 15], "quiz": 12, "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 14], "qwen2forcausallm": 6, "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "rais": 12, "ran": 14, "random": [2, 14], "random_se": [6, 12, 15], "rang": [8, 9, 12, 15], "rank": 1, "rate": [6, 12, 14, 15], "ravenclaw": 7, "raw": 14, "rb": 12, "re": 12, "reach": 14, "read": 12, "readabl": 15, "readi": [6, 12, 15], "readm": 4, "readme_exampl": 7, "real": [1, 2], "reason": 12, "recommend": [2, 10, 14], "recoveri": 10, "reduc": [1, 8, 12], "refer": [1, 11, 12], "reference_hf": 11, "refus": [12, 15], "regex": [7, 14], "regist": 5, "regular": [7, 14], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 15, "releas": [2, 10], "relev": 14, "rememb": 6, "remot": 10, "remov": [0, 11], "repeat": 14, "repetit": 12, "repetition_penalti": 14, "replac": [1, 10, 11], "repo": 2, "report": [1, 17], "represent": 15, "reproduc": 12, "req": [6, 8, 12, 14, 15], "request": [1, 7, 9, 12, 14], "request_count": 12, "request_id": 12, "requir": 0, "resourc": [10, 11], "respond": 15, "respons": [1, 3, 6, 12, 14, 15], "restart": 16, "result": [3, 12, 14, 15], "result_cont": 12, "result_file_id": 12, "retoken": 14, "retracted_req": 8, "retriev": 12, "return": 14, "return_logprob": 14, "return_text_in_logprob": 14, "reus": 11, "revolution": 15, "rid": 14, "rm": 16, "rmsnorm": 11, "road": 12, "roberta": 15, "robustli": 15, "role": [1, 12, 15], "roll": [6, 12, 15], "roman": 12, "rome": 12, "root": 10, "round_robin": [6, 12, 15], "run": [0, 2, 4, 6, 7, 11, 12, 14, 15], "run_batch": 7, "runner_allow_runasroot": 16, "running_request": 14, "runtim": [9, 10], "runtimeendpoint": [3, 7], "safetensor": [6, 12, 15], "sai": 12, "same": [1, 2, 6, 7, 11, 14], "sampl": [9, 10, 11, 17], "sampling_backend": [6, 12, 15], "sampling_param": [1, 14], "scale": [10, 14], "schedule_conserv": [6, 12, 15], "schedule_polici": [6, 12, 15], "schema": [7, 14], "scientif": 12, "script": 11, "search": 7, "second": 12, "secret": 10, "section": 14, "see": [1, 7, 8, 10, 14], "seed": 12, "select": [7, 10], "semant": 15, "send": [1, 8, 9, 12, 14], "send_request": 12, "sentenc": [12, 14], "sep": 5, "sep_styl": 5, "separ": [6, 12, 15], "seq": [6, 12, 15], "sequenc": 12, "seri": 12, "serv": [1, 2, 8, 9, 10, 14], "served_model_nam": [6, 12, 15], "server": [0, 2, 5, 7, 8, 9, 12, 14], "server_arg": [6, 12, 15], "server_process": [12, 15], "serverarg": [6, 12, 15], "servic": [10, 12], "service_ti": [12, 15], "set": [1, 2, 5, 7, 10, 14, 17], "set_default_backend": 7, "sever": [1, 2, 12, 15], "sgl": [0, 1, 3, 7, 10, 13, 14, 16], "sgl0": 16, "sglang": [2, 4, 6, 12, 13, 15, 16], "sglang_is_in_ci": 16, "sglang_storag": [6, 12, 15], "sglang_use_modelscop": 1, "sh": 13, "shard": [6, 12, 15], "share": [8, 16], "shell": 6, "shm": 16, "short": [12, 14], "shorter": 3, "should": [5, 11], "show": 7, "show_time_cost": [6, 12, 15], "siluandmul": 11, "similar": [11, 12, 14], "simpl": [7, 15], "simpli": 3, "sinc": [12, 14], "singl": [1, 2, 10, 11, 12, 14], "size": [1, 2, 16], "sk": [7, 16], "skip": 14, "skip_special_token": 14, "skip_tokenizer_init": [6, 12, 15], "sky": 10, "skyserv": 10, "sleep": [12, 16], "slightli": 12, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 17], "smollm": 1, "smooth": 12, "snippet": 2, "so": [1, 2, 6, 12, 14, 15], "some": [2, 6, 7, 11, 14, 16, 17], "someth": 12, "sometim": 17, "sourc": [2, 9], "space": [12, 14], "spaces_between_special_token": 14, "spanish": 12, "special": 14, "specif": [1, 10, 11, 15], "specifi": [1, 3, 5, 7, 14, 16], "split": 12, "srt": [9, 10, 14], "stabl": 12, "stablelm": 1, "stai": 7, "stand": [8, 15], "start": [6, 11, 12, 14], "startswith": 14, "startup": [6, 12, 15], "state": [1, 7, 12, 15], "static": [1, 2, 14, 17], "statu": [7, 10, 12], "status_cod": 12, "still": 14, "stop": [7, 8, 12, 14, 15], "stop_str": 5, "stop_token_id": 14, "store": 14, "stori": 12, "str": 14, "strategi": 1, "stream": [1, 12], "stream_interv": [6, 12, 15], "string": [8, 14], "strip": [12, 14], "strong": [3, 12], "structur": [9, 15], "student": 7, "subprocess": 6, "subset": 3, "success": 14, "successfulli": 12, "suggest": 8, "summar": 15, "summari": [7, 15], "suppli": [3, 14], "support": [3, 6, 7, 9, 10, 12, 14], "sure": [0, 11, 14], "switch": 10, "sxm5": 14, "syntax": 15, "system": [1, 2, 5, 7, 12, 14, 15], "system_fingerprint": [12, 15], "t4": 10, "tabl": 12, "take": [8, 12, 15], "task": 15, "teacher": 7, "tee": 2, "tell": 12, "temperatur": [1, 7, 12, 14, 15], "templat": [1, 7, 14], "temporarili": 5, "tensor": [1, 9], "termin": [6, 10, 12, 15], "terminate_process": [6, 12, 15], "territori": 12, "test": [1, 2, 12, 14, 15, 16], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 12, 14, 15], "text_complet": 12, "text_embed": 6, "text_it": 7, "text_qa": 7, "thei": 14, "them": [10, 15, 17], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17], "thing": 8, "through": [7, 12, 14], "throughput": [1, 12, 14, 15], "time": [1, 2, 6, 12, 14], "tip": 17, "tip_suggest": 7, "tmp": 16, "todai": 1, "togeth": [1, 6, 8, 12, 15], "tok": 14, "token": [1, 5, 6, 7, 8, 9, 10, 12, 15], "token_id": 14, "token_length_norm": 3, "tokenizer_mod": [6, 12, 15], "tokenizer_path": [6, 12, 15], "tokenizers_parallel": 6, "tokyo": [12, 15], "toml": 13, "too": 8, "tool": 7, "tool_cal": [12, 15], "tool_us": 7, "top": 14, "top_k": 14, "top_logprobs_num": 14, "top_p": [1, 12, 14], "topic": [12, 15], "torch": [1, 6, 8, 12, 15], "torch2": 10, "torch_compile_max_b": [6, 12, 15], "torchao": 1, "torchao_config": [6, 12, 15], "total": [1, 12, 14], "total_token": [12, 15], "tp": 1, "tp0": [6, 12, 15], "tp_size": [6, 12, 15], "tpot": 14, "tr": 2, "trace": 2, "traffic": 14, "train": [2, 15], "transform": [6, 11, 15], "transit": 12, "translat": 15, "triton": 10, "triton_attention_reduce_in_fp32": [6, 12, 15], "troubleshoot": 9, "true": [1, 2, 6, 7, 12, 14, 16], "truncat": [1, 2], "trust_remote_cod": [6, 12, 15], "try": [1, 12, 14, 17], "ttft": 14, "tune": [1, 9, 14], "turbo": 7, "turn": 7, "tutori": 12, "twelv": 12, "twine": 13, "two": [1, 5, 7, 11, 12], "txt": 0, "type": [1, 6, 12, 15], "typic": [6, 12, 15], "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 16, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 15], "union": 14, "unit": [1, 2, 7, 12], "unittest": 11, "until": 14, "up": [6, 10, 12, 15], "updat": [0, 2, 16], "upgrad": 10, "upload": 12, "upload_pypi": 13, "uploaded_fil": 12, "upon": [1, 6], "url": [12, 14], "us": [2, 3, 4, 5, 8, 12, 14, 16], "us_president_exampl": 3, "usabl": [6, 12, 15], "usag": [1, 3, 6, 8, 15], "user": [1, 3, 5, 7, 8, 12, 14, 15], "usual": [12, 14], "utf": [12, 14], "util": [6, 8, 12, 14, 15], "uvicorn": [6, 12, 15], "v": [10, 16], "v0": 10, "v1": [1, 6, 12, 15], "valid": 12, "valu": [1, 8, 14, 17], "valuabl": 11, "variabl": [1, 16], "variant": [2, 15], "variou": [1, 12], "vast": 12, "veri": [8, 11, 12, 14], "verifi": 12, "version": 10, "vertexai": 7, "video": 16, "view": 1, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 14, "w": [7, 12], "wa": 12, "wai": [6, 11, 12, 15], "wait": [6, 12, 15], "wait_for_serv": [6, 12, 15], "wand": 7, "want": [1, 14], "war": 12, "warn": 8, "washington": 12, "watchdog_timeout": [6, 12, 15], "we": [1, 6, 12, 14, 15], "web": 12, "weight": [1, 2, 6, 12, 15, 16], "weight_util": [6, 12, 15], "welcom": 5, "well": 11, "were": [12, 14], "what": [3, 7, 12, 15], "when": [5, 7, 8, 12, 14, 17], "where": 3, "whether": 14, "which": [8, 12, 14, 15], "while": [1, 2, 6, 10, 12, 14, 15, 16], "whl": 10, "who": 12, "why": 12, "wide": [9, 12, 15], "within": 7, "without": [2, 10, 12], "wood": 7, "word": [7, 12], "work": [1, 5, 8, 16], "workflow": 7, "workload": 8, "would": 12, "write": [0, 12], "written": 15, "x64": 16, "x86_64": 2, "xvers": 1, "xxx": 16, "y": [2, 16], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16], "your": [0, 1, 5, 7, 9, 10, 12, 14, 15], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 16], "2": [10, 16], "3": [1, 10, 16], "4": 10, "405b": 1, "5": 10, "A": [6, 15], "The": 17, "With": 10, "access": 17, "achiev": 8, "action": 16, "add": [4, 11, 16], "addit": 1, "advanc": 8, "all": 14, "an": 17, "api": [1, 6, 12], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 14, "batch": [7, 12], "benchmark": [1, 2, 14], "build": 0, "chat": [5, 12], "choic": 3, "chunk": 8, "clean": 0, "client": 15, "cloud": 10, "code": [4, 13], "common": 10, "compat": [1, 6, 12], "complet": 12, "compos": 10, "config": 16, "configur": 16, "conserv": 8, "constrain": 7, "contain": 16, "contributor": 4, "control": 7, "correct": 11, "cuda": 17, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 16], "document": [0, 9], "dp": 8, "embed": 6, "encount": 17, "engin": 1, "error": 17, "exampl": [7, 14], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 14, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [13, 16], "greedi": 3, "guid": [4, 8], "hang": 17, "host": 16, "how": 11, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 17, "implement": 7, "implic": 14, "input": 6, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 14, "launch": [6, 15], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 13, "max": 8, "mem": 8, "memori": [8, 14, 17], "method": [3, 10], "min": 14, "modal": [7, 14], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 14], "new": [11, 14], "normal": [3, 14], "note": 10, "nsight": 2, "openai": [1, 6, 7, 12, 15], "option": 8, "other": 2, "out": 8, "packag": 13, "parallel": 7, "paramet": [12, 14], "peak": 8, "penalti": 14, "perform": [1, 14], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 14, "preview": 0, "process": 13, "profil": 2, "pypi": 13, "python": 15, "quick": [1, 7, 15], "refer": 9, "releas": 13, "repetit": 14, "request": [8, 15], "role": 7, "run": [1, 8, 10, 16], "runner": 16, "runtim": [1, 5, 14], "sampl": 14, "schedul": 8, "select": 3, "self": 16, "send": 15, "serv": 0, "server": [1, 6, 15, 17], "set": 16, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 14], "sh": 16, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 15, 16], "static": 8, "step": 16, "stream": [7, 14], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 14, "token": [3, 14], "tp": 8, "troubleshoot": 17, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 16, "updat": 13, "upload": 13, "us": [1, 6, 7, 10, 15], "usag": 12, "version": 13, "vllm": 11, "wa": 17, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[17, "add-a-runner"]], "Add the model to the test suite": [[12, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[15, "all-together"]], "Avoid out-of-memory by Tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[15, "baseline"]], "Batches": [[13, "Batches"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[15, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[18, "cuda-error-an-illegal-memory-access-was-encountered"]], "Chat Completions": [[13, "Chat-Completions"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Completions": [[13, "Completions"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[15, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[15, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[12, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[12, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[15, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[16, "Launch-a-server"]], "Learn more": [[11, null]], "Make a release in GitHub": [[14, "make-a-release-in-github"]], "Memory": [[15, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[15, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[15, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[15, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"], [13, null]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Parameters": [[13, "Parameters"], [13, "id2"]], "Performance Implications on Penalties": [[15, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[12, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[15, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[14, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"]], "Quick Start: Launch A Server and Send Requests": [[16, null]], "References": [[9, null]], "Repetition Penalty": [[15, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[15, null]], "Send a Request": [[16, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[17, null]], "Step 1: Start a docker container.": [[17, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[17, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[17, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [15, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[12, "test-the-correctness"]], "The server hangs": [[18, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[18, null]], "Try Advanced Options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune --schedule-policy": [[8, "tune-schedule-policy"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[14, "update-the-version-in-code"]], "Upload the PyPI package": [[14, "upload-the-pypi-package"]], "Usage": [[13, "Usage"], [13, "id1"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Input IDs": [[6, "Using-Input-IDs"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]], "Using OpenAI Python Client": [[16, "Using-OpenAI-Python-Client"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "learn_more", "model_support", "openai_api", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "learn_more.md", "model_support.md", "openai_api.ipynb", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 12, 13, 15, 16], "0": [1, 6, 7, 8, 10, 13, 15, 16, 17, 18], "00": [6, 13, 16], "0000": 8, "0006747245788574219": 6, "0006804466247558594": 6, "000682830810546875": 6, "0020961761474609375": 6, "0020999908447265625": 6, "003025054931640625": 6, "0030345916748046875": 6, "006198883056640625": 6, "006214141845703125": 6, "00807952880859375": 6, "00830078125": 6, "00830841064453125": 6, "009002685546875": 6, "00it": 16, "01": [6, 7, 8, 13, 15, 16], "01239013671875": 6, "01438140869140625": 6, "02": [13, 15, 16], "02it": 13, "03": [6, 13, 15, 16], "03it": 16, "04": [13, 15, 17], "05": [6, 13, 15], "05it": 13, "06": [13, 15], "07": [6, 13], "08": [6, 13, 15], "09": [6, 13], "0_rocm6": 17, "0_triton3": 17, "1": [2, 6, 7, 8, 12, 13, 15, 16], "10": [1, 2, 6, 13, 15, 16], "100": [6, 7, 13, 16], "101": 15, "10138": 13, "10231": 13, "1025173": 6, "102525": 13, "10257": 13, "103": 15, "104": [13, 15], "10405": 15, "10425": 13, "1051": 13, "1057403943": 16, "10578": 13, "10666": 15, "107": [13, 15], "10767": 15, "10803": 13, "11": [6, 13, 15, 16], "112": 13, "114": 15, "11586": 15, "117": 15, "11732": 15, "12": [10, 13, 15, 17], "120": [13, 16], "122": 13, "122525": 13, "123": 13, "126": 13, "127": [1, 6, 13, 16], "128": [1, 13, 15], "128009": [13, 16], "129": 13, "13": [13, 15, 16], "130": 13, "131": 13, "131072": [6, 13, 16], "132": 13, "133": 13, "134": 13, "138": 16, "139": 16, "14": [6, 13], "142": 13, "14226": 15, "142525": 13, "144": 13, "14425": 13, "14it": 16, "15": 13, "150": 13, "16": [1, 7, 13, 15], "160": [6, 13, 16], "16219": 15, "162525": 13, "16384": [6, 13, 16], "16740": 15, "17": [13, 15], "171": 13, "17125": 15, "17167": 15, "172": 1, "1730283611": 13, "1730283612": 13, "1730283613": 13, "1730283614": 13, "1730283616": 13, "1730283708": 16, "174": 15, "175": 13, "17609": 13, "179": 15, "17it": 13, "18": [6, 13, 15], "182525": 13, "18340": 13, "184": 13, "18425": 13, "18895": 15, "189": 15, "18991": 13, "19": [6, 13], "191": 15, "1944": 13, "195": 15, "19740": 13, "198": 13, "19884": 15, "1st": 15, "2": [1, 5, 6, 7, 9, 13, 15, 16], "20": [13, 16], "200": [6, 13, 16], "20000": 1, "2024": [6, 13, 16], "202525": 13, "203851697": 13, "2048": [2, 8], "2049": [13, 16], "205": 15, "20537": 13, "20866": 15, "20it": 16, "21": [6, 13, 16], "210": 13, "21343": 13, "22": 13, "22095": 15, "2218395": 6, "2219311": 13, "2220234": 16, "22313": 13, "22363": 15, "22425": 13, "22525": 13, "22603": 15, "23": 13, "2325": 13, "233": [8, 15], "23362": 13, "23892": 15, "23it": 13, "24": [13, 15], "240": 16, "243": [6, 13, 16], "24683": 13, "24h": 13, "25": [7, 13, 16], "256": [1, 2, 6, 7, 13, 15, 16], "26": [13, 15], "26425": 13, "268": 15, "27": [15, 16], "271": 15, "2790": 13, "28": [13, 16], "280": 16, "29": [6, 13, 15, 16], "293": 15, "3": [2, 5, 6, 7, 8, 9, 13, 15, 16], "30": [6, 13, 15, 16], "3000": 15, "30000": [1, 5, 7, 10, 13, 15, 16], "30010": 6, "30425": 13, "307": 16, "308": 15, "31": [13, 15], "317": 8, "32": [1, 2, 6, 13, 15, 16], "320": [15, 16], "3278": 13, "33": [6, 13], "335": 13, "33it": 16, "34": [6, 13, 16], "34425": 13, "35": [13, 15], "354": 16, "35it": 13, "36": [13, 15, 16], "37": [13, 15, 16], "370": 13, "370959": 8, "378633": 15, "38": [13, 15], "38425": 13, "3872": 13, "39": [6, 13, 15, 16], "39106": 13, "3d0a": 13, "4": [1, 6, 7, 13, 16], "40": [6, 13, 15, 16], "4005": 6, "40444": 13, "40446": 13, "40454": 13, "40881": 15, "409": 15, "4096": [1, 2, 6, 8, 13, 16], "41": [6, 13, 15], "41888": 15, "42": 13, "4217": 13, "42425": 13, "42525": 13, "42cb": 13, "43": [6, 13], "433": 15, "43896": 6, "43967": 15, "44": [13, 15], "440": 15, "442913": [13, 16], "447": 15, "44822": 13, "44926": 15, "45": [13, 15, 16], "450": 13, "453": 15, "45354": 15, "45445": 15, "455": 15, "4594": 8, "46": [13, 15, 16], "46425": 13, "46530": 15, "46764": 13, "46776": 13, "46792": 13, "46808": 13, "47": [13, 15, 16], "471d": 13, "474": 13, "47738": 15, "48": [13, 16], "4802": 13, "48302": 15, "4832": 15, "48960": 15, "49": [13, 16], "49017": 15, "49116": 16, "49124": 16, "49128": 16, "49156": 16, "49263": 15, "49928": 6, "49940": 6, "49946": 6, "49980": 6, "49986": 6, "4c13": 13, "4cd3ab6037c9": 13, "4ebe": 13, "4f8d": 13, "5": [1, 6, 7, 13, 15, 16], "50": [8, 13, 15, 16], "500": [8, 13], "50000": 1, "50302": 15, "50425": 13, "5079": 15, "51": [13, 15, 16], "512": [2, 15], "52": [1, 13, 16], "5206": 15, "5255": 15, "52554": 15, "52825": 15, "52920": 15, "53": [13, 16], "54": [13, 15], "54497": 15, "55": [6, 13, 15], "56": [6, 13, 15, 16], "5647": 13, "5656": 15, "56782": 16, "57": [6, 13], "5727": 15, "57426": 15, "58": [13, 15], "59": [6, 13, 15, 16], "5b": 12, "6": [1, 6, 13, 17], "60": [2, 13, 15], "600": [6, 13, 16], "6000": 2, "61": [6, 13, 15], "62525": 13, "62f": 13, "63": [13, 16], "64": [1, 2, 6, 13, 15, 16], "6425": 13, "64g": 17, "65": 15, "66": 15, "6636fb77cf774cf7a76fc4d793c95a38": 13, "67": [6, 15], "6782b0981f72435ab6f3311224bd8b74": 16, "68": [6, 13, 15], "69": [13, 15], "6e16a5b8771": 13, "6ebfa0d17b5a4581a7fcf6ba813ec4c7": 13, "6f8bdf96635e": 13, "7": [1, 6, 13, 16], "70": [2, 13, 15], "71": [6, 13, 15], "72": 15, "72b": 1, "73": [13, 15, 16], "74": [13, 15, 16], "75": [13, 15, 16], "76": [6, 13, 15], "766008": 15, "774756": 15, "774955": 15, "775118": 15, "775210": 15, "775220": 15, "775651": 15, "78": [6, 13, 15, 16], "79": [13, 15], "7988b5b5": 13, "7b": [1, 5, 6, 15], "7e25f262": 13, "7fa2af80": 2, "8": [1, 6, 13, 15, 16, 18], "80": [13, 16], "8000": 0, "81": 15, "8192": [6, 13, 16], "82": [6, 8, 13], "82525": 13, "83": [6, 13, 15], "84": [13, 15], "8413": 15, "85": [6, 13, 15, 16], "86": [6, 15], "8799": 13, "88": [6, 13, 15, 16], "89": [13, 15], "890f": 13, "8980": 13, "8b": [1, 2, 7, 10, 13, 15, 16], "9": [1, 7, 8, 13, 18], "90": 15, "91": 15, "9120": 13, "922321184": 6, "93": [13, 15], "9329": 13, "94": [13, 15], "95": [1, 13, 15, 16], "9519": 13, "96": [13, 15], "9674": 13, "97": [13, 15], "98": 15, "98d4": 13, "9900": 15, "9904": 13, "9998": 8, "9d0c": 13, "A": [1, 2, 7, 8, 9, 10], "By": [5, 15], "For": [1, 2, 3, 12, 13, 15], "If": [1, 5, 8, 10, 15, 18], "In": [1, 6, 7, 13, 16, 18], "It": [1, 3, 5, 7, 8, 9, 10, 13, 15, 16], "NOT": 5, "On": [8, 13], "THE": 13, "The": [1, 2, 3, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17], "Then": [7, 17], "There": 5, "These": 15, "To": [0, 1, 2, 6, 7, 8, 10, 12, 13, 16], "__init__": 14, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "a13b": 13, "a1ef": 13, "a6bf": 13, "a921": 13, "abe6": 13, "abl": 12, "about": [1, 5, 7, 8, 11, 13], "abov": [2, 3, 10, 15], "acceler": [1, 8, 10], "accept": [13, 15], "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "accuraci": 16, "achiev": 13, "across": 3, "activ": 9, "ad": 10, "ad32": 13, "add": [1, 2, 6, 7, 8, 15, 18], "addit": [3, 7, 15], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "afd3": 13, "after": 16, "again": 13, "against": 3, "ai": [1, 10, 13, 16], "algorithm": [15, 16], "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 12, 17], "all_other_model": 12, "allow": [2, 10, 16], "almost": [1, 8, 12], "also": [1, 5, 6, 7, 8, 13, 15, 16], "altern": [3, 7], "alwai": 8, "amd": 17, "amount": 16, "an": [0, 1, 3, 7, 9, 10, 13, 15, 16, 17], "analysi": 13, "analyz": 16, "ancient": 13, "ani": [1, 7, 10, 15], "annot": 2, "anoth": [12, 16], "answer": [3, 7, 16], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 9, 10, 15, 16], "api_kei": [1, 6, 13, 16], "appear": 15, "append": 13, "appian": 13, "appli": 15, "applic": [1, 6, 9, 13, 16], "approach": 10, "apt": [2, 17], "aqueduct": 13, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 12, 13, 15, 16], "arch": 17, "architectur": [2, 13, 16], "arg": 3, "argument": [2, 7, 15], "around": 18, "articl": 16, "artifici": [13, 16], "assembli": 13, "assert": 13, "assist": [1, 3, 5, 7, 13, 15, 16], "assistant_begin": 7, "assistant_end": 7, "atmospher": 13, "attain": 8, "attent": [9, 10, 12], "attention_backend": [6, 13, 16], "attract": [3, 7, 13], "audio": [13, 16], "auror": 7, "australia": [13, 16], "author": [6, 16], "auto": [6, 13, 16], "automat": 15, "autoregress": 7, "autosc": 10, "autotoken": 6, "avail": [1, 6, 10, 13, 16], "averag": 3, "avoid": [10, 13], "awq": 9, "b": 10, "b605": 13, "back": 9, "backend": [2, 3, 10, 15, 18], "backend_input_fil": 13, "backend_result_fil": 13, "bad": 3, "baichuan2": 1, "balanc": [7, 13], "base": [3, 15, 16], "base64": 15, "base_url": [1, 6, 13, 16], "bash": [14, 17], "basic": 15, "batch": [1, 2, 6, 8, 9, 15, 16], "batch_55ca0a55": 13, "batch_8c91cce9": 13, "batch_c5b5ec3a": 13, "batch_detail": 13, "batch_id": 13, "batch_job": 13, "batch_request": 13, "batch_respons": 13, "batchrequestcount": 13, "bce": 13, "bearer": [6, 16], "becam": 13, "becaus": [7, 8], "befor": [2, 13, 15], "begin": [6, 7, 13, 16], "behind": 16, "beij": 13, "being": 8, "below": [7, 10, 15, 17], "bench_lat": [1, 2, 12], "bench_serv": [1, 2, 15], "benchmark": 9, "berlin": 3, "bespok": 3, "better": [1, 8, 10, 12, 13], "between": [1, 15], "bfloat16": [13, 16], "bia": 7, "bin": 17, "black": [6, 13, 16], "blob": 15, "block": [7, 16], "blog": 11, "blogpost": 3, "blood": 7, "blue": [6, 13, 16], "bodi": [7, 13, 15], "bogart": 7, "book": 16, "bool": 15, "born": 7, "both": 8, "bottleneck": 8, "bra": 13, "branch": 10, "bras\u00edlia": [13, 16], "brazil": [13, 16], "break": 15, "bridg": 13, "browser": 0, "bug": 13, "build": [1, 10, 13, 14], "built": 10, "c": [6, 10, 13, 16], "c51dc5985f10": 13, "c9ed6e0b75884675b752770fdf00118a": 13, "cach": [1, 2, 6, 8, 9, 10, 13, 16, 17], "caementicium": 13, "calcul": 7, "call": [3, 7, 9], "campaign": 13, "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 15, 16, 17], "canberra": [13, 16], "cancel": 13, "cancelled_job": 13, "cannot": 15, "capit": [1, 3, 7, 13, 15, 16], "captur": [13, 16], "case": [8, 18], "cd": [4, 10, 14], "center": 13, "centuri": 13, "cf5e": 13, "chain": 9, "chang": [12, 17], "charact": 7, "character_gen": 7, "character_regex": 7, "chat": [1, 7, 15, 16], "chat_exampl": 7, "chat_templ": [5, 6, 13, 16], "chatbot": 16, "chatcomplet": [13, 16], "chatcompletionmessag": [13, 16], "chatglm": 1, "chatml": [1, 5, 15], "check": [1, 10, 13], "check_output": 6, "checkpoint": [1, 2, 6, 13, 16], "china": 13, "choic": [7, 9, 13, 16], "choices_method": 3, "chunk": [1, 9, 13, 15], "chunked_prefill_s": [6, 13, 16], "ci": 4, "civil": 13, "clariti": [6, 13, 16], "class": 15, "clean": 13, "cli": 2, "client": [1, 2, 6, 13], "climat": 13, "clone": [0, 10], "cloud": 13, "cluster": 10, "co": 9, "code": [2, 6, 7, 13, 16], "color": [2, 6, 13, 16], "colosseum": 13, "com": [2, 10, 14, 15, 17], "combin": [6, 13, 16], "come": [8, 15], "command": [1, 2, 4, 6, 10, 12, 13, 16, 17], "commit": 4, "common": [16, 18], "commun": 9, "compar": 12, "comparison": [3, 12], "compat": [5, 7, 9, 15, 16], "compil": [1, 8], "complet": [1, 6, 7, 16], "completion_token": [13, 16], "completion_tokens_detail": [13, 16], "completion_window": 13, "completionchoic": 13, "completionusag": [13, 16], "complex": [7, 13, 16], "comput": [2, 7, 8, 13, 15], "concis": 13, "concret": 13, "conda": 10, "confid": 3, "config": [1, 2], "connect": [7, 10], "conquest": 13, "consid": [2, 15], "constrain": [8, 9, 15], "constrained_json_whitespace_pattern": [6, 13, 16], "constraint": 7, "contain": 3, "contempl": 13, "content": [1, 6, 7, 13, 16], "context_len": [6, 13, 16], "context_length": [6, 13, 16], "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 16], "convert": 12, "copi": 10, "core": [7, 9], "correct": [2, 15], "cost": 13, "could": 15, "count": 13, "countri": [1, 13, 16], "cover": 13, "coverag": 12, "cpu": 8, "creat": [1, 6, 12, 13, 16], "created_at": 13, "creativ": 13, "critic": 2, "ctrl": [6, 13, 16], "cu121": 10, "cuda": [1, 2, 6, 10, 13, 15, 16, 17], "cuda_graph_max_b": [6, 13, 16], "cuda_visible_devic": 17, "curl": [1, 15, 16, 17], "curl_id": 6, "curl_text": 6, "currenli": [1, 8], "current": 13, "custom": 1, "custom_id": 13, "d": [0, 1, 2, 6, 7, 10, 13, 16], "d1f9806de20": 13, "d80eb4e3": 13, "d9f7c4a06538": 13, "da61bcb26033": 13, "dark": 13, "data": [1, 6, 8, 13, 15, 16], "dataclass": 15, "dataset": [2, 15], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 15, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "deck": 13, "decod": [8, 9, 13, 15, 16], "decode_unicod": 15, "decor": 7, "decreas": 8, "deepseek": [1, 9], "def": [1, 3, 7], "default": [1, 3, 5, 8, 10, 15, 18], "defin": [5, 7], "del_respons": 13, "delai": 2, "delet": 13, "delta": 13, "depend": [10, 16], "deploi": 10, "deploy": 10, "describ": [3, 15], "descript": [2, 15], "design": [9, 16], "desir": 15, "detail": [13, 15], "detailed_tip": 7, "determin": 3, "detoken": 15, "dev": [1, 17], "devel": 17, "develop": [2, 13], "devic": [1, 6, 10, 13, 16, 17], "devtool": 2, "dict": 15, "diet": 7, "differ": 12, "difficult": 15, "directli": 1, "directori": 12, "disabl": [1, 2, 15, 18], "disable_cuda_graph": [6, 13, 16], "disable_cuda_graph_pad": [6, 13, 16], "disable_custom_all_reduc": [6, 13, 16], "disable_disk_cach": [6, 13, 16], "disable_flashinf": [6, 13, 16], "disable_flashinfer_sampl": [6, 13, 16], "disable_mla": [6, 13, 16], "disable_nan_detect": [6, 13, 16], "disable_pen": [6, 13, 16], "disable_radix_cach": [6, 13, 16], "disable_regex_jump_forward": [6, 13, 16], "discov": 13, "discoveri": 13, "dislik": 15, "displai": [6, 13, 16], "dist_init_addr": [6, 13, 16], "distrib_releas": 2, "distribut": [6, 13, 16], "divers": 13, "dn": 7, "do": [2, 8, 13, 15, 17], "doc": [2, 3, 10, 15], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [15, 17], "down": [3, 13], "download": [2, 15], "dp": 1, "dp_size": [6, 13, 16], "dpkg": 2, "drawback": 15, "dri": 17, "ds_channel_config_path": [6, 13, 16], "ds_heavy_channel_num": [6, 13, 16], "ds_heavy_channel_typ": [6, 13, 16], "ds_heavy_token_num": [6, 13, 16], "ds_sparse_decode_threshold": [6, 13, 16], "dtype": [1, 6, 13, 16], "duck": 3, "due": [3, 8, 18], "dummi": 2, "dump": [6, 13], "durat": [2, 15], "dure": [1, 8, 13, 15], "dust": 13, "dynam": [1, 2], "e": [2, 10, 12, 13, 17], "e0473f036ab74e6e95f25ac670600abd": 16, "e2": 15, "e5": [1, 6, 9], "e7d79e39": 13, "each": 1, "earli": 8, "earlier": 3, "earth": 13, "easi": [9, 12, 18], "easier": 7, "eater": 7, "echo": [2, 17], "edg": 13, "edit": 17, "effici": [1, 9], "either": 15, "element": 13, "eleutherai": 3, "elif": 7, "els": 13, "embed": [1, 9, 13], "embedding_model": 13, "embedding_process": 6, "empir": 13, "emploi": 16, "empti": 1, "enabl": [1, 7, 8, 10, 16], "enable_cache_report": [6, 13, 16], "enable_double_spars": [6, 13, 16], "enable_mixed_chunk": [6, 13, 16], "enable_overlap_schedul": [6, 13, 16], "enable_p2p_check": [6, 13, 16], "enable_torch_compil": [6, 13, 16], "encod": [6, 15], "encount": 10, "encourag": [13, 15], "end": [6, 7, 12, 13, 15, 16], "endpoint": [1, 10, 13, 15], "engag": 16, "engin": [7, 13], "enorm": 13, "enough": [1, 8], "entir": 16, "entryclass": 12, "enumer": 7, "env": 10, "environ": [1, 6, 17], "eo": [8, 15], "equival": [6, 16], "error": [1, 8, 13], "especi": 8, "establish": 13, "etc": [2, 9], "eth0": 1, "even": [3, 16], "everi": 15, "exampl": [1, 3, 6, 12, 13, 16, 17], "example_imag": 15, "exaon": 1, "except": 13, "excl": 15, "exec": 2, "execut": [10, 16], "execute_shell_command": [6, 13, 16], "exercis": 7, "exist": 12, "expand": [7, 13], "experiment": 8, "explor": 13, "export": [0, 1, 7, 17], "express": [7, 15], "extend": 3, "extens": [9, 12], "extern": [7, 9], "extra": 15, "extract": 16, "f": [1, 6, 7, 13], "face": [1, 5], "fail": [3, 13], "failur": 10, "fals": [6, 13, 15, 16], "far": 15, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "felt": 13, "fetch": 2, "file": [0, 2, 4, 12, 13, 15], "file_respons": 13, "file_storage_pth": [6, 13, 16], "fill": 7, "fillmor": 3, "final": 13, "find": [7, 11, 12, 15], "finish_reason": [13, 16], "fire": [6, 13, 16], "first": [1, 2, 6, 7, 8, 15], "fix": 18, "flashinf": [6, 9, 10, 13, 16], "flexibl": 9, "float": 15, "float16": 6, "flow": 9, "fluenci": 13, "flush": [7, 15], "focus": 13, "folder": [2, 4, 17], "follow": [1, 2, 5, 6, 7, 8, 12, 15, 17], "forev": 17, "fork": [2, 7], "format": [2, 6, 7, 13, 15, 16], "forward": [9, 12], "forward_batch": 12, "found": 7, "foundat": 13, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 15, 18], "framework": 9, "franc": [1, 3, 7, 15], "frequency_penalti": [13, 15], "frequent": 8, "from": [4, 5, 6, 7, 13, 16], "from_pretrain": 6, "frontend": [5, 10], "full": [1, 8, 13], "function": [3, 7, 12], "function_cal": [13, 16], "further": 10, "futur": [1, 12], "g": [2, 10, 12, 13, 17], "ga": 13, "gaze": 13, "gb": [6, 13, 16], "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8, 13, 16], "gener": [0, 1, 9, 13, 15, 16], "generatereqinput": 15, "get": [6, 10, 12, 13, 15, 16], "get_model_info": [6, 13, 16], "git": [10, 17], "github": [0, 10, 15], "give": [12, 17], "given": 15, "glimps": 15, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": 7, "govern": 13, "gpt": 7, "gptq": [6, 9, 13, 16], "gpu": [1, 8, 10, 15, 17], "grammar_backend": [6, 13, 16], "graph": [1, 2, 13, 16, 18], "greedy_token_select": 3, "green": 13, "grok": 1, "group": 17, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 15, 16], "h": [1, 6, 16], "h100": [10, 15], "ha": [8, 12], "had": 13, "haisgl": 17, "half": 7, "hand": 8, "handl": [1, 2, 15], "happen": 8, "hardwar": 15, "harri": 7, "hasattr": 13, "have": [0, 1, 3, 8, 13, 15], "he": 13, "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 12, 13, 15, 16], "henryx": 17, "here": [1, 6, 7, 13, 16], "hf": 5, "hf_home": 17, "hf_token": [10, 17], "hf_xxx": 17, "hi": 13, "high": [3, 8, 13, 15], "higher": [13, 15], "highest": [3, 7], "highlight": [6, 13, 16], "him": 13, "hint": 13, "historian": 13, "hit": [6, 13, 15, 16], "host": [6, 10, 13, 16], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 13, 14, 15, 16, 17], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 12, 17], "human": 16, "hung": 13, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18], "id": [13, 15, 16], "idea": 16, "ident": 12, "ignor": 15, "ignore_eo": 15, "im_end": [5, 15], "im_start": [5, 15], "imag": [7, 10, 15], "image_data": 15, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 12, 13, 16], "import": [1, 2, 6, 7, 8, 13, 15, 16], "impress": 16, "improv": [6, 13, 16], "includ": [7, 9, 13, 16], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 13, 16], "indic": 8, "industri": 9, "inf": 15, "infer": [1, 15], "info": [6, 13, 16], "inform": [7, 15], "infra": 10, "init": [1, 6, 13, 16], "initi": [3, 13], "input": [1, 2, 7, 9, 13, 15], "input_file_id": 13, "input_file_path": 13, "input_id": [6, 15], "input_ids_embed": 6, "insid": 17, "instal": [0, 2, 4, 6, 9, 13, 14, 16, 17], "installationguid": 2, "instanc": 3, "instead": [1, 18], "instruct": [1, 2, 6, 7, 10, 13, 15, 16], "int": 15, "int4": 9, "int4wo": 1, "integr": 9, "intellig": [13, 16], "inter": 15, "interact": [9, 16], "interfac": [9, 12, 16], "internlm": 1, "interpret": 13, "intfloat": 6, "intuit": [9, 16], "invok": 7, "io": 0, "ip": [1, 7], "ipc": 10, "ipynb": 13, "is_embed": [6, 13, 16], "issu": [7, 10, 18], "itali": 13, "iter_lin": 15, "itl": 15, "its": [3, 13], "jameson": 13, "japan": [7, 13, 16], "jewel": 13, "job": 13, "joke": 13, "json": [1, 2, 5, 6, 13, 15, 16], "json_decod": 7, "json_model_override_arg": [6, 13, 16], "json_output": 7, "json_schema": 15, "jsonl": 13, "jump": 9, "just": [5, 13], "k": 15, "k8": 10, "keep": 13, "kei": [2, 7, 16], "kepler": 13, "kernel": [9, 10, 18], "kfd": 17, "kingdom": 7, "knowledg": 13, "known": 16, "kv": [1, 8], "kv_cache_dtyp": [6, 13, 16], "l": 15, "l4": 10, "l40": 10, "lab": [1, 15], "label": 17, "lang": 15, "languag": [5, 9, 10, 13, 16], "larg": [1, 2, 8, 9, 16], "last": [10, 13], "late": 13, "later": [3, 17], "latest": 10, "latin": 13, "launch": [1, 2, 5, 7, 9, 10, 15, 18], "launch_serv": [1, 2, 5, 6, 7, 10, 13, 15, 16], "law": 13, "layer": 12, "layer_id": 12, "learn": [1, 4, 9, 12, 16], "least": 15, "len": [1, 2, 15], "length": [7, 13, 15], "less": 13, "let": 1, "level": [13, 15], "librari": [7, 16], "light": 13, "like": [8, 13, 16], "limit": 3, "line": [13, 16], "lint": 4, "linux": 17, "liquid": 13, "list": [1, 2, 7, 12, 13, 15, 16, 18], "literatur": 13, "llama": [2, 5, 7, 9, 10, 12, 13, 15, 16], "llama3": 1, "llamaforcausallm": [13, 16], "llava": [1, 9, 15], "llava_llama_3": 1, "llm": [1, 3, 9, 16], "lm_eval": [6, 13, 16], "lmm": [1, 15], "lmsysorg": 10, "load": [1, 2, 6, 8, 13, 15, 16], "load_balance_method": [6, 13, 16], "load_format": [6, 13, 16], "load_imag": 15, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 13, 15, 16], "locat": [13, 15], "log": [6, 7, 8, 13, 16], "log_level": [6, 13, 16], "log_level_http": [6, 13, 16], "log_request": [6, 13, 16], "logic": 15, "logit": [7, 12, 15], "logitsprocessor": 12, "logprob": [3, 13, 15, 16], "logprob_start_len": 15, "london": 3, "long": [1, 13, 16], "longer": [3, 13], "longest": 8, "look": [5, 8], "loop": 7, "lora_path": [6, 13, 16], "low": 15, "lower": [8, 13], "lpm": [6, 8, 13, 16], "lsb": 2, "lt": [6, 13, 16], "m": [0, 1, 2, 5, 6, 7, 10, 12, 13, 15, 16], "machin": [10, 16], "magic": 7, "mai": [1, 2, 6, 7, 13, 16, 18], "main": [1, 15, 16], "maintain": 12, "major": [12, 13], "make": [0, 8, 9, 12, 13, 15], "manag": 7, "mani": [3, 8, 12], "manner": 15, "mask": 7, "massiv": 16, "match": 8, "matched_stop": [13, 16], "materi": 11, "math": 7, "max": 15, "max_check": 13, "max_loras_per_batch": [6, 13, 16], "max_new_token": [1, 8, 15], "max_prefill_token": [6, 13, 16], "max_running_request": [6, 13, 16], "max_token": [1, 7, 13, 16], "max_total_num_token": [6, 13, 16], "max_total_token": [6, 13, 16], "maximum": 15, "md": 4, "me": 13, "mean": [8, 15, 16], "meanwhil": 5, "measur": 15, "median": 15, "meet": 1, "mem": [1, 6, 13, 15, 16, 18], "mem_fraction_stat": [6, 13, 16], "memori": [1, 2, 6, 13, 16], "messag": [1, 7, 13, 16], "meta": [1, 2, 5, 7, 10, 13, 15, 16], "method": [9, 13], "mild": 13, "militari": 13, "millard": 3, "min_new_token": 15, "min_p": 15, "minicpm": 1, "ministri": 7, "minut": [13, 16], "mislead": 3, "miss": 5, "mission": 13, "mistral": [1, 6, 9], "mix": 15, "mixtral": 1, "modal": [1, 9], "mode": 13, "model": [2, 3, 5, 8, 9, 10, 13, 15, 16, 17], "model_path": [1, 6, 13, 16], "moder": 13, "moe": 1, "monitor": 13, "monument": 13, "more": [1, 9, 10, 13, 15, 16], "most": [5, 8, 12, 16], "mount": 17, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 13], "multipli": 15, "must": 15, "my": 1, "my_model": 5, "my_model_templ": 5, "myself": 16, "n": [7, 13, 15, 16], "n1": [13, 16], "n2": [13, 16], "n3": [13, 16], "n4": 16, "n5": 16, "na": 16, "name": [1, 2, 3, 5, 7, 15, 17], "natur": [13, 16], "nbecaus": 13, "ncaptain": 13, "nccl": 1, "ndescrib": 15, "need": [2, 5, 7, 10, 12, 17], "nemo": 1, "nest": 7, "new": [1, 6, 8, 9, 13, 14, 16, 17], "new_token_ratio": 8, "newli": 13, "next": 1, "ngener": 1, "nkepler": 13, "nlist": 13, "nllm": 16, "nlp": [1, 6, 16], "nnode": [1, 6, 13, 16], "node": [1, 2], "node_rank": [6, 13, 16], "non": 7, "none": [6, 13, 15, 16], "normal": 7, "note": [1, 2, 5, 6, 12, 13, 15, 16, 17], "notebook": [6, 13, 16], "novel": 13, "now": 7, "npython": 13, "nsome": 16, "nsy": 2, "nthe": 16, "null": [10, 16], "num": [1, 2, 15], "num_continuous_decode_step": [6, 13, 16], "number": [8, 15], "nvidia": [2, 15, 17], "nvtx": 2, "nyou": 15, "o": [2, 6, 15, 17], "object": [13, 16], "observ": 13, "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "often": 16, "ok": [6, 13, 16], "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 16], "one": [3, 7, 13, 15, 16], "onevis": [1, 15], "onli": [2, 3, 7, 10, 12, 13, 15], "onlin": [1, 2], "only_run": 12, "oom": [8, 15], "open": [9, 10, 13, 16], "openai": [3, 5, 9, 10, 15], "openai_api_kei": [7, 17], "oper": 10, "optim": 18, "option": [3, 15], "opu": 13, "orbit": 13, "order": 7, "origin": [6, 13, 16], "other": [3, 8, 10, 12, 15], "otherworldli": 13, "our": 13, "out": [1, 2, 7, 10, 13, 18], "outlin": [6, 13, 16], "output": [1, 2, 6, 12, 13, 15, 16], "output_file_id": 13, "ov": [1, 15], "over": 16, "overhead": [8, 15], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 13, 15], "p": [10, 15], "p2p": 1, "p99": 15, "page": [9, 13, 18], "pantheon": 13, "paragraph": 7, "parallel": [1, 8, 9, 15], "paramet": [8, 9], "pari": 3, "pars": 16, "part": [12, 16], "pass": [4, 7, 12], "path": [0, 1, 2, 3, 5, 6, 7, 10, 13, 15, 16], "patronu": 7, "pattern": 16, "peer": 1, "penal": 15, "penalti": 13, "per": 15, "perform": [3, 16], "perhap": 16, "phoenix": 7, "phrase": 13, "piec": 16, "pip": [0, 2, 14, 17], "pip3": 4, "plan": 10, "planet": 13, "playground": 12, "pleas": [1, 7, 10], "png": 15, "point": 16, "pool": [1, 6, 8, 13, 16], "poorli": 3, "popular": 13, "port": [1, 5, 6, 7, 10, 13, 15, 16], "post": [6, 13, 15, 16], "post2": 10, "post3_vllm0": 17, "potter": 7, "power": 16, "pre": 4, "predict": 3, "prefer": 13, "prefil": [1, 2, 6, 9, 12, 13, 16], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": [13, 15], "presid": [1, 3], "press": [6, 13, 16], "prev": 15, "primit": [3, 7], "print": [1, 2, 7, 13, 15], "print_highlight": [6, 13, 16], "probabl": 7, "process": [6, 13, 15, 16], "profil": 9, "program": [9, 10, 13], "programm": 13, "progress_bar": 7, "project": [0, 5, 10, 11, 14, 15, 17], "prompt": [1, 2, 7, 9, 13, 15], "prompt_token": [13, 16], "prompt_tokens_detail": [13, 16], "proper": 10, "provid": [1, 2, 7, 9, 10, 13, 16], "pub": 2, "pull": 17, "pure": 7, "purpos": 13, "py": [0, 1, 2, 5, 6, 7, 12, 13, 14, 15, 16], "pydant": 7, "pyproject": 14, "python": [1, 2, 5, 6, 7, 10, 13, 14, 15], "python3": [0, 1, 2, 10, 12, 15, 17], "pytorch": [10, 18], "q": 7, "qk": [6, 13, 16], "quantiz": [1, 6, 9, 13, 16], "queri": [1, 16], "question": [7, 16], "question_1": 7, "question_2": 7, "queue": [6, 8, 13, 16], "quick": [2, 9], "quick_start": 7, "quit": [6, 13, 16], "qwen": [1, 9, 12], "qwen2": [1, 6, 12, 15], "qwen2forcausallm": 6, "r": [0, 1, 7], "radix": 2, "radixattent": [9, 12], "rais": 13, "ran": 15, "random": [2, 15], "random_se": [6, 13, 16], "rang": [8, 9, 13, 16], "rank": 1, "rate": [6, 13, 15, 16], "ravenclaw": 7, "raw": 15, "rb": 13, "reach": 15, "read": 13, "readi": [6, 13, 16], "readm": 4, "readme_exampl": 7, "real": [1, 2], "reason": 13, "recommend": [2, 10, 15], "recoveri": 10, "reduc": [1, 8, 13], "refer": [1, 12, 13], "reference_hf": 12, "refus": [13, 16], "regex": [7, 15], "regist": 5, "regular": [7, 15], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 16, "releas": [2, 10], "relev": 15, "rememb": 6, "remot": 10, "remov": [0, 12], "repeat": 15, "repetit": 13, "repetition_penalti": 15, "replac": [1, 10, 12], "repo": 2, "report": [1, 18], "reproduc": 13, "req": [6, 8, 13, 15, 16], "request": [1, 7, 9, 13, 15], "request_count": 13, "request_id": 13, "requir": 0, "resourc": [10, 12], "respons": [1, 3, 6, 13, 15, 16], "restart": 17, "result": [3, 13, 15], "result_cont": 13, "result_file_id": 13, "retoken": 15, "retracted_req": 8, "retriev": 13, "return": 15, "return_logprob": 15, "return_text_in_logprob": 15, "reus": 12, "rid": 15, "rm": 17, "rmsnorm": 12, "road": 13, "role": [1, 13, 16], "roll": [6, 13, 16], "roman": 13, "rome": 13, "root": 10, "round_robin": [6, 13, 16], "run": [0, 2, 4, 6, 7, 12, 13, 15, 16], "run_batch": 7, "runner_allow_runasroot": 17, "running_request": 15, "runtim": [9, 10], "runtimeendpoint": [3, 7], "safetensor": [6, 13, 16], "sai": 13, "same": [1, 2, 6, 7, 12, 15], "sampl": [9, 10, 12, 18], "sampling_backend": [6, 13, 16], "sampling_param": [1, 15], "scale": [10, 15, 16], "scan": 13, "schedule_conserv": [6, 13, 16], "schedule_polici": [6, 13, 16], "schema": [7, 15], "scientif": 13, "script": 12, "search": 7, "second": 13, "secret": 10, "section": 15, "see": [1, 7, 8, 10, 15], "seed": 13, "select": [7, 10], "senat": 13, "send": [1, 8, 9, 13, 15], "send_request": 13, "sentenc": 15, "sep": 5, "sep_styl": 5, "separ": [6, 13, 16], "seq": [6, 13, 16], "sequenc": 13, "seri": 13, "serv": [1, 2, 8, 9, 10, 15], "served_model_nam": [6, 13, 16], "server": [0, 2, 5, 7, 8, 9, 13, 15], "server_arg": [6, 13, 16], "server_process": [13, 16], "serverarg": [6, 13, 16], "servic": [10, 13], "service_ti": [13, 16], "set": [1, 2, 5, 7, 10, 15, 18], "set_default_backend": 7, "sever": [1, 2, 13, 16], "sgl": [0, 1, 3, 7, 10, 11, 14, 15, 17], "sgl0": 17, "sglang": [2, 4, 6, 11, 13, 14, 16, 17], "sglang_is_in_ci": 17, "sglang_storag": [6, 13, 16], "sglang_use_modelscop": 1, "sh": 14, "shard": [6, 13, 16], "share": [8, 17], "shell": 6, "shimmer": 13, "ship": 13, "shiver": 13, "shm": 17, "short": [13, 15], "shorter": 3, "should": [5, 12, 13], "show": 7, "show_time_cost": [6, 13, 16], "siluandmul": 12, "similar": [12, 13, 15], "simpl": [7, 16], "simpli": 3, "sinc": [13, 15], "singl": [1, 2, 10, 12, 13, 15], "size": [1, 2, 17], "sk": [7, 17], "skip": 15, "skip_special_token": 15, "skip_tokenizer_init": [6, 13, 16], "sky": 10, "skyserv": 10, "sleep": [13, 17], "slide": 11, "slightli": 13, "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 18], "smollm": 1, "smooth": 13, "snippet": 2, "so": [1, 2, 6, 13, 15, 16], "some": [2, 6, 7, 12, 15, 17, 18], "sometim": 18, "sourc": [2, 9], "space": [13, 15], "spaces_between_special_token": 15, "special": 15, "specif": [1, 10, 12], "specifi": [1, 3, 5, 7, 15, 17], "speech": 16, "spine": 13, "split": 13, "srt": [9, 10, 15], "stabl": 13, "stablelm": 1, "stai": 7, "stand": [8, 16], "star": 13, "start": [6, 12, 13, 15], "startswith": 15, "startup": [6, 13, 16], "state": [1, 7, 13], "static": [1, 2, 15, 18], "statu": [7, 10, 13], "status_cod": 13, "still": 15, "stood": 13, "stop": [7, 8, 13, 15, 16], "stop_str": 5, "stop_token_id": 15, "store": 15, "stori": [13, 16], "str": 15, "strategi": 1, "stream": [1, 13], "stream_interv": [6, 13, 16], "string": [8, 15], "strip": [13, 15], "strong": [3, 13], "structur": [9, 16], "student": 7, "subprocess": 6, "subset": 3, "success": 15, "successfulli": 13, "suggest": 8, "summar": 16, "summari": 7, "sun": 13, "super": 13, "suppli": [3, 15], "support": [3, 6, 7, 9, 10, 13, 15], "sure": [0, 12, 15], "surround": 13, "swirl": 13, "switch": 10, "sxm5": 15, "system": [1, 2, 5, 7, 13, 15, 16], "system_fingerprint": [13, 16], "t4": 10, "tabl": 13, "tag": 16, "take": [8, 13, 16], "teacher": 7, "techniqu": 16, "tee": 2, "tell": 13, "temperatur": [1, 7, 13, 15, 16], "templat": [1, 7, 15], "temporarili": 5, "tensor": [1, 9], "termin": [6, 10, 13, 16], "terminate_process": [6, 13, 16], "territori": 13, "test": [1, 2, 13, 15, 16, 17], "test_generation_model": 12, "test_oth": 12, "test_vision_openai_serv": 1, "testgenerationmodel": 12, "text": [1, 6, 12, 13, 15, 16], "text_complet": 13, "text_embed": 6, "text_it": 7, "text_qa": 7, "than": 13, "thei": [13, 15, 16], "them": [10, 18], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12, 13, 15, 16, 18], "thing": 8, "through": [7, 13, 15], "throughput": [1, 13, 15, 16], "time": [1, 2, 6, 13, 15, 16], "tip": 18, "tip_suggest": 7, "tmp": 17, "todai": 1, "togeth": [1, 6, 8, 13, 16], "tok": 15, "token": [1, 5, 6, 7, 8, 9, 10, 13, 16], "token_id": 15, "token_length_norm": 3, "tokenizer_mod": [6, 13, 16], "tokenizer_path": [6, 13, 16], "tokenizers_parallel": 6, "tokyo": [13, 16], "toml": 14, "too": 8, "tool": 7, "tool_cal": [13, 16], "tool_us": 7, "top": 15, "top_k": 15, "top_logprobs_num": 15, "top_p": [1, 13, 15], "topic": [13, 16], "torch": [1, 6, 8, 13, 16], "torch2": 10, "torch_compile_max_b": [6, 13, 16], "torchao": 1, "torchao_config": [6, 13, 16], "total": [1, 13, 15], "total_token": [13, 16], "tp": 1, "tp0": [6, 13, 16], "tp_size": [6, 13, 16], "tpot": 15, "tr": 2, "trace": 2, "traffic": 15, "train": [2, 16], "transform": [6, 12, 16], "transit": 13, "translat": 16, "triton": 10, "triton_attention_reduce_in_fp32": [6, 13, 16], "troubleshoot": 9, "true": [1, 2, 6, 7, 13, 15, 17], "truncat": [1, 2], "trust_remote_cod": [6, 13, 16], "try": [1, 13, 15, 18], "ttft": 15, "tune": [1, 9, 15], "turbo": 7, "turn": 7, "tutori": 13, "twelv": 13, "twine": 14, "two": [1, 5, 7, 12, 13], "txt": 0, "type": [1, 6, 13, 16], "typic": [6, 13, 16], "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 17, "unconditional_likelihood_norm": 3, "under": [2, 4, 12], "understand": [12, 16], "union": 15, "unit": [1, 2, 7, 13], "unittest": 12, "until": 15, "up": [6, 10, 13, 16], "updat": [0, 2, 17], "upgrad": 10, "upload": 13, "upload_pypi": 14, "uploaded_fil": 13, "upon": [1, 6], "url": [13, 15], "us": [2, 3, 4, 5, 8, 13, 15, 17], "us_president_exampl": 3, "usabl": [6, 13, 16], "usag": [1, 3, 6, 8, 16], "user": [1, 3, 5, 7, 8, 13, 15, 16], "usual": [13, 15], "utf": [13, 15], "util": [6, 8, 13, 15, 16], "uvicorn": [6, 13, 16], "v": [10, 17], "v0": 10, "v1": [1, 6, 13, 16], "valid": 13, "valu": [1, 8, 15, 18], "valuabl": 12, "variabl": [1, 17], "variant": 2, "variou": [1, 13], "vast": [13, 16], "veri": [8, 12, 13, 15], "verifi": 13, "version": 10, "vertexai": 7, "video": [11, 17], "view": 1, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 15, "w": [7, 13], "wa": 13, "wai": [6, 12, 13, 16], "wait": [6, 13, 16], "wait_for_serv": [6, 13, 16], "wand": 7, "want": [1, 15], "warn": 8, "washington": 13, "watchdog_timeout": [6, 13, 16], "water": 13, "we": [1, 6, 13, 15, 16], "web": 13, "weight": [1, 2, 6, 13, 16, 17], "weight_util": [6, 13, 16], "welcom": 5, "well": [12, 16], "were": [13, 15], "what": [3, 7, 13, 16], "when": [5, 7, 8, 13, 15, 18], "where": 3, "whether": 15, "which": [8, 13, 15, 16], "while": [1, 2, 6, 10, 13, 15, 16, 17], "whl": 10, "who": 13, "why": 13, "wide": [9, 13, 16], "within": 7, "without": [2, 10, 13], "wood": 7, "word": [7, 13], "work": [1, 5, 8, 17], "workflow": 7, "workload": 8, "world": 13, "write": [0, 13], "x64": 17, "x86_64": 2, "xvers": 1, "xxx": 17, "y": [2, 17], "yaml": 10, "year": 13, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 15, 16, 17], "your": [0, 1, 5, 7, 9, 10, 13, 15, 16], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "Learn more", "How to Support a New Model", "OpenAI Compatible API", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start: Launch A Server and Send Requests", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 17], "2": [10, 17], "3": [1, 10, 17], "4": 10, "405b": 1, "5": 10, "A": [6, 16], "The": 18, "With": 10, "access": 18, "achiev": 8, "action": 17, "add": [4, 12, 17], "addit": 1, "advanc": 8, "all": 15, "an": 18, "api": [1, 6, 13], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 15, "batch": [7, 13], "benchmark": [1, 2, 15], "build": 0, "chat": [5, 13], "choic": 3, "chunk": 8, "clean": 0, "client": 16, "cloud": 10, "code": [4, 14], "common": 10, "compat": [1, 6, 13], "complet": 13, "compos": 10, "config": 17, "configur": 17, "conserv": 8, "constrain": 7, "contain": 17, "contributor": 4, "control": 7, "correct": 12, "cuda": 18, "curl": 6, "custom": 5, "debug": 12, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 17], "document": [0, 9], "dp": 8, "embed": 6, "encount": 18, "engin": 1, "error": 18, "exampl": [7, 15], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 15, "from": [1, 10, 12], "frontend": [7, 9], "gener": 7, "get": 9, "github": [14, 17], "greedi": 3, "guid": [4, 8], "hang": 18, "host": 17, "how": 12, "http": 1, "hyperparamet": 8, "id": 6, "illeg": 18, "implement": 7, "implic": 15, "input": 6, "instal": 10, "interact": 12, "json": 7, "kubernet": 10, "languag": 7, "latenc": 15, "launch": [6, 16], "learn": 11, "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 14, "max": 8, "mem": 8, "memori": [8, 15, 18], "method": [3, 10], "min": 15, "modal": [7, 15], "model": [1, 6, 7, 12], "modelscop": 1, "more": [7, 11], "multi": [7, 15], "new": [12, 15], "normal": [3, 15], "note": 10, "nsight": 2, "openai": [1, 6, 7, 13, 16], "option": 8, "other": 2, "out": 8, "packag": 14, "parallel": 7, "paramet": [13, 15], "peak": 8, "penalti": 15, "perform": [1, 15], "pip": 10, "polici": 8, "port": 12, "prefil": 8, "presenc": 15, "preview": 0, "process": 14, "profil": 2, "pypi": 14, "python": 16, "quick": [1, 7, 16], "refer": 9, "releas": 14, "repetit": 15, "request": [8, 16], "role": 7, "run": [1, 8, 10, 17], "runner": 17, "runtim": [1, 5, 15], "sampl": 15, "schedul": 8, "select": 3, "self": 17, "send": 16, "serv": 0, "server": [1, 6, 16, 18], "set": 17, "sglang": [0, 1, 3, 5, 7, 9, 10, 12, 15], "sh": 17, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 16, 17], "static": 8, "step": 17, "stream": [7, 15], "structur": 7, "submiss": 8, "suit": 12, "support": [1, 12], "templat": 5, "test": [4, 12], "throughput": 8, "tip": [2, 7], "togeth": 15, "token": [3, 15], "tp": 8, "troubleshoot": 18, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 17, "updat": 14, "upload": 14, "us": [1, 6, 7, 10, 16], "usag": 13, "version": 14, "vllm": 12, "wa": 18, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
diff --git a/send_request.html b/send_request.html
index c08770d..5ddca03 100644
--- a/send_request.html
+++ b/send_request.html
@@ -177,6 +177,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
@@ -454,31 +455,31 @@ <h2>Launch a server<a class="headerlink" href="#Launch-a-server" title="Link to
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:53:28] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=522777218, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:53:43 TP0] Init torch distributed begin.
-[2024-10-30 09:53:44 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:53:44 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:53:45 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
+[2024-10-30 10:21:11] server_args=ServerArgs(model_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_path=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;meta-llama/Meta-Llama-3.1-8B-Instruct&#39;, chat_template=None, is_embedding=False, host=&#39;0.0.0.0&#39;, port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;lpm&#39;, schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=1057403943, constrained_json_whitespace_pattern=None, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth=&#39;SGLang_storage&#39;, enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method=&#39;round_robin&#39;, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config=&#39;&#39;, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:21:27 TP0] Init torch distributed begin.
+[2024-10-30 10:21:27 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:21:28 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:21:28 weight_utils.py:243] Using model weights format [&#39;*.safetensors&#39;]
 Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00&lt;?, ?it/s]
-Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.22it/s]
-Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.13it/s]
-Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.11it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03&lt;00:00,  1.47it/s]
+Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00&lt;00:02,  1.14it/s]
+Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01&lt;00:01,  1.03it/s]
+Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02&lt;00:00,  1.00it/s]
 Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03&lt;00:00,  1.33it/s]
-
-[2024-10-30 09:53:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
-[2024-10-30 09:53:48 TP0] Memory pool end. avail mem=8.37 GB
-[2024-10-30 09:53:48 TP0] Capture cuda graph begin. This can take up to several minutes.
-[2024-10-30 09:53:56 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
-[2024-10-30 09:53:56] INFO:     Started server process [1246720]
-[2024-10-30 09:53:56] INFO:     Waiting for application startup.
-[2024-10-30 09:53:56] INFO:     Application startup complete.
-[2024-10-30 09:53:56] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
-[2024-10-30 09:53:56] INFO:     127.0.0.1:57724 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
-[2024-10-30 09:53:57] INFO:     127.0.0.1:57738 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
-[2024-10-30 09:53:57 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:53:57] INFO:     127.0.0.1:57740 - &#34;POST /generate HTTP/1.1&#34; 200 OK
-[2024-10-30 09:53:57] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03&lt;00:00,  1.20it/s]
+
+[2024-10-30 10:21:32 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-30 10:21:32 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-30 10:21:32 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-30 10:21:39 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-30 10:21:39] INFO:     Started server process [2220234]
+[2024-10-30 10:21:39] INFO:     Waiting for application startup.
+[2024-10-30 10:21:39] INFO:     Application startup complete.
+[2024-10-30 10:21:39] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-30 10:21:40] INFO:     127.0.0.1:49116 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
+[2024-10-30 10:21:40] INFO:     127.0.0.1:49124 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2024-10-30 10:21:40 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:21:40] INFO:     127.0.0.1:49128 - &#34;POST /generate HTTP/1.1&#34; 200 OK
+[2024-10-30 10:21:40] The server is fired up and ready to roll!
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -507,19 +508,16 @@ <h2>Send a Request<a class="headerlink" href="#Send-a-Request" title="Link to th
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:54:02 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 6.49, #queue-req: 0
-[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.20, #queue-req: 0
-[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.07, #queue-req: 0
-[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.86, #queue-req: 0
-[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.84, #queue-req: 0
-[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.44, #queue-req: 0
-[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0
-[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.73, #queue-req: 0
-[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.56, #queue-req: 0
-[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.80, #queue-req: 0
-[2024-10-30 09:54:05] INFO:     127.0.0.1:55856 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
-{&#34;id&#34;:&#34;bbcbab6a628b4139b82000ab40565b10&#34;,&#34;object&#34;:&#34;chat.completion&#34;,&#34;created&#34;:1730282045,&#34;model&#34;:&#34;meta-llama/Meta-Llama-3.1-8B-Instruct&#34;,&#34;choices&#34;:[{&#34;index&#34;:0,&#34;message&#34;:{&#34;role&#34;:&#34;assistant&#34;,&#34;content&#34;:&#34;LLM stands for Large Language Model. It&#39;s a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on massive datasets of text, which enables them to learn patterns, relationships, and structures of language.\n\nThese models are often based on transformer architecture, which allows them to understand the context and nuances of language, including grammar, syntax, and semantics. This enables them to generate text that is coherent, readable, and often indistinguishable from human-written text.\n\nSome common applications of LLMs include:\n\n1. **Language translation**: LLMs can translate text from one language to another with high accuracy.\n2. **Text summarization**: LLMs can summarize long pieces of text into concise, informative summaries.\n3. **Content generation**: LLMs can generate text on a given topic, such as articles, blog posts, or even entire books.\n4. **Chatbots and conversational AI**: LLMs can be used to power chatbots and conversational AI systems that can understand and respond to user queries.\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\n\nSome popular examples of LLMs include:\n\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely-used LLM that has achieved state-of-the-art results in many NLP (Natural Language Processing) tasks.\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is a variant of BERT that has been optimized for specific NLP tasks.\n3. **ChatGPT**: Developed by OpenAI, ChatGPT is a conversational AI model that uses a type of LLM to generate human-like responses to user queries.\n\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, and are being applied in a wide range of fields, from healthcare and finance to education and entertainment.&#34;},&#34;logprobs&#34;:null,&#34;finish_reason&#34;:&#34;stop&#34;,&#34;matched_stop&#34;:128009}],&#34;usage&#34;:{&#34;prompt_tokens&#34;:47,&#34;total_tokens&#34;:469,&#34;completion_tokens&#34;:422,&#34;prompt_tokens_details&#34;:null}}
+[2024-10-30 10:21:45 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.95, #queue-req: 0
+[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.74, #queue-req: 0
+[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 138.73, #queue-req: 0
+[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 138.52, #queue-req: 0
+[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 138.51, #queue-req: 0
+[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 138.53, #queue-req: 0
+[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 138.21, #queue-req: 0
+[2024-10-30 10:21:48] INFO:     127.0.0.1:49156 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+{&#34;id&#34;:&#34;e0473f036ab74e6e95f25ac670600abd&#34;,&#34;object&#34;:&#34;chat.completion&#34;,&#34;created&#34;:1730283708,&#34;model&#34;:&#34;meta-llama/Meta-Llama-3.1-8B-Instruct&#34;,&#34;choices&#34;:[{&#34;index&#34;:0,&#34;message&#34;:{&#34;role&#34;:&#34;assistant&#34;,&#34;content&#34;:&#34;LLM stands for Large Language Model. It&#39;s a type of artificial intelligence (AI) designed to process and understand human language at a massive scale. \n\nA Large Language Model is trained on a vast amount of text data, which enables it to learn patterns, relationships, and structures of language. This training allows the model to generate human-like text, answer questions, summarize content, and even engage in conversations.\n\nLLMs use natural language processing (NLP) techniques, such as tokenization, part-of-speech tagging, and dependency parsing, to analyze and understand the meaning behind the text. They also employ machine learning algorithms, like transformer architecture, to learn from the data and improve their performance over time.\n\nSome common applications of Large Language Models include:\n\n1. **Chatbots**: LLMs power conversational interfaces, enabling users to interact with AI systems in a more natural and intuitive way.\n2. **Language translation**: LLMs can translate text from one language to another, often with impressive accuracy.\n3. **Text summarization**: LLMs can summarize long pieces of text, extracting key points and main ideas.\n4. **Content generation**: LLMs can generate text, such as articles, stories, or even entire books.\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\n\nThe most well-known example of an LLM is perhaps the one I&#39;m based on: I&#39;m a large language model myself.&#34;},&#34;logprobs&#34;:null,&#34;finish_reason&#34;:&#34;stop&#34;,&#34;matched_stop&#34;:128009}],&#34;usage&#34;:{&#34;prompt_tokens&#34;:47,&#34;total_tokens&#34;:354,&#34;completion_tokens&#34;:307,&#34;prompt_tokens_details&#34;:null}}
 </pre></div></div>
 </div>
 </section>
@@ -552,16 +550,16 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2024-10-30 09:54:05 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:54:05 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 48.85, #queue-req: 0
-[2024-10-30 09:54:05] INFO:     127.0.0.1:55872 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
+[2024-10-30 10:21:48 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:21:48 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 49.36, #queue-req: 0
+[2024-10-30 10:21:48] INFO:     127.0.0.1:56782 - &#34;POST /v1/chat/completions HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area rendered_html docutils container">
-<strong style='color: #00008B;'>ChatCompletion(id='eb152de88c6a42eaab7b3911b3664583', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730282045, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
+<strong style='color: #00008B;'>ChatCompletion(id='6782b0981f72435ab6f3311224bd8b74', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. **Country:** Japan\n**Capital:** Tokyo\n\n2. **Country:** Australia\n**Capital:** Canberra\n\n3. **Country:** Brazil\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283708, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong></div>
 </div>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
diff --git a/send_request.ipynb b/send_request.ipynb
index faacc69..98802c5 100644
--- a/send_request.ipynb
+++ b/send_request.ipynb
@@ -30,10 +30,10 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:53:18.660010Z",
-     "iopub.status.busy": "2024-10-30T09:53:18.659842Z",
-     "iopub.status.idle": "2024-10-30T09:54:01.960342Z",
-     "shell.execute_reply": "2024-10-30T09:54:01.959701Z"
+     "iopub.execute_input": "2024-10-30T10:21:01.529293Z",
+     "iopub.status.busy": "2024-10-30T10:21:01.529124Z",
+     "iopub.status.idle": "2024-10-30T10:21:45.827106Z",
+     "shell.execute_reply": "2024-10-30T10:21:45.826480Z"
     }
    },
    "outputs": [
@@ -41,37 +41,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:28] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=522777218, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+      "[2024-10-30 10:21:11] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=1057403943, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:43 TP0] Init torch distributed begin.\n"
+      "[2024-10-30 10:21:27 TP0] Init torch distributed begin.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:44 TP0] Load weight begin. avail mem=78.59 GB\n"
+      "[2024-10-30 10:21:27 TP0] Load weight begin. avail mem=78.59 GB\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:44 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+      "[2024-10-30 10:21:28 TP0] lm_eval is not installed, GPTQ may not be usable\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO 10-30 09:53:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "\r",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+      "INFO 10-30 10:21:28 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
      ]
     },
     {
@@ -79,7 +77,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.22it/s]\n"
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
      ]
     },
     {
@@ -87,7 +85,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.13it/s]\n"
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.14it/s]\n"
      ]
     },
     {
@@ -95,7 +93,7 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.11it/s]\n"
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.03it/s]\n"
      ]
     },
     {
@@ -103,47 +101,49 @@
      "output_type": "stream",
      "text": [
       "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.47it/s]\n",
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.33it/s]\n",
-      "\n",
-      "[2024-10-30 09:53:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
-      "[2024-10-30 09:53:48 TP0] Memory pool end. avail mem=8.37 GB\n",
-      "[2024-10-30 09:53:48 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.00it/s]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.33it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.20it/s]\n",
+      "\n",
+      "[2024-10-30 10:21:32 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+      "[2024-10-30 10:21:32 TP0] Memory pool end. avail mem=8.37 GB\n",
+      "[2024-10-30 10:21:32 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56] INFO:     Started server process [1246720]\n",
-      "[2024-10-30 09:53:56] INFO:     Waiting for application startup.\n",
-      "[2024-10-30 09:53:56] INFO:     Application startup complete.\n",
-      "[2024-10-30 09:53:56] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+      "[2024-10-30 10:21:39 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:56] INFO:     127.0.0.1:57724 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:21:39] INFO:     Started server process [2220234]\n",
+      "[2024-10-30 10:21:39] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 10:21:39] INFO:     Application startup complete.\n",
+      "[2024-10-30 10:21:39] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:53:57] INFO:     127.0.0.1:57738 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:53:57 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:53:57] INFO:     127.0.0.1:57740 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:53:57] The server is fired up and ready to roll!\n"
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49116 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49124 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:21:40] INFO:     127.0.0.1:49128 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 10:21:40] The server is fired up and ready to roll!\n"
      ]
     },
     {
@@ -191,10 +191,10 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:01.962469Z",
-     "iopub.status.busy": "2024-10-30T09:54:01.962223Z",
-     "iopub.status.idle": "2024-10-30T09:54:05.207796Z",
-     "shell.execute_reply": "2024-10-30T09:54:05.207035Z"
+     "iopub.execute_input": "2024-10-30T10:21:45.829930Z",
+     "iopub.status.busy": "2024-10-30T10:21:45.829007Z",
+     "iopub.status.idle": "2024-10-30T10:21:48.222006Z",
+     "shell.execute_reply": "2024-10-30T10:21:48.221268Z"
     }
    },
    "outputs": [
@@ -202,85 +202,64 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:02 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 6.49, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.20, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.07, #queue-req: 0\n"
+      "[2024-10-30 10:21:45 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.86, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.95, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.84, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.74, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.44, #queue-req: 0\n"
+      "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 138.73, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 138.52, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.73, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 138.51, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.56, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 138.53, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.80, #queue-req: 0\n"
+      "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 138.21, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05] INFO:     127.0.0.1:55856 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "{\"id\":\"bbcbab6a628b4139b82000ab40565b10\",\"object\":\"chat.completion\",\"created\":1730282045,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on massive datasets of text, which enables them to learn patterns, relationships, and structures of language.\\n\\nThese models are often based on transformer architecture, which allows them to understand the context and nuances of language, including grammar, syntax, and semantics. This enables them to generate text that is coherent, readable, and often indistinguishable from human-written text.\\n\\nSome common applications of LLMs include:\\n\\n1. **Language translation**: LLMs can translate text from one language to another with high accuracy.\\n2. **Text summarization**: LLMs can summarize long pieces of text into concise, informative summaries.\\n3. **Content generation**: LLMs can generate text on a given topic, such as articles, blog posts, or even entire books.\\n4. **Chatbots and conversational AI**: LLMs can be used to power chatbots and conversational AI systems that can understand and respond to user queries.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely-used LLM that has achieved state-of-the-art results in many NLP (Natural Language Processing) tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is a variant of BERT that has been optimized for specific NLP tasks.\\n3. **ChatGPT**: Developed by OpenAI, ChatGPT is a conversational AI model that uses a type of LLM to generate human-like responses to user queries.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, and are being applied in a wide range of fields, from healthcare and finance to education and entertainment.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":469,\"completion_tokens\":422,\"prompt_tokens_details\":null}}"
+      "[2024-10-30 10:21:48] INFO:     127.0.0.1:49156 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"e0473f036ab74e6e95f25ac670600abd\",\"object\":\"chat.completion\",\"created\":1730283708,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language at a massive scale. \\n\\nA Large Language Model is trained on a vast amount of text data, which enables it to learn patterns, relationships, and structures of language. This training allows the model to generate human-like text, answer questions, summarize content, and even engage in conversations.\\n\\nLLMs use natural language processing (NLP) techniques, such as tokenization, part-of-speech tagging, and dependency parsing, to analyze and understand the meaning behind the text. They also employ machine learning algorithms, like transformer architecture, to learn from the data and improve their performance over time.\\n\\nSome common applications of Large Language Models include:\\n\\n1. **Chatbots**: LLMs power conversational interfaces, enabling users to interact with AI systems in a more natural and intuitive way.\\n2. **Language translation**: LLMs can translate text from one language to another, often with impressive accuracy.\\n3. **Text summarization**: LLMs can summarize long pieces of text, extracting key points and main ideas.\\n4. **Content generation**: LLMs can generate text, such as articles, stories, or even entire books.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nThe most well-known example of an LLM is perhaps the one I'm based on: I'm a large language model myself.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":354,\"completion_tokens\":307,\"prompt_tokens_details\":null}}"
      ]
     }
    ],
@@ -305,10 +284,10 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:05.209938Z",
-     "iopub.status.busy": "2024-10-30T09:54:05.209737Z",
-     "iopub.status.idle": "2024-10-30T09:54:05.956792Z",
-     "shell.execute_reply": "2024-10-30T09:54:05.956182Z"
+     "iopub.execute_input": "2024-10-30T10:21:48.224352Z",
+     "iopub.status.busy": "2024-10-30T10:21:48.224065Z",
+     "iopub.status.idle": "2024-10-30T10:21:48.962070Z",
+     "shell.execute_reply": "2024-10-30T10:21:48.961531Z"
     }
    },
    "outputs": [
@@ -316,21 +295,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:54:05 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 48.85, #queue-req: 0\n"
+      "[2024-10-30 10:21:48 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 10:21:48 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 49.36, #queue-req: 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-10-30 09:54:05] INFO:     127.0.0.1:55872 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 10:21:48] INFO:     127.0.0.1:56782 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='eb152de88c6a42eaab7b3911b3664583', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730282045, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>ChatCompletion(id='6782b0981f72435ab6f3311224bd8b74', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283708, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -362,10 +341,10 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-10-30T09:54:05.958571Z",
-     "iopub.status.busy": "2024-10-30T09:54:05.958388Z",
-     "iopub.status.idle": "2024-10-30T09:54:07.683148Z",
-     "shell.execute_reply": "2024-10-30T09:54:07.681009Z"
+     "iopub.execute_input": "2024-10-30T10:21:48.963733Z",
+     "iopub.status.busy": "2024-10-30T10:21:48.963552Z",
+     "iopub.status.idle": "2024-10-30T10:21:50.668291Z",
+     "shell.execute_reply": "2024-10-30T10:21:50.667574Z"
     }
    },
    "outputs": [],
diff --git a/setup_github_runner.html b/setup_github_runner.html
index 422d060..26a92d8 100644
--- a/setup_github_runner.html
+++ b/setup_github_runner.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>
diff --git a/troubleshooting.html b/troubleshooting.html
index 42486ee..5aca28c 100644
--- a/troubleshooting.html
+++ b/troubleshooting.html
@@ -174,6 +174,7 @@
 <li class="toctree-l1"><a class="reference internal" href="benchmark_and_profiling.html">Benchmark and Profiling</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Troubleshooting</a></li>
 <li class="toctree-l1"><a class="reference internal" href="embedding_model.html">Embedding Model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="learn_more.html">Learn more</a></li>
 </ul>
 
     </div>