Learn more#
+You can find more blogs, slides, and videos about SGLang at sgl-project/sgl-learning-materials.
+diff --git a/README.html b/README.html index daea8d0..3a274dd 100644 --- a/README.html +++ b/README.html @@ -174,6 +174,7 @@
-[2024-10-30 09:50:54] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=448515216, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:51:10 TP0] Init torch distributed begin.
-[2024-10-30 09:51:11 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:51:12 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:51:13 weight_utils.py:243] Using model weights format ['*.safetensors']
+[2024-10-30 10:18:40] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=922321184, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:18:55 TP0] Init torch distributed begin.
+[2024-10-30 10:18:56 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:18:56 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:18:56 weight_utils.py:243] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00<?, ?it/s]
-Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:01<00:10, 1.78s/it]
-Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:03<00:09, 1.81s/it]
-Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:05<00:07, 1.86s/it]
-Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:07<00:05, 1.88s/it]
-Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:09<00:03, 1.90s/it]
-Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:10<00:01, 1.77s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.47s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.68s/it]
-
-[2024-10-30 09:51:25 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB
-[2024-10-30 09:51:25 TP0] Memory pool end. avail mem=7.43 GB
-[2024-10-30 09:51:26 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072
-[2024-10-30 09:51:26] INFO: Started server process [1244882]
-[2024-10-30 09:51:26] INFO: Waiting for application startup.
-[2024-10-30 09:51:26] INFO: Application startup complete.
-[2024-10-30 09:51:26] INFO: Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)
-[2024-10-30 09:51:27] INFO: 127.0.0.1:43056 - "GET /v1/models HTTP/1.1" 200 OK
-[2024-10-30 09:51:27] INFO: 127.0.0.1:43062 - "GET /get_model_info HTTP/1.1" 200 OK
-[2024-10-30 09:51:27 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:28] INFO: 127.0.0.1:43078 - "POST /encode HTTP/1.1" 200 OK
-[2024-10-30 09:51:28] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:01<00:10, 1.68s/it]
+Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:03<00:08, 1.76s/it]
+Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:05<00:07, 1.82s/it]
+Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:07<00:05, 1.85s/it]
+Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:09<00:03, 1.83s/it]
+Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:10<00:01, 1.67s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.41s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.61s/it]
+
+[2024-10-30 10:19:08 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=64.18 GB
+[2024-10-30 10:19:08 TP0] Memory pool end. avail mem=7.43 GB
+[2024-10-30 10:19:08 TP0] max_total_num_tokens=1025173, max_prefill_tokens=16384, max_running_requests=4005, context_len=131072
+[2024-10-30 10:19:09] INFO: Started server process [2218395]
+[2024-10-30 10:19:09] INFO: Waiting for application startup.
+[2024-10-30 10:19:09] INFO: Application startup complete.
+[2024-10-30 10:19:09] INFO: Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)
+[2024-10-30 10:19:09] INFO: 127.0.0.1:49928 - "GET /v1/models HTTP/1.1" 200 OK
+[2024-10-30 10:19:10] INFO: 127.0.0.1:49940 - "GET /get_model_info HTTP/1.1" 200 OK
+[2024-10-30 10:19:10 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:11] INFO: 127.0.0.1:49946 - "POST /encode HTTP/1.1" 200 OK
+[2024-10-30 10:19:11] The server is fired up and ready to roll!
-[2024-10-30 09:51:32 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:51:32] INFO: 127.0.0.1:34504 - "POST /v1/embeddings HTTP/1.1" 200 OK
+[2024-10-30 10:19:14 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:19:14] INFO: 127.0.0.1:49980 - "POST /v1/embeddings HTTP/1.1" 200 OK
You can find more blogs, slides, and videos about SGLang at sgl-project/sgl-learning-materials.
+
-[2024-10-30 09:51:55] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=747149505, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:52:10 TP0] Init torch distributed begin.
-[2024-10-30 09:52:11 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:52:11 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:52:12 weight_utils.py:243] Using model weights format ['*.safetensors']
+[2024-10-30 10:19:37] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=203851697, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:19:52 TP0] Init torch distributed begin.
+[2024-10-30 10:19:53 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:19:53 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:19:54 weight_utils.py:243] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00<?, ?it/s]
-Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.44it/s]
-Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.32it/s]
-Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.22it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00, 1.61it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00, 1.48it/s]
-
-[2024-10-30 09:52:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
-[2024-10-30 09:52:14 TP0] Memory pool end. avail mem=8.37 GB
-[2024-10-30 09:52:15 TP0] Capture cuda graph begin. This can take up to several minutes.
-[2024-10-30 09:52:22 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
-[2024-10-30 09:52:23] INFO: Started server process [1245797]
-[2024-10-30 09:52:23] INFO: Waiting for application startup.
-[2024-10-30 09:52:23] INFO: Application startup complete.
-[2024-10-30 09:52:23] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
-[2024-10-30 09:52:23] INFO: 127.0.0.1:40674 - "GET /v1/models HTTP/1.1" 200 OK
-[2024-10-30 09:52:24] INFO: 127.0.0.1:40678 - "GET /get_model_info HTTP/1.1" 200 OK
-[2024-10-30 09:52:24 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:24] INFO: 127.0.0.1:40686 - "POST /generate HTTP/1.1" 200 OK
-[2024-10-30 09:52:24] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.17it/s]
+Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.05it/s]
+Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.02it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.35it/s]
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.23it/s]
+
+[2024-10-30 10:19:57 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-30 10:19:57 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-30 10:19:57 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-30 10:20:04 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-30 10:20:05] INFO: Started server process [2219311]
+[2024-10-30 10:20:05] INFO: Waiting for application startup.
+[2024-10-30 10:20:05] INFO: Application startup complete.
+[2024-10-30 10:20:05] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-30 10:20:05] INFO: 127.0.0.1:40444 - "GET /v1/models HTTP/1.1" 200 OK
+[2024-10-30 10:20:06] INFO: 127.0.0.1:40446 - "GET /get_model_info HTTP/1.1" 200 OK
+[2024-10-30 10:20:06 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:06] INFO: 127.0.0.1:40454 - "POST /generate HTTP/1.1" 200 OK
+[2024-10-30 10:20:06] The server is fired up and ready to roll!
-[2024-10-30 09:52:28 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:28 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.28, #queue-req: 0
-[2024-10-30 09:52:29] INFO: 127.0.0.1:40694 - "POST /v1/chat/completions HTTP/1.1" 200 OK
+[2024-10-30 10:20:10 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:10 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 6.33, #queue-req: 0
+[2024-10-30 10:20:11] INFO: 127.0.0.1:46764 - "POST /v1/chat/completions HTTP/1.1" 200 OK
-[2024-10-30 09:52:29 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 110.01, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.67, #queue-req: 0
-[2024-10-30 09:52:29 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.20, #queue-req: 0
-[2024-10-30 09:52:30] INFO: 127.0.0.1:40694 - "POST /v1/chat/completions HTTP/1.1" 200 OK
+[2024-10-30 10:20:11 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 112.96, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 131.93, #queue-req: 0
+[2024-10-30 10:20:11 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.44, #queue-req: 0
+[2024-10-30 10:20:12] INFO: 127.0.0.1:46764 - "POST /v1/chat/completions HTTP/1.1" 200 OK
Streaming mode is also supported
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 133.41, #queue-req: 0
-[2024-10-30 09:52:30] INFO: 127.0.0.1:40694 - "POST /v1/completions HTTP/1.1" 200 OK
+[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 120.08, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.95, #queue-req: 0
+[2024-10-30 10:20:12] INFO: 127.0.0.1:46764 - "POST /v1/completions HTTP/1.1" 200 OK
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:30 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 131.40, #queue-req: 0
-[2024-10-30 09:52:30] INFO: 127.0.0.1:40694 - "POST /v1/completions HTTP/1.1" 200 OK
+[2024-10-30 10:20:12 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:12 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 126.71, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 134.52, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 133.35, #queue-req: 0
+[2024-10-30 10:20:13] INFO: 127.0.0.1:46764 - "POST /v1/completions HTTP/1.1" 200 OK
-[2024-10-30 09:52:30] INFO: 127.0.0.1:58766 - "POST /v1/files HTTP/1.1" 200 OK
-[2024-10-30 09:52:30] INFO: 127.0.0.1:58766 - "POST /v1/batches HTTP/1.1" 200 OK
-[2024-10-30 09:52:30 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:13] INFO: 127.0.0.1:46776 - "POST /v1/files HTTP/1.1" 200 OK
+[2024-10-30 10:20:13] INFO: 127.0.0.1:46776 - "POST /v1/batches HTTP/1.1" 200 OK
+[2024-10-30 10:20:13 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0
[8]:
@@ -819,12 +822,13 @@ Batches#<
-[2024-10-30 09:52:31 TP0] Decode batch. #running-req: 1, #token: 70, token usage: 0.00, gen throughput (token/s): 134.02, #queue-req: 0
+[2024-10-30 10:20:13 TP0] Decode batch. #running-req: 2, #token: 56, token usage: 0.00, gen throughput (token/s): 107.56, #queue-req: 0
+[2024-10-30 10:20:14 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 171.05, #queue-req: 0
Batch job status: validating...trying again in 3 seconds...
-[2024-10-30 09:52:34] INFO: 127.0.0.1:58766 - "GET /v1/batches/batch_d6a4ccdb-f75d-46f7-a798-3bcc8392f940 HTTP/1.1" 200 OK
+[2024-10-30 10:20:16] INFO: 127.0.0.1:46776 - "GET /v1/batches/batch_8c91cce9-3d0a-471d-ad32-d9f7c4a06538 HTTP/1.1" 200 OK
Batch job completed successfully!
Request counts: BatchRequestCounts(completed=2, failed=0, total=2)
-[2024-10-30 09:52:34] INFO: 127.0.0.1:58766 - "GET /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab/content HTTP/1.1" 200 OK
+[2024-10-30 10:20:16] INFO: 127.0.0.1:46776 - "GET /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033/content HTTP/1.1" 200 OK
-[2024-10-30 09:52:34] INFO: 127.0.0.1:58766 - "DELETE /v1/files/backend_result_file-2f8df779-86ed-4984-bff5-089f36cfecab HTTP/1.1" 200 OK
+[2024-10-30 10:20:16] INFO: 127.0.0.1:46776 - "DELETE /v1/files/backend_result_file-d80eb4e3-afd3-4802-b605-da61bcb26033 HTTP/1.1" 200 OK
It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.
@@ -944,15 +948,15 @@
-[2024-10-30 09:52:34] INFO: 127.0.0.1:58778 - "POST /v1/files HTTP/1.1" 200 OK
-[2024-10-30 09:52:34] INFO: 127.0.0.1:58778 - "POST /v1/batches HTTP/1.1" 200 OK
+[2024-10-30 10:20:16] INFO: 127.0.0.1:46792 - "POST /v1/files HTTP/1.1" 200 OK
+[2024-10-30 10:20:16] INFO: 127.0.0.1:46792 - "POST /v1/batches HTTP/1.1" 200 OK
-[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 44.86%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.33%, token usage: 0.00, #running-req: 17, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 5225, token usage: 0.01, gen throughput (token/s): 691.07, #queue-req: 0
-[2024-10-30 09:52:34 TP0] Decode batch. #running-req: 100, #token: 9225, token usage: 0.02, gen throughput (token/s): 10881.22, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 13225, token usage: 0.03, gen throughput (token/s): 10661.22, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 17225, token usage: 0.04, gen throughput (token/s): 10362.11, #queue-req: 0
-[2024-10-30 09:52:35 TP0] Decode batch. #running-req: 100, #token: 21225, token usage: 0.05, gen throughput (token/s): 10117.01, #queue-req: 0
-[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 25225, token usage: 0.06, gen throughput (token/s): 9825.47, #queue-req: 0
-[2024-10-30 09:52:36 TP0] Decode batch. #running-req: 100, #token: 29225, token usage: 0.07, gen throughput (token/s): 9705.75, #queue-req: 0
-[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 33225, token usage: 0.08, gen throughput (token/s): 9505.80, #queue-req: 0
-[2024-10-30 09:52:37 TP0] Decode batch. #running-req: 100, #token: 37225, token usage: 0.08, gen throughput (token/s): 9362.90, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 41225, token usage: 0.09, gen throughput (token/s): 9069.10, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 45225, token usage: 0.10, gen throughput (token/s): 9011.38, #queue-req: 0
-[2024-10-30 09:52:38 TP0] Decode batch. #running-req: 100, #token: 49225, token usage: 0.11, gen throughput (token/s): 8748.48, #queue-req: 0
-[2024-10-30 09:52:44] INFO: 127.0.0.1:46622 - "GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1" 200 OK
+[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 7, #new-token: 210, #cached-token: 175, cache hit rate: 44.36%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:16 TP0] Prefill batch. #new-seq: 93, #new-token: 2790, #cached-token: 2325, cache hit rate: 45.33%, token usage: 0.00, #running-req: 7, #queue-req: 0
+[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 6425, token usage: 0.01, gen throughput (token/s): 1051.94, #queue-req: 0
+[2024-10-30 10:20:17 TP0] Decode batch. #running-req: 100, #token: 10425, token usage: 0.02, gen throughput (token/s): 10803.69, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 14425, token usage: 0.03, gen throughput (token/s): 10578.85, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 18425, token usage: 0.04, gen throughput (token/s): 10257.61, #queue-req: 0
+[2024-10-30 10:20:18 TP0] Decode batch. #running-req: 100, #token: 22425, token usage: 0.05, gen throughput (token/s): 10138.49, #queue-req: 0
+[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 26425, token usage: 0.06, gen throughput (token/s): 9904.84, #queue-req: 0
+[2024-10-30 10:20:19 TP0] Decode batch. #running-req: 100, #token: 30425, token usage: 0.07, gen throughput (token/s): 9674.94, #queue-req: 0
+[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 34425, token usage: 0.08, gen throughput (token/s): 9519.28, #queue-req: 0
+[2024-10-30 10:20:20 TP0] Decode batch. #running-req: 100, #token: 38425, token usage: 0.09, gen throughput (token/s): 9329.19, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 42425, token usage: 0.10, gen throughput (token/s): 9120.97, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 46425, token usage: 0.10, gen throughput (token/s): 8980.70, #queue-req: 0
+[2024-10-30 10:20:21 TP0] Decode batch. #running-req: 100, #token: 50425, token usage: 0.11, gen throughput (token/s): 8799.31, #queue-req: 0
+[2024-10-30 10:20:26] INFO: 127.0.0.1:39106 - "GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1" 200 OK
-[2024-10-30 09:52:47] INFO: 127.0.0.1:46622 - "GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1" 200 OK
+[2024-10-30 10:20:29] INFO: 127.0.0.1:39106 - "GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1" 200 OK
-[2024-10-30 09:52:50] INFO: 127.0.0.1:46622 - "GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1" 200 OK
+[2024-10-30 10:20:32] INFO: 127.0.0.1:39106 - "GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1" 200 OK
-[2024-10-30 09:52:53] INFO: 127.0.0.1:46622 - "GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1" 200 OK
+[2024-10-30 10:20:35] INFO: 127.0.0.1:39106 - "GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1" 200 OK
-[2024-10-30 09:52:56] INFO: 127.0.0.1:46622 - "GET /v1/batches/batch_1ec1c736-3552-4779-ad78-e1d0a04103dd HTTP/1.1" 200 OK
+[2024-10-30 10:20:38] INFO: 127.0.0.1:39106 - "GET /v1/batches/batch_c5b5ec3a-cf5e-42cb-a1ef-4cd3ab6037c9 HTTP/1.1" 200 OK
-[2024-10-30 09:52:59] INFO: 127.0.0.1:47744 - "POST /v1/files HTTP/1.1" 200 OK
-[2024-10-30 09:52:59] INFO: 127.0.0.1:47744 - "POST /v1/batches HTTP/1.1" 200 OK
+[2024-10-30 10:20:42] INFO: 127.0.0.1:44822 - "POST /v1/files HTTP/1.1" 200 OK
+[2024-10-30 10:20:42] INFO: 127.0.0.1:44822 - "POST /v1/batches HTTP/1.1" 200 OK
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 37, #new-token: 37, #cached-token: 1998, cache hit rate: 59.11%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 334, #new-token: 8192, #cached-token: 10177, cache hit rate: 56.51%, token usage: 0.01, #running-req: 37, #queue-req: 129
-[2024-10-30 09:52:59 TP0] Prefill batch. #new-seq: 130, #new-token: 3871, #cached-token: 3279, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1
-[2024-10-30 09:52:59 TP0] Decode batch. #running-req: 500, #token: 16525, token usage: 0.04, gen throughput (token/s): 248.66, #queue-req: 0
-[2024-10-30 09:53:00 TP0] Decode batch. #running-req: 500, #token: 36525, token usage: 0.08, gen throughput (token/s): 25014.55, #queue-req: 0
-[2024-10-30 09:53:01 TP0] Decode batch. #running-req: 500, #token: 56525, token usage: 0.13, gen throughput (token/s): 23757.41, #queue-req: 0
-[2024-10-30 09:53:02 TP0] Decode batch. #running-req: 500, #token: 76525, token usage: 0.17, gen throughput (token/s): 22704.73, #queue-req: 0
-[2024-10-30 09:53:03 TP0] Decode batch. #running-req: 500, #token: 96525, token usage: 0.22, gen throughput (token/s): 21749.38, #queue-req: 0
-[2024-10-30 09:53:04 TP0] Decode batch. #running-req: 500, #token: 116525, token usage: 0.26, gen throughput (token/s): 20892.42, #queue-req: 0
-[2024-10-30 09:53:05 TP0] Decode batch. #running-req: 500, #token: 136525, token usage: 0.31, gen throughput (token/s): 20062.57, #queue-req: 0
-[2024-10-30 09:53:06 TP0] Decode batch. #running-req: 500, #token: 156525, token usage: 0.35, gen throughput (token/s): 19298.73, #queue-req: 0
-[2024-10-30 09:53:07 TP0] Decode batch. #running-req: 500, #token: 176525, token usage: 0.40, gen throughput (token/s): 18643.44, #queue-req: 0
-[2024-10-30 09:53:08 TP0] Decode batch. #running-req: 500, #token: 196525, token usage: 0.44, gen throughput (token/s): 17661.48, #queue-req: 0
-[2024-10-30 09:53:09] INFO: 127.0.0.1:52896 - "POST /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3/cancel HTTP/1.1" 200 OK
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 36, #new-token: 36, #cached-token: 1944, cache hit rate: 58.83%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 335, #new-token: 8192, #cached-token: 10231, cache hit rate: 56.51%, token usage: 0.01, #running-req: 36, #queue-req: 129
+[2024-10-30 10:20:42 TP0] Prefill batch. #new-seq: 130, #new-token: 3872, #cached-token: 3278, cache hit rate: 54.22%, token usage: 0.03, #running-req: 370, #queue-req: 1
+[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 22525, token usage: 0.05, gen throughput (token/s): 474.44, #queue-req: 0
+[2024-10-30 10:20:43 TP0] Decode batch. #running-req: 500, #token: 42525, token usage: 0.10, gen throughput (token/s): 24683.11, #queue-req: 0
+[2024-10-30 10:20:44 TP0] Decode batch. #running-req: 500, #token: 62525, token usage: 0.14, gen throughput (token/s): 23362.68, #queue-req: 0
+[2024-10-30 10:20:45 TP0] Decode batch. #running-req: 500, #token: 82525, token usage: 0.19, gen throughput (token/s): 22313.40, #queue-req: 0
+[2024-10-30 10:20:46 TP0] Decode batch. #running-req: 500, #token: 102525, token usage: 0.23, gen throughput (token/s): 21343.74, #queue-req: 0
+[2024-10-30 10:20:47 TP0] Decode batch. #running-req: 500, #token: 122525, token usage: 0.28, gen throughput (token/s): 20537.36, #queue-req: 0
+[2024-10-30 10:20:48 TP0] Decode batch. #running-req: 500, #token: 142525, token usage: 0.32, gen throughput (token/s): 19740.76, #queue-req: 0
+[2024-10-30 10:20:49 TP0] Decode batch. #running-req: 500, #token: 162525, token usage: 0.37, gen throughput (token/s): 18991.46, #queue-req: 0
+[2024-10-30 10:20:50 TP0] Decode batch. #running-req: 500, #token: 182525, token usage: 0.41, gen throughput (token/s): 18340.00, #queue-req: 0
+[2024-10-30 10:20:51 TP0] Decode batch. #running-req: 500, #token: 202525, token usage: 0.46, gen throughput (token/s): 17609.08, #queue-req: 0
+[2024-10-30 10:20:52] INFO: 127.0.0.1:46808 - "POST /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10/cancel HTTP/1.1" 200 OK
-[2024-10-30 09:53:12] INFO: 127.0.0.1:52896 - "GET /v1/batches/batch_31479bae-6187-4511-99f1-5462430498d3 HTTP/1.1" 200 OK
+[2024-10-30 10:20:55] INFO: 127.0.0.1:46808 - "GET /v1/batches/batch_55ca0a55-abe6-4ebe-a6bf-c51dc5985f10 HTTP/1.1" 200 OK
-[2024-10-30 09:53:12] INFO: 127.0.0.1:52896 - "DELETE /v1/files/backend_input_file-aa3ee9f6-3d9e-4b48-b53f-eaabadf1dae7 HTTP/1.1" 200 OK
+[2024-10-30 10:20:55] INFO: 127.0.0.1:46808 - "DELETE /v1/files/backend_input_file-e7d79e39-9d0c-4c13-890f-d1f9806de20e HTTP/1.1" 200 OK
-[2024-10-30 09:53:28] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=522777218, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
-[2024-10-30 09:53:43 TP0] Init torch distributed begin.
-[2024-10-30 09:53:44 TP0] Load weight begin. avail mem=78.59 GB
-[2024-10-30 09:53:44 TP0] lm_eval is not installed, GPTQ may not be usable
-INFO 10-30 09:53:45 weight_utils.py:243] Using model weights format ['*.safetensors']
+[2024-10-30 10:21:11] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=1057403943, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)
+[2024-10-30 10:21:27 TP0] Init torch distributed begin.
+[2024-10-30 10:21:27 TP0] Load weight begin. avail mem=78.59 GB
+[2024-10-30 10:21:28 TP0] lm_eval is not installed, GPTQ may not be usable
+INFO 10-30 10:21:28 weight_utils.py:243] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00<?, ?it/s]
-Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.22it/s]
-Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.13it/s]
-Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.11it/s]
-Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.47it/s]
+Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.14it/s]
+Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.03it/s]
+Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.00it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.33it/s]
-
-[2024-10-30 09:53:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
-[2024-10-30 09:53:48 TP0] Memory pool end. avail mem=8.37 GB
-[2024-10-30 09:53:48 TP0] Capture cuda graph begin. This can take up to several minutes.
-[2024-10-30 09:53:56 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
-[2024-10-30 09:53:56] INFO: Started server process [1246720]
-[2024-10-30 09:53:56] INFO: Waiting for application startup.
-[2024-10-30 09:53:56] INFO: Application startup complete.
-[2024-10-30 09:53:56] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
-[2024-10-30 09:53:56] INFO: 127.0.0.1:57724 - "GET /v1/models HTTP/1.1" 200 OK
-[2024-10-30 09:53:57] INFO: 127.0.0.1:57738 - "GET /get_model_info HTTP/1.1" 200 OK
-[2024-10-30 09:53:57 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:53:57] INFO: 127.0.0.1:57740 - "POST /generate HTTP/1.1" 200 OK
-[2024-10-30 09:53:57] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.20it/s]
+
+[2024-10-30 10:21:32 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB
+[2024-10-30 10:21:32 TP0] Memory pool end. avail mem=8.37 GB
+[2024-10-30 10:21:32 TP0] Capture cuda graph begin. This can take up to several minutes.
+[2024-10-30 10:21:39 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072
+[2024-10-30 10:21:39] INFO: Started server process [2220234]
+[2024-10-30 10:21:39] INFO: Waiting for application startup.
+[2024-10-30 10:21:39] INFO: Application startup complete.
+[2024-10-30 10:21:39] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
+[2024-10-30 10:21:40] INFO: 127.0.0.1:49116 - "GET /v1/models HTTP/1.1" 200 OK
+[2024-10-30 10:21:40] INFO: 127.0.0.1:49124 - "GET /get_model_info HTTP/1.1" 200 OK
+[2024-10-30 10:21:40 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:21:40] INFO: 127.0.0.1:49128 - "POST /generate HTTP/1.1" 200 OK
+[2024-10-30 10:21:40] The server is fired up and ready to roll!
-[2024-10-30 09:54:05 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
-[2024-10-30 09:54:05 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 48.85, #queue-req: 0
-[2024-10-30 09:54:05] INFO: 127.0.0.1:55872 - "POST /v1/chat/completions HTTP/1.1" 200 OK
+[2024-10-30 10:21:48 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2024-10-30 10:21:48 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 49.36, #queue-req: 0
+[2024-10-30 10:21:48] INFO: 127.0.0.1:56782 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[4]:
diff --git a/send_request.ipynb b/send_request.ipynb
index faacc69..98802c5 100644
--- a/send_request.ipynb
+++ b/send_request.ipynb
@@ -30,10 +30,10 @@
"execution_count": 1,
"metadata": {
"execution": {
- "iopub.execute_input": "2024-10-30T09:53:18.660010Z",
- "iopub.status.busy": "2024-10-30T09:53:18.659842Z",
- "iopub.status.idle": "2024-10-30T09:54:01.960342Z",
- "shell.execute_reply": "2024-10-30T09:54:01.959701Z"
+ "iopub.execute_input": "2024-10-30T10:21:01.529293Z",
+ "iopub.status.busy": "2024-10-30T10:21:01.529124Z",
+ "iopub.status.idle": "2024-10-30T10:21:45.827106Z",
+ "shell.execute_reply": "2024-10-30T10:21:45.826480Z"
}
},
"outputs": [
@@ -41,37 +41,35 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:28] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=522777218, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+ "[2024-10-30 10:21:11] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=1057403943, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:43 TP0] Init torch distributed begin.\n"
+ "[2024-10-30 10:21:27 TP0] Init torch distributed begin.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:44 TP0] Load weight begin. avail mem=78.59 GB\n"
+ "[2024-10-30 10:21:27 TP0] Load weight begin. avail mem=78.59 GB\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:44 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+ "[2024-10-30 10:21:28 TP0] lm_eval is not installed, GPTQ may not be usable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "INFO 10-30 09:53:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
- "\r",
- "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n"
+ "INFO 10-30 10:21:28 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
]
},
{
@@ -79,7 +77,7 @@
"output_type": "stream",
"text": [
"\r",
- "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.22it/s]\n"
+ "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n"
]
},
{
@@ -87,7 +85,7 @@
"output_type": "stream",
"text": [
"\r",
- "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.13it/s]\n"
+ "Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:02, 1.14it/s]\n"
]
},
{
@@ -95,7 +93,7 @@
"output_type": "stream",
"text": [
"\r",
- "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.11it/s]\n"
+ "Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:01<00:01, 1.03it/s]\n"
]
},
{
@@ -103,47 +101,49 @@
"output_type": "stream",
"text": [
"\r",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.47it/s]\n",
- "\r",
- "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.33it/s]\n",
- "\n",
- "[2024-10-30 09:53:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
- "[2024-10-30 09:53:48 TP0] Memory pool end. avail mem=8.37 GB\n",
- "[2024-10-30 09:53:48 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+ "Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:02<00:00, 1.00it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:56 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.33it/s]\n",
+ "\r",
+ "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00, 1.20it/s]\n",
+ "\n",
+ "[2024-10-30 10:21:32 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=63.50 GB\n",
+ "[2024-10-30 10:21:32 TP0] Memory pool end. avail mem=8.37 GB\n",
+ "[2024-10-30 10:21:32 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:56] INFO: Started server process [1246720]\n",
- "[2024-10-30 09:53:56] INFO: Waiting for application startup.\n",
- "[2024-10-30 09:53:56] INFO: Application startup complete.\n",
- "[2024-10-30 09:53:56] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+ "[2024-10-30 10:21:39 TP0] max_total_num_tokens=442913, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:56] INFO: 127.0.0.1:57724 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+ "[2024-10-30 10:21:39] INFO: Started server process [2220234]\n",
+ "[2024-10-30 10:21:39] INFO: Waiting for application startup.\n",
+ "[2024-10-30 10:21:39] INFO: Application startup complete.\n",
+ "[2024-10-30 10:21:39] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:53:57] INFO: 127.0.0.1:57738 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:53:57 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:53:57] INFO: 127.0.0.1:57740 - \"POST /generate HTTP/1.1\" 200 OK\n",
- "[2024-10-30 09:53:57] The server is fired up and ready to roll!\n"
+ "[2024-10-30 10:21:40] INFO: 127.0.0.1:49116 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+ "[2024-10-30 10:21:40] INFO: 127.0.0.1:49124 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+ "[2024-10-30 10:21:40 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-30 10:21:40] INFO: 127.0.0.1:49128 - \"POST /generate HTTP/1.1\" 200 OK\n",
+ "[2024-10-30 10:21:40] The server is fired up and ready to roll!\n"
]
},
{
@@ -191,10 +191,10 @@
"execution_count": 2,
"metadata": {
"execution": {
- "iopub.execute_input": "2024-10-30T09:54:01.962469Z",
- "iopub.status.busy": "2024-10-30T09:54:01.962223Z",
- "iopub.status.idle": "2024-10-30T09:54:05.207796Z",
- "shell.execute_reply": "2024-10-30T09:54:05.207035Z"
+ "iopub.execute_input": "2024-10-30T10:21:45.829930Z",
+ "iopub.status.busy": "2024-10-30T10:21:45.829007Z",
+ "iopub.status.idle": "2024-10-30T10:21:48.222006Z",
+ "shell.execute_reply": "2024-10-30T10:21:48.221268Z"
}
},
"outputs": [
@@ -202,85 +202,64 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:02 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 6.49, #queue-req: 0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.20, #queue-req: 0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2024-10-30 09:54:02 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.07, #queue-req: 0\n"
+ "[2024-10-30 10:21:45 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.86, #queue-req: 0\n"
+ "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.95, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.84, #queue-req: 0\n"
+ "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.74, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:03 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.44, #queue-req: 0\n"
+ "[2024-10-30 10:21:46 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 138.73, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n"
+ "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 138.52, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.73, #queue-req: 0\n"
+ "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 138.51, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.56, #queue-req: 0\n"
+ "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 138.53, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:04 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.80, #queue-req: 0\n"
+ "[2024-10-30 10:21:47 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 138.21, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:05] INFO: 127.0.0.1:55856 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
- "{\"id\":\"bbcbab6a628b4139b82000ab40565b10\",\"object\":\"chat.completion\",\"created\":1730282045,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on massive datasets of text, which enables them to learn patterns, relationships, and structures of language.\\n\\nThese models are often based on transformer architecture, which allows them to understand the context and nuances of language, including grammar, syntax, and semantics. This enables them to generate text that is coherent, readable, and often indistinguishable from human-written text.\\n\\nSome common applications of LLMs include:\\n\\n1. **Language translation**: LLMs can translate text from one language to another with high accuracy.\\n2. **Text summarization**: LLMs can summarize long pieces of text into concise, informative summaries.\\n3. **Content generation**: LLMs can generate text on a given topic, such as articles, blog posts, or even entire books.\\n4. **Chatbots and conversational AI**: LLMs can be used to power chatbots and conversational AI systems that can understand and respond to user queries.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely-used LLM that has achieved state-of-the-art results in many NLP (Natural Language Processing) tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is a variant of BERT that has been optimized for specific NLP tasks.\\n3. **ChatGPT**: Developed by OpenAI, ChatGPT is a conversational AI model that uses a type of LLM to generate human-like responses to user queries.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, and are being applied in a wide range of fields, from healthcare and finance to education and entertainment.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":469,\"completion_tokens\":422,\"prompt_tokens_details\":null}}"
+ "[2024-10-30 10:21:48] INFO: 127.0.0.1:49156 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+ "{\"id\":\"e0473f036ab74e6e95f25ac670600abd\",\"object\":\"chat.completion\",\"created\":1730283708,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language at a massive scale. \\n\\nA Large Language Model is trained on a vast amount of text data, which enables it to learn patterns, relationships, and structures of language. This training allows the model to generate human-like text, answer questions, summarize content, and even engage in conversations.\\n\\nLLMs use natural language processing (NLP) techniques, such as tokenization, part-of-speech tagging, and dependency parsing, to analyze and understand the meaning behind the text. They also employ machine learning algorithms, like transformer architecture, to learn from the data and improve their performance over time.\\n\\nSome common applications of Large Language Models include:\\n\\n1. **Chatbots**: LLMs power conversational interfaces, enabling users to interact with AI systems in a more natural and intuitive way.\\n2. **Language translation**: LLMs can translate text from one language to another, often with impressive accuracy.\\n3. **Text summarization**: LLMs can summarize long pieces of text, extracting key points and main ideas.\\n4. **Content generation**: LLMs can generate text, such as articles, stories, or even entire books.\\n5. **Question answering**: LLMs can answer questions on a wide range of topics, from simple queries to complex, open-ended questions.\\n\\nThe most well-known example of an LLM is perhaps the one I'm based on: I'm a large language model myself.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":354,\"completion_tokens\":307,\"prompt_tokens_details\":null}}"
]
}
],
@@ -305,10 +284,10 @@
"execution_count": 3,
"metadata": {
"execution": {
- "iopub.execute_input": "2024-10-30T09:54:05.209938Z",
- "iopub.status.busy": "2024-10-30T09:54:05.209737Z",
- "iopub.status.idle": "2024-10-30T09:54:05.956792Z",
- "shell.execute_reply": "2024-10-30T09:54:05.956182Z"
+ "iopub.execute_input": "2024-10-30T10:21:48.224352Z",
+ "iopub.status.busy": "2024-10-30T10:21:48.224065Z",
+ "iopub.status.idle": "2024-10-30T10:21:48.962070Z",
+ "shell.execute_reply": "2024-10-30T10:21:48.961531Z"
}
},
"outputs": [
@@ -316,21 +295,21 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:05 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
- "[2024-10-30 09:54:05 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 48.85, #queue-req: 0\n"
+ "[2024-10-30 10:21:48 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+ "[2024-10-30 10:21:48 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 49.36, #queue-req: 0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[2024-10-30 09:54:05] INFO: 127.0.0.1:55872 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+ "[2024-10-30 10:21:48] INFO: 127.0.0.1:56782 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
- "ChatCompletion(id='eb152de88c6a42eaab7b3911b3664583', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730282045, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))"
+ "ChatCompletion(id='6782b0981f72435ab6f3311224bd8b74', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730283708, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))"
],
"text/plain": [
""
@@ -362,10 +341,10 @@
"execution_count": 4,
"metadata": {
"execution": {
- "iopub.execute_input": "2024-10-30T09:54:05.958571Z",
- "iopub.status.busy": "2024-10-30T09:54:05.958388Z",
- "iopub.status.idle": "2024-10-30T09:54:07.683148Z",
- "shell.execute_reply": "2024-10-30T09:54:07.681009Z"
+ "iopub.execute_input": "2024-10-30T10:21:48.963733Z",
+ "iopub.status.busy": "2024-10-30T10:21:48.963552Z",
+ "iopub.status.idle": "2024-10-30T10:21:50.668291Z",
+ "shell.execute_reply": "2024-10-30T10:21:50.667574Z"
}
},
"outputs": [],
diff --git a/setup_github_runner.html b/setup_github_runner.html
index 422d060..26a92d8 100644
--- a/setup_github_runner.html
+++ b/setup_github_runner.html
@@ -174,6 +174,7 @@
Benchmark and Profiling
Troubleshooting
Embedding Model
+Learn more