diff --git a/dev/bench/data.js b/dev/bench/data.js index 782e05e..a1d578f 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1726370457248, + "lastUpdate": 1726452712435, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "Robert Shaw", - "username": "robertgshaw2-neuralmagic", - "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "ea0fbbd5bd1581d4d71ce3e8c151db7c792d3bdd", - "message": "Upstream sync 2024 08 18 (#59)\n\nSUMMARY:\r\n- Upstream sync from\r\nhttps://github.com/vllm-project/vllm/commit/4cf256ae7f8b0be8f06f6b85821e55d4f5bdaa13\r\n(`v0.5.2`) to\r\nhttps://github.com/vllm-project/vllm/commit/38c4b7e863570a045308af814c72f4504297222e\r\n(`v0.5.3.post1`)\r\n- Comprare\r\nhttps://github.com/neuralmagic/nm-vllm-ent/compare/upstream-sync-2024-08-18..upstream-v0.5.3.post1?expand=1\r\n\r\nTEST PLAN:\r\n- Automation\r\n\r\n---------\r\n\r\nSigned-off-by: kevin \r\nSigned-off-by: Thomas Parnell \r\nSigned-off-by: Muralidhar Andoorveedu \r\nSigned-off-by: Rui Qiao \r\nSigned-off-by: Travis Johnson \r\nCo-authored-by: Woosuk Kwon \r\nCo-authored-by: kevin \r\nCo-authored-by: Mor Zusman \r\nCo-authored-by: Mor Zusman \r\nCo-authored-by: Joe \r\nCo-authored-by: Cyrus Leung \r\nCo-authored-by: sasha0552 \r\nCo-authored-by: Thomas Parnell \r\nCo-authored-by: Peng Guanwen \r\nCo-authored-by: youkaichao \r\nCo-authored-by: Jiaxin Shan \r\nCo-authored-by: Cody Yu \r\nCo-authored-by: Michael Goin \r\nCo-authored-by: Wushi Dong <33078715+wushidonguc@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: shangmingc \r\nCo-authored-by: caishangming.csm \r\nCo-authored-by: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>\r\nCo-authored-by: milo157 <43028253+milo157@users.noreply.github.com>\r\nCo-authored-by: Antoni Baum \r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Varun Sundar Rabindranath \r\nCo-authored-by: Varun Sundar Rabindranath \r\nCo-authored-by: Nick Hill \r\nCo-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>\r\nCo-authored-by: Stephanie Wang \r\nCo-authored-by: Cyrus Leung \r\nCo-authored-by: Noam Gat \r\nCo-authored-by: Tyler Michael Smith \r\nCo-authored-by: Simon Mo \r\nCo-authored-by: Murali Andoorveedu \r\nCo-authored-by: Woo-Yeon Lee \r\nCo-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com>\r\nCo-authored-by: Travis Johnson \r\nCo-authored-by: Matt Wong <156021403+mawong-amd@users.noreply.github.com>\r\nCo-authored-by: Roger Wang \r\nCo-authored-by: sroy745 <142070531+sroy745@users.noreply.github.com>\r\nCo-authored-by: Isotr0py <2037008807@qq.com>\r\nCo-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>\r\nCo-authored-by: Jae-Won Chung \r\nCo-authored-by: Cheng Li \r\nCo-authored-by: zhaotyer <89376832+zhaotyer@users.noreply.github.com>\r\nCo-authored-by: tianyi.zhao \r\nCo-authored-by: youkaichao \r\nCo-authored-by: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>", - "timestamp": "2024-09-04T18:05:15Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/ea0fbbd5bd1581d4d71ce3e8c151db7c792d3bdd" - }, - "date": 1725492577004, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 71.92389405410115, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-04 23:07:48 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 24.259627568782342, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-04 23:07:48 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 217.5625408285608, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-04 23:28:20 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 17.966568827981423, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-04 23:28:20 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" - } - ] - }, { "commit": { "author": { @@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-15 02:59:09 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "Derek Kozikowski", + "username": "derekk-nm", + "email": "106621615+derekk-nm@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "75459fec6c533165eaad8fc8f028d614e46629d6", + "message": "fix use of code_coverage indicator (#73)\n\n# SUMMARY:\r\nTEST jobs on Nightly runs are failing when code_coverage is false (e.g.\r\n[NIGHTLY / TEST (3.10.12, gcp-k8s-l4-duo,\r\nneuralmagic/tests/test_skip_env_vars/duo-quad-full.txt) / TEST (3.10.12,\r\ngcp-k8s-l4-duo)](https://github.com/neuralmagic/nm-vllm-ent/actions/runs/10822514793/job/30048898792#logs)).\r\nnm-nightly.yml was always passing \"true\" to nn-build-test, and\r\nnm-test.yml was not passing the code_coverage to nm-test-whl, so it\r\nwould never generate the cc-vllm.json output.\r\n\r\nTEST PLAN:\r\nI triggered a NIGHTLY against this branch. Need to watch for the result.", + "timestamp": "2024-09-13T16:08:29Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/75459fec6c533165eaad8fc8f028d614e46629d6" + }, + "date": 1726452711784, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 40.757129975439355, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n \"cuda_device_names\": [\n \"NVIDIA A100-SXM4-80GB\"\n ]\n },\n \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-16 02:10:35 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 14.185489582777091, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n \"cuda_device_names\": [\n \"NVIDIA A100-SXM4-80GB\"\n ]\n },\n \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-16 02:10:35 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 19.436769941045593, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n \"cuda_device_names\": [\n \"NVIDIA A100-SXM4-80GB\"\n ]\n },\n \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-16 02:04:30 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 3.8925990412190035, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n \"cuda_device_names\": [\n \"NVIDIA A100-SXM4-80GB\"\n ]\n },\n \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-16 02:04:30 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }