From ffcc5e66da473314aae259e96df6add40d67c18f Mon Sep 17 00:00:00 2001 From: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:32:03 -0700 Subject: [PATCH] Add tensorrtllm_engine option to service-kind and update testing (#700) (#762) * Add tensorrtllm_engine option to service-kind and update testing * Add output format check for tensorrtllm_engine Co-authored-by: Elias Bermudez <6505145+debermudez@users.noreply.github.com> --- .../genai_perf/llm_inputs/llm_inputs.py | 1 + .../genai-perf/genai_perf/parser.py | 45 ++++++++++--------- .../genai-perf/tests/test_cli.py | 5 +++ 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 057c33562..e9d35bb37 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -53,6 +53,7 @@ class OutputFormat(Enum): RANKINGS = auto() TENSORRTLLM = auto() VLLM = auto() + TENSORRTLLM_ENGINE = auto() def to_lowercase(self): return self.name.lower() diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 776535d15..c415da9e4 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -177,6 +177,9 @@ def _check_conditional_args( args = _convert_str_to_enum_entry(args, "backend", OutputFormat) args.output_format = args.backend + if args.service_kind == "tensorrtllm_engine": + args.output_format = OutputFormat.TENSORRTLLM_ENGINE + # Output token distribution checks if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN: if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV: @@ -268,6 +271,8 @@ def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace: name += [f"{args.service_kind}-{args.endpoint_type}"] elif args.service_kind == "triton": name += [f"{args.service_kind}-{args.backend.to_lowercase()}"] + elif args.service_kind == "tensorrtllm_engine": + name += [f"{args.service_kind}"] else: raise ValueError(f"Unknown service kind '{args.service_kind}'.") @@ -528,25 +533,6 @@ def _add_profile_args(parser): def _add_endpoint_args(parser): endpoint_group = parser.add_argument_group("Endpoint") - endpoint_group.add_argument( - "-m", - "--model", - nargs="+", - default=[], - help=f"The name of the model(s) to benchmark.", - ) - endpoint_group.add_argument( - "--model-selection-strategy", - type=str, - choices=utils.get_enum_names(ModelSelectionStrategy), - default="round_robin", - required=False, - help=f"When multiple model are specified, this is how a specific model " - "should be assigned to a prompt. round_robin means that ith prompt in the " - "list gets assigned to i mod len(models). random means that assignment is " - "uniformly random", - ) - endpoint_group.add_argument( "--backend", type=str, @@ -576,10 +562,29 @@ def _add_endpoint_args(parser): 'server. This is only used with the "openai" service-kind.', ) + endpoint_group.add_argument( + "-m", + "--model", + nargs="+", + default=[], + help=f"The name of the model(s) to benchmark.", + ) + endpoint_group.add_argument( + "--model-selection-strategy", + type=str, + choices=utils.get_enum_names(ModelSelectionStrategy), + default="round_robin", + required=False, + help=f"When multiple model are specified, this is how a specific model " + "should be assigned to a prompt. round_robin means that ith prompt in the " + "list gets assigned to i mod len(models). random means that assignment is " + "uniformly random", + ) + endpoint_group.add_argument( "--service-kind", type=str, - choices=["triton", "openai"], + choices=["triton", "openai", "tensorrtllm_engine"], default="triton", required=False, help="The kind of service perf_analyzer will " diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index 2ef5d52ba..d35f4cf11 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -203,6 +203,10 @@ def test_help_version_arguments_output_and_exit( (["--request-rate", "9.0"], {"request_rate": 9.0}), (["-s", "99.5"], {"stability_percentage": 99.5}), (["--service-kind", "triton"], {"service_kind": "triton"}), + ( + ["--service-kind", "tensorrtllm_engine"], + {"service_kind": "tensorrtllm_engine"}, + ), ( ["--service-kind", "openai", "--endpoint-type", "chat"], {"service_kind": "openai", "endpoint": "v1/chat/completions"}, @@ -654,6 +658,7 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): OutputFormat.TENSORRTLLM, ), (["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM), + (["--service-kind", "tensorrtllm_engine"], OutputFormat.TENSORRTLLM_ENGINE), ], ) def test_inferred_output_format(self, monkeypatch, args, expected_format):