Skip to content

Commit

Permalink
support input payload generation for tensorrtllm engine
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Jul 25, 2024
1 parent ffcc5e6 commit be921b6
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def create_llm_inputs(

json_in_pa_format = cls._convert_generic_json_to_output_format(
output_format,
tokenizer,
generic_dataset_json,
add_model_name,
add_stream,
Expand Down Expand Up @@ -689,6 +690,7 @@ def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict:
def _convert_generic_json_to_output_format(
cls,
output_format: OutputFormat,
tokenizer: Tokenizer,
generic_dataset: Dict,
add_model_name: bool,
add_stream: bool,
Expand Down Expand Up @@ -764,6 +766,16 @@ def _convert_generic_json_to_output_format(
model_name,
model_selection_strategy,
)
elif output_format == OutputFormat.TENSORRTLLM_ENGINE:
output_json = cls._convert_generic_json_to_trtllm_engine_format(
generic_dataset,
tokenizer,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
else:
raise GenAIPerfException(
f"Output format {output_format} is not currently supported"
Expand Down Expand Up @@ -1011,6 +1023,28 @@ def _convert_generic_json_to_trtllm_format(

return pa_json

@classmethod
def _convert_generic_json_to_trtllm_engine_format(
cls,
dataset_json: Dict,
tokenizer: Tokenizer,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
pa_json = cls._populate_trtllm_engine_output_json(
dataset_json,
tokenizer,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
return pa_json

@classmethod
def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None:
filename = output_dir / DEFAULT_INPUT_DATA_JSON
Expand Down Expand Up @@ -1262,6 +1296,43 @@ def _populate_trtllm_output_json(

return pa_json

@classmethod
def _populate_trtllm_engine_output_json(
cls,
dataset_json: Dict,
tokenizer: Tokenizer,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
pa_json = cls._create_empty_trtllm_pa_json()

for index, entry in enumerate(dataset_json["rows"]):
token_ids = tokenizer.encode(entry["text_input"])
pa_json["data"].append(
{
"input_ids": {
"content": token_ids,
"shape": [len(token_ids)],
},
"input_lengths": [len(token_ids)],
"request_output_len": [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS],
}
)

pa_json = cls._add_optional_tags_to_trtllm_engine_json(
pa_json,
index,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
return pa_json

@classmethod
def _create_empty_openai_pa_json(cls) -> Dict:
empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT)
Expand Down Expand Up @@ -1478,6 +1549,31 @@ def _add_optional_tags_to_trtllm_json(

return pa_json

@classmethod
def _add_optional_tags_to_trtllm_engine_json(
cls,
pa_json: Dict,
index: int,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
row = pa_json["data"][index]
if add_stream:
row["streaming"] = [True]
if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN:
num_tokens = int(random.gauss(output_tokens_mean, output_tokens_stddev))
row["request_output_len"] = [num_tokens]
if output_tokens_deterministic:
row["min_length"] = [num_tokens]

for key, value in extra_inputs.items():
row[key] = [value]

return pa_json

@classmethod
def _add_required_tags_to_trtllm_json(
cls,
Expand Down
5 changes: 3 additions & 2 deletions src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,11 @@ def _check_conditional_args(
"The --output-tokens-mean option is required when using --output-tokens-mean-deterministic."
)

if args.service_kind != "triton":
if args.service_kind not in ["triton", "tensorrtllm_engine"]:
if args.output_tokens_mean_deterministic:
parser.error(
"The --output-tokens-mean-deterministic option is only supported with the Triton service-kind."
"The --output-tokens-mean-deterministic option is only supported "
"with the Triton and TensorRT-LLM Engine service-kind."
)

_check_conditional_args_embeddings_rankings(parser, args)
Expand Down
7 changes: 7 additions & 0 deletions src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
cmd += [f"-{arg}"]
else:
cmd += [f"--{arg}"]

# (TPA-237) GAP needs to call PA using triton_c_api service kind.
# Currently, it just calls using triton service kind to verify that
# it runs.
elif arg == "service_kind" and value == "tensorrtllm_engine":
cmd += ["--service-kind", "triton"]
args.service_kind = "triton"
else:
if len(arg) == 1:
cmd += [f"-{arg}", f"{value}"]
Expand Down
2 changes: 1 addition & 1 deletion src/c++/perf_analyzer/genai-perf/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys):
"100",
"--output-tokens-mean-deterministic",
],
"The --output-tokens-mean-deterministic option is only supported with the Triton service-kind",
"The --output-tokens-mean-deterministic option is only supported with the Triton and TensorRT-LLM Engine service-kind",
),
(
[
Expand Down
102 changes: 102 additions & 0 deletions src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,107 @@ def test_llm_inputs_with_defaults(self, default_configured_url):
# else:
# assert False, f"Unsupported output format: {output_format}"

@pytest.mark.parametrize(
"generic_json, add_stream, output_tokens_mean, output_tokens_deterministic, expected_json",
[
(
# generic_json
{
"rows": [
{"text_input": "test input one"},
{"text_input": "test input two"},
]
},
False,
-1,
False,
# expected_json
{
"data": [
{
"input_ids": {
"content": [1243, 1881, 697],
"shape": [3],
},
"input_lengths": [3],
"request_output_len": [
LlmInputs.DEFAULT_TENSORRTLLM_MAX_TOKENS
],
},
{
"input_ids": {
"content": [1243, 1881, 1023],
"shape": [3],
},
"input_lengths": [3],
"request_output_len": [
LlmInputs.DEFAULT_TENSORRTLLM_MAX_TOKENS
],
},
],
},
),
(
# generic_json
{
"rows": [
{"text_input": "test input one"},
{"text_input": "test input two"},
]
},
True,
999,
True,
# expected_json
{
"data": [
{
"input_ids": {
"content": [1243, 1881, 697],
"shape": [3],
},
"input_lengths": [3],
"request_output_len": [999],
"min_length": [999],
"streaming": [True],
},
{
"input_ids": {
"content": [1243, 1881, 1023],
"shape": [3],
},
"input_lengths": [3],
"request_output_len": [999],
"min_length": [999],
"streaming": [True],
},
],
},
),
],
)
def test_generic_json_to_trtllm_engine_format(
self,
generic_json,
add_stream,
output_tokens_mean,
output_tokens_deterministic,
expected_json,
) -> None:
trtllm_json = LlmInputs._convert_generic_json_to_output_format(
output_format=OutputFormat.TENSORRTLLM_ENGINE,
tokenizer=get_tokenizer(DEFAULT_TOKENIZER),
generic_dataset=generic_json,
add_model_name=False,
add_stream=add_stream,
extra_inputs={},
output_tokens_mean=output_tokens_mean,
output_tokens_stddev=0,
output_tokens_deterministic=output_tokens_deterministic,
)

assert trtllm_json == expected_json

def test_add_image_inputs_openai_vision(self) -> None:
generic_json = {
"rows": [
Expand Down Expand Up @@ -606,6 +707,7 @@ def test_add_image_inputs_openai_vision(self) -> None:
OutputFormat.OPENAI_VISION,
OutputFormat.VLLM,
OutputFormat.TENSORRTLLM,
OutputFormat.TENSORRTLLM_ENGINE,
],
)
def test_get_input_dataset_from_synthetic(
Expand Down

0 comments on commit be921b6

Please sign in to comment.