Skip to content

Commit

Permalink
Update TensorRT-LLM (#1725)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: RunningLeon <[email protected]>
Co-authored-by: Tlntin <[email protected]>
Co-authored-by: ZHENG, Zhen <[email protected]>
Co-authored-by: Pham Van Ngoan <[email protected]>
Co-authored-by: Nathan Price <[email protected]>
Co-authored-by: Tushar Goel <[email protected]>
Co-authored-by: Mati <[email protected]>
  • Loading branch information
8 people authored Jun 4, 2024
1 parent f430a4b commit b777bd6
Show file tree
Hide file tree
Showing 368 changed files with 21,445 additions and 8,977 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/
*.nsys-rep
.VSCodeCounter
build*/
!builders/
*.egg-info/
.coverage
*.onnx
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ repos:
args:
- --skip=".git,3rdparty"
- --exclude-file=examples/whisper/tokenizer.py
- --ignore-words-list=rouge,inout,atleast,strat,nd
- --ignore-words-list=rouge,inout,atleast,strat,nd,subtile
exclude: 'tests/llm-test-defs/turtle/test_input_files'
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ To get started with TensorRT-LLM, visit our documentation:
- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)

## Community
- [Model zoo](https://huggingface.co/TheFloat16) (generated by TRT-LLM rel 0.9 a9356d4b7610330e89c1010f342a9ac644215c52)
14 changes: 6 additions & 8 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,10 @@ TP=2
PP=1
MAX_LEN=1024
MAX_BATCH=32
MAX_LORA_RANK=32
NUM_LAYERS=40
MAX_LORA_RANK=64
NUM_LORA_MODS=7
EOS_ID=2
SOURCE_LORA=chinese-llama-2-lora-13b
CPP_LORA=chinese-llama-2-lora-13b-cpp
Expand All @@ -234,7 +236,7 @@ ${HOME}/.local/bin/trtllm-build \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
--lora_target_modules attn_qkv \
--lora_target_modules attn_q attn_k attn_v attn_dense mlp_h_to_4h mlp_4h_to_h mlp_gate \
--max_lora_rank ${MAX_LORA_RANK}
NUM_LORAS=(8 16 24 32 64 128 256)
Expand All @@ -252,8 +254,6 @@ mkdir -p $EG_DIR/data
# Prepare dataset without lora_task_id
python benchmarks/cpp/prepare_dataset.py \
--output "${EG_DIR}/data/token-norm-dist.json" \
--request-rate -1 \
--time-delay-dist constant \
--tokenizer $TOKENIZER \
token-norm-dist \
--num-requests $NUM_REQUESTS \
Expand All @@ -263,8 +263,6 @@ python benchmarks/cpp/prepare_dataset.py \
for nloras in ${NUM_LORAS[@]}; do
python benchmarks/cpp/prepare_dataset.py \
--output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
--request-rate -1 \
--time-delay-dist constant \
--rand-task-id 0 $(( $nloras - 1 )) \
--tokenizer $TOKENIZER \
token-norm-dist \
Expand Down Expand Up @@ -292,7 +290,7 @@ mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
# Now run inference with various numbers or loras
# The host cache is set large enough to hold all the LoRAs in lora_dir
# GPU cache is set to hold 32 LoRAs
# GPU cache is set to hold 16 LoRAs
# This benchmark will preload all the LoRAs into the host cache
# We run inference on a range of active LoRAs exercising different cache miss rates.
for nloras in ${NUM_LORAS[@]}; do
Expand All @@ -303,7 +301,7 @@ for nloras in ${NUM_LORAS[@]}; do
--type IFB \
--dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
--lora_host_cache_bytes 8589934592 \
--lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
--lora_num_device_mod_layers $(( 16 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
--kv_cache_free_gpu_mem_fraction 0.80 \
--log_level info \
--eos_id ${EOS_ID} \
Expand Down
32 changes: 26 additions & 6 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,10 +458,6 @@ class Recorder
{
this->recordEnd(requestId, hasError);

if (mRespJsonFile.empty())
return;
int32_t outputSeqLen;

for (auto& tensor : responseTensors)
{
if (tensor.name == inference_request::kOutputIdsTensorName)
Expand All @@ -471,7 +467,7 @@ class Recorder
else if (tensor.name == inference_request::kSequenceLengthTensorName)
{
// Tensor of shape nBeams, and we only need the first one
outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
int32_t outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
if (mOutputHasInput)
{
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
Expand All @@ -482,6 +478,30 @@ class Recorder
}
}

void recordEnd(uint64_t requestId, texec::Response const& response)
{

this->recordEnd(requestId, response.hasError());

// Get the actual output length
if (!response.hasError())
{
auto outputTokenIds = response.getResult().outputTokenIds;

int32_t outSeqLen = 0;
for (auto const& beam : outputTokenIds)
{
outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);
}
if (mOutputHasInput)
{
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
outSeqLen -= inputSeqLen;
}
mRequestBenchInfos[requestId].outputLength = outSeqLen;
}
}

float calcPercentile(std::vector<float> const& latencies, int percentile)
{
int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;
Expand Down Expand Up @@ -827,7 +847,7 @@ class ExecutorServer
numFinished++;
if (!warmup)
{
mRecorder->recordEnd(reqId, response.hasError());
mRecorder->recordEnd(reqId, response);
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <NvInfer.h>
#include <atomic>
#include <chrono>
#include <cuda_profiler_api.h>
#include <cxxopts.hpp>
#include <future>
#include <sstream>
Expand Down Expand Up @@ -213,6 +214,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
std::vector<float> latencies;
std::vector<float> generationTimes;
auto generationProfiler = std::make_shared<GptSession::GenerationProfiler>();
cudaProfilerStart();
while (iterIdx < numRuns)
{
auto const start = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -242,6 +244,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
break;
}
}
cudaProfilerStop();

TLLM_LOG_INFO(memoryCounter.toString());
done = true;
Expand Down
4 changes: 0 additions & 4 deletions benchmarks/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ def parse_arguments():
help=
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
)
parser.add_argument('--strongly_typed',
default=False,
action='store_true',
help='This option will reduce the building time.')
parser.add_argument(
'--gpu_weights_percent',
type=str,
Expand Down
50 changes: 21 additions & 29 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,6 @@ def parse_arguments():
default=False,
action='store_true',
help="Build engines serially")
parser.add_argument('--strongly_typed',
default=False,
action='store_true',
help='This option will reduce the building time.')
parser.add_argument(
'--multiple_profiles',
default=False,
Expand Down Expand Up @@ -251,9 +247,6 @@ def build_gpt(args):
if not args.serial_build:
torch.cuda.set_device(runtime_rank)

strongly_typed = args.strongly_typed
if args.quantization is not None and "fp8" in args.quantization:
strongly_typed = True
num_kv_heads = build_config['num_heads'] \
if build_config['num_kv_heads'] is None else build_config['num_kv_heads']
apply_query_key_layer_scaling = False
Expand Down Expand Up @@ -321,7 +314,7 @@ def build_gpt(args):
quant_mode=quant_mode,
use_refit=False,
opt_level=build_config['builder_opt'],
strongly_typed=strongly_typed,
strongly_typed=True,
weight_streaming=is_weight_streaming,
**builder_config_extra_kwargs)
engine_name = get_engine_name(args.model, args.dtype, world_size,
Expand Down Expand Up @@ -363,8 +356,10 @@ def build_gpt(args):
'apply_query_key_layer_scaling':
builder_config.apply_query_key_layer_scaling,
'rotary_pct': build_config['rotary_pct'],
'moe_num_experts': build_config["moe_num_experts"],
'moe_top_k': build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.GPTForCausalLM(config)
Expand Down Expand Up @@ -399,7 +394,7 @@ def build_gpt(args):
elif family == "llama":
config = {
'architecture':
'LLaMAForCausalLM',
'LlamaForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
Expand Down Expand Up @@ -430,10 +425,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
}
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
Expand Down Expand Up @@ -602,9 +597,6 @@ def build_gpt(args):
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
tensorrt_llm_model = optimize_model(
tensorrt_llm_model,
use_parallel_embedding=config.use_parallel_embedding)
elif family == "falcon":
config = {
'architecture':
Expand Down Expand Up @@ -696,7 +688,7 @@ def build_gpt(args):
elif family == "internlm":
config = {
'architecture':
'LLaMAForCausalLM',
'LlamaForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
Expand Down Expand Up @@ -778,10 +770,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
'qwen_type':
'qwen',
}
Expand Down Expand Up @@ -821,10 +813,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
'qwen_type':
'qwen2',
}
Expand Down Expand Up @@ -1029,7 +1021,7 @@ def build_bert(args):
max_batch_size=max_batch_size,
max_input_len=max_input_len,
opt_level=build_config['builder_opt'],
strongly_typed=args.strongly_typed,
strongly_typed=True,
weight_streaming=is_weight_streaming,
)
engine_name = get_engine_name(args.model, args.dtype, world_size,
Expand Down Expand Up @@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
cross_attention=(component == 'decoder'),
has_position_embedding=has_position_embedding,
has_token_type_embedding=False, # by default
strongly_typed=False, # by default
strongly_typed=True,
gather_all_token_logits=False, # by default
int8=(quant_mode.has_act_and_weight_quant()
or quant_mode.is_int8_weight_only()),
Expand Down
19 changes: 16 additions & 3 deletions benchmarks/python/check_accuracy_mlperf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
from enum import Enum

import evaluate
Expand Down Expand Up @@ -82,9 +83,11 @@ def calculate_toks_per_sample(preds, eos_id):
return avg_len / num_samples


def calculate_rouge_score(preds, targets):
def calculate_rouge_score(preds, targets, rouge_dir=None):
print("Calculating ROUGE scores...")
metric = evaluate.load("rouge")
rouge_dir = rouge_dir if rouge_dir and os.path.exists(
rouge_dir) else "rouge"
metric = evaluate.load(rouge_dir)
preds, targets = postprocess_text(preds, targets[0:len(preds)])
result = metric.compute(predictions=preds,
references=targets,
Expand Down Expand Up @@ -114,6 +117,15 @@ def parse_arguments():
parser.add_argument("--base_model",
type=str,
help="Location of the model used (to create tokenizer)")

parser.add_argument(
'--rouge_dir',
default=None,
type=str,
help=
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
)

args = parser.parse_args()

return args
Expand Down Expand Up @@ -146,7 +158,8 @@ def main():
tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)

pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
achieved_scores = calculate_rouge_score(pred_texts, target_texts)
achieved_scores = calculate_rouge_score(pred_texts, target_texts,
args.rouge_dir)

achieved_scores['tokens_per_sample'] = tps_score
targets = ACCURACY_TARGETS[model]
Expand Down
12 changes: 4 additions & 8 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,10 @@ def check_memory(self, io_shapes: list, raise_exception=False):
self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
# when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
if not self.use_gpt_attention_plugin:
if os.getenv('TRTLLM_DISABLE_OOTB_KVCACHE_REUSE') != 'ON':
local_n_layer = ceil(self.build_config.num_layers /
self.runtime_mapping.pp_size)
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
local_n_layer + 1)
else:
# without reusing, we need one for past as engine inputs, one for present as engine outputs.
kv_cache_size_in_bytes *= 2
local_n_layer = ceil(self.build_config.num_layers /
self.runtime_mapping.pp_size)
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
local_n_layer + 1)

kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
"MiB")
Expand Down
4 changes: 1 addition & 3 deletions benchmarks/suite/tensorrt_llm_bench/utils/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def get_build_options(self, dtype: str) -> List[str]:
List[str]: A list of command line arguments to be added to build
commands.
"""
if self.value == self.FP8:
return ["--strongly_typed"]
else:
if not self.value == self.FP8:
return ["--gemm_plugin", dtype]


Expand Down
Loading

0 comments on commit b777bd6

Please sign in to comment.