diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md index 3d61331bd..9639b5eb2 100644 --- a/benchmarks/cpp/README.md +++ b/benchmarks/cpp/README.md @@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \ --output_dir ${LORA_ENGINE} \ --max_batch_size ${MAX_BATCH} \ --max_input_len $MAX_LEN \ - --max_output_len $MAX_LEN \ + --max_seq_len $((2*${MAX_LEN})) \ --gemm_plugin float16 \ --lora_plugin float16 \ --use_paged_context_fmha enable \ diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp index 8e50c9244..2b303170d 100644 --- a/benchmarks/cpp/bertBenchmark.cpp +++ b/benchmarks/cpp/bertBenchmark.cpp @@ -17,6 +17,7 @@ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/rawEngine.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/tllmRuntime.h" #include "tensorrt_llm/runtime/worldConfig.h" @@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da { auto const worldConfig = WorldConfig::mpi(); auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName); - auto engineBlob = loadEngine(enginePath.string()); for (float gpuWeightsPercent : gpuWeightsPercents) { - auto rt = std::make_shared(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger); + auto rt = std::make_shared(RawEngine(enginePath), logger.get(), gpuWeightsPercent); rt->addContext(0); for (auto inLen : inLens) { diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp index 09bc18408..02abc189b 100644 --- a/benchmarks/cpp/gptManagerBenchmark.cpp +++ b/benchmarks/cpp/gptManagerBenchmark.cpp @@ -150,6 +150,7 @@ struct BenchmarkParams bool streaming{false}; bool enableExpDelays{false}; std::optional requestRate{std::nullopt}; + std::optional maxBatchSize{std::nullopt}; int randomSeed = 430; std::optional maxAttentionWindow{std::nullopt}; @@ -785,6 +786,10 @@ class ExecutorServer executorConfig.setPeftCacheConfig(peftCacheConfig); executorConfig.setBatchingType( modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT); + if (benchmarkParams.maxBatchSize) + { + executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value()); + } mExecutor = std::make_unique(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig); @@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks; optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent; optionalParams.maxBeamWidth = beamWidth; + optionalParams.maxBatchSize = benchmarkParams.maxBatchSize; optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy}; auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json"); @@ -1628,6 +1634,7 @@ int main(int argc, char* argv[]) options.add_options()("request_rate", "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.", cxxopts::value()); + options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value()); options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution", cxxopts::value()->default_value("false")); options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival", @@ -1777,6 +1784,12 @@ int main(int argc, char* argv[]) benchmarkParams.requestRate = result["request_rate"].as(); } + // Argument: request rate + if (result.count("max_batch_size")) + { + benchmarkParams.maxBatchSize = result["max_batch_size"].as(); + } + benchmarkParams.enableExpDelays = result["enable_exp_delays"].as(); // Argument: Enable batch stats output diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py index 3a0109216..45cf050c5 100644 --- a/benchmarks/python/allowed_configs.py +++ b/benchmarks/python/allowed_configs.py @@ -32,7 +32,7 @@ class BuildConfig: max_batch_size: int max_input_len: Optional[int] = None num_kv_heads: Optional[int] = None - max_output_len: Optional[int] = None + max_seq_len: Optional[int] = None max_beam_width: int = 1 # TRT builder_optimization_level from 0 to 5 builder_opt: Optional[int] = None @@ -89,7 +89,7 @@ class EncDecBuildConfig: normalize_before: Optional[bool] = None max_encoder_input_len: Optional[int] = None max_decoder_input_len: Optional[int] = None - max_output_len: Optional[int] = None + max_seq_len: Optional[int] = None builder_opt: Optional[int] = None n_mels: Optional[int] = None skip_cross_qkv: bool = False @@ -122,7 +122,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "gpt_1.5b": @@ -138,7 +138,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "gpt_175b": @@ -154,7 +154,7 @@ class ModelConfig: n_positions=2048, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "gpt_350m_moe": @@ -170,7 +170,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, moe_num_experts=8, moe_top_k=1, @@ -188,7 +188,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, quantization="int8_sq_per_tensor", )), @@ -205,7 +205,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, quantization="int8_sq_per_token_channel", )), @@ -222,7 +222,7 @@ class ModelConfig: n_positions=1024, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, position_embedding_type='rope_gpt_neox', rotary_pct=0.5, @@ -241,7 +241,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, pre_norm=False, do_layer_norm_before=False, @@ -259,7 +259,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, pre_norm=False, do_layer_norm_before=True, @@ -277,7 +277,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, pre_norm=False, do_layer_norm_before=True, @@ -295,7 +295,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, pre_norm=False, do_layer_norm_before=True, @@ -313,7 +313,7 @@ class ModelConfig: n_positions=2048, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, pre_norm=True, do_layer_norm_before=True, @@ -332,7 +332,7 @@ class ModelConfig: n_positions=8192, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "starcoder2_3b": @@ -351,7 +351,7 @@ class ModelConfig: rotary_pct=1.0, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "llama_7b": @@ -368,7 +368,7 @@ class ModelConfig: inter_size=11008, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "llama_13b": @@ -385,7 +385,7 @@ class ModelConfig: inter_size=13824, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "llama_30b": @@ -402,7 +402,7 @@ class ModelConfig: inter_size=17920, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "llama_70b": @@ -420,7 +420,7 @@ class ModelConfig: inter_size=28672, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "llama_70b_long_context": @@ -437,7 +437,7 @@ class ModelConfig: inter_size=28672, max_batch_size=16, max_input_len=8000, - max_output_len=200, + max_seq_len=8200, builder_opt=None, enable_multi_block_mode=True)), "llama_70b_long_generation": @@ -454,7 +454,7 @@ class ModelConfig: inter_size=28672, max_batch_size=64, max_input_len=200, - max_output_len=16384, + max_seq_len=16584, builder_opt=None, enable_multi_block_mode=True)), "llama_70b_sq_per_tensor": @@ -471,7 +471,7 @@ class ModelConfig: inter_size=28672, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, quantization="int8_sq_per_tensor")), "mixtral_8x7b": @@ -489,7 +489,7 @@ class ModelConfig: inter_size=14336, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, moe_num_experts=8, moe_top_k=2, @@ -508,7 +508,7 @@ class ModelConfig: rotary_dim=64, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "gptneox_20b": @@ -525,7 +525,7 @@ class ModelConfig: rotary_dim=24, max_batch_size=16, max_input_len=512, - max_output_len=512, + max_seq_len=1024, builder_opt=None, )), "chatglm_6b": @@ -543,7 +543,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, remove_input_padding=False, )), @@ -562,7 +562,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, remove_input_padding=False, )), @@ -581,7 +581,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, remove_input_padding=False, )), @@ -600,7 +600,7 @@ class ModelConfig: n_positions=1024, max_batch_size=128, max_input_len=1024, - max_output_len=256, + max_seq_len=1280, builder_opt=None, remove_input_padding=False, )), @@ -617,7 +617,7 @@ class ModelConfig: n_positions=2048, max_batch_size=32, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, builder_opt=None, )), "bloom_176b": @@ -633,7 +633,7 @@ class ModelConfig: n_positions=2048, max_batch_size=8, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, builder_opt=None, )), "bert_base": @@ -703,7 +703,7 @@ class ModelConfig: n_positions=2048, max_batch_size=256, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, builder_opt=None, bias=True, use_alibi=True, @@ -724,7 +724,7 @@ class ModelConfig: n_positions=2048, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=False, use_alibi=False, @@ -745,7 +745,7 @@ class ModelConfig: n_positions=2048, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=False, use_alibi=False, @@ -766,7 +766,7 @@ class ModelConfig: n_positions=2048, max_batch_size=8, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, builder_opt=None, bias=False, use_alibi=False, @@ -791,7 +791,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "t5_base": @@ -812,7 +812,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "t5_large": @@ -833,7 +833,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "t5_3b": @@ -854,7 +854,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "t5_11b": @@ -875,7 +875,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "flan_t5_small": @@ -897,7 +897,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "flan_t5_base": @@ -919,7 +919,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "flan_t5_large": @@ -941,7 +941,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "flan_t5_xl": @@ -963,7 +963,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "flan_t5_xxl": @@ -985,7 +985,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "bart_large_cnn": @@ -1008,7 +1008,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "mbart_large_50_many_to_one_mmt": @@ -1030,7 +1030,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1024, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "baichuan_7b": @@ -1047,7 +1047,7 @@ class ModelConfig: inter_size=11008, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "baichuan2_7b_chat": @@ -1064,7 +1064,7 @@ class ModelConfig: inter_size=11008, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "baichuan_13b_chat": @@ -1081,7 +1081,7 @@ class ModelConfig: inter_size=13696, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "baichuan2_13b_chat": @@ -1098,7 +1098,7 @@ class ModelConfig: inter_size=13696, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "internlm_chat_7b": @@ -1116,7 +1116,7 @@ class ModelConfig: inter_size=11008, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=True, )), @@ -1135,7 +1135,7 @@ class ModelConfig: inter_size=13824, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=False, )), @@ -1152,7 +1152,7 @@ class ModelConfig: inter_size=22016, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=False)), "qwen_14b_chat": @@ -1169,7 +1169,7 @@ class ModelConfig: inter_size=27392, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "qwen1.5_7b_chat": @@ -1185,7 +1185,7 @@ class ModelConfig: inter_size=11008, max_batch_size=128, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, bias=False)), "qwen1.5_14b_chat": @@ -1202,7 +1202,7 @@ class ModelConfig: inter_size=13696, max_batch_size=64, max_input_len=512, - max_output_len=200, + max_seq_len=712, builder_opt=None, )), "mamba_2.8b": @@ -1218,7 +1218,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, state_size=16, conv_kernel=4, rnn_hidden_size=5120, @@ -1238,7 +1238,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, state_size=16, conv_kernel=4, rnn_hidden_size=4096, @@ -1258,7 +1258,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, state_size=16, conv_kernel=4, rnn_hidden_size=3072, @@ -1278,7 +1278,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, state_size=16, conv_kernel=4, rnn_hidden_size=2048, @@ -1298,7 +1298,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, state_size=16, conv_kernel=4, rnn_hidden_size=1536, @@ -1323,7 +1323,7 @@ class ModelConfig: max_batch_size=8, max_encoder_input_len=1500, max_decoder_input_len=1, - max_output_len=200, + max_seq_len=201, builder_opt=None, )), "recurrentgemma_2b": @@ -1341,7 +1341,7 @@ class ModelConfig: n_positions=8192, max_batch_size=64, max_input_len=1024, - max_output_len=1024, + max_seq_len=2048, position_embedding_type='rope_gpt_neox', rotary_pct=0.5, conv_kernel=4, diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py index c081cb140..75e52aa5e 100644 --- a/benchmarks/python/benchmark.py +++ b/benchmarks/python/benchmark.py @@ -184,6 +184,15 @@ def parse_arguments(): help= ('If this option is specified, it will override the max output len of ' 'TRT engines to the specified value instead of using pre-defined one')) + parser.add_argument( + '--max_seq_len', + '--max_decoder_seq_len', + dest='max_seq_len', + type=int, + default=None, + help= + ('If this option is specified, it will override the max sequence len of ' + 'TRT engines to the specified value instead of using pre-defined one')) parser.add_argument( '--max_batch_size', type=int, @@ -351,6 +360,21 @@ def main(args): rank = tensorrt_llm.mpi_rank() world_size = tensorrt_llm.mpi_world_size() + if args.max_output_len: + logger.warning( + '--max_output_len has been deprecated in favor of --max_seq_len') + if args.max_input_len: + if args.max_seq_len: + logger.warning( + '--max_seq_len has been overwritten due to --max_output_len being specified' + ) + args.max_seq_len = args.max_input_len + args.max_output_len + else: + raise Exception( + f"--max_output_len is specified but not --max_input_len") + + del args.max_output_len + # TODO: Re-enable memory monitor for multi-gpu benchmarks. # Current Mem Monitor will cause benchmark script hang # because MPI does not work well with multiprocessing. diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py index 9168a9742..07447aaa8 100644 --- a/benchmarks/python/build.py +++ b/benchmarks/python/build.py @@ -136,6 +136,15 @@ def parse_arguments(): help= ('If this option is specified, it will override the max output len of ' 'TRT engines to the specified value instead of using pre-defined one')) + parser.add_argument( + '--max_seq_len', + '--max_decoder_seq_len', + dest='max_seq_len', + type=int, + default=None, + help= + ('If this option is specified, it will override the max sequence len of ' + 'TRT engines to the specified value instead of using pre-defined one')) parser.add_argument( '--max_batch_size', type=int, @@ -254,8 +263,24 @@ def build_gpt(args): if args.max_batch_size is None else args.max_batch_size max_input_len = build_config['max_input_len'] \ if args.max_input_len is None else args.max_input_len - max_output_len = build_config['max_output_len'] \ - if args.max_output_len is None else args.max_output_len + + if args.max_output_len: + logger.warning( + '--max_output_len has been deprecated in favor of --max_seq_len') + if args.max_input_len: + if args.max_seq_len: + logger.warning( + '--max_seq_len has been overwritten due to --max_output_len being specified' + ) + args.max_seq_len = args.max_input_len + args.max_output_len + else: + raise Exception( + f"max_output_len is specified but not max_input_len") + + del args.max_output_len + + max_seq_len = build_config['max_seq_len'] \ + if args.max_seq_len is None else args.max_seq_len max_beam_width = build_config['max_beam_width'] \ if args.max_beam_width is None else args.max_beam_width @@ -308,7 +333,7 @@ def build_gpt(args): max_batch_size=max_batch_size, max_beam_width=max_beam_width, max_input_len=max_input_len, - max_output_len=max_output_len, + max_seq_len=max_seq_len, max_num_tokens=max_num_tokens, int8=(quant_mode.has_act_and_weight_quant() or quant_mode.is_int8_weight_only()), @@ -675,7 +700,6 @@ def build_gpt(args): config['quantization'].update({ 'has_zero_point': False, 'pre_quant_scale': True, - 'exclude_modules': [], }) config = PretrainedConfig.from_dict(config) tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM(config) @@ -759,7 +783,6 @@ def build_gpt(args): "group_size": 128, "has_zero_point": False, "pre_quant_scale": True, - "exclude_modules": [], }) elif 'gptq' in args.quantization: config['quantization'].update({ @@ -968,14 +991,14 @@ def build_gpt(args): # Forward print( - f"max_batch_size: {max_batch_size}, max_input_len: {max_input_len}, max_output_len: {max_output_len}, max_beam_width: {max_beam_width}" + f"max_batch_size: {max_batch_size}, max_input_len: {max_input_len}, max_seq_len: {max_seq_len}, max_beam_width: {max_beam_width}" ) # NOTE: all other models use PretrainedModel.prepare_inputs(...) # except RecurrentGemmaForCausalLM and MambaForCausalLM inputs = tensorrt_llm_model.prepare_inputs( max_batch_size=max_batch_size, max_input_len=max_input_len, - max_seq_len=max_input_len + max_output_len, + max_seq_len=max_seq_len, max_num_tokens=max_num_tokens, use_cache=True, max_beam_width=max_beam_width, @@ -1231,7 +1254,7 @@ def enc_dec_build_helper(component, config, args): max_batch_size=config['max_batch_size'], max_beam_width=config['max_beam_width'], max_decoder_input_len=config['max_decoder_input_len'], - max_output_len=config['max_output_len'], + max_seq_len=config['max_seq_len'], max_encoder_input_len=config['max_encoder_input_len'], opt_level=config['builder_opt'], cross_attention=(component == 'decoder'), @@ -1473,7 +1496,7 @@ def enc_dec_build_helper(component, config, args): max_batch_size=config['max_batch_size'], max_beam_width=config['max_beam_width'], max_decoder_input_len=config['max_decoder_input_len'], - max_seq_len=config['max_output_len'], + max_seq_len=config['max_seq_len'], max_encoder_input_len=1500, # n_audio_ctx ) tllm_model(**inputs) @@ -1482,7 +1505,7 @@ def enc_dec_build_helper(component, config, args): max_batch_size=config['max_batch_size'], max_beam_width=config['max_beam_width'], max_decoder_input_len=config['max_decoder_input_len'], - max_seq_len=config['max_output_len'], + max_seq_len=config['max_seq_len'], max_encoder_input_len=config['max_encoder_input_len'], ) @@ -1548,8 +1571,24 @@ def build_enc_dec(args): build_config['max_encoder_input_len'] = build_config['max_encoder_input_len'] \ if args.max_input_len is None else args.max_input_len build_config['max_decoder_input_len'] = 1 - build_config['max_output_len'] = build_config['max_output_len'] \ - if args.max_output_len is None else args.max_output_len + + if args.max_output_len: + logger.warning( + '--max_output_len has been deprecated in favor of --max_seq_len') + if args.max_input_len: + if args.max_seq_len: + logger.warning( + '--max_seq_len has been overwritten due to --max_output_len being specified' + ) + args.max_seq_len = args.max_input_len + args.max_output_len + else: + raise Exception( + f"max_output_len is specified but not max_input_len") + + del args.max_output_len + + build_config['max_seq_len'] = build_config['max_seq_len'] \ + if args.max_seq_len is None else args.max_seq_len build_config[ 'max_beam_width'] = 1 if args.max_beam_width is None else args.max_beam_width diff --git a/benchmarks/python/enc_dec_benchmark.py b/benchmarks/python/enc_dec_benchmark.py index 7fdbd18a6..0f40f7e10 100644 --- a/benchmarks/python/enc_dec_benchmark.py +++ b/benchmarks/python/enc_dec_benchmark.py @@ -115,7 +115,7 @@ def read_config(component): self.max_batch_size = config["builder_config"]["max_batch_size"] self.max_input_len = config["builder_config"][ "max_encoder_input_len"] - self.max_output_len = config["builder_config"]["max_output_len"] + self.max_seq_len = config["builder_config"]["max_seq_len"] self.n_mels = config["builder_config"][ 'n_mels'] if 'whisper' in self.model_name else 0 @@ -180,8 +180,8 @@ def read_config(component): if args.max_batch_size is None else args.max_batch_size self.max_input_len = build_config['max_encoder_input_len'] \ if args.max_input_len is None else args.max_input_len - self.max_output_len = build_config['max_output_len'] \ - if args.max_output_len is None else args.max_output_len + self.max_seq_len = build_config['max_seq_len'] \ + if args.max_seq_len is None else args.max_seq_len self.n_mels = build_config[ 'n_mels'] if 'whisper' in self.model_name else 0 # Build engine @@ -218,10 +218,11 @@ def get_config(self): f"[WARNING] whisper benchmark is input_len=1500, no text prompt, output_len=arbitrary" ) for inlen, outlen in self.in_out_lens: - if (inlen > self.max_input_len or outlen > self.max_output_len): + if (inlen > self.max_input_len + or inlen + outlen > self.max_seq_len): print( f"[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) and " - f"outlen({outlen}) <= max_outlen({self.max_output_len}) failed, skipping." + f"inlen({inlen}) + outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping." ) continue for batch_size in self.batch_sizes: diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py index 7616dea4f..d0dea2855 100644 --- a/benchmarks/python/gpt_benchmark.py +++ b/benchmarks/python/gpt_benchmark.py @@ -88,8 +88,8 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents, self.max_batch_size = args.max_batch_size if args.max_input_len is not None: self.max_input_len = args.max_input_len - if args.max_output_len is not None: - self.max_output_len = args.max_output_len + if args.max_seq_len is not None: + self.max_seq_len = args.max_seq_len self.quant_config = get_quant_config(args.quantization) self.quant_mode = self.quant_config.quant_mode @@ -209,10 +209,10 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents, def get_config(self): for inlen, outlen in self.in_out_lens: - if inlen > self.max_input_len or outlen > self.max_output_len: + if inlen > self.max_input_len or inlen + outlen > self.max_seq_len: print( - f'[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) and ' - f'outlen({outlen}) <= max_outlen({self.max_output_len}) failed, skipping.' + f'[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) or ' + f'seqlen({inlen + outlen}) <= max_seq_len({self.max_seq_len}) failed, skipping.' ) continue for batch_size in self.batch_sizes: @@ -314,7 +314,7 @@ def check_memory(self, io_shapes: list, raise_exception=False): output_length=outlen, max_batch_size=self.build_config.max_batch_size, max_input_len=self.build_config.max_input_len, - max_output_len=self.build_config.max_output_len, + max_seq_len=self.build_config.max_seq_len, max_beam_width=self.build_config.max_beam_width) for k, v in build_args.items(): tensorrt_llm.logger.info(f"{prefix} {k}:{v}") diff --git a/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py b/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py index 0fbc89da4..b2e67d06f 100644 --- a/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py +++ b/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py @@ -84,8 +84,8 @@ def get_build_command(self) -> List[str]: max_batch_size, "--max_input_len", max_isl, - "--max_output_len", - max_osl, + "--max_seq_len", + max_osl + max_isl, "--context_fmha", "enable", # Set the attention plugin data type. diff --git a/benchmarks/suite/tensorrt_llm_bench/ifb.py b/benchmarks/suite/tensorrt_llm_bench/ifb.py index 4d7e30052..cb12c04ad 100644 --- a/benchmarks/suite/tensorrt_llm_bench/ifb.py +++ b/benchmarks/suite/tensorrt_llm_bench/ifb.py @@ -140,8 +140,8 @@ def get_trtllm_build_command(benchmark_cfg: BenchmarkConfig) -> List[str]: benchmark_cfg.world_size, "--max_input_len", max_isl, - "--max_output_len", - max_osl, + "--max_seq_len", + max_osl + max_isl, "--context_fmha", "enable", # Set the attention plugin data type. diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index a28ab7269..589cd280b 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -82,6 +82,9 @@ struct KvCacheStats SizeType32 freeNumBlocks; SizeType32 usedNumBlocks; SizeType32 toksPerBlock; + SizeType32 allocTotalBlocks; + SizeType32 allocNewBlocks; + SizeType32 reusedBlocks; }; // Basic building block of a paged KV cache - a single @@ -329,6 +332,16 @@ class BlockManager return mFreePrimaryBlocks.size(); } + [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const + { + return mAllocTotalBlocks; + } + + [[nodiscard]] SizeType32 getNumAllocNewBlocks() const + { + return mAllocNewBlocks; + } + [[nodiscard]] SizeType32 getNumReusedBlocks() const noexcept { return mReusedBlocks; @@ -496,6 +509,21 @@ class KVCacheManager return mBlockManager.getNumFreeBlocks(); } + [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const + { + return mBlockManager.getNumAllocTotalBlocks(); + } + + [[nodiscard]] SizeType32 getNumAllocNewBlocks() const + { + return mBlockManager.getNumAllocNewBlocks(); + } + + [[nodiscard]] SizeType32 getNumReusedBlocks() const noexcept + { + return mBlockManager.getNumReusedBlocks(); + } + [[nodiscard]] KvCacheStats getKvCacheStats() const { KvCacheStats kvCacheStats; @@ -503,6 +531,9 @@ class KVCacheManager kvCacheStats.freeNumBlocks = getNumFreeBlocks(); kvCacheStats.usedNumBlocks = getUsedNumBlocks(); kvCacheStats.toksPerBlock = getTokensPerBlock(); + kvCacheStats.allocTotalBlocks = getNumAllocTotalBlocks(); + kvCacheStats.allocNewBlocks = getNumAllocNewBlocks(); + kvCacheStats.reusedBlocks = getNumReusedBlocks(); return kvCacheStats; } diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 7289afbf2..be4d5fa57 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -62,7 +62,7 @@ class GenericLlmRequest using VecLogProbs = std::vector; using BeamTokens = std::vector; using TensorPtr = TTensor; - using LogitsPostProcessor = std::function; + using LogitsPostProcessor = std::function; GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr inputTokens, runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional endId = std::nullopt, @@ -76,6 +76,7 @@ class GenericLlmRequest std::optional> draftTokens = std::nullopt, std::optional draftLogits = std::nullopt, bool excludeInputFromOutput = false, std::optional logitsPostProcessor = std::nullopt, + bool applyLogitsPostProcessorBatched = false, std::optional> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false) : mRequestId(requestId) , mPromptLen(inputTokens->size()) @@ -86,6 +87,7 @@ class GenericLlmRequest , mEndId(endId) , mPadId(padId) , mLogitsPostProcessor(logitsPostProcessor) + , mApplyLogitsPostProcessorBatched(applyLogitsPostProcessorBatched) , mOrigPromptLen(mPromptLen) , mMaxSentTokenPos(mPromptLen - 1) , mEmbeddingBias(std::move(embeddingBias)) @@ -679,7 +681,7 @@ class GenericLlmRequest void allocContextLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType) { - mContextLogitsHost = runtime::BufferManager::pinned( + mContextLogitsHost = runtime::BufferManager::pinnedPool( runtime::ITensor::makeShape({mPromptLen, vocabSizePadded}), logitsDataType); } @@ -695,13 +697,13 @@ class GenericLlmRequest void allocGenerationLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType) { - mGenerationLogitsHost = runtime::BufferManager::pinned( + mGenerationLogitsHost = runtime::BufferManager::pinnedPool( runtime::ITensor::makeShape({mSamplingConfig.beamWidth, mMaxNewTokens, vocabSizePadded}), logitsDataType); } void allocTargetModelAcceptedTokenLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType) { - mGenerationLogitsHost = runtime::BufferManager::pinned( + mGenerationLogitsHost = runtime::BufferManager::pinnedPool( runtime::ITensor::makeShape({getNumDraftTokens() + 1, vocabSizePadded}), logitsDataType); } @@ -948,6 +950,7 @@ class GenericLlmRequest std::optional mPadId; std::optional mSeqSlot; std::optional mLogitsPostProcessor; + bool mApplyLogitsPostProcessorBatched; protected: BeamTokens mTokens; @@ -1073,20 +1076,24 @@ class LlmRequest : public GenericLlmRequest std::optional> draftTokens = std::nullopt, std::optional draftLogits = std::nullopt, bool excludeInputFromOutput = false, std::optional logitsPostProcessor = std::nullopt, + bool applyLogitsPostProcessorBatched = false, std::optional> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false) : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig), returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), - excludeInputFromOutput, std::move(logitsPostProcessor), std::move(encoderInputTokens), returnEncoderOutput) + excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, + std::move(encoderInputTokens), returnEncoderOutput) { } LlmRequest(RequestIdType requestId, executor::Request const& Request, - std::optional logitsPostProcessor = std::nullopt) + std::optional logitsPostProcessor = std::nullopt, + bool applyLogitsPostProcessorBatched = false) : Base(requestId, Request) { mLogitsPostProcessor = std::move(logitsPostProcessor); + mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched; } void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager) diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h index bda295d22..77cd6d673 100644 --- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h +++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h @@ -41,7 +41,7 @@ class TrtGptModelOptionalParams bool normalizeLogProbs = true, bool enableChunkedContext = false, PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{}, executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1, - std::optional maxBeamWidth = std::nullopt, + std::optional maxBeamWidth = std::nullopt, std::optional maxBatchSize = std::nullopt, executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{}) : kvCacheConfig{kvCacheConfig} , enableTrtOverlap{enableTrtOverlap} @@ -52,6 +52,7 @@ class TrtGptModelOptionalParams , decodingConfig(std::move(decodingConfig)) , gpuWeightsPercent(gpuWeightsPercent) , maxBeamWidth(maxBeamWidth) + , maxBatchSize(maxBatchSize) , schedulerConfig{schedulerConfig} { } @@ -62,7 +63,7 @@ class TrtGptModelOptionalParams executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(), PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())), executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}), - executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), + executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getSchedulerConfig()) { } @@ -87,6 +88,7 @@ class TrtGptModelOptionalParams // Percentage of weights on the gpu at runtime float gpuWeightsPercent; std::optional maxBeamWidth; + std::optional maxBatchSize; executor::SchedulerConfig schedulerConfig; }; diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index 748e4ba80..e398e6d08 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -263,6 +263,9 @@ class Request std::optional logitsPostProcessorName = std::nullopt, std::optional encoderInputTokenIds = std::nullopt); + /// @brief This logits postprocessor name will dispatch to the batched logits postprocessor + static auto constexpr kBatchedPostProcessorName = "batched"; + Request(Request const& other); Request(Request&& other) noexcept; Request& operator=(Request const& other); @@ -403,6 +406,14 @@ class KvCacheConfig [[nodiscard]] std::optional getHostCacheSize() const; [[nodiscard]] bool getOnboardBlocks() const; + void setEnableBlockReuse(bool enableBlockReuse); + void setMaxTokens(SizeType32 maxTokens); + void setMaxAttentionWindow(SizeType32 maxAttentionWindow); + void setSinkTokenLength(SizeType32 sinkTokenLength); + void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction); + void setHostCacheSize(size_t hostCacheSize); + void setOnboardBlocks(bool onboardBlocks); + private: friend class Serialization; @@ -557,49 +568,39 @@ class PeftCacheConfig std::optional mHostCacheSize; }; -/// @brief Configuration class for Lookahead decoding. -class LookaheadDecodingConfig +struct LookaheadDecodingConfig { -public: - explicit LookaheadDecodingConfig( - SizeType32 maxNgramSize, SizeType32 maxWindowSize, SizeType32 maxVerificationSetSize); - - bool operator==(LookaheadDecodingConfig const& other) const; + LookaheadDecodingConfig(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize); - void setMaxNgramSize(SizeType32); - void setMaxWindowSize(SizeType32); - void setMaxVerificationSetSize(SizeType32); - [[nodiscard]] SizeType32 getMaxNgramSize() const; - [[nodiscard]] SizeType32 getMaxWindowSize() const; - [[nodiscard]] SizeType32 getMaxVerificationSetSize() const; + explicit LookaheadDecodingConfig() + : LookaheadDecodingConfig(1, 1, 0) + { + } -private: - friend class Serialization; - - // Number of tokens per NGram. - SizeType32 mMaxNgramSize; - // Number of NGrams in lookahead branch per step. - SizeType32 mMaxWindowSize; - // Number of NGrams in verification branch per step. - SizeType32 mMaxVerificationSetSize; -}; + bool operator==(LookaheadDecodingConfig const& other) const; + [[nodiscard]] std::tuple get() const; + [[nodiscard]] SizeType32 getWindowSize() const; + [[nodiscard]] SizeType32 getNgramSize() const; + [[nodiscard]] SizeType32 getVerificationSetSize() const; -/// @brief Configuration class for explicit draft tokens decoding. -class ExplicitDraftTokensConfig -{ -public: - explicit ExplicitDraftTokensConfig(float temperature); + /// @brief return + std::tuple calculateSpeculativeResource() const; - bool operator==(ExplicitDraftTokensConfig const& other) const; + /// @brief return true when `this` can be executed on resources defined by `that` + bool isLE(LookaheadDecodingConfig const& that) const; - void setTemperature(float); - [[nodiscard]] float getTemperature() const; + /// @brief return true when the parameter combination is valid. + static bool isLegal(SizeType32 windowSize, SizeType32 ngramSize, SizeType32 verificationSetSize) noexcept; private: friend class Serialization; - // Sampling temperature. - float mTemperature; + // Number of NGrams in lookahead branch per step. + SizeType32 mWindowSize; + // Number of tokens per NGram. + SizeType32 mNgramSize; + // Number of NGrams in verification branch per step. + SizeType32 mVerificationSetSize; }; /// @brief Configuration class for the speculative decoding. @@ -608,8 +609,7 @@ class DecodingConfig public: explicit DecodingConfig(std::optional decodingMode = std::nullopt, std::optional lookaheadDecodingConfig = std::nullopt, - std::optional medusaChoices = std::nullopt, - std::optional explicitDraftTokensConfig = std::nullopt); + std::optional medusaChoices = std::nullopt); bool operator==(DecodingConfig const& other) const; @@ -620,7 +620,7 @@ class DecodingConfig // Lookahead methods. /// @brief Sets lookahead decoding mode and config. - void setLookaheadDecoding(LookaheadDecodingConfig const&); + void setLookaheadDecoding(LookaheadDecodingConfig const& lookaheadDecodingConfig); [[nodiscard]] std::optional getLookaheadDecodingConfig() const; // Medusa methods. @@ -628,11 +628,6 @@ class DecodingConfig void setMedusaChoices(MedusaChoices const&); [[nodiscard]] std::optional getMedusaChoices() const; - // ExplicitDraftTokens decoding methods. - /// @brief Sets explicit draft tokens decoding mode and config. - void setExplicitDraftTokens(ExplicitDraftTokensConfig const&); - [[nodiscard]] std::optional getExplicitDraftTokensConfig() const; - private: friend class Serialization; @@ -642,8 +637,6 @@ class DecodingConfig std::optional mLookaheadDecodingConfig; // Medusa params. std::optional mMedusaChoices; - // Explicit draft tokens params. - std::optional mExplicitDraftTokensConfig; }; /// @brief Configuration class for the model executor @@ -654,10 +647,11 @@ class ExecutorConfig KvCacheConfig const& kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType32 iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType32 requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, - BatchingType batchingType = BatchingType::kINFLIGHT, + BatchingType batchingType = BatchingType::kINFLIGHT, std::optional maxBatchSize = std::nullopt, std::optional parallelConfig = std::nullopt, std::optional const& peftCacheConfig = std::nullopt, std::optional logitsPostProcessorMap = std::nullopt, + std::optional logitsPostProcessorBatched = std::nullopt, std::optional decodingConfig = std::nullopt, float gpuWeightsPercent = 1); [[nodiscard]] SizeType32 getMaxBeamWidth() const; @@ -668,13 +662,16 @@ class ExecutorConfig [[nodiscard]] SizeType32 getIterStatsMaxIterations() const; [[nodiscard]] SizeType32 getRequestStatsMaxIterations() const; [[nodiscard]] BatchingType getBatchingType() const; + [[nodiscard]] std::optional getMaxBatchSize() const; [[nodiscard]] std::optional getParallelConfig() const; [[nodiscard]] std::optional getPeftCacheConfig() const; [[nodiscard]] std::optional getLogitsPostProcessorMap() const; + [[nodiscard]] std::optional getLogitsPostProcessorBatched() const; [[nodiscard]] std::optional getDecodingConfig() const; [[nodiscard]] float getGpuWeightsPercent() const; void setMaxBeamWidth(SizeType32 maxBeamWidth); + void setMaxBatchSize(SizeType32 maxBatchSize); void setSchedulerConfig(SchedulerConfig const& schedulerConfig); void setKvCacheConfig(KvCacheConfig const& kvCacheConfig); void setEnableChunkedContext(bool enableChunkedContext); @@ -685,6 +682,7 @@ class ExecutorConfig void setParallelConfig(ParallelConfig const& parallelConfig); void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig); void setLogitsPostProcessorMap(LogitsPostProcessorMap const& logitsPostProcessorMap); + void setLogitsPostProcessorBatched(LogitsPostProcessorBatched const& logitsPostProcessorBatched); void setDecodingConfig(DecodingConfig const& decodingConfig); void setGpuWeightsPercent(float const& gpuWeightsPercent); @@ -715,10 +713,14 @@ class ExecutorConfig /// @brief The type of batching strategy to use. See BatchingType. BatchingType mBatchingType; + /// @brief The max batch size of requests + std::optional mMaxBatchSize; + /// @brief The parallel execution configuration. std::optional mParallelConfig; std::optional mPeftCacheConfig; std::optional mLogitsPostProcessorMap; + std::optional mLogitsPostProcessorBatched; /// @brief Decoding configuration. std::optional mDecodingConfig; float mGpuWeightsPercent; diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h index d3f40d342..7f3314c2e 100644 --- a/cpp/include/tensorrt_llm/executor/serialization.h +++ b/cpp/include/tensorrt_llm/executor/serialization.h @@ -112,11 +112,6 @@ class Serialization static void serialize(LookaheadDecodingConfig const& lookaheadDecodingConfig, std::ostream& os); static size_t serializedSize(LookaheadDecodingConfig const& lookaheadDecodingConfig); - // ExplicitDraftTokensConfig - static ExplicitDraftTokensConfig deserializeExplicitDraftTokensConfig(std::istream& is); - static void serialize(ExplicitDraftTokensConfig const& ExplicitDraftTokensConfig, std::ostream& os); - static size_t serializedSize(ExplicitDraftTokensConfig const& ExplicitDraftTokensConfig); - // DecodingConfig static DecodingConfig deserializeDecodingConfig(std::istream& is); static void serialize(DecodingConfig const& decodingConfig, std::ostream& os); diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h index afb0e3a80..009707c56 100644 --- a/cpp/include/tensorrt_llm/executor/types.h +++ b/cpp/include/tensorrt_llm/executor/types.h @@ -53,8 +53,10 @@ using IterationType = std::uint64_t; using RandomSeedType = std::uint64_t; using VecLogProbs = std::vector; using StreamPtr = std::shared_ptr; -using LogitsPostProcessor = std::function; +using LogitsPostProcessor = std::function; using LogitsPostProcessorMap = std::unordered_map; +using LogitsPostProcessorBatched = std::function const&, std::vector&, + std::vector> const&, StreamPtr const&)>; using MedusaChoices = std::vector>; enum class DataType @@ -224,6 +226,12 @@ struct KvCacheStats SizeType32 usedNumBlocks; /// @brief Number of tokens per block SizeType32 tokensPerBlock; + /// @brief Number of total allocated block + SizeType32 allocTotalBlocks; + /// @brief Number of newly allocated block + SizeType32 allocNewBlocks; + /// @brief Number of reused block + SizeType32 reusedBlocks; }; /// @brief Struct that holds the stats of static batching models for a single iteration @@ -267,6 +275,8 @@ struct IterationStats std::string timestamp; /// @brief Iteration id IterationType iter; + /// @brief Iteration latency (ms) + double iterLatencyMS; /// @brief Number of active requests SizeType32 numActiveRequests; /// @brief Number of max active requests @@ -717,6 +727,8 @@ static_assert(!DecodingMode::Lookahead().isBeamSearch()); static_assert(!DecodingMode::Lookahead().isMedusa()); static_assert(!DecodingMode::Lookahead().isExplicitDraftTokens()); static_assert(DecodingMode::Lookahead().isUseStopCriteria()); +static_assert(DecodingMode::Lookahead().isUseStopWords()); +static_assert(DecodingMode::Lookahead().isUseExplicitEosStop()); static_assert(DecodingMode::Lookahead().isLookahead()); static_assert(!DecodingMode::ExplicitDraftTokens().isAuto()); diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h index 70700491c..613a7da1c 100644 --- a/cpp/include/tensorrt_llm/runtime/decodingInput.h +++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h @@ -29,13 +29,13 @@ class DecodingInput public: using TensorPtr = std::shared_ptr; - DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, - SizeType32 maxBatchSize, TensorPtr logits, TensorPtr endIds) + DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize, + TensorPtr logits, TensorPtr endIds) : step{maxLength} , maxLength{maxLength} , maxAttentionWindow{maxAttentionWindow} , sinkTokenLength{sinkTokenLength} - , maxBatchSize{maxBatchSize} + , batchSize{batchSize} , maxStopWordsLen{0} , maxBadWordsLen{0} , logits{std::move(logits)} @@ -50,46 +50,68 @@ class DecodingInput SizeType32 maxLength; SizeType32 maxAttentionWindow; SizeType32 sinkTokenLength; - SizeType32 maxBatchSize; + SizeType32 batchSize; SizeType32 maxStopWordsLen; // The maximum value in the `stopWordsLens` tensor SizeType32 maxBadWordsLen; // The maximum value in the `badWordsLens` tensor TensorPtr logits; // [batchSize, beamWidth, vocabSizePadded], on gpu std::optional> logitsVec; // vector of size [batchSize] contains logits of size [beamWidth, vocabSizePadded], on gpu - TensorPtr endIds; // [maxBatchSize * beamWidth], on gpu + TensorPtr endIds; // [batchSize * beamWidth], on gpu // optional parameters - TensorPtr finished; // [maxBatchSize, beamWidth], finished states at current iteration. + TensorPtr finished; // [batchSize, beamWidth], finished states at current iteration. // If true for some request, the decoding step of it is skipped, on gpu - TensorPtr sequenceLimitLength; // [maxBatchSize], on gpu - TensorPtr embeddingBias; // [maxBatchSize, vocabSizePadded], on gpu - TensorPtr lengths; // [maxBatchSize, beamWidth], on gpu - TensorPtr badWordsList; // [2, badWordsLength] or [maxBatchSize, 2, badWordsLength], on gpu - TensorPtr badWordsPtrs; // [maxBatchSize][2, badWordsLength], on gpu - TensorPtr badWordsLens; // [maxBatchSize], on gpu - TensorPtr stopWordsList; // [maxBatchSize, 2, stopWordsLength], on gpu - TensorPtr stopWordsPtrs; // [maxBatchSize][2, stopWordsLength], on gpu - TensorPtr stopWordsLens; // [maxBatchSize], on gpu - TensorPtr noRepeatNgramSize; // [maxBatchSize], on gpu + TensorPtr sequenceLimitLength; // [batchSize], on gpu + TensorPtr embeddingBias; // [batchSize, vocabSizePadded], on gpu + TensorPtr lengths; // [batchSize, beamWidth], on gpu + TensorPtr badWordsList; // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu + TensorPtr badWordsPtrs; // [batchSize][2, badWordsLength], on gpu + TensorPtr badWordsLens; // [batchSize], on gpu + TensorPtr stopWordsList; // [batchSize, 2, stopWordsLength], on gpu + TensorPtr stopWordsPtrs; // [batchSize][2, stopWordsLength], on gpu + TensorPtr stopWordsLens; // [batchSize], on gpu + TensorPtr noRepeatNgramSize; // [batchSize], on gpu TensorPtr batchSlots; // [batchSize], optional, address map of the linear batch id to to the seq slots, int32_t, pinned // parameters for beam search - TensorPtr cacheIndirection; // [maxBatchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu + TensorPtr cacheIndirection; // [batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu // Medusa class MedusaInputs { public: - TensorPtr medusaPaths; // [maxBatchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu - TensorPtr medusaTreeIds; // [maxBatchSize, maxTokensPerStep], on gpu + TensorPtr medusaPaths; // [batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu + TensorPtr medusaTreeIds; // [batchSize, maxTokensPerStep], on gpu std::vector> - medusaLogits; // [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu - TensorPtr medusaCurTokensPerStep; // [maxBatchSize], on gpu - TensorPtr medusaTargetTokensPerStep; // [maxBatchSize], on gpu + medusaLogits; // [batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu + TensorPtr medusaCurTokensPerStep; // [batchSize], on gpu + TensorPtr medusaTargetTokensPerStep; // [batchSize], on gpu + }; + + class ExplicitDraftTokensInputs + { + public: + TensorPtr nextDraftTokens; // [batchSize, maxNumPaths, maxPathLen] + TensorPtr nextFlatTokens; // [batchSize * maxDecodingTokens] + TensorPtr nextDraftIndices; // [batchSize, maxNumPaths, maxPathLen] + TensorPtr nextDraftProbs; // [batchSize, maxNumPaths, maxDraftPathLen, vocabSize] + TensorPtr lastDraftTokens; // [batchSize, maxNumPaths, maxPathLen] + TensorPtr lastDraftIndices; // [batchSize, maxNumPaths, maxPathLen] + TensorPtr masks; // [batchSize, maxDecodingTokens, maxDecodingTokens], bool + TensorPtr packedPositionIds; // [batchSize * maxDecodingTokens] + TensorPtr bestPathLengths; // [batchSize] + TensorPtr bestPathIndices; // [batchSize] + TensorPtr nextGenerationLengths; // [batchSize] + TensorPtr lastPositionIdsBase; // [batchSize] + TensorPtr lastGenerationLengths; // [batchSize] + TensorPtr maxGenLengthDevice; // [1] + TensorPtr seqSlots; // [batchSize] }; std::optional medusaInputs; + + std::optional explicitDraftTokensInputs; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h index 252466671..8298b07a2 100644 --- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h +++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h @@ -18,6 +18,7 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h" #include "tensorrt_llm/runtime/iTensor.h" #include #include @@ -94,12 +95,15 @@ class DecodingOutput public: TensorPtr nextDraftTokens; // [maxBatchSize, maxDraftTokens] TensorPtr nextDraftTokensLen; // [maxBatchSize] + TensorPtr prevDraftTokensLen; // [maxBatchSize] TensorPtr acceptedTokensLen; // [maxBatchSize] TensorPtr acceptedLengthsCumSum; // [maxBatchSize + 1] TensorPtr pathsOffsets; // [maxBatchSize, maxAcceptedDraftTokensPerStep] }; std::optional speculativeDecodingOutputs; + + std::optional explicitDraftTokensBuffers; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/explicitDraftTokensBuffers.h b/cpp/include/tensorrt_llm/runtime/explicitDraftTokensBuffers.h new file mode 100644 index 000000000..983c13912 --- /dev/null +++ b/cpp/include/tensorrt_llm/runtime/explicitDraftTokensBuffers.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/runtime/explicitDraftTokensModule.h" +#include "tensorrt_llm/runtime/iBuffer.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/modelConfig.h" +#include "tensorrt_llm/runtime/tllmRuntime.h" +#include "tensorrt_llm/runtime/worldConfig.h" + +#include + +namespace tensorrt_llm::runtime +{ + +class ExplicitDraftTokensBuffers +{ +public: + using SizeType32 = runtime::SizeType32; + using ITensor = runtime::ITensor; + using BufferPtr = runtime::IBuffer::SharedPtr; + using TensorPtr = runtime::ITensor::SharedPtr; + using TensorMap = runtime::StringPtrMap; + + class Inputs + { + public: + //! [batchSize] + TensorPtr temperatures; + //! [batchSize] + TensorPtr positionIdsBase; + //! [batchSize] or [numGenSequences] + TensorPtr generationLengths; + //! [batchSize] + TensorPtr randomDataSample; + //! [batchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen] + TensorPtr randomDataValidation; + //! [batchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen] + TensorPtr draftTokens; + //! [batchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen] + TensorPtr draftIndices; + //! [batchSize, maxNumPaths, maxPathDraftLen, vocabSize] + //! or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize] + TensorPtr draftProbs; + //! [batchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] + //! or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)] + TensorPtr packedMasks; + //! [batchSize] or [numGenSequences] + TensorPtr positionIds; + // [1], on pinned + TensorPtr maxGenLengthHost; + + void create(SizeType32 maxNumSequences, runtime::TllmRuntime const& runtime, + runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig); + }; + + class EngineInputs : public Inputs + { + public: + //! [numSequences], on gpu + TensorPtr requestTypesDevice; + //! [numGenSequences] + TensorPtr positionOffsets; + } engineInputs; + + class EngineOutputs + { + public: + //! [batchSize] + TensorPtr nextGenerationLengths; + //! [batchSize] + TensorPtr nextPositionOffsets; + //! [batchSize, maxDecodingTokens, maxDecodingTokens], bool + TensorPtr masks; + + //! [batchSize, maxNumPaths, maxPathLen] + TensorPtr nextDraftTokens; + //! [batchSize, maxNumPaths, maxPathLen] + TensorPtr nextDraftIndices; + //! [batchSize, maxNumPaths, maxDraftPathLen, vocabSize] + TensorPtr nextDraftProbs; + + //! [batchSize * maxDecodingTokens] + TensorPtr nextFlatTokens; + //! [batchSize] + TensorPtr bestPathLengths; + //! [batchSize] + TensorPtr bestPathIndices; + //! [1] + TensorPtr maxGenToken; + //! [1] + TensorPtr totalGenToken; + //! [batchSize * maxDecodingTokens] + TensorPtr packedPositionIds; + } engineOutputs; + +public: + ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager, + runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, + executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime); + + void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig); + + void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const& requestTypes, + ITensor const& seqSlots, ExplicitDraftTokensBuffers::Inputs const& decoderBuffers, + ITensor const& contextPositionIds, runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig, + runtime::WorldConfig const& worldConfig) const; + + void insertInputTensors( + TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const; + +private: + template + void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded, + ITensor const& seqSlots, ExplicitDraftTokensBuffers::Inputs const& draftBuffers, + ITensor const& contextPositionIds, runtime::ExplicitDraftTokensModule const& explicitDraftTokensModule, + runtime::CudaStream const& stream) const; + +public: + // helper tensors + std::size_t scanTempStorageBytes{0}; + BufferPtr scanTempStorage; + TensorPtr cumSumGenerationLengths; +}; + +} // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h index 5d37a65f3..136f8f32b 100644 --- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h +++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h @@ -18,18 +18,15 @@ #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/runtime/bufferManager.h" -#include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/runtime/decodingInput.h" #include "tensorrt_llm/runtime/decodingOutput.h" -#include "tensorrt_llm/runtime/modelConfig.h" #include "tensorrt_llm/runtime/samplingConfig.h" -#include "tensorrt_llm/runtime/worldConfig.h" + +#include #include #include -#include - namespace tensorrt_llm { @@ -43,6 +40,8 @@ class DynamicDecodeLayer; namespace runtime { +class SpeculativeDecodingModule; + class IGptDecoder { public: @@ -51,7 +50,8 @@ class IGptDecoder virtual ~IGptDecoder() = default; virtual void setup(SamplingConfig const& samplingConfig, size_t batchSize, - std::optional const& batchSlots = std::nullopt) + std::optional const& batchSlots = std::nullopt, + std::optional const& output = std::nullopt) = 0; virtual void forwardAsync(DecodingOutput& output, DecodingInput const& input) = 0; @@ -93,7 +93,8 @@ class GptDecoder : public virtual IGptDecoder std::shared_ptr speculativeDecodingModule = nullptr); void setup(SamplingConfig const& samplingConfig, size_t batchSize, - std::optional const& batchSlots = std::nullopt) override; + std::optional const& batchSlots = std::nullopt, + std::optional const& output = std::nullopt) override; void forwardAsync(DecodingOutput& output, DecodingInput const& input) override; @@ -133,7 +134,9 @@ inline std::unique_ptr IGptDecoder::create(executor::DecodingMode c case nvinfer1::DataType::kHALF: return std::make_unique>(mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, maxSequenceLength, stream, speculativeDecodingModule); - default: TLLM_THROW("Unsupported decoder data type. Use either kFLOAT or kHALF."); return nullptr; + default: + TLLM_THROW("Unsupported decoder data type: %d. Use either kFLOAT or kHALF.", static_cast(dtype)); + return nullptr; } } } // namespace runtime diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h index 4fb884509..cf230a649 100644 --- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h +++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h @@ -38,6 +38,12 @@ class GptDecoderBatch : public IGptDecoderBatch using TensorPtr = ITensor::SharedPtr; using SharedConstPtr = ITensor::SharedConstPtr; + enum class ForwardType + { + kASYNC, + kSYNC + }; + GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream, SpeculativeDecodingMode const& speculativeDecodingMode); @@ -47,6 +53,8 @@ class GptDecoderBatch : public IGptDecoderBatch SizeType32 maxTokensPerStep, bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const& modelConfig) override; + void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override; + void newBatch( GenerationInput const& inputs, GenerationOutput const& outputs, SamplingConfig const& samplingConfig) override; @@ -164,6 +172,12 @@ class GptDecoderBatch : public IGptDecoderBatch return mJointDecodingOutput->speculativeDecodingOutputs->nextDraftTokens; } + //! @returns [batchSize], predicted draft tokens lengths for previous step, on gpu + [[nodiscard]] TensorPtr getPrevDraftTokensLengths() const override + { + return mJointDecodingOutput->speculativeDecodingOutputs->prevDraftTokensLen; + } + //! @returns [batchSize], predicted draft tokens lengths for next step, on gpu [[nodiscard]] TensorPtr getNextDraftTokensLengths() const override { @@ -171,13 +185,13 @@ class GptDecoderBatch : public IGptDecoderBatch } //! @returns [batchSize + 1], exclusive sum of accepted draft token lengths, on gpu - [[nodiscard]] TensorPtr getSpecDecodingAcceptedLengthsCumSum() const override + [[nodiscard]] TensorPtr getAcceptedLengthsCumSum() const override { return mJointDecodingOutput->speculativeDecodingOutputs->acceptedLengthsCumSum; } //! @returns [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu - [[nodiscard]] TensorPtr getSpecDecodingAcceptedPackedPaths() const override + [[nodiscard]] TensorPtr getAcceptedPackedPaths() const override { return mJointDecodingOutput->speculativeDecodingOutputs->pathsOffsets; } @@ -215,17 +229,19 @@ class GptDecoderBatch : public IGptDecoderBatch //! @brief Updates finished state on host for all active requests void updateFinished(decoder_batch::Token const& token); + //! @brief Sets inputs for explicit draft tokens. + void setExplicitDraftTokensInputs(decoder_batch::Input const& input); + //! @brief Calls unfused or fused decoders for tokens per engine step - void forwardDispatch( - decoder_batch::Output& output, decoder_batch::Input const& input, std::optional const& eventStart); + void forwardDispatch(decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType); //! @brief Calls unfused decoder for whole batch in loop - void forwardUnfusedDecoder(SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, - std::optional const& eventStart); + void forwardUnfusedDecoder( + SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType); //! @brief Calls fused decoder for whole batch - void forwardFusedDecoder(SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, - std::optional const& eventStart); + void forwardFusedDecoder( + SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType); private: std::size_t const mVocabSize; diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h index 41caad418..d21ba2982 100644 --- a/cpp/include/tensorrt_llm/runtime/gptSession.h +++ b/cpp/include/tensorrt_llm/runtime/gptSession.h @@ -32,6 +32,7 @@ #include "tensorrt_llm/runtime/generationOutput.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/modelConfig.h" +#include "tensorrt_llm/runtime/rawEngine.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include "tensorrt_llm/runtime/worldConfig.h" @@ -158,28 +159,34 @@ class [[deprecated("Use the executor API instead.")]] GptSession //! @param sessionConfig Configuration of the session, //! @param modelConfig Description of the model, //! @param worldConfig Description of the environment, - //! @param engineBuffer The compiled TensorRT engine (const void*), - //! @param engineSize The size in bytes of the TensorRT engine (size_t), + //! @param rawEngine The compiled TensorRT engine, //! @param logger The optional logger. GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig, - void const* engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr); + RawEngine const& rawEngine, LoggerPtr logger = nullptr); + + GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig, + void const* engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr) + : GptSession(sessionConfig, modelConfig, worldConfig, RawEngine(engineBuffer, engineSize), std::move(logger)) + { + } GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig, std::vector const& engineBuffer, LoggerPtr logger = nullptr) - : GptSession( - sessionConfig, modelConfig, worldConfig, engineBuffer.data(), engineBuffer.size(), std::move(logger)) + : GptSession(sessionConfig, modelConfig, worldConfig, RawEngine(engineBuffer.data(), engineBuffer.size()), + std::move(logger)) { } GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig, std::string const& engineFile, LoggerPtr logger = nullptr) - : GptSession(sessionConfig, modelConfig, worldConfig, utils::loadEngine(engineFile), std::move(logger)) + : GptSession(sessionConfig, modelConfig, worldConfig, RawEngine(engineFile), std::move(logger)) { } [[nodiscard]] nvinfer1::ILogger& getLogger() const; [[nodiscard]] BufferManager const& getBufferManager() const; + [[nodiscard]] BufferManager::CudaStreamPtr getRuntimeStreamPtr() const; [[nodiscard]] ModelConfig const& getModelConfig() const { diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h index c934117ce..5e7702e7f 100644 --- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h +++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h @@ -18,8 +18,10 @@ #include "tensorrt_llm/runtime/cudaEvent.h" #include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h" #include "tensorrt_llm/runtime/iStatefulGptDecoder.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/request.h" #include "tensorrt_llm/runtime/utils/sessionUtils.h" #include @@ -31,41 +33,6 @@ namespace tensorrt_llm::runtime namespace decoder_batch { -class Request -{ -public: - using ConstTensorPtr = ITensor::SharedConstPtr; - using TensorPtr = ITensor::SharedPtr; - using BufferPtr = IBuffer::SharedPtr; - - explicit Request(ConstTensorPtr ids, SizeType32 inputLen, std::optional maxNewTokens = std::nullopt, - std::optional endId = std::nullopt) - : ids{std::move(ids)} - , inputLen(inputLen) - , maxNewTokens{maxNewTokens} - , endId{endId} - , generatedTokensPerEngineStep(1) - { - } - - // mandatory parameters - ConstTensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu - SizeType32 inputLen; // the input length without draft tokens - - // optional parameters - std::optional maxNewTokens; // maximum number of tokens to generate for this request - std::optional endId; // end token id - BufferPtr draftTokens; // [generatedTokensPerStep - 1], on gpu, draft tokens from speculative decoding - std::optional - draftLogits; // [generatedTokensPerStep - 1, vocabSize], on gpu, draft tokens from speculative decoding - TensorPtr embeddingBias; // [vocabSizePadded], on gpu - TensorPtr badWordsList; // [2, badWordsLength], on gpu - TensorPtr stopWordsList; // [2, stopWordsLength], on gpu - - SizeType32 generatedTokensPerEngineStep; - TensorPtr medusaPaths; // [maxDraftTokens + 1, maxAcceptedDraftTokensPerStep + 1], on gpu - TensorPtr medusaTreeIds; // [maxDraftTokens + 1], on gpu -}; class Input { @@ -109,6 +76,11 @@ class Input // within one beam for beam search, on gpu std::vector> predictedDraftLogits; // [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded] + TensorConstPtr seqSlots; // [batchSize] + + // explicit draft tokens data. + std::optional explicitDraftTokensInputs; + std::optional explicitDraftTokensLastInputs; }; using Output = decoder::Output; @@ -136,6 +108,9 @@ class IGptDecoderBatch : public virtual IStatefulGptDecoder using TensorPtr = std::shared_ptr; using TokenPtr = std::unique_ptr; + //! @brief Setup buffers for ExplicitDraftTokens decoding. + virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0; + //! @brief Run one step for all requests without blocking the host process and return the token for synchronization. virtual TokenPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) = 0; @@ -189,14 +164,17 @@ class IGptDecoderBatch : public virtual IStatefulGptDecoder //! @returns [batchSize, maxTokensPerStep-1], predicted draft tokens for next step, on gpu virtual TensorPtr getNextDraftTokens() const = 0; + //! @returns [batchSize], predicted draft tokens lengths for previous step, on gpu + virtual TensorPtr getPrevDraftTokensLengths() const = 0; + //! @returns [batchSize], predicted draft tokens lengths for next step, on gpu virtual TensorPtr getNextDraftTokensLengths() const = 0; //! @returns [batchSize + 1], exclusive sum of accepted draft token lengths, on gpu - virtual TensorPtr getSpecDecodingAcceptedLengthsCumSum() const = 0; + virtual TensorPtr getAcceptedLengthsCumSum() const = 0; //! @returns [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu - virtual TensorPtr getSpecDecodingAcceptedPackedPaths() const = 0; + virtual TensorPtr getAcceptedPackedPaths() const = 0; protected: IGptDecoderBatch() = default; diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h index cc9a1c18f..db261b83c 100644 --- a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h +++ b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h @@ -16,7 +16,12 @@ #pragma once +#include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/modelConfig.h" +#include "tensorrt_llm/runtime/request.h" #include "tensorrt_llm/runtime/speculativeDecodingModule.h" +#include namespace tensorrt_llm::runtime { @@ -24,8 +29,9 @@ namespace tensorrt_llm::runtime class LookaheadModule : public SpeculativeDecodingModule { public: - explicit LookaheadModule(SizeType32 maxAcceptedTokens, SizeType32 maxDraftTokens) noexcept - : SpeculativeDecodingModule(maxAcceptedTokens, maxDraftTokens, maxDraftTokens) + explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept + : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens) + , mExecutionConfig() { } @@ -33,5 +39,19 @@ class LookaheadModule : public SpeculativeDecodingModule : LookaheadModule(0, 0) { } + + void setExecutionConfig(executor::LookaheadDecodingConfig const& config) + { + mExecutionConfig = config; + } + + executor::LookaheadDecodingConfig const getExecutionConfig() const + { + return mExecutionConfig; + } + +private: + executor::LookaheadDecodingConfig mExecutionConfig; }; + } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h index 2be879e98..fd675075c 100644 --- a/cpp/include/tensorrt_llm/runtime/modelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h @@ -21,7 +21,9 @@ #include "tensorrt_llm/runtime/loraModule.h" #include "tensorrt_llm/runtime/speculativeDecodingMode.h" #include "tensorrt_llm/runtime/speculativeDecodingModule.h" + #include +#include namespace tensorrt_llm::runtime { @@ -29,6 +31,12 @@ namespace tensorrt_llm::runtime class ModelConfig { public: + // See `split_point` defined in `tensorrt_llm/models/generation_mixin.py`. + // The split points are tuned to get better perf, if we need to let + // users tune that, we can support that by writing and reading the + // points in `config.json`. + static constexpr std::array kOPT_PROFILES_SPLIT_POINTS{64, 128, 256, 512, 1024}; + enum class ModelVariant : std::int32_t { kGpt = 0, @@ -88,7 +96,14 @@ class ModelConfig , mUsePositionEmbedding(false) , mUseTokenTypeEmbedding(false) , mSpeculativeDecodingMode(SpeculativeDecodingMode::None()) + , mLogitsDtype(nvinfer1::DataType::kFLOAT) + , mUseShapeInference(true) + { + } + + [[nodiscard]] static std::vector getOptProfilesSplitPoints() noexcept { + return {kOPT_PROFILES_SPLIT_POINTS.begin(), kOPT_PROFILES_SPLIT_POINTS.end()}; } [[nodiscard]] SizeType32 constexpr getVocabSize() const noexcept @@ -555,6 +570,26 @@ class ModelConfig return mSpeculativeDecodingMode; } + void setLogitsDtype(nvinfer1::DataType inputDtype) noexcept + { + mLogitsDtype = inputDtype; + } + + [[nodiscard]] nvinfer1::DataType constexpr getLogitsDtype() const noexcept + { + return mLogitsDtype; + } + + void setUseShapeInference(bool useShapeInference) noexcept + { + mUseShapeInference = useShapeInference; + } + + [[nodiscard]] bool useShapeInference() const noexcept + { + return mUseShapeInference; + } + private: SizeType32 mVocabSize; SizeType32 mNbAttentionLayers; @@ -608,6 +643,10 @@ class ModelConfig // Speculative decoding members std::shared_ptr mSpeculativeDecodingModule; SpeculativeDecodingMode mSpeculativeDecodingMode; + + // Logits datatype + nvinfer1::DataType mLogitsDtype; + bool mUseShapeInference; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/rawEngine.h b/cpp/include/tensorrt_llm/runtime/rawEngine.h new file mode 100644 index 000000000..6a6e7eb5a --- /dev/null +++ b/cpp/include/tensorrt_llm/runtime/rawEngine.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/assert.h" + +#include +#include + +namespace tensorrt_llm::runtime +{ + +class RawEngine +{ +public: + enum Type + { + FilePath, + AddressWithSize, + HostMemory + }; + + explicit RawEngine(std::filesystem::path enginePath) noexcept + : mType(FilePath) + , mEnginePath(std::move(enginePath)) + { + } + + explicit RawEngine(void const* engineAddr, std::size_t engineSize) noexcept + : mType(AddressWithSize) + , mEngineAddr(engineAddr) + , mEngineSize(engineSize) + { + } + + explicit RawEngine(nvinfer1::IHostMemory const* engineBuffer) noexcept + : mType(HostMemory) + , mEngineBuffer(engineBuffer) + { + } + + [[nodiscard]] Type getType() const + { + return mType; + } + + [[nodiscard]] std::filesystem::path getPath() const + { + TLLM_CHECK(mType == FilePath); + return mEnginePath; + } + + [[nodiscard]] void const* getAddress() const + { + TLLM_CHECK(mType == AddressWithSize); + return mEngineAddr; + } + + [[nodiscard]] std::size_t getSize() const + { + TLLM_CHECK(mType == AddressWithSize); + return mEngineSize; + } + + [[nodiscard]] nvinfer1::IHostMemory const* getHostMemory() const + { + TLLM_CHECK(mType == HostMemory); + return mEngineBuffer; + } + +private: + Type mType; + std::filesystem::path mEnginePath; + + struct + { + void const* mEngineAddr{}; + std::size_t mEngineSize{}; + }; + + nvinfer1::IHostMemory const* mEngineBuffer{}; +}; + +} // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/request.h b/cpp/include/tensorrt_llm/runtime/request.h new file mode 100644 index 000000000..374b879e2 --- /dev/null +++ b/cpp/include/tensorrt_llm/runtime/request.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/runtime/cudaEvent.h" +#include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/iStatefulGptDecoder.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/utils/sessionUtils.h" +#include + +namespace tensorrt_llm::runtime::decoder_batch +{ + +class Request +{ +public: + using ConstTensorPtr = ITensor::SharedConstPtr; + using TensorPtr = ITensor::SharedPtr; + using BufferPtr = IBuffer::SharedPtr; + + explicit Request(ConstTensorPtr ids, SizeType32 inputLen, std::optional maxNewTokens = std::nullopt, + std::optional endId = std::nullopt) + : ids{std::move(ids)} + , inputLen(inputLen) + , maxNewTokens{maxNewTokens} + , endId{endId} + , generatedTokensPerEngineStep(1) + { + } + + // mandatory parameters + ConstTensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu + SizeType32 inputLen; // the input length without draft tokens + + // optional parameters + std::optional maxNewTokens; // maximum number of tokens to generate for this request + std::optional endId; // end token id + BufferPtr draftTokens; // [generatedTokensPerStep - 1], on gpu, draft tokens from speculative decoding + std::optional + draftLogits; // [generatedTokensPerStep - 1, vocabSize], on gpu, draft tokens from speculative decoding + TensorPtr embeddingBias; // [vocabSizePadded], on gpu + TensorPtr badWordsList; // [2, badWordsLength], on gpu + TensorPtr stopWordsList; // [2, stopWordsLength], on gpu + + SizeType32 generatedTokensPerEngineStep; + TensorPtr medusaPaths; // [maxDraftTokens + 1, maxAcceptedDraftTokensPerStep + 1], on gpu + TensorPtr medusaTreeIds; // [maxDraftTokens + 1], on gpu + std::optional lookaheadRuntimeConfig; +}; + +} // namespace tensorrt_llm::runtime::decoder_batch diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h index 612f162c2..e3103ea91 100644 --- a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h +++ b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h @@ -75,6 +75,11 @@ class SpeculativeDecodingMode return anyBitSet(kExplicitDraftTokens); } + [[nodiscard]] bool constexpr updatesPositionIds() const + { + return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens); + } + [[nodiscard]] bool constexpr requiresAttentionMask() const { return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens); @@ -101,6 +106,12 @@ class SpeculativeDecodingMode return anyBitSet(kMedusa); } + [[nodiscard]] bool constexpr needsDecoderPrologue() const + { + // Potentially lookahead should require it too. + return anyBitSet(kExplicitDraftTokens); + } + using UnderlyingType = std::uint8_t; bool operator==(SpeculativeDecodingMode const& other) const diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index b64b933a6..4c834e01a 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3769cb4ad108cb9898a03b25e91781bcb5576b85397fbd7f673843abba27272e -size 3977112 +oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1 +size 3931604 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index b64b933a6..4c834e01a 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3769cb4ad108cb9898a03b25e91781bcb5576b85397fbd7f673843abba27272e -size 3977112 +oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1 +size 3931604 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index 97739f174..a23bfa496 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -359da6357b9948425f249c226166408a libtensorrt_llm_batch_manager_static.a -359da6357b9948425f249c226166408a libtensorrt_llm_batch_manager_static.pre_cxx11.a -8d4b145290d5984494a1fa6e380d01456534dc62 commit \ No newline at end of file +93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a +93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a +c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index b69c7cd69..2438054cd 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3841fcf17899aa8cb75a01a5d0ee8c99e4e078399e4bb8a1201f9d53445d09cf -size 3869232 +oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4 +size 3812158 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index c78e733e0..a88fe4a7f 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99d1e58c95ea4267129b7a3ac95b65dc72b5e006b3168d07b213a1f9712930de -size 3835982 +oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439 +size 3773896 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib index 51fd59f45..2710c6005 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e61e4199962b639502aba50adca548e79d6332e658c10ab717b2ec019d28ed45 -size 22213850 +oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03 +size 22381766 diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h index d76f95f56..b55dae2d4 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h @@ -162,7 +162,7 @@ struct CutlassGemmConfig { } - std::string toString() + std::string toString() const { std::stringstream tactic; tactic << "Cutlass GEMM Tactic"; diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a index b0c960f78..69936aaee 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a00e1d3526af9fe7877c5e3362b32244309ccfac8fd720d1020c966d13b71c9 -size 1372862 +oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe +size 1396404 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index b0c960f78..69936aaee 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a00e1d3526af9fe7877c5e3362b32244309ccfac8fd720d1020c966d13b71c9 -size 1372862 +oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe +size 1396404 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt index f4ea4f186..698fe4058 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -a35a65a41062edf23a898ad42cdce31c libtensorrt_llm_executor_static.a -a35a65a41062edf23a898ad42cdce31c libtensorrt_llm_executor_static.pre_cxx11.a -8d4b145290d5984494a1fa6e380d01456534dc62 commit \ No newline at end of file +7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a +7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a +c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a index e48bba593..eee7ed862 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e37e2b2f28ac1ae37c22fac7c93394c6fba6e94e27403c0904e47eeb6cd4bf5c -size 1412454 +oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218 +size 1448772 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 7cdb45cd6..b6e842a0f 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b801135ba31f7ea63de5deb1880a45b68b2bc9fa45403e7204f6b7a153bd3ee -size 1346882 +oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50 +size 1375430 diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib index 2102f5d33..f20e66567 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9334de5c0a470731f8dd63f68e60ef320268d838547e5e6cbf537bf5c231eb6f -size 12962386 +oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532 +size 14031364 diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu index 049b1994d..cfa8fd9ce 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu @@ -97,6 +97,7 @@ struct PackedOn16Bytes<__nv_bfloat16> { using Type = PackedBFloat16; }; + #endif // add two 128b data @@ -600,7 +601,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params) template -static __global__ void twoShotAllReduceKernel(AllReduceParams params) +static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduceParams params) { // Suppose that two GPUs participate in the AR exchange, and we start two blocks. // The message is partitioned into chunks as detailed below: @@ -674,7 +675,7 @@ static __global__ void twoShotAllReduceKernel(AllReduceParams params) #pragma unroll for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { - size_t offset_rank = ii * params.elts_per_rank + local_offset; + size_t offset_rank = ranks[ii] * params.elts_per_rank + local_offset; if (offset_rank >= params.elts_total) { continue; @@ -829,7 +830,6 @@ std::tuple kernelLaunchConfig(AllReduceStrategyType algo, AllReducePar threads_per_block = std::min(DEFAULT_BLOCK_SIZE, total_threads); blocks_per_grid = std::min(static_cast(MAX_ALL_REDUCE_BLOCKS), divUp(total_threads, threads_per_block)); */ - while (total_threads % blocks_per_grid != 0 || total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) { blocks_per_grid += 1; @@ -863,7 +863,8 @@ template (fusionOp)); if (algo == AllReduceStrategyType::ONESHOT) { reduce_fusion::one_shot_all_reduce_norm_kernel_launcher(params, stream); @@ -1019,7 +1020,6 @@ AllReduceParams AllReduceParams::deserialize(int32_t const* buffer, size_t tpSiz } params.barrier_flag = flag_value; params.ranks_per_node = tpSize; - params.rank = tpRank; params.local_rank = tpRank; return params; diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h index 432226d34..fcc9e287c 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h @@ -30,7 +30,7 @@ namespace tensorrt_llm::kernels constexpr size_t WARP_SIZE = 32; constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24; constexpr size_t MAX_RANKS_PER_NODE = 8; -constexpr size_t DEFAULT_BLOCK_SIZE = 1024; +constexpr size_t DEFAULT_BLOCK_SIZE = 512; // Warning: python definition is in tensorrt_llm/functional.py // they must be kept in sync @@ -82,7 +82,7 @@ struct AllReduceParams size_t elts_per_rank; size_t elts_per_block; size_t rank_offset; - size_t ranks_per_node, rank, local_rank; + size_t ranks_per_node, local_rank; uint32_t barrier_flag; uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE]; uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE]; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 6fe7b7fae..8ffdfdad1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -173,6 +173,10 @@ std::vector get_candidate_tiles( std::vector get_candidate_tiles_sm90( int const sm, CutlassGemmConfig::CandidateConfigTypeParam const config) { +#ifdef FAST_BUILD + // Fast build disables all configs except this one for SM90 + return {CutlassTileConfigSM90::CtaShape128x128x128B}; +#else if (config & CutlassGemmConfig::GROUPED_GEMM) { return {CutlassTileConfigSM90::CtaShape128x16x128B, CutlassTileConfigSM90::CtaShape128x32x128B, @@ -187,26 +191,35 @@ std::vector get_candidate_tiles_sm90( CutlassTileConfigSM90::CtaShape128x32x128B, CutlassTileConfigSM90::CtaShape128x64x128B, CutlassTileConfigSM90::CtaShape128x128x128B, CutlassTileConfigSM90::CtaShape128x256x128B}; } +#endif } // We only compile CUTLASS kernels with multi-cast along M if the M tile is >= 128. This is purely to improve // compilation speed. bool supports_mcast_along_m(const CutlassTileConfigSM90 tile) { +#ifdef FAST_BUILD + return false; +#else std::set valid_tiles{CutlassTileConfigSM90::CtaShape128x16x128B, CutlassTileConfigSM90::CtaShape128x32x128B, CutlassTileConfigSM90::CtaShape128x64x128B, CutlassTileConfigSM90::CtaShape128x128x128B, CutlassTileConfigSM90::CtaShape128x256x128B}; return valid_tiles.count(tile) == 1; +#endif } // We only compile CUTLASS kernels with multi-cast along N if the N tile is >= 128. This is purely to improve // compilation speed. bool supports_mcast_along_n(const CutlassTileConfigSM90 tile) { +#ifdef FAST_BUILD + return false; +#else std::set valid_tiles{CutlassTileConfigSM90::CtaShape64x128x128B, CutlassTileConfigSM90::CtaShape64x256x128B, CutlassTileConfigSM90::CtaShape128x128x128B, CutlassTileConfigSM90::CtaShape128x256x128B}; return valid_tiles.count(tile) == 1; +#endif } std::vector get_candidate_configs( diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h index b7d2a230b..f7b20ea41 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h @@ -16,6 +16,7 @@ #pragma once +#include "cute/tensor.hpp" #include "cutlass_extensions/gemm_configs.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -26,13 +27,31 @@ namespace kernels namespace cutlass_kernels { +template +struct should_filter_sm90_gemm_problem_shape +{ +#ifdef FAST_BUILD + constexpr static int TILE_K = 128 * 8 / cutlass::sizeof_bits::value; + using SupportedCtaShape = cute::Shape>; + using SupportedCgaShape = cute::Shape; + + constexpr static bool value + = !cute::is_same_v || !cute::is_same_v; +#else + constexpr static bool value = false; +#endif +}; +template +constexpr static bool should_filter_sm90_gemm_problem_shape_v + = should_filter_sm90_gemm_problem_shape::value; + std::vector get_candidate_configs( int sm, int const max_split_k, tensorrt_llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam const); tensorrt_llm::cutlass_extensions::CutlassGemmConfig estimate_best_config_from_occupancies( std::vector const& candidate_configs, - std::vector const& occupancies, const int64_t m, const int64_t n, const int64_t k, const int64_t num_experts, - int const split_k_limit, const size_t workspace_bytes, int const multi_processor_count, int const is_weight_only); + std::vector const& occupancies, int64_t const m, int64_t const n, int64_t const k, int64_t const num_experts, + int const split_k_limit, size_t const workspace_bytes, int const multi_processor_count, int const is_weight_only); } // namespace cutlass_kernels } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl index 7447ced6f..5d8210332 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl @@ -41,6 +41,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h" @@ -69,15 +70,8 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType #ifdef COMPILE_HOPPER_TMA_GEMMS using CutlassActivationType = typename TllmToCutlassTypeAdapter::type; -// For FAST_BUILD, only instantiate kernels with 128x128x128B with 1x1x1 cluster shape. -#ifdef FAST_BUILD - constexpr int TILE_K = 128 * 8 / cutlass::sizeof_bits::value; - using SupportedCtaShape = Shape<_128, _128, cute::Int>; - using SupportedCgaShape = Shape<_1, _1, _1>; - - if constexpr (cute::is_same_v && cute::is_same_v) + if constexpr (!should_filter_sm90_gemm_problem_shape_v) { -#endif // FAST_BUILD using CutlassWeightType__ = typename TllmToCutlassTypeAdapter::type; // We need to remap this since SM90 uses a different layout for the weight matrix. using CutlassWeightType_ = std::conditional_t, @@ -278,13 +272,17 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status)); throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); } -#ifdef FAST_BUILD } else { - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] Config not compiled with FAST_BUILD."); + std::stringstream ss; + ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << "," + << (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") (" + << (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << "," + << (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD."; + + throw std::runtime_error(ss.str()); } -#endif // FAST_BUILD #else // COMPILE_HOPPER_TMA_GEMMS throw std::runtime_error( diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl index 4e5baf306..18480682a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl @@ -197,14 +197,7 @@ void sm90_generic_moe_gemm_kernelLauncher(HopperGroupedGemmInput hopper_input, i { #ifdef COMPILE_HOPPER_TMA_GEMMS using namespace cute; - // For FAST_BUILD, only instantiate kernels with 128x128x128B with 1x1x1 cluster shape. -#ifdef FAST_BUILD - constexpr int TILE_K = 128 * 8 / cutlass::sizeof_bits::value; - using SupportedCtaShape = Shape<_128, _128, cute::Int>; - using SupportedCgaShape = Shape<_1, _1, _1>; - - if constexpr (cute::is_same_v && cute::is_same_v) -#endif // FAST_BUILD + if constexpr (!should_filter_sm90_gemm_problem_shape_v) { using GemmInfo = HopperGroupedGemmInfo; @@ -287,12 +280,10 @@ void sm90_generic_moe_gemm_kernelLauncher(HopperGroupedGemmInput hopper_input, i "Failed to run cutlass variable batched gemm. Error: " + std::string(cutlassGetStatusString(run_status))); sync_check_cuda_error(); } -#ifdef FAST_BUILD else { TLLM_THROW("Configuration was disabled by FAST_BUILD"); } -#endif #else // COMPILE_HOPPER_TMA_GEMMS TLLM_THROW("Please recompile with support for hopper by passing 90-real as an arch to build_wheel.py."); diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels.h index ad07b9f88..aa0b57c5d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels.h @@ -30,7 +30,6 @@ namespace tensorrt_llm struct HopperGroupedGemmInput { - template using TransposeLayoutTag = std::conditional_t, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>; @@ -180,7 +179,7 @@ class MoeGemmRunner bool supportsHopperSpecialisation() const; [[nodiscard]] bool isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const; - size_t calcMaxWorkspaceSize(int num_experts) const; + size_t getMaxWorkspaceSize(int num_experts) const; [[nodiscard]] int getSM() const; @@ -197,9 +196,12 @@ class MoeGemmRunner int64_t gemm_k, int num_experts, bool use_fused_moe, cudaStream_t stream); private: - int sm_; - int multi_processor_count_; + int sm_{}; + int multi_processor_count_{}; + mutable int num_experts_ = 0; + mutable size_t gemm_workspace_size_ = 0; std::optional best_config_{}; + size_t calcMaxWorkspaceSize(int num_experts) const; }; } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h index 3b1302358..32fefcd22 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h @@ -48,6 +48,8 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" + #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" @@ -533,6 +535,18 @@ void MoeGemmRunner::dispatchToArch(T const* A, Weigh } } +template +size_t MoeGemmRunner::getMaxWorkspaceSize(int num_experts) const +{ + if (num_experts != num_experts_) + { + TLLM_LOG_TRACE("Calling getMaxWorkspaceSize() with a new expert count %d vs %d", num_experts, num_experts_); + num_experts_ = num_experts; + gemm_workspace_size_ = calcMaxWorkspaceSize(num_experts); + } + return gemm_workspace_size_; +} + template size_t MoeGemmRunner::calcMaxWorkspaceSize(int num_experts) const { diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp new file mode 100644 index 000000000..77a53bebd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Common utils to be shared between Precompiled and JIT implementation. + */ +#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParams) +{ + unsigned int head_size = xqaParams.head_size; + unsigned int num_q_heads = xqaParams.num_q_heads; + unsigned int num_kv_heads = xqaParams.num_kv_heads; + TLLM_CHECK_WITH_INFO(num_q_heads % num_kv_heads == 0, "numQHeads should be multiple of numKVHeads."); + unsigned int num_q_heads_over_kv = num_q_heads / num_kv_heads; + unsigned int beam_width = xqaParams.beam_width; + + // Use mTileSize = 16 kernels when qSeqLen <= 16. + unsigned int qSeqLen = static_cast(xqaParams.generation_input_length); + unsigned int mTileSize = qSeqLen <= 16 ? 16 : 32; + // MultiQueryToken kernels can support any num_q_heads_over_kv that is power of 2. + unsigned int kernel_num_q_heads_over_kv = xqaParams.multi_query_tokens ? 0 : num_q_heads_over_kv; + // MultiQueryToken kernels can handle either 16/32 for M direction per CTA. + unsigned int kernel_m_tilesize = xqaParams.multi_query_tokens ? mTileSize : num_q_heads_over_kv; + + return {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, kernel_m_tilesize, + xqaParams.paged_kv_cache ? static_cast(xqaParams.tokens_per_block) : 0, xqaParams.paged_kv_cache, + xqaParams.multi_query_tokens}; +} + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h index 6aaf43760..a6fc3209b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h @@ -17,6 +17,7 @@ */ #pragma once #include "decoderXQAConstants.h" +#include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/workspace.h" @@ -73,6 +74,8 @@ struct XQAKernelRuntimeHashKey } }; +XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParams); + struct XQAKernelRuntimeHasher { size_t operator()(XQAKernelRuntimeHashKey const& s) const diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp index 4f748dc39..e974b75ee 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp @@ -29,7 +29,7 @@ namespace jit { CubinObj::CubinObj(void const* buffer_, size_t buffer_size) - : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance()) + : mInitialized(false) { uint8_t const* buffer = static_cast(buffer_); size_t remaining_buffer_size = buffer_size; @@ -37,8 +37,37 @@ CubinObj::CubinObj(void const* buffer_, size_t buffer_size) mContent.resize(len); TLLM_CHECK(len <= remaining_buffer_size); memcpy(mContent.data(), buffer, len); +} + +CubinObj::CubinObj(std::string const& content) + : mContent(content) + , mInitialized(false) +{ +} + +CubinObj::CubinObj(CubinObj const& other) +{ + // Only uninitialized CubinObj can be copy-constructed. + TLLM_CHECK(!other.mInitialized); - initialize(mContent.c_str(), "kernel_mha"); + this->mContent = other.mContent; + this->mInitialized = false; +} + +CubinObj& CubinObj::operator=(CubinObj const& other) +{ + if (this == &other) + { + return *this; + } + + // Only uninitialized CubinObj can be copy-assigned. + TLLM_CHECK(!other.mInitialized); + + this->mContent = other.mContent; + this->mInitialized = false; + + return *this; } size_t CubinObj::getSerializationSize() const noexcept @@ -59,47 +88,45 @@ void CubinObj::serialize(void* buffer_, size_t buffer_size) const noexcept memcpy(buffer, mContent.c_str(), len); } -CubinObj::CubinObj(std::string const& content) - : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance()) - , mContent(content) - , mModule(nullptr) - , mFunction(nullptr) - , mSharedMemBytes(0) -{ - initialize(mContent.c_str(), "kernel_mha"); -} - void CubinObj::launch(dim3 gridDim, dim3 blockDim, CUstream hStream, void** kernelParams) { + TLLM_CHECK(mInitialized); cuErrCheck(mDriver->cuLaunchKernel(mFunction, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, mSharedMemBytes, hStream, kernelParams, /*extra=*/nullptr), mDriver); } -void CubinObj::initialize(char const* content, char const* funcName) +void CubinObj::initialize() { - cuErrCheck(mDriver->cuModuleLoadData(&mModule, content), mDriver); - TLLM_CHECK(mModule != nullptr); - cuErrCheck(mDriver->cuModuleGetFunction(&mFunction, mModule, funcName), mDriver); - TLLM_CHECK(mFunction != nullptr); + if (!mInitialized) + { + mDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance(); + mModule = nullptr; + cuErrCheck(mDriver->cuModuleLoadData(&mModule, mContent.c_str()), mDriver); + TLLM_CHECK(mModule != nullptr); + mFunction = nullptr; + cuErrCheck(mDriver->cuModuleGetFunction(&mFunction, mModule, kFuncName), mDriver); + TLLM_CHECK(mFunction != nullptr); - // Populate mSharedMemBytes. - CUdeviceptr shmem_dev_ptr = 0; - cuErrCheck(mDriver->cuModuleGetGlobal(&shmem_dev_ptr, nullptr, mModule, "smemSize"), mDriver); - TLLM_CHECK(shmem_dev_ptr != 0); - cuErrCheck(mDriver->cuMemcpyDtoH(&mSharedMemBytes, shmem_dev_ptr, sizeof(unsigned int)), mDriver); + // Populate mSharedMemBytes. + CUdeviceptr shmem_dev_ptr = 0; + cuErrCheck(mDriver->cuModuleGetGlobal(&shmem_dev_ptr, nullptr, mModule, kSmemName), mDriver); + TLLM_CHECK(shmem_dev_ptr != 0); + cuErrCheck(mDriver->cuMemcpyDtoH(&mSharedMemBytes, shmem_dev_ptr, sizeof(unsigned int)), mDriver); - TLLM_CHECK(mSharedMemBytes > 0); + TLLM_CHECK(mSharedMemBytes > 0); - /* Set 46KB threshold here because we have to take static/driver shared memory into consideration. */ - if (mSharedMemBytes >= 46 * 1024) - { - cuErrCheck( - mDriver->cuFuncSetAttribute(mFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, mSharedMemBytes), - mDriver); - } + /* Set 46KB threshold here because we have to take static/driver shared memory into consideration. */ + if (mSharedMemBytes >= 46 * 1024) + { + cuErrCheck(mDriver->cuFuncSetAttribute( + mFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, mSharedMemBytes), + mDriver); + } - sync_check_cuda_error(); + sync_check_cuda_error(); + mInitialized = true; + } } } // namespace jit diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h index 2706bf1cc..e5742112a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h @@ -31,20 +31,35 @@ class CubinObj public: // Default constructor constructs an empty unusable CubinObj instance. CubinObj() = default; - CubinObj(std::string const& content); + // Constructs from raw cubin content. + explicit CubinObj(std::string const& content); + // Deserializes from a serialization buffer. CubinObj(void const* buffer, size_t buffer_size); + + CubinObj(CubinObj const& other); + CubinObj& operator=(CubinObj const& other); + + // CubinObj can be move-constructed/assigned. + CubinObj(CubinObj&& other) = default; + CubinObj& operator=(CubinObj&& other) = default; + + // Should be called at least once before calling launch(). + void initialize(); void launch(dim3 gridDim, dim3 blockDim, CUstream hStream, void** kernelParams); + // It is safe to call getSerializeSize()/serialize() before calling initialize(). size_t getSerializationSize() const noexcept; void serialize(void* buffer, size_t buffer_size) const noexcept; private: - void initialize(char const* content, char const* funcName); - - std::shared_ptr mDriver; - + static constexpr char const* kFuncName = "kernel_mha"; + static constexpr char const* kSmemName = "smemSize"; + // Constructors should populate mContent. std::string mContent; + // Fields below are undefined prior to initialize() call. + bool mInitialized; + std::shared_ptr mDriver; CUmodule mModule; CUfunction mFunction; unsigned int mSharedMemBytes; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h index 28bdc969a..843ed381a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h @@ -29,7 +29,7 @@ namespace kernels namespace jit { -// A collection of CubinObjs, with caching functionality. +// A thread-safe collection of CubinObjs, with caching functionality. template > class CubinObjRegistryTemplate { @@ -64,6 +64,7 @@ class CubinObjRegistryTemplate std::unique_ptr> clone() const noexcept { + std::lock_guard lock(mMutex); auto result = std::make_unique>(); for (auto const& p : mMap) { @@ -74,6 +75,7 @@ class CubinObjRegistryTemplate size_t getSerializationSize() const noexcept { + std::lock_guard lock(mMutex); size_t result = sizeof(uint32_t); for (auto&& p : mMap) { @@ -85,6 +87,7 @@ class CubinObjRegistryTemplate void serialize(void* buffer_, size_t buffer_size) const noexcept { + std::lock_guard lock(mMutex); size_t remaining_buffer_size = buffer_size; uint8_t* buffer = static_cast(buffer_); uint32_t n = mMap.size(); @@ -108,31 +111,61 @@ class CubinObjRegistryTemplate TLLM_CHECK(remaining_buffer_size == 0); } - // Returns directly if the Cubin already exists in the registry, otherwise call compileEngine to compile it. - // - // compileEngine may be nullptr. - CubinObj* getCubin(Key const& key, CompileEngine* compileEngine) + // Compiles and inserts the cubin if not found in mMap. Does nothing otherwise. + void insertCubinIfNotExists(Key const& key, CompileEngine* compileEngine) { + TLLM_CHECK(compileEngine != nullptr); + + std::lock_guard lock(mMutex); + auto iter = mMap.find(key); if (iter != mMap.end()) { - return &(iter->second); + return; } - TLLM_CHECK_WITH_INFO(compileEngine != nullptr, "Key not found; compileEngine shouldn't be nullptr."); - CubinObj obj = compileEngine->compile(); - auto insertResultIter = mMap.insert({key, std::move(obj)}).first; - return &(insertResultIter->second); + mMap.insert({key, std::move(obj)}); + return; + } + + void insertCubin(Key const& key, CubinObj&& obj) + { + std::lock_guard lock(mMutex); + mMap.insert({key, std::forward(obj)}); + } + + CubinObj* getCubin(Key const& key) + { + std::lock_guard lock(mMutex); + auto iter = mMap.find(key); + if (iter != mMap.end()) + { + return &iter->second; + } + else + { + return nullptr; + } + } + + void merge(CubinObjRegistryTemplate const& other) + { + for (auto&& p : other.mMap) + { + mMap.insert(p); + } } void clear() { + std::lock_guard lock(mMutex); mMap.clear(); } private: std::unordered_map mMap; + mutable std::mutex mMutex; }; using CubinObjKey = XQAKernelFullHashKey; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index 7a375a929..fc7f1897d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -36,24 +36,6 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens}; } -XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParams) -{ - unsigned int head_size = xqaParams.head_size; - int num_q_heads = xqaParams.num_q_heads; - int num_kv_heads = xqaParams.num_kv_heads; - TLLM_CHECK_WITH_INFO(num_q_heads % num_kv_heads == 0, "numQHeads should be multiple of numKVHeads."); - unsigned int num_q_heads_over_kv = num_q_heads / num_kv_heads; - unsigned int beam_width = xqaParams.beam_width; - // MultiQueryToken kernels can support any num_q_heads_over_kv that is power of 2. - unsigned int kernel_num_q_heads_over_kv = xqaParams.multi_query_tokens ? 0 : num_q_heads_over_kv; - // MultiQueryToken kernels can handle either 16/32 for M direction per CTA. - unsigned int m_tilesize = xqaParams.multi_query_tokens ? 16 : num_q_heads_over_kv; - - return {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, m_tilesize, - xqaParams.paged_kv_cache ? static_cast(xqaParams.tokens_per_block) : 0, xqaParams.paged_kv_cache, - xqaParams.multi_query_tokens}; -} - } // anonymous namespace namespace tensorrt_llm @@ -66,7 +48,6 @@ DecoderXQAImplJIT::DecoderXQAImplJIT(DecoderXQARunner* runner) , mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance()) , mForceXQA(tensorrt_llm::common::forceXQAKernels()) , mSM(tensorrt_llm::common::getSMVersion()) - , mCubinObjRegistry(runner->mResource->getCubinObjRegistry()) { initSupportedConfigs(); } @@ -140,8 +121,24 @@ void DecoderXQAImplJIT::prepare(XQAParams const& xqaParams) jit::CompileEngine compileEngine(mSM, xqaParams); - // Discard getCubin() result. - mCubinObjRegistry->getCubin(key, &compileEngine); + auto registryGlobal = DecoderXQARunner::getResourceGlobal()->getCubinObjRegistry(); + jit::CubinObj* uninitializedCubin = registryGlobal->getCubin(key); + if (uninitializedCubin != nullptr) + { + // Inference time. Prepare for the inference. + if (mInitializedCubinObjRegistry.getCubin(key) == nullptr) + { + // Make a copy and initialize it. + jit::CubinObj initializedCubin = *uninitializedCubin; + initializedCubin.initialize(); + mInitializedCubinObjRegistry.insertCubin(key, std::move(initializedCubin)); + } + } + else + { + // Engine-build time. Compile the cubin and place it into CubinObjRegistry. + registryGlobal->insertCubinIfNotExists(key, &compileEngine); + } } void DecoderXQAImplJIT::runWithKVLinearBuffer( @@ -204,9 +201,13 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& BuildDecoderInfoParams decoder_params; memset(&decoder_params, 0, sizeof(decoder_params)); decoder_params.seqQOffsets = launchParams.cu_seq_lens; + decoder_params.seqQLengths = xqaParams.spec_decoding_generation_lengths; decoder_params.seqKVLengths = xqaParams.sequence_lengths; decoder_params.batchSize = int(batch_beam_size); decoder_params.maxQSeqLength = xqaParams.generation_input_length; + decoder_params.removePadding = xqaParams.multi_query_tokens; + TLLM_CHECK_WITH_INFO(!xqaParams.multi_query_tokens || xqaParams.spec_decoding_generation_lengths != nullptr, + "Spec_decoding_generation_lengths must be provided."); // Rotary embedding inv_freq buffer. decoder_params.rotaryEmbeddingScale = xqaParams.rotary_embedding_scale; decoder_params.rotaryEmbeddingBase = xqaParams.rotary_embedding_base; @@ -222,16 +223,18 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& // NOTE: MHA kernels should read kv cache that has already been appended with new tokens' kv cache. void* xqa_q_input_ptr = ioScratch; QKVPreprocessingParams preprocessingParms{static_cast(const_cast(xqaParams.qkv)), - nullptr, static_cast(xqa_q_input_ptr), kv_cache_buffer, static_cast(xqaParams.qkv_bias), nullptr, - xqaParams.sequence_lengths, nullptr, launchParams.rotary_inv_freq_buf, (float2 const*) nullptr, - xqaParams.kv_scale_orig_quant, xqaParams.spec_decoding_position_offsets, int(batch_beam_size), - xqaParams.generation_input_length, xqaParams.timestep, xqaParams.cyclic_attention_window_size, - xqaParams.sink_token_length, int(xqaParams.batch_size * beam_width * xqaParams.generation_input_length), - xqaParams.num_q_heads, xqaParams.num_kv_heads, xqaParams.num_q_heads / xqaParams.num_kv_heads, - xqaParams.head_size, xqaParams.rotary_embedding_dim, xqaParams.rotary_embedding_base, - xqaParams.rotary_embedding_scale_type, xqaParams.rotary_embedding_scale, - xqaParams.rotary_embedding_max_positions, xqaParams.position_embedding_type, xqaParams.position_shift_enabled, - cache_type, true, false, multiprocessor_count}; + nullptr, static_cast(xqa_q_input_ptr), kv_cache_buffer, static_cast(xqaParams.qkv_bias), + xqaParams.spec_decoding_generation_lengths, xqaParams.sequence_lengths, + xqaParams.multi_query_tokens ? launchParams.cu_seq_lens : nullptr, launchParams.rotary_inv_freq_buf, + (float2 const*) nullptr, xqaParams.kv_scale_orig_quant, xqaParams.spec_decoding_position_offsets, + int(batch_beam_size), xqaParams.generation_input_length, xqaParams.timestep, + xqaParams.cyclic_attention_window_size, xqaParams.sink_token_length, + int(xqaParams.batch_size * beam_width * xqaParams.generation_input_length), xqaParams.num_q_heads, + xqaParams.num_kv_heads, xqaParams.num_q_heads / xqaParams.num_kv_heads, xqaParams.head_size, + xqaParams.rotary_embedding_dim, xqaParams.rotary_embedding_base, xqaParams.rotary_embedding_scale_type, + xqaParams.rotary_embedding_scale, xqaParams.rotary_embedding_max_positions, xqaParams.position_embedding_type, + xqaParams.position_shift_enabled, cache_type, true, false, multiprocessor_count, xqaParams.rotary_vision_start, + xqaParams.rotary_vision_length}; invokeQKVPreprocessing(preprocessingParms, stream); sync_check_cuda_error(); @@ -245,7 +248,8 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& unsigned int kernel_m_tilesize = xqaParams.multi_query_tokens ? mTileSize : num_q_heads_over_kv; jit::CubinObjKey key = getCubinObjKeyFromXQAParams(xqaParams); - jit::CubinObj* cubinObj = mCubinObjRegistry->getCubin(key, /*compileEngine=*/nullptr); + jit::CubinObj* cubinObj = mInitializedCubinObjRegistry.getCubin(key); + TLLM_CHECK(cubinObj != nullptr); if (xqaParams.multi_query_tokens) { @@ -275,8 +279,8 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& } else { - // mha_sm90.cu kernels. Default to false because it is not available in JIT path for now. - bool const isGmmaKernel = false; + bool const isGmmaKernel = (mSM == kSM_90 && xqaParams.kv_cache_data_type == XQADataType::DATA_TYPE_E4M3 + && xqaParams.beam_width == 1); constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 11; uint32_t const maxNbKernelParams = (isGmmaKernel ? 11 : 10); uint32_t idxNextParam = 0; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h index a7f1d9fdd..ff10a44c6 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h @@ -60,7 +60,8 @@ class DecoderXQAImplJIT : public DecoderXQAImpl bool mForceXQA; int mSM; - jit::CubinObjRegistry* mCubinObjRegistry; + jit::CubinObjRegistry mInitializedCubinObjRegistry; + jit::CubinObjKey getCubinObjKeyFromXQAParams(XQAParams const& xqaParams) const; //! The first prototype just takes whatever available from the Precompiled cubins. diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index 98774b85f..5c8c2ebf1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f51307e90efbdd3dadc404efafb3b8a96ddbdb89a9068eba0b9676656be7d46d -size 80202640 +oid sha256:8de0cd3bd46925e008f263b3f6c78c17f198578f74e23bc90661bec5a9acfbb1 +size 80250768 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index 41596da36..236936439 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -b3823dd8e1d7f154019fb7dc24172ff4 libtensorrt_llm_nvrtc_wrapper.so -8d4b145290d5984494a1fa6e380d01456534dc62 commit \ No newline at end of file +5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so +c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index ae86a44eb..967003354 100755 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08384c1d7a80a86d888f6f23a5687ccb102b1a510b66db8dbcc3169127e4e88a -size 83472488 +oid sha256:bbf358364915d5b023a6d0574cde0f602c104d24efe0bf5c04eeee4610a2413e +size 83541760 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll index ad8bee56e..964f3f9b0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22867facd7d8dfa699618884d2e6912b1a2a7afedc299aa91e14b36353d6b8bd -size 1011200 +oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3 +size 1080832 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index 42c6d5e6b..dfca83ddf 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -213,14 +213,7 @@ class XQAKernelList // Use mTileSize = 16 kernels when qSeqLen <= 16. unsigned int qSeqLen = static_cast(xqaParams.generation_input_length); unsigned int mTileSize = qSeqLen <= 16 ? 16 : 32; - // MultiQueryToken kernels can support any num_q_heads_over_kv that is power of 2. - unsigned int kernel_num_q_heads_over_kv = xqaParams.multi_query_tokens ? 0 : num_q_heads_over_kv; - // MultiQueryToken kernels can handle either 16/32 for M direction per CTA. - unsigned int kernel_m_tilesize = xqaParams.multi_query_tokens ? mTileSize : num_q_heads_over_kv; - XQAKernelRuntimeHashKey hash_key{xqaParams.kv_cache_data_type, head_size, beam_width, - kernel_num_q_heads_over_kv, kernel_m_tilesize, - xqaParams.paged_kv_cache ? static_cast(xqaParams.tokens_per_block) : 0, - xqaParams.paged_kv_cache, xqaParams.multi_query_tokens}; + XQAKernelRuntimeHashKey hash_key = getRuntimeHashKeyFromXQAParams(xqaParams); auto const findIter = mFunctions.find(hash_key); TLLM_CHECK_WITH_INFO(findIter != mFunctions.end(), "XQAKernelFunc not found."); @@ -310,28 +303,6 @@ class XQAKernelList } } -private: - static uint32_t getElemBytes(CUtensorMapDataType_enum dataType) - { - switch (dataType) - { - case CU_TENSOR_MAP_DATA_TYPE_UINT8: return 1; - case CU_TENSOR_MAP_DATA_TYPE_UINT16: return 2; - case CU_TENSOR_MAP_DATA_TYPE_UINT32: return 4; - case CU_TENSOR_MAP_DATA_TYPE_INT32: return 4; - case CU_TENSOR_MAP_DATA_TYPE_UINT64: return 8; - case CU_TENSOR_MAP_DATA_TYPE_INT64: return 8; - case CU_TENSOR_MAP_DATA_TYPE_FLOAT16: return 2; - case CU_TENSOR_MAP_DATA_TYPE_FLOAT32: return 4; - case CU_TENSOR_MAP_DATA_TYPE_FLOAT64: return 8; - case CU_TENSOR_MAP_DATA_TYPE_BFLOAT16: return 2; - case CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ: return 4; - case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32: return 4; - case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ: return 4; - } - throw std::runtime_error("unsupported data type"); - } - protected: std::shared_ptr mDriver; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp index 76ccd2e03..f5bf359f4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp @@ -36,10 +36,9 @@ namespace tensorrt_llm namespace kernels { -DecoderXQARunner::DecoderXQARunner(Resource* resource, const XQADataType data_type, int num_heads, int num_kv_heads, - int head_size, bool multi_block_mode) - : mResource(resource) - , mDataType(data_type) +DecoderXQARunner::DecoderXQARunner( + const XQADataType data_type, int num_heads, int num_kv_heads, int head_size, bool multi_block_mode) + : mDataType(data_type) , mNumHeads(num_heads) , mNumKVHeads(num_kv_heads) , mHeadSize(head_size) @@ -104,18 +103,12 @@ size_t DecoderXQARunner::getWorkspaceSize(int max_batch_beam_size, int max_num_t DecoderXQAImpl* DecoderXQARunner::getImplFromXQAParams(XQAParams const& xqaParams) { - if (tensorrt_llm::common::getSMVersion() == kSM_90) - { - // Always use Precompiled impl for sm90 until Hopper XQA source gets integrated to JIT codepath. - return mPrecompiledImpl.get(); - } if (xqaParams.multi_query_tokens) { // Use precompiled cubin for medusa, because medusa cubins are generated from a different CUDA source file than // non-medusa. return mPrecompiledImpl.get(); } - if (tensorrt_llm::common::getEnvEnableXQAJIT()) { return mJITImpl.get(); @@ -143,6 +136,12 @@ void DecoderXQARunner::run( return getImplFromXQAParams(xqa_params)->run(xqa_params, kv_cache_buffer, stream); } +DecoderXQARunner::Resource* DecoderXQARunner::getResourceGlobal() +{ + static DecoderXQARunner::Resource sResource; + return &sResource; +} + template void DecoderXQARunner::run( XQAParams const& xqa_params, KVLinearBuffer const& kv_linear_buffer, cudaStream_t const& stream); template void DecoderXQARunner::run( diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h index da79fda3b..dfe53d903 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h @@ -77,10 +77,8 @@ struct XQADispatchHelper<__nv_bfloat16, KVBlockArray> class DecoderXQARunner { public: - // Resources for constructing a DecoderXQARunner object. - class Resource; - DecoderXQARunner(Resource* resource, const XQADataType data_type, int num_heads, int num_kv_heads, int head_size, - bool multi_block_mode); + DecoderXQARunner( + const XQADataType data_type, int num_heads, int num_kv_heads, int head_size, bool multi_block_mode); ~DecoderXQARunner(); /** @@ -169,6 +167,9 @@ class DecoderXQARunner this->run(xqa_params, kv_cache_buffer, stream); } + class Resource; + static Resource* getResourceGlobal(); + private: bool shouldUseImpl(XQAParams const& xqa_params, bool for_configure_plugin); void prepareForRun(XQAParams const& xqa_params); @@ -178,8 +179,6 @@ class DecoderXQARunner static constexpr int kMaxBeamWidth = 4; - Resource* mResource; - XQADataType mDataType; int mNumHeads; int mNumKVHeads; @@ -206,11 +205,21 @@ class DecoderXQARunner::Resource Resource(void const* buffer, size_t buffer_size); ~Resource() = default; + void merge(Resource const& other) + { + getCubinObjRegistry()->merge(*other.getCubinObjRegistry()); + } + jit::CubinObjRegistry* getCubinObjRegistry() { return mCubinObjRegistry.get(); } + jit::CubinObjRegistry const* getCubinObjRegistry() const + { + return mCubinObjRegistry.get(); + } + size_t getSerializationSize() const noexcept; void serialize(void* buffer, size_t buffer_size) const noexcept; diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu index 6fce38e48..e27534d62 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu @@ -339,7 +339,7 @@ __global__ void insertUnfinishedPathKernel(BeamHypotheses bh) // Other parameters bh.sequenceLengthsCBA[dstBeam] = bh.sequenceLengths[srcBeam]; bh.normedScoresCBA[dstBeam] - = applyLengthPenalty(bh.cumLogProbs[srcBeam], step - bh.inputLengths[srcBeam], bh.lengthPenalties[bid]); + = applyLengthPenalty(bh.cumLogProbs[srcBeam], step - bh.inputLengths[srcBeam] + 1, bh.lengthPenalties[bid]); bh.cumLogProbsCBA[dstBeam] = bh.cumLogProbs[srcBeam]; bh.numBeamsCBA[bid]++; } diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu index 4557e5e44..7b37b25ea 100644 --- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu @@ -1070,7 +1070,7 @@ std::vector CutlassMoeFCRunner::getWo size_t const sorter_size = CubKeyValueSorter::getWorkspaceSize(num_rows, num_experts); size_t const fc2_result_size = permuted_elems * gemm_output_dtype; // May be an intermediate type for quantization size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0; - size_t const gemm_workspace_size = moe_gemm_runner_.calcMaxWorkspaceSize(num_experts_per_node); + size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node); std::vector workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size, total_rows_before_expert_size, softmax_out_size, glu_inter_size, @@ -1085,7 +1085,7 @@ size_t CutlassMoeFCRunner::getWorkspaceSize(i ActivationType activation_type, MOEParallelismConfig parallelism_config) const { int const ep_size = parallelism_config.ep_size; - TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of tp size"); + TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size"); auto workspace = getWorkspaceBufferSizes( num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type); return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size()); diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h index 4e618c157..675398448 100644 --- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h @@ -52,15 +52,6 @@ class CubKeyValueSorter int num_bits_; }; -enum class MOEParallelismMode : int -{ - NONE = 0, //!< Ignore parallelism and duplicate the work across all nodes - EXPERT_PARALLELISM, //!< Divide the experts between each node. The number of experts must be a multiple of - //!< parallelism - TENSOR_PARALLELISM, //!< Divide the weight matrices between the nodes. The hidden dimension must be a multiple of - //!< parallelism -}; - enum class MOEExpertScaleNormalizationMode : int { NONE = 0, //!< Run the softmax on all scales and select the topk @@ -91,20 +82,23 @@ enum class MOEExpertScaleNormalizationMode : int */ struct MOEParallelismConfig { - constexpr static MOEParallelismConfig TensorParallelism(int tp_size, int tp_rank) + int tp_size = 1; + int tp_rank = 0; + int ep_size = 1; + int ep_rank = 0; + + bool operator==(MOEParallelismConfig const& other) const { - return {tp_size, tp_rank, 1, 0}; + return tp_size == other.tp_size && tp_rank == other.tp_rank && ep_size == other.ep_size + && ep_rank == other.ep_rank; } - constexpr static MOEParallelismConfig ExpertParallelism(int ep_size, int ep_rank) + friend std::ostream& operator<<(std::ostream& os, MOEParallelismConfig const& config) { - return {1, 0, ep_size, ep_rank}; + os << "tp_size: " << config.tp_size << ", tp_rank: " << config.tp_rank << ", ep_size: " << config.ep_size + << ", ep_rank: " << config.ep_rank; + return os; } - - int const tp_size = 1; - int const tp_rank = 0; - int const ep_size = 1; - int const ep_rank = 0; }; struct QuantParams diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu index ddea2eca5..ea8a106e3 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu @@ -14,11 +14,7 @@ * limitations under the License. */ -#include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/cudaTypeUtils.cuh" -#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -33,35 +29,34 @@ using namespace tensorrt_llm::runtime; namespace tensorrt_llm::kernels::speculative_decoding { -size_t invokeScanSpecDecodingGenerationLengths(void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, - SizeType32 const* __restrict__ specDecodingGenerationLengths, - SizeType32* __restrict__ maxSpecDecodingGenerationLengths, SizeType32 batchSize, cudaStream_t stream) +size_t invokeScanGenerationLengths(void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, + SizeType32 const* __restrict__ generationLengths, SizeType32* __restrict__ scannedGenerationLengths, + SizeType32 batchSize, cudaStream_t stream) { - cub::DeviceScan::InclusiveSum(scanTempStorage, scanTempStorageBytes, specDecodingGenerationLengths, - maxSpecDecodingGenerationLengths, batchSize, stream); + cub::DeviceScan::InclusiveSum( + scanTempStorage, scanTempStorageBytes, generationLengths, scannedGenerationLengths, batchSize, stream); return scanTempStorageBytes; } -size_t invokeReduceMaxSpecDecodingGenerationLengths(void* __restrict__ reduceMaxTempStorage, - size_t reduceTempStorageBytes, SizeType32 const* __restrict__ specDecodingGenerationLengths, - SizeType32* __restrict__ scannedSpecDecodingGenerationLengths, SizeType32 batchSize, cudaStream_t stream) +size_t invokeReduceMaxGenerationLengths(void* __restrict__ reduceMaxTempStorage, size_t reduceTempStorageBytes, + SizeType32 const* __restrict__ generationLengths, SizeType32* __restrict__ maxGenerationLengths, + SizeType32 batchSize, cudaStream_t stream) { - cub::DeviceReduce::Max(reduceMaxTempStorage, reduceTempStorageBytes, specDecodingGenerationLengths, - scannedSpecDecodingGenerationLengths, batchSize, stream); + cub::DeviceReduce::Max( + reduceMaxTempStorage, reduceTempStorageBytes, generationLengths, maxGenerationLengths, batchSize, stream); return reduceTempStorageBytes; } -// inclusive prefix sum specDecodingGenerationLengths and reduce max specDecodingGenerationLengths -void invokeScanReduceSpecDecodingGenerationLengths(SizeType32 batchSize, - SizeType32 const* __restrict__ specDecodingGenerationLengths, void* __restrict__ scanTempStorage, - size_t scanTempStorageBytes, SizeType32* __restrict__ scanedSpecDecodingGenerationLengths, - void* __restrict__ reduceMaxTempStorage, size_t reduceMaxTempStorageBytes, - SizeType32* maxSpecDecodingGenerationLengths, cudaStream_t stream) +// inclusive prefix sum generationLengths and reduce max generationLengths +void invokeScanReduceGenerationLengths(SizeType32 batchSize, SizeType32 const* __restrict__ generationLengths, + void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, SizeType32* __restrict__ scanedGenerationLengths, + void* __restrict__ reduceMaxTempStorage, size_t reduceMaxTempStorageBytes, SizeType32* maxGenerationLengths, + cudaStream_t stream) { - invokeScanSpecDecodingGenerationLengths(scanTempStorage, scanTempStorageBytes, specDecodingGenerationLengths, - scanedSpecDecodingGenerationLengths, batchSize, stream); - invokeReduceMaxSpecDecodingGenerationLengths(reduceMaxTempStorage, reduceMaxTempStorageBytes, - specDecodingGenerationLengths, maxSpecDecodingGenerationLengths, batchSize, stream); + invokeScanGenerationLengths( + scanTempStorage, scanTempStorageBytes, generationLengths, scanedGenerationLengths, batchSize, stream); + invokeReduceMaxGenerationLengths( + reduceMaxTempStorage, reduceMaxTempStorageBytes, generationLengths, maxGenerationLengths, batchSize, stream); } //////////////////////// @@ -100,27 +95,25 @@ __device__ SizeType32 positivePowerOfTwo(SizeType32 n) return res; } -__global__ void getSpecDecodingPackedMask(SizeType32 const* __restrict__ specDecodingCumGenerationLengths, - SizeType32 const* __restrict__ specDecodingMaxGenerationLengths, bool const* __restrict__ specDecodingMask, - SizeType32 const* __restrict__ batchSlots, SizeType32 maxDraftTokens, - SizeType32* __restrict__ specDecodingPackedMask) +__global__ void getPackedMask(SizeType32 const* __restrict__ cumGenerationLengths, + SizeType32 const* __restrict__ maxGenerationLengths, bool const* __restrict__ mask, + SizeType32 const* __restrict__ batchSlots, SizeType32 maxDraftTokens, SizeType32* __restrict__ packedMask) { auto const batchIdx = static_cast(blockIdx.y); auto const tokenIdx = static_cast(blockIdx.x); - auto const numTokens = (batchIdx == 0) - ? specDecodingCumGenerationLengths[0] - : specDecodingCumGenerationLengths[batchIdx] - specDecodingCumGenerationLengths[batchIdx - 1]; + auto const numTokens = (batchIdx == 0) ? cumGenerationLengths[0] + : cumGenerationLengths[batchIdx] - cumGenerationLengths[batchIdx - 1]; if (tokenIdx >= numTokens) { return; } - auto const maxGenerationLength = specDecodingMaxGenerationLengths[0]; + auto const maxGenerationLength = maxGenerationLengths[0]; auto const numPackedMasks = divUp(maxDraftTokens, 32); auto const outputStartId = batchSlots ? (batchSlots[batchIdx] * (maxDraftTokens + 1)) - : ((batchIdx == 0) ? 0 : specDecodingCumGenerationLengths[batchIdx - 1]); - auto* outputPtr = specDecodingPackedMask + (outputStartId + tokenIdx) * numPackedMasks; + : ((batchIdx == 0) ? 0 : cumGenerationLengths[batchIdx - 1]); + auto* outputPtr = packedMask + (outputStartId + tokenIdx) * numPackedMasks; if (tokenIdx == 0) { for (auto maskId = static_cast(threadIdx.x); maskId < numPackedMasks; @@ -132,18 +125,18 @@ __global__ void getSpecDecodingPackedMask(SizeType32 const* __restrict__ specDec } else { - bool const* specDecodingMaskPtr = specDecodingMask + batchIdx * maxGenerationLength * maxGenerationLength - + tokenIdx * maxGenerationLength + 1; - extern __shared__ char shSpecDecodingMask[]; + bool const* maskPtr + = mask + batchIdx * maxGenerationLength * maxGenerationLength + tokenIdx * maxGenerationLength + 1; + extern __shared__ char shMask[]; if (threadIdx.x == 0) { - shSpecDecodingMask[maxGenerationLength - 1] = '1'; + shMask[maxGenerationLength - 1] = '1'; } for (auto ti = static_cast(threadIdx.x); ti < maxGenerationLength - 1; ti += static_cast(blockDim.x)) { auto const shIndex = maxGenerationLength - 1 - ti - 1; - shSpecDecodingMask[shIndex] = specDecodingMaskPtr[ti] ? '1' : '0'; + shMask[shIndex] = maskPtr[ti] ? '1' : '0'; } __syncthreads(); for (auto maskId = static_cast(threadIdx.x); maskId < numPackedMasks; @@ -156,19 +149,19 @@ __global__ void getSpecDecodingPackedMask(SizeType32 const* __restrict__ specDec } else { - auto const shSpecDecodingMaskIndexStart + auto const shMaskIndexStart = ((maxGenerationLength - (maskId + 1) * 32) < 0) ? 0 : (maxGenerationLength - (maskId + 1) * 32); - auto const shSpecDecodingMaskIndexEnd = maxGenerationLength - (maskId * 32 + 1) + 1; + auto const shMaskIndexEnd = maxGenerationLength - (maskId * 32 + 1) + 1; - auto const validNumBits = shSpecDecodingMaskIndexEnd - shSpecDecodingMaskIndexStart; - auto const firstBit1 = (shSpecDecodingMask[shSpecDecodingMaskIndexStart] == '1') ? true : false; + auto const validNumBits = shMaskIndexEnd - shMaskIndexStart; + auto const firstBit1 = (shMask[shMaskIndexStart] == '1') ? true : false; SizeType32 mask31bits = 0; if (validNumBits != 1) { - for (auto i = shSpecDecodingMaskIndexStart + 1; i < shSpecDecodingMaskIndexEnd; i++) + for (auto i = shMaskIndexStart + 1; i < shMaskIndexEnd; i++) { - auto const index = (validNumBits - 1) - (i - shSpecDecodingMaskIndexStart - 1) - 1; - mask31bits += (shSpecDecodingMask[i] == '1') ? positivePowerOfTwo(index) : 0; + auto const index = (validNumBits - 1) - (i - shMaskIndexStart - 1) - 1; + mask31bits += (shMask[i] == '1') ? positivePowerOfTwo(index) : 0; } } SizeType32 mask32bits; @@ -187,19 +180,47 @@ __global__ void getSpecDecodingPackedMask(SizeType32 const* __restrict__ specDec } } // namespace -void invokeConvertSpecDecodingMaskToPackedMask(SizeType32 batchSize, - SizeType32 const* __restrict__ specDecodingCumGenerationLengths, - SizeType32 const* __restrict__ specDecodingMaxGenerationLengths, bool const* __restrict__ specDecodingMask, +void invokeConvertMaskToPackedMask(SizeType32 batchSize, SizeType32 const* __restrict__ cumGenerationLengths, + SizeType32 const* __restrict__ maxGenerationLengths, bool const* __restrict__ mask, SizeType32 const* __restrict__ batchSlots, SizeType32 maxDraftTokens, SizeType32 maxGenerationLength, - SizeType32* __restrict__ specDecodingPackedMask, cudaStream_t stream) + SizeType32* __restrict__ packedMask, cudaStream_t stream) { dim3 block(32); dim3 grid(maxGenerationLength, batchSize); size_t shmSize = maxGenerationLength * sizeof(char); - getSpecDecodingPackedMask<<>>(specDecodingCumGenerationLengths, - specDecodingMaxGenerationLengths, specDecodingMask, batchSlots, maxDraftTokens, specDecodingPackedMask); + getPackedMask<<>>( + cumGenerationLengths, maxGenerationLengths, mask, batchSlots, maxDraftTokens, packedMask); } +namespace +{ +template +__global__ void fillContextBuffers(FillContextExplicitDraftTokensParams params) +{ + auto const bid = static_cast(blockIdx.x); + auto const batchSlot = params.batchSlots ? params.batchSlots[bid] : bid; + + if (threadIdx.x == 0) + { + // Generate new random data for sampling. + params.randDataSample[batchSlot] = static_cast(curand_uniform(params.curandState + batchSlot)); + + // Copy temperature. + params.outputTemperatures[batchSlot] = __frcp_rn(params.inputTemperatures[batchSlot]); + } +} +} // namespace + +template +void invokeFillContextBuffers(FillContextExplicitDraftTokensParams const& params, cudaStream_t stream) +{ + SizeType32 constexpr BLOCK_SIZE = 32; + fillContextBuffers<<>>(params); +} + +template void invokeFillContextBuffers(FillContextExplicitDraftTokensParams const& params, cudaStream_t stream); +template void invokeFillContextBuffers(FillContextExplicitDraftTokensParams const& params, cudaStream_t stream); + namespace { template @@ -216,6 +237,8 @@ __global__ void extractExplicitDraftTokens(ExtractExplicitDraftTokensParams p auto const bestPathIdx = params.bestPathIndices[bid]; // Get current seq len (w/o newly accepted tokens). auto const curSeqLen = params.sequenceLengths[batchSlot]; + // `last*` tensors do not have data for context requests. + auto const lastTensorBid = bid - params.numContextRequests; // Get output ids. auto* outputIdsRequest = params.outputIds + batchSlot * params.maxSeqLen; @@ -237,7 +260,8 @@ __global__ void extractExplicitDraftTokens(ExtractExplicitDraftTokensParams p { // Read 1:bestPathLength slice of last draft tokens at best path idx. // This tensor comes directly from engine and has linear batch index. - auto const pathOffset = flat_index3(bid, bestPathIdx, ti + 1, params.numPaths, params.maxPathLength); + auto const pathOffset + = flat_index3(lastTensorBid, bestPathIdx, ti + 1, params.numPaths, params.maxPathLength); // Read accepted token from last draft tokens. acceptedToken = params.lastDraftTokens[pathOffset]; } @@ -253,6 +277,11 @@ __global__ void extractExplicitDraftTokens(ExtractExplicitDraftTokensParams p = params.nextDraftTokens[bid * params.numPaths * params.maxPathLength + ti]; params.unpackedNextDraftIndices[batchSlot * params.numPaths * params.maxPathLength + ti] = params.inputUnpackedNextDraftIndices[bid * params.numPaths * params.maxPathLength + ti]; + if (lastTensorBid >= 0) + { + params.outputLastDraftIndices[batchSlot * params.numPaths * params.maxPathLength + ti] + = params.lastDraftIndices[lastTensorBid * params.numPaths * params.maxPathLength + ti]; + } } auto const numNextDraftTokens = (bid == 0) @@ -274,14 +303,13 @@ __global__ void extractExplicitDraftTokens(ExtractExplicitDraftTokensParams p for (auto ti = static_cast(threadIdx.x); ti < numNextDraftTokens; ti += static_cast(blockDim.x)) { - params.outputPositionIds[batchSlot * maxDecodingTokens + ti] = params.packedPositionIds[startId + ti]; + params.outputPositionIds[batchSlot * maxDecodingTokens + ti] = params.packedPositionIds[startId + ti] - 1; } for (auto ti = static_cast(threadIdx.x); ti < params.numPaths * (params.maxPathLength - 1); ti += static_cast(blockDim.x)) { // Generate new random data for token verification. - // This tensor goes directly to engine and has linear batch index. auto const offset = flat_index2(batchSlot, ti, params.numPaths * (params.maxPathLength - 1)); params.randDataVerification[offset] = static_cast(curand_uniform(params.curandState + batchSlot)); } @@ -291,23 +319,31 @@ __global__ void extractExplicitDraftTokens(ExtractExplicitDraftTokensParams p if (threadIdx.x == 0) { // Update pos id base. - // This tensor goes directly to engine and has linear batch index. params.outputPositionIdsBase[batchSlot] = params.inputPositionIdsBase[bid] + bestPathLength; // Set number of accepted tokens at this iteration. params.acceptedLengths[batchSlot] = bestPathLength; + // Set number of draft tokens for the next iteration. + params.prevDraftLengths[batchSlot] = params.nextDraftLengths[batchSlot]; + // Set number of draft tokens for the next iteration. params.nextDraftLengths[batchSlot] = numNextDraftTokens - 1; + // Set number of tokens passed to the engine per request for the next iteration. + params.outputGenerationLengths[batchSlot] = numNextDraftTokens; + // Generate new random data for sampling. - // This tensor goes directly to engine and has linear batch index. params.randDataSample[batchSlot] = static_cast(curand_uniform(params.curandState + batchSlot)); + // Increase seqLen by accepted len. params.sequenceLengths[batchSlot] = curSeqLen + bestPathLength; // Copy temperature. - params.outputTemperatures[batchSlot] = params.inputTemperatures[batchSlot]; + params.outputTemperatures[batchSlot] = __frcp_rn(params.inputTemperatures[batchSlot]); + + // Copy best path index. + params.outputBestPathIndices[batchSlot] = bestPathIdx; } } } // namespace @@ -328,12 +364,13 @@ namespace { template __global__ void copyProbs(uint8_t const* srcData, uint8_t* dstData, SizeType32 const* inputBatchSlots, - SizeType32 const* outputBatchSlots, SizeType32 sizeInBytes) + SizeType32 const* outputBatchSlots, SizeType32 sizeInBytes, SizeType32 inputBatchIdxOffset) { auto constexpr VEC_ELTS = static_cast(sizeof(VecT)); - auto const bid = static_cast(blockIdx.y); - auto const intputBatchSlot = inputBatchSlots ? inputBatchSlots[bid] : bid; - auto const outputBatchSlot = outputBatchSlots ? outputBatchSlots[bid] : bid; + auto const inputBid = static_cast(blockIdx.y) + inputBatchIdxOffset; + auto const outputBid = static_cast(blockIdx.y); + auto const intputBatchSlot = inputBatchSlots ? inputBatchSlots[inputBid] : inputBid; + auto const outputBatchSlot = outputBatchSlots ? outputBatchSlots[outputBid] : outputBid; auto const srcStartIdx = intputBatchSlot * sizeInBytes; auto const dstStartIdx = outputBatchSlot * sizeInBytes; auto const tidx = (static_cast(blockIdx.x) * blockDim.x + threadIdx.x) * VEC_ELTS; @@ -351,7 +388,8 @@ __global__ void copyProbs(uint8_t const* srcData, uint8_t* dstData, SizeType32 c } // namespace void invokeCopyProbs(uint8_t const* srcDataPtr, uint8_t* dstDataPtr, SizeType32 const* inputBatchSlots, - SizeType32 const* outputBatchSlots, SizeType32 batchSize, SizeType32 copyRowSizeInBytes, cudaStream_t stream) + SizeType32 const* outputBatchSlots, SizeType32 batchSize, SizeType32 inputBatchIdxOffset, + SizeType32 copyRowSizeInBytes, cudaStream_t stream) { auto copyProbsInvocation = copyProbs; if (copyRowSizeInBytes % 16 == 0) @@ -375,7 +413,7 @@ void invokeCopyProbs(uint8_t const* srcDataPtr, uint8_t* dstDataPtr, SizeType32 SizeType32 constexpr BLOCKS_PER_ROW{32}; dim3 const gridSize{BLOCKS_PER_ROW, static_cast(batchSize)}; copyProbsInvocation<<>>( - srcDataPtr, dstDataPtr, inputBatchSlots, outputBatchSlots, copyRowSizeInBytes); + srcDataPtr, dstDataPtr, inputBatchSlots, outputBatchSlots, copyRowSizeInBytes, inputBatchIdxOffset); } template @@ -386,12 +424,41 @@ void invokeCopyProbs(ExtractExplicitDraftTokensParams const& params, cudaStre auto const numCopyElems = params.numPaths * (params.maxPathLength - 1) * params.vocabSize; auto const copyRowSizeInBytes = numCopyElems * sizeof(T); - invokeCopyProbs(srcDataPtr, dstDataPtr, nullptr, params.batchSlots, params.batchSize, copyRowSizeInBytes, stream); + invokeCopyProbs( + srcDataPtr, dstDataPtr, nullptr, params.batchSlots, params.batchSize, 0, copyRowSizeInBytes, stream); } template void invokeCopyProbs(ExtractExplicitDraftTokensParams const& params, cudaStream_t stream); template void invokeCopyProbs(ExtractExplicitDraftTokensParams const& params, cudaStream_t stream); +namespace +{ +template +__global__ void packGenerationLengths(PackExplicitDraftTokensParams params) +{ + auto const batchIdx = static_cast(blockIdx.x); + auto const batchSlot = params.batchSlots ? params.batchSlots[batchIdx] : batchIdx; + + auto const isGenerationRequest = batchIdx >= params.numContextRequests; + auto const genIdx = batchIdx - params.numContextRequests; + + if (threadIdx.x == 0 && isGenerationRequest) + { + params.outputGenerationLengths[genIdx] = params.inputGenerationLengths[batchSlot]; + } +} +} // namespace + +template +void invokePackGenerationLengths(PackExplicitDraftTokensParams const& params, cudaStream_t stream) +{ + SizeType32 constexpr BLOCK_SIZE = 32; + packGenerationLengths<<>>(params); +} + +template void invokePackGenerationLengths(PackExplicitDraftTokensParams const& params, cudaStream_t stream); +template void invokePackGenerationLengths(PackExplicitDraftTokensParams const& params, cudaStream_t stream); + namespace { template @@ -400,61 +467,79 @@ __global__ void packExplicitDraftTokens(PackExplicitDraftTokensParams params) auto const batchIdx = static_cast(blockIdx.x); auto const batchSlot = params.batchSlots ? params.batchSlots[batchIdx] : batchIdx; + auto const isGenerationRequest = batchIdx >= params.numContextRequests; + auto const genIdx = batchIdx - params.numContextRequests; + if (threadIdx.x == 0) { params.outputPositionIdsBase[batchIdx] = params.inputPositionIdsBase[batchSlot]; - params.outputGenerationLengths[batchIdx] = params.inputGenerationLengths[batchSlot]; params.outputRandomDataSample[batchIdx] = params.inputRandomDataSample[batchSlot]; params.outputTemperatures[batchIdx] = params.inputTemperatures[batchSlot]; } // Copy random validation data. auto const numDecodingDraftTokens = params.numPaths * (params.maxPathLength - 1); - auto outputRandomDataValidation = params.outputRandomDataValidation + batchIdx * numDecodingDraftTokens; - auto inputRandomDataValidation = params.inputRandomDataValidation + batchSlot * numDecodingDraftTokens; - for (auto ti = static_cast(threadIdx.x); ti < numDecodingDraftTokens; - ti += static_cast(blockDim.x)) + if (isGenerationRequest) { - outputRandomDataValidation[ti] = inputRandomDataValidation[ti]; + auto outputRandomDataValidation = params.outputRandomDataValidation + genIdx * numDecodingDraftTokens; + auto const inputRandomDataValidation = params.inputRandomDataValidation + batchSlot * numDecodingDraftTokens; + for (auto ti = static_cast(threadIdx.x); ti < numDecodingDraftTokens; + ti += static_cast(blockDim.x)) + { + outputRandomDataValidation[ti] = inputRandomDataValidation[ti]; + } } // Copy draft tokens and indices - auto const numUnpackedTokens = numDecodingDraftTokens + params.numPaths; - auto outputNextDraftTokens = params.outputNextDraftTokens + batchIdx * numUnpackedTokens; - auto outputNextDraftIndices = params.outputNextDraftIndices + batchIdx * numUnpackedTokens; - auto const inputNextDraftTokens = params.inputNextDraftTokens + batchSlot * numUnpackedTokens; - auto const inputNextDraftIndices = params.inputNextDraftIndices + batchSlot * numUnpackedTokens; - for (auto ti = static_cast(threadIdx.x); ti < numUnpackedTokens; - ti += static_cast(blockDim.x)) + if (isGenerationRequest) { - outputNextDraftTokens[ti] = inputNextDraftTokens[ti]; - outputNextDraftIndices[ti] = inputNextDraftIndices[ti]; + auto const numUnpackedTokens = numDecodingDraftTokens + params.numPaths; + auto outputNextDraftTokens = params.outputNextDraftTokens + genIdx * numUnpackedTokens; + auto outputNextDraftIndices = params.outputNextDraftIndices + genIdx * numUnpackedTokens; + auto const inputNextDraftTokens = params.inputNextDraftTokens + batchSlot * numUnpackedTokens; + auto const inputNextDraftIndices = params.inputNextDraftIndices + batchSlot * numUnpackedTokens; + for (auto ti = static_cast(threadIdx.x); ti < numUnpackedTokens; + ti += static_cast(blockDim.x)) + { + outputNextDraftTokens[ti] = inputNextDraftTokens[ti]; + outputNextDraftIndices[ti] = inputNextDraftIndices[ti]; + } } auto const maxGenerationLength = params.maxGenerationLength[0]; auto const maxDecodingTokens = numDecodingDraftTokens + 1; auto const numPackedMasks = divUp(maxGenerationLength, 32); - auto const outputMaskStartId = (batchIdx == 0) ? 0 : params.cumSumGenerationLengths[batchIdx - 1]; - auto const numTokens = (batchIdx == 0) + auto const outputMaskStartId = (genIdx == 0) ? 0 : params.cumSumGenerationLengths[genIdx - 1]; + auto const numTokens = (genIdx == 0) ? params.cumSumGenerationLengths[0] - : params.cumSumGenerationLengths[batchIdx] - params.cumSumGenerationLengths[batchIdx - 1]; + : params.cumSumGenerationLengths[genIdx] - params.cumSumGenerationLengths[genIdx - 1]; // Copy packed masks. // Masks are placed next to each other with offsets of cumSumGenerationLengths[bi-1] - auto const inputPackedMask = params.inputPackedMask + batchSlot * numPackedMasks * maxDecodingTokens; - auto outputPackedMask = params.outputPackedMask + outputMaskStartId * numPackedMasks; - for (auto ti = static_cast(threadIdx.x); ti < numTokens * numPackedMasks; - ti += static_cast(blockDim.x)) + if (isGenerationRequest) { - outputPackedMask[ti] = inputPackedMask[ti]; + auto const inputPackedMask = params.inputPackedMask + batchSlot * numPackedMasks * maxDecodingTokens; + auto outputPackedMask = params.outputPackedMask + outputMaskStartId * numPackedMasks; + for (auto ti = static_cast(threadIdx.x); ti < numTokens * numPackedMasks; + ti += static_cast(blockDim.x)) + { + outputPackedMask[ti] = inputPackedMask[ti]; + } } // Copy pos offsets. Copy only for maxGenerationLength - auto outputPositionOffsets = params.outputPositionOffsets + batchIdx * maxGenerationLength; - auto const inputPositionOffsets = params.inputPositionOffsets + batchSlot * maxDecodingTokens; - for (auto ti = static_cast(threadIdx.x); ti < maxGenerationLength; - ti += static_cast(blockDim.x)) + if (isGenerationRequest) { - outputPositionOffsets[ti] = inputPositionOffsets[ti]; + auto const basePosId = params.outputPositionIdsBase[batchIdx]; + auto outputPositionOffsets = params.outputPositionOffsets + genIdx * maxGenerationLength; + auto outputPositionIds = params.outputPositionIds + genIdx * maxGenerationLength; + auto const inputPositionIds = params.inputPositionIds + batchSlot * maxDecodingTokens; + for (auto ti = static_cast(threadIdx.x); ti < maxGenerationLength; + ti += static_cast(blockDim.x)) + { + auto const posId = inputPositionIds[ti]; + outputPositionIds[params.numContextTokens + ti] = posId; + outputPositionOffsets[ti] = posId - basePosId + 1; + } } } } // namespace @@ -477,7 +562,8 @@ void invokeCopyProbs(PackExplicitDraftTokensParams const& params, cudaStream_ auto const numCopyElems = params.numPaths * (params.maxPathLength - 1) * params.vocabSize; auto const copyRowSizeInBytes = numCopyElems * sizeof(T); - invokeCopyProbs(srcDataPtr, dstDataPtr, params.batchSlots, nullptr, params.batchSize, copyRowSizeInBytes, stream); + invokeCopyProbs(srcDataPtr, dstDataPtr, params.batchSlots, nullptr, params.numGenerationRequests, + params.numContextRequests, copyRowSizeInBytes, stream); } template void invokeCopyProbs(PackExplicitDraftTokensParams const& params, cudaStream_t stream); diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h index 8f08b67f3..a3556068d 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" #include @@ -25,6 +26,38 @@ namespace tensorrt_llm::kernels::speculative_decoding { +template +struct FillContextExplicitDraftTokensParams +{ + //! [maxBatchSize] + T* randDataSample{nullptr}; + //! [maxBatchSize] + T* outputTemperatures{nullptr}; + //! [maxBatchSize] + float const* inputTemperatures{nullptr}; + //! [maxBatchSize] + curandState_t* curandState{nullptr}; + //! [forwardBatchSize] + runtime::SizeType32 const* batchSlots{nullptr}; + + runtime::SizeType32 batchSize{0}; + + void checkParams() const + { + TLLM_CHECK(randDataSample); + TLLM_CHECK(outputTemperatures); + TLLM_CHECK(inputTemperatures); + TLLM_CHECK(curandState); + TLLM_CHECK(batchSlots); + + TLLM_CHECK(batchSize > 0); + } +}; + +//! @brief Sets temperature and generates random variable for sampling. +template +void invokeFillContextBuffers(FillContextExplicitDraftTokensParams const& params, cudaStream_t stream); + template struct ExtractExplicitDraftTokensParams { @@ -43,10 +76,18 @@ struct ExtractExplicitDraftTokensParams //! [maxBatchSize] runtime::SizeType32* acceptedLengths{nullptr}; //! [maxBatchSize] + runtime::SizeType32* prevDraftLengths{nullptr}; + //! [maxBatchSize] runtime::SizeType32* nextDraftLengths{nullptr}; //! [maxBatchSize] runtime::SizeType32* sequenceLengths{nullptr}; //! [maxBatchSize] + runtime::SizeType32* outputGenerationLengths{nullptr}; + //! [maxBatchSize] + runtime::SizeType32* outputBestPathIndices{nullptr}; + //! [maxBatchSize, maxNumPaths, maxPathLength] + runtime::SizeType32* outputLastDraftIndices{nullptr}; + //! [maxBatchSize] T* randDataSample{nullptr}; //! [maxBatchSize, maxNumPaths, maxPathDraftLength] T* randDataVerification{nullptr}; @@ -58,7 +99,7 @@ struct ExtractExplicitDraftTokensParams runtime::SizeType32 const* batchSlots{nullptr}; //! [forwardBatchSize, maxNumPaths, maxPathLength] runtime::TokenIdType const* nextDraftTokens{nullptr}; - //! [forwardBatchSize, maxNumPaths, maxPathLength] + //! [forwardBatchSize, maxNumPaths, maxPathLength], optional runtime::TokenIdType const* lastDraftTokens{nullptr}; //! [forwardBatchSize, maxNumPaths, maxPathLength] runtime::SizeType32 const* inputUnpackedNextDraftIndices{nullptr}; @@ -74,17 +115,75 @@ struct ExtractExplicitDraftTokensParams runtime::TokenIdType const* nextFlatTokens{nullptr}; //! [forwardBatchSize] runtime::SizeType32 const* generationLengthInclusiveSum{nullptr}; + //! [forwardBatchSize] + runtime::SizeType32 const* lastGenerationLengths{nullptr}; + //! [maxBatchSize, maxNumPaths, maxPathLength] + runtime::SizeType32 const* lastDraftIndices{nullptr}; //! [forwardBatchSize, maxNumPaths, maxPathDraftLength, maxVocabSize] T const* nextDraftProbs{nullptr}; //! [maxBatchSize] float const* inputTemperatures{nullptr}; //! [maxBatchSize] curandState_t* curandState{nullptr}; - runtime::SizeType32 batchSize; - runtime::SizeType32 numPaths; - runtime::SizeType32 maxPathLength; - runtime::SizeType32 maxSeqLen; - runtime::SizeType32 vocabSize; + runtime::SizeType32 batchSize{0}; + runtime::SizeType32 numPaths{0}; + runtime::SizeType32 maxPathLength{0}; + runtime::SizeType32 maxSeqLen{0}; + runtime::SizeType32 vocabSize{0}; + runtime::SizeType32 numContextRequests{0}; + runtime::SizeType32 numGenerationRequests{0}; + + void checkParams() const + { + TLLM_CHECK(outputIds); + + TLLM_CHECK(outputPositionIdsBase); + TLLM_CHECK(inputPositionIdsBase); + + TLLM_CHECK(outputPositionIds); + TLLM_CHECK(packedPositionIds); + + TLLM_CHECK(outputTemperatures); + TLLM_CHECK(inputTemperatures); + + TLLM_CHECK(outputDraftProbs); + TLLM_CHECK(nextDraftProbs); + + TLLM_CHECK(outputNextDraftTokens); + TLLM_CHECK(unpackedNextDraftTokens); + + TLLM_CHECK(unpackedNextDraftIndices); + TLLM_CHECK(inputUnpackedNextDraftIndices); + + TLLM_CHECK(outputLastDraftIndices); + + TLLM_CHECK(bestPathIndices); + TLLM_CHECK(outputBestPathIndices); + + TLLM_CHECK(curandState); + TLLM_CHECK(batchSlots); + TLLM_CHECK(nextDraftTokens); + TLLM_CHECK(nextFlatTokens); + TLLM_CHECK(generationLengthInclusiveSum); + TLLM_CHECK(bestPathLengths); + + TLLM_CHECK(randDataSample); + TLLM_CHECK(randDataVerification); + TLLM_CHECK(acceptedLengths); + TLLM_CHECK(nextDraftLengths); + TLLM_CHECK(prevDraftLengths); + TLLM_CHECK(sequenceLengths); + TLLM_CHECK(outputGenerationLengths); + + TLLM_CHECK(batchSize > 0); + TLLM_CHECK(numPaths > 0); + TLLM_CHECK(maxPathLength > 0); + TLLM_CHECK(maxSeqLen > 0); + TLLM_CHECK(vocabSize > 0); + TLLM_CHECK(numContextRequests >= 0); + TLLM_CHECK(numGenerationRequests >= 0); + TLLM_CHECK(numContextRequests + numGenerationRequests != 0); + } }; //! @brief Modifies `outputIds` and `sequenceLengths` according to the accepted tokens @@ -146,10 +245,12 @@ struct PackExplicitDraftTokensParams //! [forwardBatchSize, maxGenerationLength, divUp(maxGenerationLength, 32)] int32_t const* inputPackedMask{nullptr}; + //! [forwardBatchSize, maxGenerationLength] + runtime::SizeType32* outputPositionIds{nullptr}; //! [forwardBatchSize, maxGenerationLength] runtime::SizeType32* outputPositionOffsets{nullptr}; //! [maxBatchSize, maxGenerationLength] - runtime::SizeType32 const* inputPositionOffsets{nullptr}; + runtime::SizeType32 const* inputPositionIds{nullptr}; //! [forwardBatchSize, maxNumPaths, maxPathDraftLength, maxVocabSize] T* outputDraftProbs{nullptr}; @@ -161,12 +262,59 @@ struct PackExplicitDraftTokensParams //! [maxBatchSize] T const* inputTemperatures{nullptr}; - runtime::SizeType32 batchSize; - runtime::SizeType32 numPaths; - runtime::SizeType32 maxPathLength; - runtime::SizeType32 vocabSize; + runtime::SizeType32 batchSize{0}; + runtime::SizeType32 numPaths{0}; + runtime::SizeType32 maxPathLength{0}; + runtime::SizeType32 vocabSize{0}; + runtime::SizeType32 numContextTokens{0}; + runtime::SizeType32 numContextRequests{0}; + runtime::SizeType32 numGenerationRequests{0}; + + void checkParams() const + { + TLLM_CHECK(batchSlots); + TLLM_CHECK(cumSumGenerationLengths); + TLLM_CHECK(maxGenerationLength); + + TLLM_CHECK(inputPositionIdsBase); + + TLLM_CHECK(inputGenerationLengths); + + TLLM_CHECK(outputRandomDataSample); + TLLM_CHECK(inputRandomDataSample); + + TLLM_CHECK(inputRandomDataValidation); + + TLLM_CHECK(inputNextDraftTokens); + + TLLM_CHECK(inputNextDraftIndices); + + TLLM_CHECK(inputPackedMask); + + TLLM_CHECK(inputPositionIds); + + TLLM_CHECK(inputDraftProbs); + + TLLM_CHECK(outputTemperatures); + TLLM_CHECK(inputTemperatures); + + TLLM_CHECK(batchSize > 0); + TLLM_CHECK(numPaths > 0); + TLLM_CHECK(maxPathLength > 0); + TLLM_CHECK(vocabSize > 0); + TLLM_CHECK(numContextRequests >= 0); + TLLM_CHECK(numGenerationRequests >= 0); + TLLM_CHECK( + (numContextTokens == 0 && numContextRequests == 0) || (numContextTokens > 0 && numContextRequests > 0)); + TLLM_CHECK(numContextRequests + numGenerationRequests != 0); + } }; +//! @brief Copy all rows at `batchSlots[batchIdx]` from `inputGenerationLengths` tensors to `batchIdx` rows at +//! `outputGenerationLengths` tensor. +template +void invokePackGenerationLengths(PackExplicitDraftTokensParams const& params, cudaStream_t stream); + //! @brief Copy all rows at `batchSlots[batchIdx]` from `input*` tensors to `batchIdx` rows at `output*` tensor. template void invokePackExplicitDraftTokens(PackExplicitDraftTokensParams const& params, cudaStream_t stream); @@ -175,27 +323,24 @@ void invokePackExplicitDraftTokens(PackExplicitDraftTokensParams const& param template void invokeCopyProbs(PackExplicitDraftTokensParams const& params, cudaStream_t stream); -size_t invokeScanSpecDecodingGenerationLengths(void* __restrict__ reduceMaxTempStorage, size_t reduceTempStorageBytes, - runtime::SizeType32 const* __restrict__ specDecodingGenerationLengths, - runtime::SizeType32* __restrict__ scannedSpecDecodingGenerationLengths, runtime::SizeType32 batchSize, - cudaStream_t stream); -size_t invokeReduceMaxSpecDecodingGenerationLengths(void* __restrict__ reduceMaxTempStorage, - size_t reduceTempStorageBytes, runtime::SizeType32 const* __restrict__ specDecodingGenerationLengths, - runtime::SizeType32* __restrict__ scannedSpecDecodingGenerationLengths, runtime::SizeType32 batchSize, - cudaStream_t stream); - -// inclusive prefix sum specDecodingGenerationLengths -void invokeScanReduceSpecDecodingGenerationLengths(runtime::SizeType32 batchSize, - runtime::SizeType32 const* __restrict__ specDecodingGenerationLengths, void* __restrict__ scanTempStorage, - size_t scanTempStorageBytes, runtime::SizeType32* __restrict__ scanedSpecDecodingGenerationLengths, +size_t invokeScanGenerationLengths(void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, + runtime::SizeType32 const* __restrict__ generationLengths, + runtime::SizeType32* __restrict__ scannedGenerationLengths, runtime::SizeType32 batchSize, cudaStream_t stream); +size_t invokeReduceMaxGenerationLengths(void* __restrict__ reduceMaxTempStorage, size_t reduceTempStorageBytes, + runtime::SizeType32 const* __restrict__ generationLengths, runtime::SizeType32* __restrict__ maxGenerationLengths, + runtime::SizeType32 batchSize, cudaStream_t stream); + +// inclusive prefix sum generationLengths +void invokeScanReduceGenerationLengths(runtime::SizeType32 batchSize, + runtime::SizeType32 const* __restrict__ generationLengths, void* __restrict__ scanTempStorage, + size_t scanTempStorageBytes, runtime::SizeType32* __restrict__ scanedGenerationLengths, void* __restrict__ reduceMaxTempStorage, size_t reduceMaxTempStorageBytes, - runtime::SizeType32* maxSpecDecodingGenerationLengths, cudaStream_t stream); + runtime::SizeType32* maxGenerationLengths, cudaStream_t stream); -void invokeConvertSpecDecodingMaskToPackedMask(runtime::SizeType32 batchSize, - runtime::SizeType32 const* __restrict__ specDecodingCumGenerationLengths, - runtime::SizeType32 const* __restrict__ specDecodingMaxGenerationLengths, bool const* __restrict__ specDecodingMask, +void invokeConvertMaskToPackedMask(runtime::SizeType32 batchSize, + runtime::SizeType32 const* __restrict__ cumGenerationLengths, + runtime::SizeType32 const* __restrict__ maxGenerationLengths, bool const* __restrict__ mask, runtime::SizeType32 const* __restrict__ batchSlots, runtime::SizeType32 maxDraftTokens, - runtime::SizeType32 maxGenerationLength, runtime::SizeType32* __restrict__ specDecodingPackedMask, - cudaStream_t stream); + runtime::SizeType32 maxGenerationLength, runtime::SizeType32* __restrict__ packedMask, cudaStream_t stream); } // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu index a6fa77fd6..c0e6127de 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu @@ -25,99 +25,107 @@ namespace tensorrt_llm::kernels::speculative_decoding { -static constexpr int kUpdateKVCacheKernelShmSize = 16384; +using namespace tensorrt_llm::runtime; +static constexpr SizeType32 kUpdateKVCacheKernelShmSize = 16384; namespace { -template +template __global__ void updateKVCacheDraftTokenLocationBatchedKernel(std::array kvCacheBuffers, - int const* seqAcceptedDraftTokenOffsets, IndexType const* packedAcceptedDraftTokensIndices, - int32_t const* pastKeyValueLengths, int rewindDraftTokenCommonCount, int const* rewindDraftTokenSeparateAdjustments, - int const* seqSlotRemapping, int eltCountPerHead) + SizeType32 const* seqAcceptedDraftTokenOffsets, IndexType const* packedAcceptedDraftTokensIndices, + SizeType32 const* pastKeyValueLengths, SizeType32 rewindDraftTokenCommonCount, + SizeType32 const* rewindDraftTokenSeparateAdjustments, SizeType32 const* seqSlotRemapping, + SizeType32 const* batchSlots, SizeType32 eltCountPerHead) { - int seqIdx = blockIdx.x; - int headIdx = blockIdx.y; - int layerIdx = blockIdx.z; - int warpIdx = threadIdx.x / 32; - int warpCount = blockDim.x / 32; - int laneIdx = threadIdx.x & 0x1f; - int seqDraftTokenStart = seqAcceptedDraftTokenOffsets[seqIdx]; - int seqDraftTokenEnd = seqAcceptedDraftTokenOffsets[seqIdx + 1]; + auto const seqIdx = static_cast(blockIdx.x); + auto const headIdx = static_cast(blockIdx.y); + auto const layerIdx = static_cast(blockIdx.z); + auto const warpIdx = static_cast(threadIdx.x / 32); + auto const warpCount = static_cast(blockDim.x / 32); + auto const laneIdx = static_cast(threadIdx.x & 0x1f); + auto const seqDraftTokenStart = seqAcceptedDraftTokenOffsets[seqIdx]; + auto const seqDraftTokenEnd = seqAcceptedDraftTokenOffsets[seqIdx + 1]; auto const seqSlot = seqSlotRemapping == nullptr ? seqIdx : seqSlotRemapping[seqIdx]; - int seqDraftCount = seqDraftTokenEnd - seqDraftTokenStart; - int maxEltCountPerMove = kUpdateKVCacheKernelShmSize / sizeof(MoveEltType) / seqDraftCount; - int eltCountPerMove = min(maxEltCountPerMove, eltCountPerHead); + auto const seqDraftCount = seqDraftTokenEnd - seqDraftTokenStart; + auto const maxEltCountPerMove + = static_cast(kUpdateKVCacheKernelShmSize / sizeof(MoveEltType) / seqDraftCount); + auto const eltCountPerMove = min(maxEltCountPerMove, eltCountPerHead); if (seqDraftCount == 0 || eltCountPerMove == 0) { return; } KVCacheBuffer& kvCacheBuffer = kvCacheBuffers[layerIdx]; - int tokenStartIdx = pastKeyValueLengths[seqSlot] - rewindDraftTokenCommonCount; + auto tokenStartIdx = pastKeyValueLengths[seqSlot] - rewindDraftTokenCommonCount; if (rewindDraftTokenSeparateAdjustments != nullptr) { - tokenStartIdx -= rewindDraftTokenSeparateAdjustments[seqSlot]; + auto const batchSlot = batchSlots == nullptr ? seqIdx : batchSlots[seqIdx]; + tokenStartIdx -= rewindDraftTokenSeparateAdjustments[batchSlot]; } __shared__ char loadSmemBuffer[kUpdateKVCacheKernelShmSize]; auto* eltLoadSmemBuffer = reinterpret_cast(&loadSmemBuffer[0]); - for (int startChannelOffset = 0; startChannelOffset < eltCountPerHead; startChannelOffset += eltCountPerMove) + for (SizeType32 startChannelOffset = 0; startChannelOffset < eltCountPerHead; startChannelOffset += eltCountPerMove) { - int eltCountCurrentMove = min(eltCountPerMove, eltCountPerHead - startChannelOffset); + SizeType32 eltCountCurrentMove = min(eltCountPerMove, eltCountPerHead - startChannelOffset); // load K - for (int tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) + for (SizeType32 tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) { - int tokenPos = packedAcceptedDraftTokensIndices[seqDraftTokenStart + tokenIdx]; + auto const tokenPos = packedAcceptedDraftTokensIndices[seqDraftTokenStart + tokenIdx]; auto* tokenSmemBuffer = eltLoadSmemBuffer + tokenIdx * eltCountCurrentMove; - int tokenKVPosition = tokenStartIdx + tokenPos; + auto const tokenKVPosition = tokenStartIdx + tokenPos; auto* kPtr = reinterpret_cast(kvCacheBuffer.getKBlockPtr(seqSlot, tokenKVPosition)); - for (int loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) + for (SizeType32 loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) { - int channelIdx = loadChannelIdx + startChannelOffset; - int kvLocationIdx = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); + auto const channelIdx = loadChannelIdx + startChannelOffset; + auto const kvLocationIdx + = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); tokenSmemBuffer[loadChannelIdx] = kPtr[kvLocationIdx]; } } __syncthreads(); // store K - for (int tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) + for (SizeType32 tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) { - int tokenPos = tokenIdx; + auto const tokenPos = tokenIdx; auto* tokenSmemBuffer = eltLoadSmemBuffer + tokenIdx * eltCountCurrentMove; - int tokenKVPosition = tokenStartIdx + tokenPos; + auto const tokenKVPosition = tokenStartIdx + tokenPos; auto* kPtr = reinterpret_cast(kvCacheBuffer.getKBlockPtr(seqSlot, tokenKVPosition)); - for (int loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) + for (SizeType32 loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) { - int channelIdx = loadChannelIdx + startChannelOffset; - int kvLocationIdx = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); + auto const channelIdx = loadChannelIdx + startChannelOffset; + auto const kvLocationIdx + = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); kPtr[kvLocationIdx] = tokenSmemBuffer[loadChannelIdx]; } } __syncthreads(); // load V - for (int tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) + for (SizeType32 tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) { - int tokenPos = packedAcceptedDraftTokensIndices[seqDraftTokenStart + tokenIdx]; + auto const tokenPos = packedAcceptedDraftTokensIndices[seqDraftTokenStart + tokenIdx]; auto* tokenSmemBuffer = eltLoadSmemBuffer + tokenIdx * eltCountCurrentMove; - int tokenKVPosition = tokenStartIdx + tokenPos; + auto const tokenKVPosition = tokenStartIdx + tokenPos; auto* vPtr = reinterpret_cast(kvCacheBuffer.getVBlockPtr(seqSlot, tokenKVPosition)); - for (int loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) + for (SizeType32 loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) { - int channelIdx = loadChannelIdx + startChannelOffset; - int kvLocationIdx = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); + auto const channelIdx = loadChannelIdx + startChannelOffset; + auto const kvLocationIdx + = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); tokenSmemBuffer[loadChannelIdx] = vPtr[kvLocationIdx]; } } __syncthreads(); // store V - for (int tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) + for (SizeType32 tokenIdx = warpIdx; tokenIdx < seqDraftCount; tokenIdx += warpCount) { - int tokenPos = tokenIdx; + auto const tokenPos = tokenIdx; auto* tokenSmemBuffer = eltLoadSmemBuffer + tokenPos * eltCountCurrentMove; - int tokenKVPosition = tokenStartIdx + tokenPos; + auto const tokenKVPosition = tokenStartIdx + tokenPos; auto* vPtr = reinterpret_cast(kvCacheBuffer.getVBlockPtr(seqSlot, tokenKVPosition)); - for (int loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) + for (SizeType32 loadChannelIdx = laneIdx; loadChannelIdx < eltCountCurrentMove; loadChannelIdx += 32) { - int channelIdx = loadChannelIdx + startChannelOffset; - int kvLocationIdx = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); + auto const channelIdx = loadChannelIdx + startChannelOffset; + auto const kvLocationIdx + = kvCacheBuffer.getKVLocalIdx(tokenKVPosition, headIdx, eltCountPerHead, channelIdx); vPtr[kvLocationIdx] = tokenSmemBuffer[loadChannelIdx]; } } @@ -126,12 +134,13 @@ __global__ void updateKVCacheDraftTokenLocationBatchedKernel(std::array +template void updateKVCacheDraftTokenLocationBatched(KVCacheBuffer const* kvCacheBuffers, - int const* seqAcceptedDraftTokenOffsets, IndexType const* packedAcceptedDraftTokensIndices, - int32_t const* pastKeyValueLengths, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, int const* seqSlotRemapping, - cudaStream_t stream) + SizeType32 const* seqAcceptedDraftTokenOffsets, IndexType const* packedAcceptedDraftTokensIndices, + SizeType32 const* pastKeyValueLengths, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, + SizeType32 const* rewindDraftTokenSeparateAdjustments, SizeType32 const* seqSlotRemapping, + SizeType32 const* batchSlots, cudaStream_t stream) { // make sure launch buffer is enough static_assert(MaxLayerCount * sizeof(KVCacheBuffer) <= 3072); @@ -139,22 +148,22 @@ void updateKVCacheDraftTokenLocationBatched(KVCacheBuffer const* kvCacheBuffers, { return; } - int alignedBytes = 16; + SizeType32 alignedBytes = 16; while (alignedBytes > 0 && (sizeInBytesPerKVHead % alignedBytes != 0)) { alignedBytes >>= 1; } TLLM_CHECK_WITH_INFO(alignedBytes > 0, "alignedByte should be positive"); - int eltCountPerHead = sizeInBytesPerKVHead / alignedBytes; + SizeType32 eltCountPerHead = sizeInBytesPerKVHead / alignedBytes; dim3 grid(seqCount, numKVHeads, layerCount); dim3 block(128, 1, 1); std::array kvCacheBufferArray; - for (int i = 0; i < layerCount; i++) + for (SizeType32 i = 0; i < layerCount; i++) { kvCacheBufferArray[i] = kvCacheBuffers[i]; } - void (*pKernelFunc)(std::array, int const*, IndexType const*, int32_t const*, int, - int const*, int const*, int) + void (*pKernelFunc)(std::array, SizeType32 const*, IndexType const*, + SizeType32 const*, SizeType32, SizeType32 const*, SizeType32 const*, SizeType32 const*, SizeType32) = nullptr; switch (alignedBytes) { @@ -170,7 +179,7 @@ void updateKVCacheDraftTokenLocationBatched(KVCacheBuffer const* kvCacheBuffers, } case 4: { - pKernelFunc = &updateKVCacheDraftTokenLocationBatchedKernel; + pKernelFunc = &updateKVCacheDraftTokenLocationBatchedKernel; break; } case 2: @@ -187,7 +196,7 @@ void updateKVCacheDraftTokenLocationBatched(KVCacheBuffer const* kvCacheBuffers, } pKernelFunc<<>>(kvCacheBufferArray, seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, rewindDraftTokenCommonCount, - rewindDraftTokenSeparateAdjustments, seqSlotRemapping, eltCountPerHead); + rewindDraftTokenSeparateAdjustments, seqSlotRemapping, batchSlots, eltCountPerHead); TLLM_CUDA_CHECK(cudaGetLastError()); } @@ -209,54 +218,59 @@ void updateKVCacheDraftTokenLocationBatched(KVCacheBuffer const* kvCacheBuffers, * @param stream : CUDA stream to use. */ template -void updateKVCacheDraftTokenLocation(KVCacheBuffer const* kvCacheBuffers, int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, int layerCount, int seqCount, - int numKVHeads, int sizeInBytesPerKVHead, int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, - int const* seqSlotRemapping, cudaStream_t stream) +void updateKVCacheDraftTokenLocation(KVCacheBuffer const* kvCacheBuffers, + SizeType32 const* seqAcceptedDraftTokenOffsets, IndexType const* packedAcceptedDraftTokensIndices, + SizeType32 const* pastKeyValueLengths, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, + SizeType32 const* rewindDraftTokenSeparateAdjustments, SizeType32 const* seqSlotRemapping, + SizeType32 const* batchSlots, cudaStream_t stream) { - int startLayer = 0; - static constexpr int kMaxLayersPerIter = 32; + SizeType32 startLayer = 0; + static constexpr SizeType32 kMaxLayersPerIter = 32; while (startLayer < layerCount) { - int microBatchLayerCount = std::min(layerCount - startLayer, kMaxLayersPerIter); + SizeType32 microBatchLayerCount = std::min(layerCount - startLayer, kMaxLayersPerIter); updateKVCacheDraftTokenLocationBatched(kvCacheBuffers + startLayer, seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, microBatchLayerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, rewindDraftTokenCommonCount, - rewindDraftTokenSeparateAdjustments, seqSlotRemapping, stream); + rewindDraftTokenSeparateAdjustments, seqSlotRemapping, batchSlots, stream); startLayer += microBatchLayerCount; } } -void updateLinearKVCacheDraftTokenLocation(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - int8_t* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, int const* seqSlotRemapping, - int maxKVCacheLen, cudaStream_t stream) +void updateLinearKVCacheDraftTokenLocation(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, + int8_t* const* pastKeyValueList, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, + SizeType32 const* rewindDraftTokenSeparateAdjustments, SizeType32 const* seqSlotRemapping, SizeType32 maxKVCacheLen, + cudaStream_t stream) { std::vector kvLinearBuffers; kvLinearBuffers.reserve(layerCount); auto const sizePerToken = numKVHeads * sizeInBytesPerKVHead; - for (int i = 0; i < layerCount; i++) + for (SizeType32 i = 0; i < layerCount; i++) { kvLinearBuffers.emplace_back( seqCount, maxKVCacheLen, sizePerToken, maxKVCacheLen, 0, false, pastKeyValueList[i]); } updateKVCacheDraftTokenLocation(kvLinearBuffers.data(), seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, - rewindDraftTokenCommonCount, rewindDraftTokenSeparateAdjustments, seqSlotRemapping, stream); + rewindDraftTokenCommonCount, rewindDraftTokenSeparateAdjustments, seqSlotRemapping, nullptr, stream); } -void updateKVBlockArrayDraftTokenLocation(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, int const* seqSlotRemapping, - int maxKVCacheLen, int maxBlocksPerSeq, int tokensPerBlock, cudaStream_t stream) +void updateKVBlockArrayDraftTokenLocation(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, void* const* pointerArray, + KVBlockArray::DataType* offsetArray, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, + SizeType32 const* rewindDraftTokenSeparateAdjustments, SizeType32 const* seqSlotRemapping, + SizeType32 const* batchSlots, SizeType32 maxKVCacheLen, SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, + cudaStream_t stream) { std::vector kvBlockArrays; kvBlockArrays.reserve(layerCount); auto const bytesPerToken = numKVHeads * sizeInBytesPerKVHead; auto const bytesPerBlock = tokensPerBlock * bytesPerToken; - for (int layerIdx = 0; layerIdx < layerCount; layerIdx++) + for (SizeType32 layerIdx = 0; layerIdx < layerCount; layerIdx++) { auto const layerOffset = layerIdx * 2 * bytesPerBlock; auto* const primaryPoolPointer @@ -269,49 +283,52 @@ void updateKVBlockArrayDraftTokenLocation(int const* seqAcceptedDraftTokenOffset } updateKVCacheDraftTokenLocation(kvBlockArrays.data(), seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, - rewindDraftTokenCommonCount, rewindDraftTokenSeparateAdjustments, seqSlotRemapping, stream); + rewindDraftTokenCommonCount, rewindDraftTokenSeparateAdjustments, seqSlotRemapping, batchSlots, stream); } -void updateLinearKVCacheDraftTokenLocationCommonRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - int8_t* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCount, int const* seqSlotRemapping, int maxKVCacheLen, cudaStream_t stream) +void updateLinearKVCacheDraftTokenLocationCommonRewind(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, + int8_t* const* pastKeyValueList, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCount, SizeType32 const* seqSlotRemapping, + SizeType32 maxKVCacheLen, cudaStream_t stream) { updateLinearKVCacheDraftTokenLocation(seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, pastKeyValueList, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, rewindDraftTokenCount, nullptr, seqSlotRemapping, maxKVCacheLen, stream); } -void updateKVBlockArrayDraftTokenLocationCommonRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCount, int const* seqSlotRemapping, int maxKVCacheLen, int maxBlocksPerSeq, int tokensPerBlock, - cudaStream_t stream) +void updateKVBlockArrayDraftTokenLocationCommonRewind(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, void* const* pointerArray, + KVBlockArray::DataType* offsetArray, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCount, SizeType32 const* seqSlotRemapping, + SizeType32 maxKVCacheLen, SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream) { updateKVBlockArrayDraftTokenLocation(seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, pointerArray, offsetArray, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, - rewindDraftTokenCount, nullptr, seqSlotRemapping, maxKVCacheLen, maxBlocksPerSeq, tokensPerBlock, stream); + rewindDraftTokenCount, nullptr, seqSlotRemapping, nullptr, maxKVCacheLen, maxBlocksPerSeq, tokensPerBlock, + stream); } -void updateLinearKVCacheDraftTokenLocationSeparateRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - int8_t* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int* rewindDraftTokenCounts, int const* seqSlotRemapping, int maxKVCacheLen, cudaStream_t stream) +void updateLinearKVCacheDraftTokenLocationSeparateRewind(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, + int8_t* const* pastKeyValueList, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32* rewindDraftTokenCounts, SizeType32 const* seqSlotRemapping, + SizeType32 maxKVCacheLen, cudaStream_t stream) { updateLinearKVCacheDraftTokenLocation(seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, pastKeyValueList, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, 0, rewindDraftTokenCounts, seqSlotRemapping, maxKVCacheLen, stream); } -void updateKVBlockArrayDraftTokenLocationSeparateRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int* rewindDraftTokenCounts, int const* seqSlotRemapping, int maxKVCacheLen, int maxBlocksPerSeq, - int tokensPerBlock, cudaStream_t stream) +void updateKVBlockArrayDraftTokenLocationSeparateRewind(SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, SizeType32 const* pastKeyValueLengths, void* const* pointerArray, + KVBlockArray::DataType* offsetArray, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, + SizeType32 sizeInBytesPerKVHead, SizeType32 const* rewindDraftTokenCounts, SizeType32 const* seqSlotRemapping, + SizeType32 maxKVCacheLen, SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream) { updateKVBlockArrayDraftTokenLocation(seqAcceptedDraftTokenOffsets, packedAcceptedDraftTokensIndices, pastKeyValueLengths, pointerArray, offsetArray, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, 0, - rewindDraftTokenCounts, seqSlotRemapping, maxKVCacheLen, maxBlocksPerSeq, tokensPerBlock, stream); + rewindDraftTokenCounts, seqSlotRemapping, nullptr, maxKVCacheLen, maxBlocksPerSeq, tokensPerBlock, stream); } } // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h index e24215a10..cebe06075 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h @@ -17,6 +17,7 @@ #pragma once #include "tensorrt_llm/kernels/kvCacheUtils.h" +#include "tensorrt_llm/runtime/common.h" #include #include @@ -44,11 +45,11 @@ using IndexType = int; * @param maxKVCacheLen : Maximum length of each KV cache * @param stream : CUDA stream to use. */ -void updateLinearKVCacheDraftTokenLocationCommonRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - KVLinearBuffer::DataType* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, - int sizeInBytesPerKVHead, int rewindDraftTokenCount, int const* seqSlotRemapping, int maxKVCacheLen, - cudaStream_t stream); +void updateLinearKVCacheDraftTokenLocationCommonRewind(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + KVLinearBuffer::DataType* const* pastKeyValueList, runtime::SizeType32 layerCount, runtime::SizeType32 seqCount, + runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, runtime::SizeType32 rewindDraftTokenCount, + runtime::SizeType32 const* seqSlotRemapping, runtime::SizeType32 maxKVCacheLen, cudaStream_t stream); /*! * Update Block KV cache using common rewind count. @@ -72,10 +73,12 @@ void updateLinearKVCacheDraftTokenLocationCommonRewind(int const* seqAcceptedDra * @param tokensPerBlock : Tokens per block of Block KV cache * @param stream : CUDA stream to use. */ -void updateKVBlockArrayDraftTokenLocationCommonRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCount, int const* seqSlotRemapping, int maxKVCacheLen, int maxBlocksPerSeq, int tokensPerBlock, +void updateKVBlockArrayDraftTokenLocationCommonRewind(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + void* const* pointerArray, KVBlockArray::DataType* offsetArray, runtime::SizeType32 layerCount, + runtime::SizeType32 seqCount, runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, + runtime::SizeType32 rewindDraftTokenCount, runtime::SizeType32 const* seqSlotRemapping, + runtime::SizeType32 maxKVCacheLen, runtime::SizeType32 maxBlocksPerSeq, runtime::SizeType32 tokensPerBlock, cudaStream_t stream); /*! @@ -98,11 +101,12 @@ void updateKVBlockArrayDraftTokenLocationCommonRewind(int const* seqAcceptedDraf * @param maxKVCacheLen : Maximum length of each KV cache * @param stream : CUDA stream to use. */ -void updateLinearKVCacheDraftTokenLocationSeparateRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - KVLinearBuffer::DataType* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, - int sizeInBytesPerKVHead, int* rewindDraftTokenCounts, int const* seqSlotRemapping, int maxKVCacheLen, - cudaStream_t stream); +void updateLinearKVCacheDraftTokenLocationSeparateRewind(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + KVLinearBuffer::DataType* const* pastKeyValueList, runtime::SizeType32 layerCount, runtime::SizeType32 seqCount, + runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, + runtime::SizeType32* rewindDraftTokenCounts, runtime::SizeType32 const* seqSlotRemapping, + runtime::SizeType32 maxKVCacheLen, cudaStream_t stream); /*! * Update Block KV cache using separate rewind count for each sequence. @@ -127,11 +131,13 @@ void updateLinearKVCacheDraftTokenLocationSeparateRewind(int const* seqAcceptedD * @param tokensPerBlock : Tokens per block of Block KV cache * @param stream : CUDA stream to use. */ -void updateKVBlockArrayDraftTokenLocationSeparateRewind(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int* rewindDraftTokenCounts, int const* seqSlotRemapping, int maxKVCacheLen, int maxBlocksPerSeq, - int tokensPerBlock, cudaStream_t stream); +void updateKVBlockArrayDraftTokenLocationSeparateRewind(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + void* const* pointerArray, KVBlockArray::DataType* offsetArray, runtime::SizeType32 layerCount, + runtime::SizeType32 seqCount, runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, + runtime::SizeType32* rewindDraftTokenCounts, runtime::SizeType32 const* seqSlotRemapping, + runtime::SizeType32 maxKVCacheLen, runtime::SizeType32 maxBlocksPerSeq, runtime::SizeType32 tokensPerBlock, + cudaStream_t stream); /*! * Update Linear KV cache using both common rewind and separate rewind count for each sequence. The common @@ -156,11 +162,12 @@ void updateKVBlockArrayDraftTokenLocationSeparateRewind(int const* seqAcceptedDr * @param maxKVCacheLen : Maximum length of each KV cache * @param stream : CUDA stream to use. */ -void updateLinearKVCacheDraftTokenLocation(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, - KVLinearBuffer::DataType* const* pastKeyValueList, int layerCount, int seqCount, int numKVHeads, - int sizeInBytesPerKVHead, int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, - int const* seqSlotRemapping, int maxKVCacheLen, cudaStream_t stream); +void updateLinearKVCacheDraftTokenLocation(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + KVLinearBuffer::DataType* const* pastKeyValueList, runtime::SizeType32 layerCount, runtime::SizeType32 seqCount, + runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, + runtime::SizeType32 rewindDraftTokenCommonCount, runtime::SizeType32 const* rewindDraftTokenSeparateAdjustments, + runtime::SizeType32 const* seqSlotRemapping, runtime::SizeType32 maxKVCacheLen, cudaStream_t stream); /*! * Update Block KV cache using both common rewind and separate rewind count for each sequence. The common @@ -178,20 +185,24 @@ void updateLinearKVCacheDraftTokenLocation(int const* seqAcceptedDraftTokenOffse * @param sizeInBytesPerKVHead : Size of each KV head * @param rewindDraftTokenCommonCount : Common token count to rewind * @param rewindDraftTokenSeparateAdjustments : Pointer to an array of length seqCount, each element indicated the - * rewind adjustment for one sequence. + * rewind adjustment for one sequence, indexed through batchSlots. * @param seqSlotRemapping mapping from batch index to index of the seqSlot in the sorted seqSlot buffer * e.g. for requests [0, 1, 2] with seqSlots [5, 3, 4], seqSlotRemapping is [1, 2, 0] * Required to match seqAcceptedDraftTokenOffsets and packedAcceptedDraftTokensIndices from gptDecoderBatch * and pointerArray and pastKeyValueLengths from runtimeBuffers. + * @param batchSlots : [seqCount] indices of sequences in the seq slots. * @param maxKVCacheLen : Maximum length of each KV cache * @param maxBlocksPerSeq : Maximum blocks per sequence of Block KV cache. * @param tokensPerBlock : Tokens per block of Block KV cache * @param stream : CUDA stream to use. */ -void updateKVBlockArrayDraftTokenLocation(int const* seqAcceptedDraftTokenOffsets, - IndexType const* packedAcceptedDraftTokensIndices, int32_t const* pastKeyValueLengths, void* const* pointerArray, - KVBlockArray::DataType* offsetArray, int layerCount, int seqCount, int numKVHeads, int sizeInBytesPerKVHead, - int rewindDraftTokenCommonCount, int* rewindDraftTokenSeparateAdjustments, int const* seqSlotRemapping, - int maxKVCacheLen, int maxBlocksPerSeq, int tokensPerBlock, cudaStream_t stream); +void updateKVBlockArrayDraftTokenLocation(runtime::SizeType32 const* seqAcceptedDraftTokenOffsets, + IndexType const* packedAcceptedDraftTokensIndices, runtime::SizeType32 const* pastKeyValueLengths, + void* const* pointerArray, KVBlockArray::DataType* offsetArray, runtime::SizeType32 layerCount, + runtime::SizeType32 seqCount, runtime::SizeType32 numKVHeads, runtime::SizeType32 sizeInBytesPerKVHead, + runtime::SizeType32 rewindDraftTokenCommonCount, runtime::SizeType32 const* rewindDraftTokenSeparateAdjustments, + runtime::SizeType32 const* seqSlotRemapping, runtime::SizeType32 const* batchSlots, + runtime::SizeType32 maxKVCacheLen, runtime::SizeType32 maxBlocksPerSeq, runtime::SizeType32 tokensPerBlock, + cudaStream_t stream); } // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu index b79c5af45..1983c2262 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu @@ -131,7 +131,8 @@ void invokeStopWordsCriterion(TokenIdType const** outputIds, SizeType32 const** } __global__ void lengthCriterion(FinishedState* finished, SizeType32* finishedSum, SizeType32 const* sequenceLimitLength, - SizeType32* sequenceLengths, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 beamWidth) + SizeType32* sequenceLengths, SizeType32* numNewTokens, SizeType32 const* batchSlots, SizeType32 batchSize, + SizeType32 beamWidth) { SizeType32 threadFinishedCount = 0; auto const batchIdx = blockIdx.x; @@ -144,10 +145,15 @@ __global__ void lengthCriterion(FinishedState* finished, SizeType32* finishedSum auto finishState = finished[batchSlotBeamWidthIdx]; - if (sequenceLengths[batchSlotBeamWidthIdx] >= sequenceLimitLength[batchSlot]) + auto const numTokensToLimit = sequenceLimitLength[batchSlot] - sequenceLengths[batchSlotBeamWidthIdx]; + if (numTokensToLimit <= 0) { finishState.setFinishedMaxLength(); sequenceLengths[batchSlotBeamWidthIdx] = sequenceLimitLength[batchSlot]; + if (numNewTokens) + { + numNewTokens[batchSlot] = numNewTokens[batchSlot] + numTokensToLimit; + } } threadFinishedCount += finishState.isFinished() ? 1 : 0; finished[batchSlotBeamWidthIdx] = finishState; @@ -174,8 +180,8 @@ __global__ void lengthCriterion(FinishedState* finished, SizeType32* finishedSum } void invokeLengthCriterion(FinishedState* finished, SizeType32* finishedSum, SizeType32 const* sequenceLimitLength, - SizeType32* sequenceLengths, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 beamWidth, - cudaStream_t stream) + SizeType32* sequenceLengths, SizeType32* numNewTokens, SizeType32 const* batchSlots, SizeType32 batchSize, + SizeType32 beamWidth, cudaStream_t stream) { // Check if we have attained the sequence length limit. If so, stop the // sequence. In addition, check if all sequences are stopped and return the @@ -184,12 +190,12 @@ void invokeLengthCriterion(FinishedState* finished, SizeType32* finishedSum, Siz dim3 grid{static_cast(batchSize)}; lengthCriterion<<>>( - finished, finishedSum, sequenceLimitLength, sequenceLengths, batchSlots, batchSize, beamWidth); + finished, finishedSum, sequenceLimitLength, sequenceLengths, numNewTokens, batchSlots, batchSize, beamWidth); sync_check_cuda_error(); } __global__ void explicitEOSCriterion(TokenIdType const** outputIds, TokenIdType const* endIds, FinishedState* finished, - SizeType32* sequenceLengths, SizeType32 const* tokensPerStep, SizeType32 const* batchSlots, SizeType32 batchSize, + SizeType32* sequenceLengths, SizeType32* numNewTokens, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxTokensPerStep) { auto const batchIdx = blockIdx.x * blockDim.x + threadIdx.x; @@ -204,7 +210,7 @@ __global__ void explicitEOSCriterion(TokenIdType const** outputIds, TokenIdType return; } - auto const numTokens = tokensPerStep != nullptr ? tokensPerStep[batchSlot] : maxTokensPerStep; + auto const numTokens = numNewTokens != nullptr ? numNewTokens[batchSlot] : maxTokensPerStep; auto const endId = endIds[batchSlot]; auto const sequenceLength = sequenceLengths[batchSlot]; @@ -217,12 +223,17 @@ __global__ void explicitEOSCriterion(TokenIdType const** outputIds, TokenIdType { finished[batchSlot].setFinishedEOS(); sequenceLengths[batchSlot] = max(0, pos); + if (numNewTokens) + { + numNewTokens[batchSlot] = pos - posStart; + } + return; } } } void invokeExplicitEOSCriterion(TokenIdType const** outputIds, TokenIdType const* endIds, FinishedState* finished, - SizeType32* sequenceLengths, SizeType32 const* tokensPerStep, SizeType32 const* batchSlots, SizeType32 batchSize, + SizeType32* sequenceLengths, SizeType32* numNewTokens, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxTokensPerStep, cudaStream_t stream) { TLLM_CHECK_WITH_INFO(beamWidth == 1, "Explicit EOS criterion does not support beam search"); @@ -233,7 +244,7 @@ void invokeExplicitEOSCriterion(TokenIdType const** outputIds, TokenIdType const grid.x = divUp(batchSize, blockSize); explicitEOSCriterion<<>>( - outputIds, endIds, finished, sequenceLengths, tokensPerStep, batchSlots, batchSize, maxTokensPerStep); + outputIds, endIds, finished, sequenceLengths, numNewTokens, batchSlots, batchSize, maxTokensPerStep); sync_check_cuda_error(); } diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h index c77b5a029..7d0ca4d8f 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h @@ -62,14 +62,16 @@ void invokeStopWordsCriterion(runtime::TokenIdType const** outputIds, runtime::S //! \param sequenceLimitLength input buffer [maxBatchSize]. Maximum sequence length. //! \param sequenceLengths input/output buffer [maxBatchSize, beamWidth]. //! Current sequence lengths of the request tokens. +//! \param numNewTokens output buffer [maxBatchSize], optional. Number of tokens per step for each request. +//! It is assumed that all requests have maxTokensPerStep tokens per step if nullptr. //! \param batchSlots input buffer[batchSize], optional. Indices of rows of data in memory pool //! \param batchSize batch size //! \param beamWidth beam width //! \param stream stream void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishedSum, runtime::SizeType32 const* sequenceLimitLength, runtime::SizeType32* sequenceLengths, - runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, - cudaStream_t stream); + runtime::SizeType32* numNewTokens, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, + runtime::SizeType32 beamWidth, cudaStream_t stream); //! \brief Sets finished states based on the endIds and ajusts sequence length to length before the first EOS token. //! Does not support beamWidth > 1 for now. @@ -81,7 +83,7 @@ void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishe //! [maxBatchSize, beamWidth]. Finished states. Set to FinishedState::FINISHED_EOS if any new tokens contain EOS. //! \param sequenceLengths input/output buffer [maxBatchSize, beamWidth]. //! Current sequence lengths of the request tokens. -//! \param tokensPerStep input buffer [maxBatchSize], optional. Number of tokens per step for each request. +//! \param numNewTokens input/output buffer [maxBatchSize], optional. Number of tokens per step for each request. //! It is assumed that all requests have maxTokensPerStep tokens per step if nullptr. //! \param batchSlots input buffer[batchSize], optional. Indices of rows of data in memory pool //! \param batchSize batch size @@ -89,7 +91,7 @@ void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishe //! \param maxTokensPerStep maximum number of tokens decoded per step //! \param stream stream void invokeExplicitEOSCriterion(runtime::TokenIdType const** outputIds, runtime::TokenIdType const* endIds, - FinishedState* finished, runtime::SizeType32* sequenceLengths, runtime::SizeType32 const* tokensPerStep, + FinishedState* finished, runtime::SizeType32* sequenceLengths, runtime::SizeType32* numNewTokens, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxTokensPerStep, cudaStream_t stream); } // namespace kernels diff --git a/cpp/tensorrt_llm/layers/banWordsLayer.cpp b/cpp/tensorrt_llm/layers/banWordsLayer.cpp index c4fa2bc80..2816f2bb6 100644 --- a/cpp/tensorrt_llm/layers/banWordsLayer.cpp +++ b/cpp/tensorrt_llm/layers/banWordsLayer.cpp @@ -16,22 +16,16 @@ */ #include "tensorrt_llm/layers/banWordsLayer.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/banBadWords.h" #include "tensorrt_llm/kernels/banRepeatNgram.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" -#include - using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -97,30 +91,31 @@ void BanWordsLayer::freeBuffer() template void BanWordsLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); std::vector batchSlotsVec(batchSize); std::iota(batchSlotsVec.begin(), batchSlotsVec.end(), 0); auto batchSlotsHost = batchSlots ? batchSlots : batchSlotsVec.data(); - auto const& penaltyParams = setupParams->penaltyParams; + auto const& banWordsParams = setupParams->banWordsParams; + TLLM_CHECK_WITH_INFO(banWordsParams, "banWordsParams for setup is not set"); bool const useNoRepeatNgramSize - = mDecodingMode.isUseNoRepeatNgramSize() && penaltyParams.noRepeatNgramSize.has_value(); + = mDecodingMode.isUseNoRepeatNgramSize() && banWordsParams->noRepeatNgramSize.has_value(); FillBuffers const fillBuffers{batchSize, mDecoderDomain.getBatchSize(), mStream}; mUseNoRepeatNgramSize |= useNoRepeatNgramSize; if (mUseNoRepeatNgramSize) { - fillBuffers(penaltyParams.noRepeatNgramSize, DefaultDecodingParams::getNoRepeatNgramSize(), mNoRepeatNgramSize, - mNoRepeatNgramSizeDevice, batchSlotsHost, std::make_pair(0.f, std::numeric_limits::max()), - "no_repeat_ngram_size"); + fillBuffers(banWordsParams->noRepeatNgramSize, DefaultDecodingParams::getNoRepeatNgramSize(), + mNoRepeatNgramSize, mNoRepeatNgramSizeDevice, batchSlotsHost, + std::make_pair(0.f, std::numeric_limits::max()), "no_repeat_ngram_size"); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template -void BanWordsLayer::banRepeatNGrams(Tensor& logits, std::shared_ptr const& outputs, - std::shared_ptr const& inputs, SizeType32 const* batchSlots, +void BanWordsLayer::banRepeatNGrams(Tensor& logits, std::shared_ptr const& outputs, + std::shared_ptr const& inputs, SizeType32 const* batchSlots, SizeType32 const* noRepeatNgramSizeDevice, DecoderDomain const& decoderDomain, SizeType32 maxSeqLen, bool useNoRepeatNgramSize, cudaStream_t stream) { @@ -129,11 +124,11 @@ void BanWordsLayer::banRepeatNGrams(Tensor& logits, std::shared_ptr(), outputs->output_ids_ptr.template getPtr(), + invokeBanRepeatNgram(logits.template getPtr(), outputs->outputIdsPtr.template getPtr(), reinterpret_cast( inputs->finished.value_or(Tensor{}).template getPtr()), - outputs->parent_ids_ptr.template getPtr(), batchSlots, - outputs->sequence_length->template getPtr(), decoderDomain.getBatchSize(), + outputs->parentIdsPtr.template getPtr(), batchSlots, + outputs->sequenceLength->template getPtr(), decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), maxSeqLen, noRepeatNgramSizeDevice, decoderDomain.getVocabSizePadded(), maxStep, stream); } @@ -141,39 +136,40 @@ void BanWordsLayer::banRepeatNGrams(Tensor& logits, std::shared_ptr -void BanWordsLayer::banBadWords(Tensor& logits, std::shared_ptr const& outputs, - std::shared_ptr const& inputs, SizeType32 const* batchSlots, - DecoderDomain const& decoderDomain, SizeType32 maxSeqLen, cudaStream_t stream) +void BanWordsLayer::banBadWords(Tensor& logits, std::shared_ptr const& outputs, + std::shared_ptr const& inputs, SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, + SizeType32 maxSeqLen, cudaStream_t stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const maxBadWordsLength = inputs->max_bad_words_len; + auto const maxBadWordsLength = inputs->banWordsInputs->maxBadWordsLen; if (maxBadWordsLength) { - auto const** badWordsPtr = inputs->bad_words_ptr->template getPtr(); - auto const* badWordsLens = inputs->bad_words_lengths->template getPtr(); + auto const** badWordsPtr = inputs->banWordsInputs->badWordsPtr->template getPtr(); + auto const* badWordsLens = inputs->banWordsInputs->badWordsLengths->template getPtr(); - invokeBanBadWords((T*) logits.template getPtr(), - outputs->output_ids_ptr.template getPtr(), - decoderDomain.getBeamWidth() > 1 ? outputs->parent_ids_ptr.template getPtr() : nullptr, + invokeBanBadWords((T*) logits.template getPtr(), outputs->outputIdsPtr.template getPtr(), + decoderDomain.getBeamWidth() > 1 ? outputs->parentIdsPtr.template getPtr() : nullptr, batchSlots, decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), badWordsPtr, badWordsLens, maxBadWordsLength, decoderDomain.getVocabSizePadded(), - outputs->sequence_length->template getPtr(), maxSeqLen, stream); + outputs->sequenceLength->template getPtr(), maxSeqLen, stream); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void BanWordsLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto inputs = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); + + TLLM_CHECK_WITH_INFO(inputs->banWordsInputs, "banWordsInputs for forward is not set"); auto const localDecoderDomain = getLocalDecoderDomain(inputs, mDecoderDomain); - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; - auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr() : nullptr; + auto const maxSeqLen = outputs->outputIds.shape[outputs->outputIds.shape.size() - 1]; + auto batchSlots = inputs->batchSlots ? inputs->batchSlots->template getPtr() : nullptr; banRepeatNGrams(inputs->logits.value(), outputs, inputs, batchSlots, mNoRepeatNgramSizeDevice, localDecoderDomain, maxSeqLen, mUseNoRepeatNgramSize, mStream); @@ -185,5 +181,4 @@ void BanWordsLayer::forwardAsync( template class BanWordsLayer; template class BanWordsLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/banWordsLayer.h b/cpp/tensorrt_llm/layers/banWordsLayer.h index 4630e03f1..f30288e7e 100644 --- a/cpp/tensorrt_llm/layers/banWordsLayer.h +++ b/cpp/tensorrt_llm/layers/banWordsLayer.h @@ -17,17 +17,14 @@ #pragma once -#include - #include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" -#include "tensorrt_llm/runtime/iTensor.h" -namespace tensorrt_llm -{ -namespace layers +#include + +namespace tensorrt_llm::layers { //! \brief Layer to ban specific words from being sampled. @@ -45,20 +42,21 @@ class BanWordsLayer : public BaseLayer ~BanWordsLayer() override; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) override; + std::shared_ptr const& baseSetupParams) override; //! \brief Modifies 'outputs->logits' in-place with -INF for banned words - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: void initialize(); void allocateBuffer(); void freeBuffer(); - static void banBadWords(tc::Tensor& logits, std::shared_ptr const& outputs, - std::shared_ptr const& params, runtime::SizeType32 const* batchSlots, + static void banBadWords(tc::Tensor& logits, std::shared_ptr const& outputs, + std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, runtime::SizeType32 maxSeqLen, cudaStream_t stream); - static void banRepeatNGrams(tc::Tensor& logits, std::shared_ptr const& outputs, - std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, + static void banRepeatNGrams(tc::Tensor& logits, std::shared_ptr const& outputs, + std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, runtime::SizeType32 const* noRepeatNgramSizeDevice, DecoderDomain const& decoderDomain, runtime::SizeType32 maxSeqLen, bool useNoRepeatNgramSize, cudaStream_t stream); @@ -76,5 +74,4 @@ class BanWordsLayer : public BaseLayer bool mUseNoRepeatNgramSize{false}; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/baseLayer.h b/cpp/tensorrt_llm/layers/baseLayer.h index 366299b42..b1ccea495 100644 --- a/cpp/tensorrt_llm/layers/baseLayer.h +++ b/cpp/tensorrt_llm/layers/baseLayer.h @@ -17,14 +17,14 @@ #pragma once #include "tensorrt_llm/common/allocator.h" -#include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/executor/types.h" +#include "tensorrt_llm/common/cudaAllocator.h" #include "tensorrt_llm/layers/decodingParams.h" +#include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ -namespace layers +#include + +namespace tensorrt_llm::layers { class BaseLayer @@ -35,8 +35,17 @@ class BaseLayer BaseLayer(DecoderDomain const& decoderDomain, cudaStream_t stream, std::shared_ptr allocator) - : mStream(stream) + : mBufferManager(nullptr) + , mStream(stream) , mAllocator(std::move(allocator)) + , mDecoderDomain(std::move(decoderDomain)) + { + } + + BaseLayer(DecoderDomain const& decoderDomain, std::shared_ptr const& bufferManager) + : mBufferManager(bufferManager) + , mStream(mBufferManager->getStream().get()) + , mAllocator(std::make_shared(*mBufferManager)) , mDecoderDomain(decoderDomain) { } @@ -79,29 +88,36 @@ class BaseLayer //! \param setupParams shared pointer to params inherited from BaseSetupParams // clang-format on virtual void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, - runtime::SizeType32 const* batchSlots, std::shared_ptr setupParams) + runtime::SizeType32 const* batchSlots, std::shared_ptr const& setupParams) = 0; // clang-format off //! \brief Virtual function to execute layer async on GPU. //! There must be no stream synchronization inside this function. //! - //! \param outputs shared pointer to params inherited from BaseOutputParams + //! \param outputs shared pointer to params inherited from BaseDecodingOutputs //! \param inputs shared pointer to params inherited from BaseForwardParams // clang-format on - virtual void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) = 0; + virtual void forwardAsync( + std::shared_ptr const& outputs, std::shared_ptr const& inputs) + = 0; // clang-format off //! \brief Virtual function to execute layer synchronously on CPU / GPU. //! It is allowed (but not necassary) to synchronize on stream inside this function. //! It is targeted mainly for prototyping. //! - //! \param outputs shared pointer to params inherited from BaseOutputParams + //! \param outputs shared pointer to params inherited from BaseDecodingOutputs //! \param inputs shared pointer to params inherited from BaseForwardParams // clang-format on - virtual void forwardSync(std::shared_ptr outputs, std::shared_ptr inputs) {} + virtual void forwardSync( + std::shared_ptr const& outputs, std::shared_ptr const& inputs) + { + } protected: + // Buffer Manager + std::shared_ptr mBufferManager; // Cuda stream cudaStream_t mStream; // Memory allocator @@ -119,5 +135,4 @@ class BaseLayer bool mIsAllocateBuffer{false}; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/beamSearchLayer.cu b/cpp/tensorrt_llm/layers/beamSearchLayer.cu index 5600bfb16..5aaf77aad 100644 --- a/cpp/tensorrt_llm/layers/beamSearchLayer.cu +++ b/cpp/tensorrt_llm/layers/beamSearchLayer.cu @@ -14,19 +14,17 @@ * limitations under the License. */ -#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" #include "tensorrt_llm/layers/beamSearchLayer.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" + #include using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -47,7 +45,7 @@ BeamSearchLayer::~BeamSearchLayer() template void BeamSearchLayer::setup(runtime::SizeType32 const batchSize, runtime::SizeType32 const beamWidth, - runtime::SizeType32 const* batchSlots, std::shared_ptr baseSetupParams) + runtime::SizeType32 const* batchSlots, std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO( @@ -65,12 +63,12 @@ void BeamSearchLayer::setup(runtime::SizeType32 const batchSize, runtime::Siz auto constexpr fltEpsilon = std::numeric_limits::epsilon(); FillBuffers const fillBuffers{batchSize, batchSize, mStream}; - fillBuffers(setupParams->beam_search_diversity_rate, DefaultDecodingParams::getBeamSearchDiversity(), + fillBuffers(setupParams->beamSearchDiversityRate, DefaultDecodingParams::getBeamSearchDiversity(), mDiversityRateHost, mDiversityRateDevice, (int*) nullptr, std::make_pair(-fltEpsilon, fltMax), "diveristy rate"); - fillBuffers(setupParams->length_penalty, DefaultDecodingParams::getLengthPenalty(), mLengthPenaltyHost, + fillBuffers(setupParams->lengthPenalty, DefaultDecodingParams::getLengthPenalty(), mLengthPenaltyHost, mLengthPenaltyDevice, (int*) nullptr, std::make_pair(fltMin, fltMax), "length penalty"); - fillBuffers(setupParams->early_stopping, DefaultDecodingParams::getEarlyStopping(), mEarlyStoppingHost, + fillBuffers(setupParams->earlyStopping, DefaultDecodingParams::getEarlyStopping(), mEarlyStoppingHost, mEarlyStoppingDevice, (int*) nullptr, std::make_pair(fltMin, fltMax), "early stopping"); mHasDiffRuntimeArgs = setupParams->hasDiffRuntimeArgs; @@ -80,7 +78,7 @@ void BeamSearchLayer::setup(runtime::SizeType32 const batchSize, runtime::Siz __global__ void updateCacheIndirectionKernel( int* tgtCI, int const* srcCI, BeamHypotheses bh, int const nMaxAttentionWindow, int const nSinkTokenLength) { - // Update indirections from steps `bh.inputLength[indexBatchBeam]` to step `sequence_lengths[indexBatchBeam]` + // Update indirections from steps `bh.inputLength[indexBatchBeam]` to step `sequenceLengths[indexBatchBeam]` int const step = threadIdx.x + blockIdx.x * blockDim.x; int const indexBatchBeam = blockIdx.y; int const nBS{bh.nBatchSize}; @@ -88,7 +86,7 @@ __global__ void updateCacheIndirectionKernel( int const nMSL{bh.nMaxSeqLen}; int const indexBatch = indexBatchBeam / nBM; int const indexBeam = indexBatchBeam % nBM; - int const lastStep{bh.sequenceLengths[indexBatchBeam] - 1}; // the sequence_lengths is updated, need to minus 1 + int const lastStep{bh.sequenceLengths[indexBatchBeam] - 1}; // the sequenceLengths is updated, need to minus 1 // Return early when the indexBatchBeam or step is out of the bound // No update for the indices of context part since KV Cache is shared @@ -111,38 +109,38 @@ __global__ void updateCacheIndirectionKernel( template void BeamSearchLayer::forwardAsyncSingleRequest( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto ip = std::dynamic_pointer_cast(baseInputs); - auto op = std::dynamic_pointer_cast(baseOutputs); + auto op = std::dynamic_pointer_cast(baseOutputs); TLLM_CHECK_WITH_INFO(op->beamHypotheses, std::string("Output BeamHypotheses is not set.")); - TLLM_CHECK_WITH_INFO(op->sequence_length->template getPtr() != nullptr || mLengthPenaltyDevice == nullptr, + TLLM_CHECK_WITH_INFO(op->sequenceLength->template getPtr() != nullptr || mLengthPenaltyDevice == nullptr, std::string("Current sequence lengths must be set for length penalty computation.")); TLLM_CHECK_WITH_INFO(ip->ite == 0, "Pipeline Parallelism is not supported yet !"); BeamHypotheses& bh{*op->beamHypotheses}; // bh's members already initialized in op: *CBA, batchDones // bh's members not used in function: outputIds, logProbs, outputIdsUnfinish, parentIdsUnfinish - bh.nMaxBatchSize = static_cast(op->output_ids_ptr.shape[0]); - bh.nBatchSize = ip->logits.shape[0]; - bh.nBeamWidth = static_cast(op->output_ids_ptr.shape[1]); + bh.nMaxBatchSize = static_cast(op->outputIdsPtr.shape[0]); + bh.nBatchSize = ip->localBatchSize; + bh.nBeamWidth = static_cast(op->outputIdsPtr.shape[1]); bh.nIte = ip->ite; - bh.nMaxSeqLen = static_cast(op->output_ids_ptr.shape[2]); + bh.nMaxSeqLen = static_cast(op->outputIdsPtr.shape[2]); bh.nVocabSize = mVocabSizePadded; bh.diversityRates = mDiversityRateDevice; bh.lengthPenalties = mLengthPenaltyDevice; bh.earlyStoppings = mEarlyStoppingDevice; - bh.inputLengths = ip->input_lengths->template getPtr(); - bh.endIds = ip->end_ids.template getPtr(); - bh.logProbsTiled = (op->output_log_probs) ? op->output_log_probs->template getPtr() : nullptr; - bh.sequenceLengths = op->sequence_length->template getPtr(); - bh.cumLogProbs = op->cum_log_probs->template getPtr(); + bh.inputLengths = ip->inputLengths->template getPtr(); + bh.endIds = ip->endIds.template getPtr(); + bh.logProbsTiled = (op->outputLogProbs) ? op->outputLogProbs->template getPtr() : nullptr; + bh.sequenceLengths = op->sequenceLength->template getPtr(); + bh.cumLogProbs = op->cumLogProbs->template getPtr(); bh.finished = reinterpret_cast(op->finished->template getPtr()); - bh.outputIdsPtr = op->output_ids_ptr.template getPtr(); - bh.parentIdsPtr = op->parent_ids_ptr.template getPtr(); + bh.outputIdsPtr = op->outputIdsPtr.template getPtr(); + bh.parentIdsPtr = op->parentIdsPtr.template getPtr(); T const* logits = ip->logits.template getPtr(); T const* bias = static_cast(nullptr); @@ -155,11 +153,11 @@ void BeamSearchLayer::forwardAsyncSingleRequest( if (bh.nBeamWidth > 1) { - auto tgtCI = op->tgt_cache_indirection.template getPtr(); - auto srcCI = ip->src_cache_indirection.template getPtr(); + auto tgtCI = op->tgtCacheIndirection.template getPtr(); + auto srcCI = ip->srcCacheIndirection.template getPtr(); dim3 const grid(roundUp(bh.nMaxSeqLen, 32), bh.nBatchSize * bh.nBeamWidth); updateCacheIndirectionKernel<<>>( - tgtCI, srcCI, bh, ip->max_attention_window, ip->sink_token_length); + tgtCI, srcCI, bh, ip->maxAttentionWindow, ip->sinkTokenLength); sync_check_cuda_error(); } @@ -168,32 +166,29 @@ void BeamSearchLayer::forwardAsyncSingleRequest( template void BeamSearchLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto outputs = std::dynamic_pointer_cast(baseOutputs); - auto params = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto params = std::dynamic_pointer_cast(baseInputs); auto const localDecoderDomain = getLocalDecoderDomain(params, mDecoderDomain); - auto batchSlots = params->batch_slots ? params->batch_slots->template getPtr() : nullptr; - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; + auto batchSlots = params->batchSlots ? params->batchSlots->template getPtr() : nullptr; + auto const maxSeqLen = outputs->outputIds.shape[outputs->outputIds.shape.size() - 1]; auto const ite = params->ite; auto const step = params->step; // common inputs - auto const& endIds = params->end_ids; - auto const localBatchSize = static_cast(params->local_batch_size); + auto const& endIds = params->endIds; + auto const localBatchSize = static_cast(params->localBatchSize); TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() > 1, "Decoding mode is beam search, but beamWidth <= 1 (%d <= 1)", localDecoderDomain.getBeamWidth()); - TLLM_CHECK_WITH_INFO( - params->src_cache_indirection.has_value(), "src_cache_indirection is mandatory in beam search."); - TLLM_CHECK_WITH_INFO( - outputs->tgt_cache_indirection.has_value(), "tgt_cache_indirection is mandatory in beam search."); - TLLM_CHECK_WITH_INFO(outputs->parent_ids.has_value(), "parent_ids tensor is mandatory in beam search."); + TLLM_CHECK_WITH_INFO(params->srcCacheIndirection.has_value(), "srcCacheIndirection is mandatory in beam search."); + TLLM_CHECK_WITH_INFO(outputs->parentIds.has_value(), "parentIds tensor is mandatory in beam search."); TLLM_CHECK_WITH_INFO(outputs->finished.has_value(), "finished tensor is mandatory in beam search."); - TLLM_CHECK_WITH_INFO(outputs->cum_log_probs.has_value(), "cum_log_probs tensor is mandatory in beam search."); + TLLM_CHECK_WITH_INFO(outputs->cumLogProbs.has_value(), "cumLogProbs tensor is mandatory in beam search."); // Compute one by one if there are different runtime arguments // due to Batch-Beam-Search is not supported yet, so we need to compute @@ -211,30 +206,32 @@ void BeamSearchLayer::forwardAsync( auto const end_id_offset = endIds.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size); auto forwardParams = std::make_shared(step, ite, logits_offset, end_id_offset, - *params->src_cache_indirection, static_cast(params->max_attention_window), - static_cast(params->sink_token_length), static_cast(maxSeqLen)); + *params->srcCacheIndirection, static_cast(params->maxAttentionWindow), + static_cast(params->sinkTokenLength), static_cast(maxSeqLen), + dynamic_decode_batch_size); - if (params->input_lengths) + if (params->inputLengths) { - forwardParams->input_lengths = params->input_lengths->slice( + forwardParams->inputLengths = params->inputLengths->slice( {dynamic_decode_batch_size * localDecoderDomain.getBeamWidth()}, dynamic_id_offset); } - auto outputParams = std::make_shared( - outputs->output_ids, outputs->parent_ids.value(), outputs->tgt_cache_indirection.value()); + auto outputParams = std::make_shared(outputs->outputIds); - outputParams->output_ids_ptr = std::move(outputs->output_ids_ptr); - outputParams->parent_ids_ptr = std::move(outputs->parent_ids_ptr); - outputParams->sequence_length = outputs->sequence_length->slice( + outputParams->parentIds = std::move(outputs->parentIds); + outputParams->tgtCacheIndirection = std::move(outputs->tgtCacheIndirection); + outputParams->outputIdsPtr = std::move(outputs->outputIdsPtr); + outputParams->parentIdsPtr = std::move(outputs->parentIdsPtr); + outputParams->sequenceLength = outputs->sequenceLength->slice( {dynamic_decode_batch_size * localDecoderDomain.getBeamWidth()}, dynamic_id_offset); outputParams->finished = outputs->finished->slice( {dynamic_decode_batch_size * localDecoderDomain.getBeamWidth()}, dynamic_id_offset); - outputParams->cum_log_probs = outputs->cum_log_probs->slice( + outputParams->cumLogProbs = outputs->cumLogProbs->slice( {dynamic_decode_batch_size * localDecoderDomain.getBeamWidth()}, dynamic_id_offset); - outputParams->output_log_probs = outputs->output_log_probs_tiled; // notice: use tiled tensor + outputParams->outputLogProbs = outputs->outputLogProbsTiled; // notice: use tiled tensor outputParams->beamHypotheses = std::move(outputs->beamHypotheses); - // beam_search_diversity_rate is only supported when using BeamHypotheses + // beamSearchDiversityRate is only supported when using BeamHypotheses forwardAsyncSingleRequest(outputParams, forwardParams); } // end of dynamic_ite TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -275,5 +272,4 @@ void BeamSearchLayer::freeBuffer() template class BeamSearchLayer; template class BeamSearchLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/beamSearchLayer.h b/cpp/tensorrt_llm/layers/beamSearchLayer.h index 6bd9527a9..fbecb1305 100644 --- a/cpp/tensorrt_llm/layers/beamSearchLayer.h +++ b/cpp/tensorrt_llm/layers/beamSearchLayer.h @@ -17,71 +17,40 @@ #pragma once #include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/kernels/beamSearchKernels.h" -#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/runtime/common.h" -#include - #include +#include namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ -namespace layers -{ - -// BS: batch_size, lBS: local_batch_size, BM: beam_width, mSL: max_seq_length -class BeamSearchSetupParams : public BaseSetupParams +namespace tensorrt_llm::layers { -public: - std::optional> beam_search_diversity_rate; // [BS] on cpu - std::optional> length_penalty; // [BS] on cpu - std::optional> early_stopping; // [BS] on cpu - bool hasDiffRuntimeArgs{false}; -}; -class BeamSearchInputParams : public BaseInputParams +class BeamSearchInputParams : public DecodingInputs { public: explicit BeamSearchInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor logits, - tc::Tensor endIds, tc::Tensor src_cache_indirection, runtime::SizeType32 max_attention_window, - runtime::SizeType32 sink_token_length, runtime::SizeType32 max_seq_len) - : BaseInputParams(step, ite, std::move(endIds)) + tc::Tensor endIds, tc::Tensor srcCacheIndirection, runtime::SizeType32 maxAttentionWindow, + runtime::SizeType32 sinkTokenLength, runtime::SizeType32 maxSeqLen, runtime::SizeType32 localBatchSize) + : DecodingInputs(std::move(endIds), step, ite, localBatchSize) , logits{std::move(logits)} - , max_attention_window{max_attention_window} - , sink_token_length{sink_token_length} - , max_seq_len{max_seq_len} - , src_cache_indirection{std::move(src_cache_indirection)} + , maxAttentionWindow{maxAttentionWindow} + , sinkTokenLength{sinkTokenLength} + , maxSeqLen{maxSeqLen} + , srcCacheIndirection{std::move(srcCacheIndirection)} { } // mandatory parameters tc::Tensor logits; // [maxBatchSize, beamWidth, vocabSizePadded] - runtime::SizeType32 max_attention_window; - runtime::SizeType32 sink_token_length; - runtime::SizeType32 max_seq_len; - tc::Tensor src_cache_indirection; // [BS, BM, mSL] - std::optional input_lengths; // [BS, BM] -}; - -class BeamSearchOutputParams : public BaseOutputParams -{ -public: - explicit BeamSearchOutputParams(tc::Tensor outputIds, tc::Tensor parentIds, tc::Tensor tgt_cache_indirection) - : BaseOutputParams{std::move(outputIds)} - , parent_ids{std::move(parentIds)} - , tgt_cache_indirection{std::move(tgt_cache_indirection)} - { - } - - std::shared_ptr beamHypotheses; - tc::Tensor parent_ids; // [BS, BM, mSL] - tc::Tensor tgt_cache_indirection; // [BS, BM, mSL] - tc::Tensor parent_ids_ptr; // [BS][BM, mSL] + runtime::SizeType32 maxAttentionWindow; + runtime::SizeType32 sinkTokenLength; + runtime::SizeType32 maxSeqLen; + tc::Tensor srcCacheIndirection; // [BS, BM, mSL] + std::optional inputLengths; // [BS, BM] }; template @@ -94,15 +63,17 @@ class BeamSearchLayer : public BaseLayer ~BeamSearchLayer() override; - void setup(runtime::SizeType32 const batch_size, runtime::SizeType32 const beamWidth, - runtime::SizeType32 const* batchSlots, std::shared_ptr setupParams) override; + void setup(runtime::SizeType32 const batchSize, runtime::SizeType32 const beamWidth, + runtime::SizeType32 const* batchSlots, std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: - void forwardAsyncSingleRequest(std::shared_ptr outputs, std::shared_ptr inputs); + void forwardAsyncSingleRequest( + std::shared_ptr const& outputs, std::shared_ptr const& inputs); - void allocateBuffer(runtime::SizeType32 const batch_size, runtime::SizeType32 const beam_width); + void allocateBuffer(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth); void freeBuffer(); private: @@ -126,5 +97,4 @@ class BeamSearchLayer : public BaseLayer bool mHasDiffRuntimeArgs{false}; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/decodingLayer.cpp b/cpp/tensorrt_llm/layers/decodingLayer.cpp index 4b8db1e6f..402dc887a 100644 --- a/cpp/tensorrt_llm/layers/decodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/decodingLayer.cpp @@ -16,16 +16,13 @@ */ #include "tensorrt_llm/layers/decodingLayer.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/kernels/decodingCommon.h" -#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/layers/beamSearchLayer.h" #include "tensorrt_llm/layers/decodingParams.h" +#include "tensorrt_llm/layers/explicitDraftTokensLayer.h" #include "tensorrt_llm/layers/layerUtils.h" +#include "tensorrt_llm/layers/medusaDecodingLayer.h" #include "tensorrt_llm/layers/samplingLayer.h" -#include - using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; @@ -59,15 +56,14 @@ bool allSame(std::optional> const& vOpt) bool hasDiffRuntimeArgs(std::shared_ptr const& params) { - return !allSame(params->penaltyParams.frequencyPenalty) || !allSame(params->penaltyParams.presencePenalty) - || !allSame(params->penaltyParams.repetitionPenalty) || !allSame(params->penaltyParams.temperature) - || !allSame(params->penaltyParams.minLength) || !allSame(params->penaltyParams.noRepeatNgramSize); + // return !allSame(params->penaltyParams.frequencyPenalty) || !allSame(params->penaltyParams.presencePenalty) + // || !allSame(params->penaltyParams.repetitionPenalty) || !allSame(params->penaltyParams.temperature) + // || !allSame(params->penaltyParams.minLength) || !allSame(params->banWordsInputs.noRepeatNgramSize); + return false; } } // namespace -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template DecodingLayer::DecodingLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain, @@ -110,61 +106,40 @@ DecodingLayer::DecodingLayer(executor::DecodingMode const& mode, DecoderDomai template void DecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); + TLLM_CHECK_WITH_INFO(setupParams->decodingParams, "decodingParams for setup is not set"); + if (mDecodingMode.isTopKorTopP()) { // sampling layers TLLM_CHECK_WITH_INFO( beamWidth == 1, "Decoding mode is TopK and/or TopP, but beamWidth != 1 (%d != 1)", beamWidth); - auto samplingParams = std::make_shared(); - - samplingParams->runtime_top_k = setupParams->samplingParams.runtime_top_k; - samplingParams->runtime_top_p = setupParams->samplingParams.runtime_top_p; - samplingParams->randomSeed = setupParams->randomSeed; - - samplingParams->top_p_decay = setupParams->samplingParams.top_p_decay; - samplingParams->top_p_min = setupParams->samplingParams.top_p_min; - samplingParams->top_p_reset_ids = setupParams->samplingParams.top_p_reset_ids; - samplingParams->normalize_log_probs = setupParams->samplingParams.normalize_log_probs; - samplingParams->outputLogProbs = setupParams->samplingParams.outputLogProbs; - samplingParams->cumLogProbs = setupParams->samplingParams.cumLogProbs; - - mDecodingLayer->setup(batchSize, beamWidth, batchSlots, samplingParams); + mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams); } else if (mDecodingMode.isBeamSearch()) { // beam search layer TLLM_CHECK_WITH_INFO(beamWidth > 1, "Decoding mode is beam search, but beamWidth <= 1 (%d <= 1)", beamWidth); - auto beamSearchParams = std::make_shared(); - - beamSearchParams->beam_search_diversity_rate = setupParams->beamSearchParams.beam_search_diversity_rate; - beamSearchParams->length_penalty = setupParams->beamSearchParams.length_penalty; - beamSearchParams->early_stopping = setupParams->beamSearchParams.early_stopping; - beamSearchParams->hasDiffRuntimeArgs = hasDiffRuntimeArgs(setupParams); - - mDecodingLayer->setup(batchSize, beamWidth, nullptr, beamSearchParams); + mDecodingLayer->setup(batchSize, beamWidth, nullptr, setupParams->decodingParams); } else if (mDecodingMode.isMedusa()) { - auto medusaSetupParams = std::make_shared(); - medusaSetupParams->runtimeTopK = setupParams->samplingParams.runtime_top_k; - medusaSetupParams->runtimeHeadsTopK = setupParams->medusaParams.topKMedusaHeads; - medusaSetupParams->randomSeed = setupParams->randomSeed; - mDecodingLayer->setup(batchSize, beamWidth, batchSlots, medusaSetupParams); + TLLM_CHECK_WITH_INFO(beamWidth == 1, "Decoding mode is Medusa, but beamWidth != 1 (%d != 1)", beamWidth); + mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams); } else if (mDecodingMode.isLookahead()) { + TLLM_CHECK_WITH_INFO(beamWidth == 1, "Decoding mode is Lookahead, but beamWidth != 1 (%d != 1)", beamWidth); // TODO(nkorobov) add lookahead layer } else if (mDecodingMode.isExplicitDraftTokens()) { - auto explicitDraftTokensSetupParams = std::make_shared(); - explicitDraftTokensSetupParams->temperature = setupParams->penaltyParams.temperature; - explicitDraftTokensSetupParams->randomSeed = setupParams->randomSeed; - mDecodingLayer->setup(batchSize, /* beamWidth */ 1, batchSlots, explicitDraftTokensSetupParams); + TLLM_CHECK_WITH_INFO( + beamWidth == 1, "Decoding mode is ExplicitDraftTokens, but beamWidth != 1 (%d != 1)", beamWidth); + mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams); } else { @@ -178,7 +153,7 @@ void DecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeTyp template void DecodingLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto [outputParams, inputParams] = prepareParams(baseOutputs, baseInputs); @@ -188,7 +163,7 @@ void DecodingLayer::forwardAsync( template void DecodingLayer::forwardSync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto [outputParams, inputParams] = prepareParams(baseOutputs, baseInputs); @@ -197,31 +172,30 @@ void DecodingLayer::forwardSync( } template -std::tuple, std::shared_ptr> DecodingLayer::prepareParams( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) const +std::tuple, std::shared_ptr> DecodingLayer::prepareParams( + std::shared_ptr const& baseOutputs, + std::shared_ptr const& baseInputs) const { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto outputs = std::dynamic_pointer_cast(baseOutputs); - auto params = std::dynamic_pointer_cast(baseInputs); + auto params = std::dynamic_pointer_cast(baseInputs); auto const localDecoderDomain = getLocalDecoderDomain(params, mDecoderDomain); - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; - auto const& endIds = params->end_ids; + auto const maxSeqLen = baseOutputs->outputIds.shape[baseOutputs->outputIds.shape.size() - 1]; + auto const& endIds = params->endIds; - std::shared_ptr preparedOutputs; - std::shared_ptr preparedInputs; + std::shared_ptr preparedOutputs; + std::shared_ptr preparedInputs; - // dynamic decode GPT if (mDecodingMode.isBeamSearch()) { preparedInputs = baseInputs; preparedOutputs = baseOutputs; } else if (mDecodingMode.isTopKorTopP()) - { // beamWidth == 1 + { auto const ite = params->ite; auto const step = params->step; - auto const localBatchSize = static_cast(params->local_batch_size); + auto const localBatchSize = static_cast(params->localBatchSize); TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1, "Decoding mode is TopK and/or TopP, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth()); @@ -231,60 +205,29 @@ std::tuple, std::shared_ptr> Tensor const logitsSlice{params->logits->slice( {localBatchSize, static_cast(localDecoderDomain.getBeamWidth()), params->logits->shape[2]}, 0)}; Tensor const endIdSlice{endIds.slice({localBatchSize}, 0)}; - auto decodeInputs = std::make_shared( - step, ite, logitsSlice, endIdSlice, static_cast(maxSeqLen)); + auto decodeInputs = std::make_shared(endIdSlice, step, ite, localBatchSize); decodeInputs->finished = params->finished; - if (params->input_lengths) - { - auto& inputLengths = params->input_lengths.value(); - decodeInputs->input_lengths - = inputLengths.slice({localBatchSize, static_cast(localDecoderDomain.getBeamWidth())}, 0); - } - decodeInputs->batch_slots = params->batch_slots; + decodeInputs->logits = logitsSlice; - auto decodeOutputs = std::make_shared(outputs->output_ids); - decodeOutputs->output_ids_ptr = std::move(outputs->output_ids_ptr); - if (outputs->sequence_length) - { - decodeOutputs->sequence_length - = outputs->sequence_length->slice({localBatchSize * localDecoderDomain.getBeamWidth()}, 0); - } - if (outputs->finished) + if (params->inputLengths) { - decodeOutputs->finished = outputs->finished->slice({localBatchSize * localDecoderDomain.getBeamWidth()}, 0); - } - if (outputs->cum_log_probs) - { - decodeOutputs->cum_log_probs - = outputs->cum_log_probs->slice({localBatchSize * localDecoderDomain.getBeamWidth()}, 0); - } - if (outputs->output_log_probs_tiled) - { - Tensor& output_log_probs = outputs->output_log_probs_tiled.value(); - decodeOutputs->output_log_probs - = output_log_probs.slice({1, localBatchSize * localDecoderDomain.getBeamWidth()}, 0); + auto& inputLengths = params->inputLengths.value(); + decodeInputs->inputLengths + = inputLengths.slice({localBatchSize, static_cast(localDecoderDomain.getBeamWidth())}, 0); } + decodeInputs->batchSlots = params->batchSlots; preparedInputs = decodeInputs; - preparedOutputs = decodeOutputs; + preparedOutputs = baseOutputs; } else if (mDecodingMode.isMedusa()) { TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1, "Decoding mode is Medusa, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth()); - auto medusaInputParams = std::make_shared(params->logits.value(), endIds); - medusaInputParams->finished = outputs->finished.value(); - medusaInputParams->batch_slots = params->batch_slots; - medusaInputParams->paths = params->medusaInputs->medusaPaths; - medusaInputParams->medusaLogits = params->medusaInputs->medusaLogits; - medusaInputParams->medusaCurTokensPerStep = params->medusaInputs->medusaCurTokensPerStep; - medusaInputParams->medusaTargetTokensPerStep = params->medusaInputs->medusaTargetTokensPerStep; - medusaInputParams->treeIds = params->medusaInputs->medusaTreeIds; - - preparedInputs = medusaInputParams; + preparedInputs = baseInputs; preparedOutputs = baseOutputs; } else if (mDecodingMode.isLookahead()) @@ -311,5 +254,4 @@ std::tuple, std::shared_ptr> template class DecodingLayer; template class DecodingLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/decodingLayer.h b/cpp/tensorrt_llm/layers/decodingLayer.h index 58473b12e..9828235ab 100644 --- a/cpp/tensorrt_llm/layers/decodingLayer.h +++ b/cpp/tensorrt_llm/layers/decodingLayer.h @@ -17,22 +17,13 @@ #pragma once -#include - -#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" -#include "tensorrt_llm/layers/beamSearchLayer.h" #include "tensorrt_llm/layers/decodingParams.h" -#include "tensorrt_llm/layers/explicitDraftTokensLayer.h" -#include "tensorrt_llm/layers/medusaDecodingLayer.h" -#include "tensorrt_llm/layers/samplingLayer.h" -namespace tc = tensorrt_llm::common; +#include -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { //! \brief Layer performs token decoding using sampling (beamWidth=1), beam search (beamWidth>1) or Medusa. @@ -46,19 +37,21 @@ class DecodingLayer : public BaseLayer ~DecodingLayer() override = default; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; //! \brief Calls single SamplingLayer::forwardAsync or MedusaDecodingLayer::forwardAsync in batched mode //! or runs BeamSearchLayer::forwardAsync in the loop for each request. //! Modifies outputs->logits in-place. - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; //! \brief Calls forwardSync of configired decoding layer. - void forwardSync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardSync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: - std::tuple, std::shared_ptr> prepareParams( - std::shared_ptr outputs, std::shared_ptr inputs) const; + [[nodiscard]] std::tuple, std::shared_ptr> prepareParams( + std::shared_ptr const& outputs, std::shared_ptr const& inputs) const; private: using BaseLayer::mWorkspaceSize; @@ -74,5 +67,4 @@ class DecodingLayer : public BaseLayer std::unique_ptr mDecodingLayer; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/decodingParams.h b/cpp/tensorrt_llm/layers/decodingParams.h index f1727d261..bf671fccd 100644 --- a/cpp/tensorrt_llm/layers/decodingParams.h +++ b/cpp/tensorrt_llm/layers/decodingParams.h @@ -16,12 +16,16 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/request.h" #include #include #include #include +#include #include namespace tc = tensorrt_llm::common; @@ -42,10 +46,11 @@ namespace tensorrt_llm::layers //! It is passed through `setup` method. //! 3. `forwardBatchSize` for layers forwarding for a batch of existing active requests. //! it is passed through `forwardAsync` and `forwardSync` methods. -//! `setup` and `forward` always provide `batch_slots` indexed by +//! `setup` and `forward` always provide `batchSlots` indexed by //! local batch index ranging in [0, setupBatchSize) or [0, forwardBatchSize), //! holding the global batch index ranging in [0, maxBatchSize). //! In case of beam search, maxBatchSize = forwardBatchSize = 1. + class DecoderDomain { public: @@ -56,7 +61,7 @@ class DecoderDomain , mBeamWidth(beamWidth) , mVocabSize(vocabSize) , mVocabSizePadded(vocabSizePadded.value_or(vocabSize)) - , mSpeculativeDecodingModule(speculativeDecodingModule) + , mSpeculativeDecodingModule(std::move(speculativeDecodingModule)) { } @@ -91,6 +96,11 @@ class DecoderDomain return mSpeculativeDecodingModule; } + [[nodiscard]] std::shared_ptr getSpeculativeDecodingModulePtr() const + { + return mSpeculativeDecodingModule; + } + private: runtime::SizeType32 mBatchSize; runtime::SizeType32 mBeamWidth; @@ -102,245 +112,444 @@ class DecoderDomain class BaseSetupParams { public: - virtual ~BaseSetupParams() {} + virtual ~BaseSetupParams() = default; +}; + +// Penalty layer +class PenaltySetupParams : public BaseSetupParams +{ +public: + std::optional> temperature; // [1] or [setupBatchSize] on cpu + std::optional> minLength; // [1] or [setupBatchSize] on cpu + std::optional> repetitionPenalty; // [1] or [setupBatchSize] on cpu + std::optional> presencePenalty; // [1] or [setupBatchSize] on cpu + std::optional> frequencyPenalty; // [1] or [setupBatchSize] on cpu +}; + +// Ban words layer +class BanWordsSetupParams : public BaseSetupParams +{ +public: + std::optional> noRepeatNgramSize; // [1] or [setupBatchSize] on cpu +}; + +class DecodingSetupParams : public BaseSetupParams +{ +public: + virtual ~DecodingSetupParams() = default; + + std::optional> randomSeed; // [1] or [setupBatchSize] on cpu + std::optional> outputLogProbs; // [setupBatchSize] + std::optional> cumLogProbs; // [setupBatchSize] +}; + +class SamplingSetupParams : public DecodingSetupParams +{ +public: + // baseSamplingLayer + std::optional> runtimeTopK; // [1] or [setupBatchSize] on cpu + std::optional> runtimeTopP; // [1] or [setupBatchSize] on cpu + + // topPSamplingLayer + std::optional> topPDecay; // [setupBatchSize], must between [0, 1] + std::optional> topPMin; // [setupBatchSize], must between [0, 1] + std::optional> topPResetIds; // [setupBatchSize] + std::optional normalizeLogProbs; +}; + +class BeamSearchSetupParams : public DecodingSetupParams +{ +public: + // BeamSearchLayer + std::optional> beamSearchDiversityRate; // [setupBatchSize] on cpu + std::optional> lengthPenalty; // [setupBatchSize] on cpu + std::optional> earlyStopping; // [setupBatchSize] on cpu + bool hasDiffRuntimeArgs{false}; +}; + +class MedusaSetupParams : public DecodingSetupParams +{ +public: + // Medusa params + std::optional> runtimeTopK; // [setupBatchSize] on cpu + std::optional>> runtimeHeadsTopK; // [setupBatchSize, maxMedusaHeads] +}; + +class ExplicitDraftTokensSetupParams : public DecodingSetupParams +{ +public: + std::optional> temperature; // [setupBatchSize] on cpu + // Hack to init some data for the context phase in the setup. + tc::Tensor randomDataSample; // [maxBatchSize], on gpu + tc::Tensor temperatures; // [maxBatchSize], on gpu }; class DynamicDecodeSetupParams : public BaseSetupParams { public: - // Penalty layer - struct PenaltyParams - { - std::optional> temperature; // [1] or [setupBatchSize] on cpu - std::optional> minLength; // [1] or [setupBatchSize] on cpu - std::optional> repetitionPenalty; // [1] or [setupBatchSize] on cpu - std::optional> presencePenalty; // [1] or [setupBatchSize] on cpu - std::optional> frequencyPenalty; // [1] or [setupBatchSize] on cpu - std::optional> noRepeatNgramSize; // [1] or [setupBatchSize] on cpu - }; - - struct SamplingParams - { - // baseSamplingLayer - std::optional> runtime_top_k; // [1] or [setupBatchSize] on cpu - std::optional> runtime_top_p; // [1] or [setupBatchSize] on cpu - - // topPSamplingLayer - std::optional> top_p_decay; // [setupBatchSize], must between [0, 1] - std::optional> top_p_min; // [setupBatchSize], must between [0, 1] - std::optional> top_p_reset_ids; // [setupBatchSize] - std::optional normalize_log_probs; - std::optional> outputLogProbs; // [setupBatchSize] - std::optional> cumLogProbs; // [setupBatchSize] - }; - - struct BeamSearchParams - { - // BeamSearchLayer - std::optional> beam_search_diversity_rate; // [setupBatchSize] on cpu - std::optional> length_penalty; // [setupBatchSize] on cpu - std::optional> early_stopping; // [setupBatchSize] on cpu - }; + std::shared_ptr penaltyParams; + + std::shared_ptr banWordsParams; - struct MedusaParams + std::shared_ptr decodingParams; +}; + +class LookaheadSetupParams : public DecodingSetupParams +{ +public: + std::vector prompt; // [batchSize][maxSeqLen] on cpu + std::optional> randomSeed; // [1] or [batchSize] on cpu + std::vector algoConfigs; // [1 or batchSize] on cpu +}; + +class BaseDecodingInputs +{ +public: + BaseDecodingInputs(runtime::SizeType32 localBatchSize) + : localBatchSize(localBatchSize) { - // Medusa params - std::optional>> - topKMedusaHeads; // [setupBatchSize, maxMedusaHeads] - }; + } - std::optional> randomSeed; // [1] or [setupBatchSize] on cpu + virtual ~BaseDecodingInputs() = default; - PenaltyParams penaltyParams; + runtime::SizeType32 localBatchSize; +}; + +// Ban words inputs +class BanWordsDecodingInputs : public BaseDecodingInputs +{ +public: + BanWordsDecodingInputs(runtime::SizeType32 localBatchSize) + : BaseDecodingInputs(localBatchSize) + { + } - SamplingParams samplingParams; + runtime::SizeType32 maxBadWordsLen{0}; + //! [maxBatchSize][2, bad_words_length], on gpu + std::optional badWordsPtr; + //! [maxBatchSize], on gpu + std::optional badWordsLengths; +}; - BeamSearchParams beamSearchParams; +// Stop criteria inputs +class StopCriteriaDecodingInputs : public BaseDecodingInputs +{ +public: + StopCriteriaDecodingInputs(runtime::SizeType32 localBatchSize) + : BaseDecodingInputs(localBatchSize) + { + } - MedusaParams medusaParams; + runtime::SizeType32 maxStopWordsLen{0}; + //! [maxBatchSize], on gpu + std::optional sequenceLimitLength; + //! [maxBatchSize][2, stop_words_length], on gpu + std::optional stopWordsPtr; + //! [maxBatchSize], on gpu + std::optional stopWordsLengths; }; -class BaseInputParams +class DecodingInputs : public BaseDecodingInputs { public: - explicit BaseInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor endIds) - : step{step} + DecodingInputs(tc::Tensor endIds, runtime::SizeType32 step = 0, runtime::SizeType32 ite = 0, + runtime::SizeType32 localBatchSize = 0, runtime::SizeType32 maxAttentionWindow = 0, + runtime::SizeType32 sinkTokenLength = 0) + : BaseDecodingInputs(localBatchSize) + , endIds{std::move(endIds)} + , step{step} , ite{ite} - , end_ids{std::move(endIds)} + , maxAttentionWindow{maxAttentionWindow} + , sinkTokenLength{sinkTokenLength} { } - virtual ~BaseInputParams() {} + //! [maxBatchSize] + tc::Tensor endIds; - // mandatory parameters + // used only for python runtime runtime::SizeType32 step; runtime::SizeType32 ite; - tc::Tensor end_ids; // [maxBatchSize] - std::optional batch_slots; // [forwardBatchSize], on pinned memory - std::optional finished; // [maxBatchSize, maxBeamWidth] + + // mandatory parameters + runtime::SizeType32 maxAttentionWindow; + runtime::SizeType32 sinkTokenLength; + + //! One of these two fields has to be set + //! DynamicDecodeLayer::forward checks for it + //! Need both of these fields to support legacy code during transition period to the batched decoder + //! [forwardBatchSize, beamWidth, vocabSizePadded] + std::optional logits; + //! [forwardBatchSize][beamWidth, vocabSizePadded], on gpu + std::optional> logitsVec; + + // optional parameters + //! the indices of the selected beams, mandatory for beam search, on gpu + //! [forwardBatchSize, maxBeamWidth, maxSeqLen] + std::optional srcCacheIndirection; + //! [vocabSizePadded], on gpu + std::optional embeddingBias; + //! [maxBatchSize, maxBeamWidth], on gpu + std::optional inputLengths; + //! [forwardBatchSize], on pinned memory + std::optional batchSlots; + //! [maxBatchSize, maxBeamWidth] + std::optional finished; + //! [maxBatchSize], on gpu + std::optional curTokensPerStep; + + std::shared_ptr banWordsInputs; + + std::shared_ptr stopCriteriaInputs; }; -class DynamicDecodeInputParams : public BaseInputParams +class SamplingInputs : public DecodingInputs { public: - DynamicDecodeInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, runtime::SizeType32 maxInputLength, - runtime::SizeType32 maxAttentionWindow, runtime::SizeType32 sinkTokenLength, runtime::SizeType32 localBatchSize, - tc::Tensor endIds) - : BaseInputParams(step, ite, std::move(endIds)) - , max_input_length{maxInputLength} - , max_attention_window{maxAttentionWindow} - , sink_token_length{sinkTokenLength} - , local_batch_size{localBatchSize} - , max_stop_words_len{0} - , max_bad_words_len{0} + explicit SamplingInputs( + tc::Tensor endIds, runtime::SizeType32 step, runtime::SizeType32 ite, runtime::SizeType32 localBatchSize) + : DecodingInputs{std::move(endIds), step, ite, localBatchSize} { } - // mandatory parameters - runtime::SizeType32 max_input_length; - runtime::SizeType32 max_attention_window; - runtime::SizeType32 sink_token_length; - runtime::SizeType32 local_batch_size; - runtime::SizeType32 max_stop_words_len; - runtime::SizeType32 max_bad_words_len; - - // One of these two fields has to be set - // DynamicDecodeLayer::forward checks for it - // Need both of these fields to support legacy code during transition period to the batched decoder - std::optional logits; // [maxBatchSize, beamWidth, vocabSizePadded] - std::optional> logits_vec; // [forwardBatchSize][beamWidth, vocabSizePadded], on gpu + //! optional parameters + //! [localBatchSize] + curandState_t* curandStates{}; + //! Pointer to the workspace for sampling computation + void* samplingWorkspace{}; + //! Flag to mark that logits tensor contains probabilities + bool probsComputed{}; +}; - // optional parameters - std::optional src_cache_indirection; // [forwardBatchSize, maxBeamWidth, maxSeqLen] - the k/v cache - // index for beam search, mandatory for beam search, on gpu - std::optional sequence_limit_length; // [maxBatchSize], on gpu - std::optional embedding_bias; // [vocabSizePadded], on gpu - std::optional input_lengths; // [maxBatchSize, maxBeamWidth], on gpu - std::optional bad_words_ptr; // [maxBatchSize][2, bad_words_length], on gpu - std::optional bad_words_lengths; // [maxBatchSize], on gpu - std::optional stop_words_ptr; // [maxBatchSize][2, stop_words_length], on gpu - std::optional stop_words_lengths; // [maxBatchSize], on gpu - - // Medusa inputs - class MedusaInputs +// Medusa inputs +class MedusaDecodingInputs : public DecodingInputs +{ +public: + explicit MedusaDecodingInputs(tc::Tensor endIds, runtime::SizeType32 localBatchSize) + : DecodingInputs(std::move(endIds), 0, 0, localBatchSize) { - public: - tc::Tensor medusaCurTokensPerStep; // [maxBatchSize], optional, on gpu - tc::Tensor medusaTargetTokensPerStep; // [maxBatchSize], optional, on gpu - tc::Tensor medusaPaths; // [maxBatchSize, maxPathLen, maxPathLen] - // optional, on gpu - tc::Tensor medusaTreeIds; // [maxBatchSize, maxDecodingTokens], optional, on gpu - std::vector> medusaLogits; // [maxBatchSize][maxDraftPathLen] - // [maxDecodingTokens, vocabSizePadded], optional, on gpu - }; - - // Explicit draft tokens inputs - // FIXME(nkorobov): this should be ExplicitDraftTokensBuffers? - class ExplicitDraftTokensInputs + } + + //! [maxBatchSize], on gpu + tc::Tensor targetTokensPerStep; + //! [maxBatchSize, maxPathLen, maxPathLen], on gpu + tc::Tensor paths; + //! [maxBatchSize, maxDecodingTokens], on gpu + tc::Tensor treeIds; + //! [maxBatchSize][maxDraftPathLen][maxDecodingTokens, vocabSizePadded], on gpu + std::vector> medusaLogits; +}; + +// Explicit draft tokens inputs +class ExplicitDraftTokensInputs : public DecodingInputs +{ +public: + explicit ExplicitDraftTokensInputs(tc::Tensor endIds, runtime::SizeType32 batchSize) + : DecodingInputs(std::move(endIds), 0, 0, batchSize) { - public: - }; + } - std::optional medusaInputs; + //! Draft tokens for the next iteration. The first token in each path is the last accepted token at current + //! iteration. E.g. if forwardBatchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]] + tc::Tensor nextDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu + //! Compressed form of `nextDraftTokens`, where common prefixes and collapsed. + //! Using example above [0, 1, 2, 10] + tc::Tensor nextFlatTokens; // [forwardBatchSize * maxDecodingTokens], gpu + //! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration. + //! Using example above, [[[0, 1, 2], [0, 1, 3]]] + tc::Tensor nextDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu + //! Probabilities of the next draft tokens. + tc::Tensor nextDraftProbs; // [forwardBatchSize, maxNumPaths, maxDraftPathLen, vocabSize], gpu + //! Same as `nextDraftTokens`, but for current iteration. + //! Current accepted tokens obtained as `lastDraftTokens[bi][bestPathIndices[bi]][1:bestPathLengths[bi]]`. + tc::Tensor lastDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu + //! Same as `nextDraftIndices`, but for current iteration. + tc::Tensor lastDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu + //! Boolean attention masks. + //! maxDecodingTokens' = generationLengths.max() + tc::Tensor masks; // [forwardBatchSize, maxDecodingTokens', maxDecodingTokens'], gpu + //! Relative to `positionIdsBase` position ids. Same as `nextFlatTokens` for next draft indices. + //! Using example above, [0, 1, 2, 3] + tc::Tensor packedPosIds; // [forwardBatchSize * maxDecodingTokens], gpu + //! Lengths of the accepted paths for each request. It is 1 for context phase (Only 1 primary tokens is accepted). + tc::Tensor bestPathLengths; // [forwardBatchSize], gpu + //! Indices of the accepted paths for each request. It is 0 for context phase. + tc::Tensor bestPathIndices; // [forwardBatchSize], gpu + //! Number of the draft tokens for the next iteration. + tc::Tensor generationLengths; // [forwardBatchSize], gpu + //! Baseline for the position ids. + tc::Tensor positionIdsBase; // [forwardBatchSize], gpu + //! Generation length for the previous stage. + tc::Tensor lastGenerationLengths; // [forwardBatchSize], gpu + //! Maximum number of generated tokens for the next step across whole batch + tc::Tensor maxGenLengthDevice; // [1], on gpu + //! Address map to map from linear indices of the engine outputs to seqSlot. + //! It is not the same as batchSlots because it maps the ordered engine outputs to the respective seqSlot, + //! while batchSlots is just a a list of active seqSlots. + tc::Tensor seqSlots; // [forwardBatchSize], on gpu +}; - std::optional explicitDraftTokensInputs; +class LookaheadDecodingInputs : public DecodingInputs +{ + using TensorConstPtr = runtime::ITensor::SharedConstPtr; + +public: + explicit LookaheadDecodingInputs(tc::Tensor endIds) + : DecodingInputs{std::move(endIds)} + //, logits{logits} + { + } + // TODO(liweim) reuse base logits and curTokensPerStep. + // TensorConstPtr logits; // [batchSize, maxTokensPerStep, vocabSizePadded] on gpu + // TensorConstPtr tokensPerStep; // [maxBatchSize] on gpu }; -class BaseOutputParams +class BaseDecodingOutputs { public: - explicit BaseOutputParams(tc::Tensor outputIds) - : output_ids{std::move(outputIds)} + explicit BaseDecodingOutputs(tc::Tensor outputIds) + : outputIds{std::move(outputIds)} { } - virtual ~BaseOutputParams() {} + virtual ~BaseDecodingOutputs() = default; // mandatory parameters - tc::Tensor output_ids; // [maxBatchSize, maxSeqLen] + tc::Tensor outputIds; // [maxBatchSize, maxSeqLen] // optional parameters - std::optional finished; // [maxBatchSize * maxBeamWidth], optional - std::optional sequence_length; // [maxBatchSize * maxBeamWidth], optional - std::optional cum_log_probs; // [maxBatchSize * maxBeamWidth], necessary in beam search - std::optional output_log_probs; // [maxBatchSize, maxBeamWidth, maxSeqLen], must be float*, optional - std::optional parent_ids; // [maxBatchSize, maxBeamWidth, maxSeqLen], necessary in beam search - - tc::Tensor output_ids_ptr; // [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] - - //! - //! \brief SpeculativeDecodingOutputs outputs. - //! - //! For one example sequence [a, b] [c] , where, [a, b, c] is the accepted sequence, - //! [c] is the last accepted token, and is the draft tokens from `nextDraftTokens` saved by last step. - //! [c]'s position id is known, only position ids for need to be provided in `nextDraftPosIds`. - //! LLM inputs {c, x, y, z} and generates {c', x', y', z'}. - //! - //! {c'} is always accepted and {x', z'} is supposed to be accepted. - //! The accepted tokens [c', x', z'] is saved in `output_ids` in-place, starting from `sequence_length`. - //! The `acceptedLength` is 3, and the accepted draft tokens length is 2. - //! `sequence_length` is also increaded by `acceptedLength` in-place. - //! The pathsOffset is {0, 1, 3} for {c', x', z'}. - //! [] for accepted, <> for draft, {} for input/output. - //! - //! For a batchSlots {1, 3}, `acceptedLengthsCumSum` is an exclusive sum of `acceptedLength` over the batch, - //! the `acceptedLengths` may be {3, 5}, `acceptedLengthsCumSum` is {0, 3, 8}. - class SpeculativeDecodingOutputs - { - public: - tc::Tensor nextDraftTokens; // [maxBatchSize, maxDecodingDraftTokens], draft tokens for the next step - tc::Tensor nextDraftPosIds; // [maxBatchSize, maxDecodingDraftTokens], draft token position IDs - tc::Tensor nextDraftLengths; // [maxBatchSize], next step draft tokens lengths - tc::Tensor acceptedLengths; // [maxBatchSize], lengths of the accepted draft tokens + 1. - tc::Tensor acceptedLengthsCumSum; // [maxBatchSize + 1] accumulative sum along batchSlots. - tc::Tensor pathsOffsets; // [maxBatchSize, maxPathLen] - tc::Tensor packedMasks; // [maxBatchSize, maxDecodingTokens, divUp(maxDecodingTokens, 32)] - }; - - class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs + //! [maxBatchSize * maxBeamWidth], optional + std::optional finished; + //! [maxBatchSize * maxBeamWidth], optional + std::optional sequenceLength; + //! [maxBatchSize * maxBeamWidth], necessary in beam search + std::optional cumLogProbs; + //! [maxBatchSize, maxBeamWidth, maxSeqLen], must be float*, optional + std::optional outputLogProbs; + //! [maxBatchSize, maxBeamWidth, maxSeqLen], necessary in beam search + std::optional parentIds; + + //! [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] + tc::Tensor outputIdsPtr; + //! [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] + tc::Tensor parentIdsPtr; + + // Tokens predicted at current iteration. + tc::Tensor newTokens; // [maxBatchSize, maxBeamWidth] + + // optional parameters + //! Number of tokens predicted at current iteration. + //! [maxBatchSize] + std::optional numNewTokens; + //! [1] in pinned host memory + std::optional finishedSum; + //! [maxSeqLen, maxBatchSize, maxBeamWidth], must be float* + std::optional outputLogProbsTiled; +}; + +class BeamSearchOutputs : public BaseDecodingOutputs +{ +public: + explicit BeamSearchOutputs(tc::Tensor outputIds) + : BaseDecodingOutputs{std::move(outputIds)} { - public: - //! Draft tokens for the next iteration. The first token in each path is the last accepted token at current - //! iteration. E.g. if batchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]] - tc::Tensor unpackedNextDraftTokens; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu - //! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration. - //! Using example above, [[[0, 1, 2], [0, 1, 3]]] - tc::Tensor unpackedNextDraftIndices; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu - //! Probabilities of the next draft tokens. - tc::Tensor nextDraftProbs; // [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] on gpu - //! Baseline for the position ids. - tc::Tensor positionIdsBase; // [maxBatchSize] on gpu - //! Randomly sampled data (between 0.f and 1.f) - tc::Tensor randomDataSample; // [maxBatchSize] on gpu - //! Randomly sampled data (between 0.f and 1.f) - tc::Tensor randomDataValidation; // [maxBatchSize, maxNumPaths, maxDraftPathLen] on gpu - //! Sampling temperature. - tc::Tensor temperatures; // [maxBatchSize] on gpu - }; - - std::optional speculativeDecodingOutputs; - - std::optional explicitDraftTokensOutputs; + } + + //! the k/v cache index for beam search + //! [forwardBatchSize, maxBeamWidth, maxSeqLen] + tc::Tensor tgtCacheIndirection; + //! structure maintains some pointers of beam search + std::unique_ptr beamHypotheses; }; -class DynamicDecodeOutputParams : public BaseOutputParams +//! +//! \brief SpeculativeDecodingOutputs outputs. +//! +//! For one example sequence [a, b] [c] , where, [a, b, c] is the accepted sequence, +//! [c] is the last accepted token, and is the draft tokens from `nextDraftTokens` saved by last step. +//! [c]'s position id is known, only position ids for need to be provided in `nextDraftPosIds`. +//! LLM inputs {c, x, y, z} and generates {c', x', y', z'}. +//! +//! {c'} is always accepted and {x', z'} is supposed to be accepted. +//! The accepted tokens [c', x', z'] is saved in `outputIds` in-place, starting from `sequenceLength`. +//! The `acceptedLength` is 3, and the accepted draft tokens length is 2. +//! `sequenceLength` is also increaded by `acceptedLength` in-place. +//! The pathsOffset is {0, 1, 3} for {c', x', z'}. +//! [] for accepted, <> for draft, {} for input/output. +//! +//! For a batchSlots {1, 3}, `numNewTokensCumSum` is an exclusive sum of `numNewTokens` over the batch, +//! the `numNewTokens` may be {3, 5}, `numNewTokensCumSum` is {0, 3, 8}. +//! +//! `nextDraftLengths` and `prevDraftLengths` are needed for methods that support if variable +//! draft length. `nextDraftLengths` must contain the number of draft tokens per request for the next iteration. +//! `prevDraftLengths` must contain the number of draft tokens used in the current iteraiton. +//! +//! `pathsOffsets` is needed for KV cache rewind. It contains the positions of the accepted draft tokens in the +//! flattened tensor of draft tokens. E.g. if for sequence {c, x, y, z} only `y` and `z` were accepted, +//! `pathsOffsets` contains [1, 2]. `pathsOffsets` is flattened tensor for whole batch. +//! +//! The order of `pathsOffsets` and `numNewTokensCumSum` must be aligned. Such that +//! `pathsOffset[numNewTokensCumSum[bi]:numNewTokensCumSum[bi+1]]` is the slice of offsets for `bi`th request. +//! Furthermore, the order of requests is important and must be aligned with sorted `RuntimeBuffers::seqSlots` +//! such that the request with smaller `seqSlot` stays earlier in the tensors. +//! However, this condition usually holds if method does not expect from the engine anything else, but logits. +class SpeculativeDecodingOutputs : public BaseDecodingOutputs { public: - explicit DynamicDecodeOutputParams(tc::Tensor outputIds) - : BaseOutputParams{std::move(outputIds)} + explicit SpeculativeDecodingOutputs(tc::Tensor outputIds) + : BaseDecodingOutputs{std::move(outputIds)} { } - // mandatory parameters - tc::Tensor newTokens; // [maxBatchSize, maxBeamWidth] - // optional parameters - std::optional finished_sum; // [1] in pinned host memory - std::optional output_log_probs_tiled; // [maxSeqLen, maxBatchSize, maxBeamWidth], must be float* - std::optional - tgt_cache_indirection; // [forwardBatchSize, maxBeamWidth, maxSeqLen], the k/v cache index for beam search - std::unique_ptr beamHypotheses; // structure maintains some pointers of beam search + //! Draft tokens for the next step + // [maxBatchSize, maxDecodingDraftTokens] + tc::Tensor nextDraftTokens; + //! Draft token position IDs + //! [maxBatchSize, maxDecodingDraftTokens] + tc::Tensor nextDraftPosIds; + //! Prev step draft tokens lengths, should be filled only for variable draft length speculative decoding mode + //! [maxBatchSize] + tc::Tensor prevDraftLengths; + //! Next step draft tokens lengths, should be filled only for variable draft length speculative decoding mode + //! [maxBatchSize] + tc::Tensor nextDraftLengths; + //! Accumulative sum along batchSlots. + //! [maxBatchSize + 1] + tc::Tensor numNewTokensCumSum; + //! [maxBatchSize * maxPathLen] + tc::Tensor pathsOffsets; + //! [maxBatchSize, maxDecodingTokens, divUp(maxDecodingTokens, 32)] + tc::Tensor packedMasks; +}; + +class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs +{ +public: + explicit ExplicitDraftTokensOutputs(tc::Tensor outputIds) + : SpeculativeDecodingOutputs{std::move(outputIds)} + { + } - tc::Tensor parent_ids_ptr; // [maxBatchSize] int* (2-d array), each int* has [maxBeamWidth, maxSeqLen] + //! Draft tokens for the next iteration. The first token in each path is the last accepted token at current + //! iteration. E.g. if batchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]] + tc::Tensor unpackedNextDraftTokens; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu + //! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration. + //! Using example above, [[[0, 1, 2], [0, 1, 3]]] + tc::Tensor unpackedNextDraftIndices; // [maxBatchSize, maxNumPaths, maxPathLen] on gpu + //! Probabilities of the next draft tokens. + tc::Tensor nextDraftProbs; // [maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] on gpu + //! Baseline for the position ids. + tc::Tensor positionIdsBase; // [maxBatchSize] on gpu + //! Randomly sampled data (between 0.f and 1.f) + tc::Tensor randomDataSample; // [maxBatchSize] on gpu + //! Randomly sampled data (between 0.f and 1.f) + tc::Tensor randomDataValidation; // [maxBatchSize, maxNumPaths, maxDraftPathLen] on gpu + //! Sampling temperature. + tc::Tensor temperatures; // [maxBatchSize] on gpu + //! Next generation lengths. + tc::Tensor generationLengths; // [maxBatchSize] on gpu + //! Maximum number of generated tokens for the next step across whole batch + tc::Tensor maxGenLengthHost; // [1] on pinned }; } // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp index c50762fff..0dd41361d 100644 --- a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp +++ b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp @@ -15,22 +15,20 @@ */ #include "tensorrt_llm/layers/dynamicDecodeLayer.h" -#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/kernels/decodingKernels.h" -#include "tensorrt_llm/layers/beamSearchLayer.h" -#include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" #include "tensorrt_llm/layers/layersFactory.h" #include "tensorrt_llm/runtime/bufferManager.h" -#include "tensorrt_llm/runtime/cudaStream.h" + +#include +#include using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -111,17 +109,18 @@ void DynamicDecodeLayer::initializeLayers() template void DynamicDecodeLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); - if (setupParams->samplingParams.outputLogProbs) + TLLM_CHECK_WITH_INFO(setupParams->decodingParams, "decodingParams for setup is not set"); + if (setupParams->decodingParams->outputLogProbs) { // FIXME(nkorobov): monotonically growing - mOutputLogProbs = std::any_of(setupParams->samplingParams.outputLogProbs->begin(), - setupParams->samplingParams.outputLogProbs->end(), + mOutputLogProbs = std::any_of(setupParams->decodingParams->outputLogProbs->begin(), + setupParams->decodingParams->outputLogProbs->end(), [this](bool outputLogProbs) { return this->mOutputLogProbs | outputLogProbs; }); } @@ -153,20 +152,19 @@ void DynamicDecodeLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, Si template void DynamicDecodeLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto params = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto params = std::dynamic_pointer_cast(baseInputs); - TLLM_CHECK_WITH_INFO(mDecodingMode.isExplicitDraftTokens() || params->logits || params->logits_vec, - "If not explicit Draft Tokens mode, either logits or logits_vec have to be specified."); + TLLM_CHECK_WITH_INFO(mDecodingMode.isExplicitDraftTokens() || params->logits || params->logitsVec, + "If not explicit Draft Tokens mode, either logits or logitsVec have to be specified."); TLLM_CHECK_WITH_INFO( - outputs->sequence_length.has_value(), "sequence_length tensor is required in DynamicDecoderLayer."); + baseOutputs->sequenceLength.has_value(), "sequenceLength tensor is required in DynamicDecoderLayer."); auto const localDecoderDomain = getLocalDecoderDomain(params, mDecoderDomain); - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; + auto const maxSeqLen = baseOutputs->outputIds.shape[baseOutputs->outputIds.shape.size() - 1]; TLLM_CHECK_WITH_INFO((mConfiguredBeamWidth == 1 && localDecoderDomain.getBeamWidth() == 1) || (mConfiguredBeamWidth > 1 && localDecoderDomain.getBeamWidth() > 1 @@ -185,12 +183,12 @@ void DynamicDecodeLayer::forwardAsync( std::vector batchSlotsVec(localDecoderDomain.getBatchSize()); std::iota(batchSlotsVec.begin(), batchSlotsVec.end(), 0); auto batchSlotsHost - = params->batch_slots ? params->batch_slots->template getPtr() : batchSlotsVec.data(); - auto batchSlots = params->batch_slots ? params->batch_slots->template getPtr() : nullptr; + = params->batchSlots ? params->batchSlots->template getPtr() : batchSlotsVec.data(); + auto batchSlots = params->batchSlots ? params->batchSlots->template getPtr() : nullptr; mCyclicStep = mCyclicStep % mRuntimeMaxSeqLen; prepareIdsPtrs( - outputs, batchSlotsHost, localDecoderDomain.getBatchSize(), localDecoderDomain.getBeamWidth(), maxSeqLen); + baseOutputs, batchSlotsHost, localDecoderDomain.getBatchSize(), localDecoderDomain.getBeamWidth(), maxSeqLen); for (auto& layer : mLayers) { @@ -198,7 +196,7 @@ void DynamicDecodeLayer::forwardAsync( } // Copy nextIds and transpose logits when needed - prepareOutputData(outputs, params, mIdsPtrHost, batchSlots, localDecoderDomain.getBatchSize(), + prepareOutputData(baseOutputs, params, mIdsPtrHost, batchSlots, localDecoderDomain.getBatchSize(), mDecoderDomain.getBatchSize(), localDecoderDomain.getBeamWidth(), maxSeqLen, mDecoderDomain.getMaxDecodingTokens(), mCyclicStep, mOutputLogProbs, mStream); @@ -210,7 +208,7 @@ void DynamicDecodeLayer::forwardAsync( template void DynamicDecodeLayer::forwardSync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); for (auto& layer : mLayers) @@ -221,7 +219,7 @@ void DynamicDecodeLayer::forwardSync( } template -void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptr const& outputs, +void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptr const& outputs, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 beamWidth, SizeType32 maxSeqLen) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -231,7 +229,7 @@ void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptroutput_ids.template getPtrWithOffset(batchSlot * beamWidth * maxSeqLen); + = outputs->outputIds.template getPtrWithOffset(batchSlot * beamWidth * maxSeqLen); } for (SizeType32 bi = 0; bi < batchSize; bi++) { @@ -239,7 +237,7 @@ void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptr 1) { idsPtrHost[mDecoderDomain.getBatchSize() + batchSlot] - = outputs->parent_ids.value().template getPtrWithOffset(bi * beamWidth * maxSeqLen); + = outputs->parentIds.value().template getPtrWithOffset(bi * beamWidth * maxSeqLen); } else { @@ -247,11 +245,11 @@ void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptroutput_ids_ptr = Tensor(MEMORY_GPU, DataType::TYPE_INT32_PTR, + outputs->outputIdsPtr = Tensor(MEMORY_GPU, DataType::TYPE_INT32_PTR, {static_cast(mDecoderDomain.getBatchSize()), static_cast(beamWidth), static_cast(maxSeqLen)}, idsPtrHost); - outputs->parent_ids_ptr = Tensor(MEMORY_GPU, DataType::TYPE_INT32_PTR, + outputs->parentIdsPtr = Tensor(MEMORY_GPU, DataType::TYPE_INT32_PTR, {static_cast(mDecoderDomain.getBatchSize()), static_cast(beamWidth), static_cast(maxSeqLen)}, idsPtrHost + mDecoderDomain.getBatchSize()); @@ -259,29 +257,28 @@ void DynamicDecodeLayer::prepareIdsPtrs(std::shared_ptr -void DynamicDecodeLayer::prepareOutputData(std::shared_ptr const& outputs, - std::shared_ptr const& params, runtime::ITensor::SharedPtr const& idsPtrsHost, +void DynamicDecodeLayer::prepareOutputData(std::shared_ptr const& outputs, + std::shared_ptr const& params, runtime::ITensor::SharedPtr const& idsPtrsHost, SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, SizeType32 cyclicStep, bool outputLogProbs, cudaStream_t stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto idsPtrHostSlice = ITensor::slice(idsPtrsHost, cyclicStep, 1); auto idsPtrHost = reinterpret_cast(runtime::bufferCast(*idsPtrHostSlice)); - auto const numNewTokens = outputs->speculativeDecodingOutputs - ? outputs->speculativeDecodingOutputs->acceptedLengths.template getPtr() - : nullptr; + auto const numNewTokens + = outputs->numNewTokens ? outputs->numNewTokens->template getPtr() : nullptr; invokeCopyNextStepIds(outputs->newTokens.template getPtr(), idsPtrHost, - outputs->sequence_length->template getPtr(), numNewTokens, batchSlots, batchSize, maxBatchSize, + outputs->sequenceLength->template getPtr(), numNewTokens, batchSlots, batchSize, maxBatchSize, beamWidth, maxSeqLen, maxTokensPerStep, stream); // Transpose output log probs from [maxSeqLen, batchSize, beamWidth] to [batchSize, beamWidth, maxSeqLen] - if (outputLogProbs && outputs->output_log_probs_tiled) + if (outputLogProbs && outputs->outputLogProbsTiled) { - auto logProbsMaxSeqLen = outputs->output_log_probs_tiled.value().shape[0]; + auto logProbsMaxSeqLen = outputs->outputLogProbsTiled.value().shape[0]; - invokeTransposeLogProbs(outputs->output_log_probs.value().template getPtr(), - outputs->output_log_probs_tiled.value().template getPtr(), - outputs->sequence_length->template getPtr(), batchSlots, batchSize, maxBatchSize, beamWidth, + invokeTransposeLogProbs(outputs->outputLogProbs.value().template getPtr(), + outputs->outputLogProbsTiled.value().template getPtr(), + outputs->sequenceLength->template getPtr(), batchSlots, batchSize, maxBatchSize, beamWidth, logProbsMaxSeqLen, stream); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -290,5 +287,4 @@ void DynamicDecodeLayer::prepareOutputData(std::shared_ptr; template class DynamicDecodeLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.h b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.h index 797295999..d7ee79039 100644 --- a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.h +++ b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.h @@ -16,35 +16,14 @@ #pragma once -#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/executor/types.h" -#include "tensorrt_llm/layers/banWordsLayer.h" #include "tensorrt_llm/layers/baseLayer.h" -#include "tensorrt_llm/layers/beamSearchLayer.h" -#include "tensorrt_llm/layers/decodingLayer.h" -#include "tensorrt_llm/layers/layerUtils.h" -#include "tensorrt_llm/layers/medusaDecodingLayer.h" #include "tensorrt_llm/layers/penaltyLayer.h" -#include "tensorrt_llm/layers/samplingLayer.h" -#include "tensorrt_llm/layers/stopCriteriaLayer.h" -#include "tensorrt_llm/runtime/cudaStream.h" #include "tensorrt_llm/runtime/iTensor.h" -#include -#include -#include -#include - namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ -namespace kernels -{ -struct BeamHypotheses; -} - -namespace layers +namespace tensorrt_llm::layers { template @@ -59,11 +38,13 @@ class DynamicDecodeLayer : public BaseLayer ~DynamicDecodeLayer() override; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; - void forwardSync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardSync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; // Function is only used by test. // It is guaranteed by LayersFactory that the first layer is the Penalty layer. @@ -79,11 +60,10 @@ class DynamicDecodeLayer : public BaseLayer void initialize(); void initializeLayers(); - void prepareIdsPtrs(std::shared_ptr const& outputs, - runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, - runtime::SizeType32 maxSeqLen); - static void prepareOutputData(std::shared_ptr const& outputs, - std::shared_ptr const& params, runtime::ITensor::SharedPtr const& idsPtrsHost, + void prepareIdsPtrs(std::shared_ptr const& outputs, runtime::SizeType32 const* batchSlots, + runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxSeqLen); + static void prepareOutputData(std::shared_ptr const& outputs, + std::shared_ptr const& params, runtime::ITensor::SharedPtr const& idsPtrsHost, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 maxBatchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxSeqLen, runtime::SizeType32 maxTokensPerStep, runtime::SizeType32 cyclicStep, bool outputLogProbs, cudaStream_t stream); @@ -109,5 +89,4 @@ class DynamicDecodeLayer : public BaseLayer runtime::SizeType32 mConfiguredBeamWidth{-1}; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.cpp b/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.cpp index 3d160cd78..6f13529a4 100644 --- a/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.cpp +++ b/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.cpp @@ -18,13 +18,10 @@ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" -#include "tensorrt_llm/kernels/decodingKernels.h" -#include "tensorrt_llm/kernels/penaltyKernels.h" +#include "tensorrt_llm/kernels/penaltyTypes.h" #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" -#include "tensorrt_llm/runtime/bufferManager.h" -#include "tensorrt_llm/runtime/iBuffer.h" #include @@ -67,23 +64,28 @@ void ExplicitDraftTokensLayer::allocateBuffer() mTemperature.resize(mDecoderDomain.getBatchSize()); - mScanWorkspaceSizeInBytes = invokeScanSpecDecodingGenerationLengths( + mScanWorkspaceSizeInBytes = invokeScanGenerationLengths( nullptr, mScanWorkspaceSizeInBytes, nullptr, nullptr, mDecoderDomain.getBatchSize(), mStream); - mReduceWorkspaceSizeInBytes = invokeReduceMaxSpecDecodingGenerationLengths( + mReduceWorkspaceSizeInBytes = invokeReduceMaxGenerationLengths( nullptr, mReduceWorkspaceSizeInBytes, nullptr, nullptr, mDecoderDomain.getBatchSize(), mStream); mWorkspaceSizeInBytes = std::max(mScanWorkspaceSizeInBytes, mReduceWorkspaceSizeInBytes); - std::array deviceBufferSizes + std::array deviceBufferSizes = {sizeof(curandState_t) * mDecoderDomain.getBatchSize(), sizeof(uint64_t) * mDecoderDomain.getBatchSize(), mWorkspaceSizeInBytes, sizeof(SizeType32) * mDecoderDomain.getBatchSize(), sizeof(SizeType32), - sizeof(float) * mDecoderDomain.getBatchSize()}; + sizeof(float) * mDecoderDomain.getBatchSize(), sizeof(SizeType32) * mDecoderDomain.getBatchSize(), + sizeof(SizeType32) * mDecoderDomain.getBatchSize() + * mDecoderDomain.getSpeculativeDecodingModule()->getMaxNumPaths() + * mDecoderDomain.getSpeculativeDecodingModule()->getMaxPathLen()}; mCurandStatesDevice = mAllocator->reMalloc(mCurandStatesDevice, deviceBufferSizes[0], false); mRandomSeedsDevice = mAllocator->reMalloc(mRandomSeedsDevice, deviceBufferSizes[1], false); mWorkspaceDevice = mAllocator->reMalloc(mWorkspaceDevice, deviceBufferSizes[2], false); mGenerationLengthInclusiveSum = mAllocator->reMalloc(mGenerationLengthInclusiveSum, deviceBufferSizes[3], false); mMaxGenerationLength = mAllocator->reMalloc(mMaxGenerationLength, deviceBufferSizes[4], false); mTemperatureDevice = mAllocator->reMalloc(mTemperatureDevice, deviceBufferSizes[5], false); + mBestPathIndicesSlots = mAllocator->reMalloc(mBestPathIndicesSlots, deviceBufferSizes[6], false); + mLastDraftIndicesSlots = mAllocator->reMalloc(mLastDraftIndicesSlots, deviceBufferSizes[7], false); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -99,13 +101,15 @@ void ExplicitDraftTokensLayer::freeBuffer() mAllocator->free((void**) (&mGenerationLengthInclusiveSum)); mAllocator->free((void**) (&mMaxGenerationLength)); mAllocator->free((void**) (&mTemperatureDevice)); + mAllocator->free((void**) (&mBestPathIndicesSlots)); + mAllocator->free((void**) (&mLastDraftIndicesSlots)); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void ExplicitDraftTokensLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -139,17 +143,40 @@ void ExplicitDraftTokensLayer::setup(SizeType32 batchSize, SizeType32 beamWid fillBuffers(setupParams->temperature, DefaultDecodingParams::getTemperature(), mTemperature, mTemperatureDevice, batchSlots, getLimitsPenalty(DecodingPenaltyType::Temperature), "temperature penalty"); + fillContextBuffers(batchSize, batchSlots, *setupParams); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExplicitDraftTokensLayer::fillContextBuffers( + SizeType32 batchSize, SizeType32 const* batchSlots, ExplicitDraftTokensSetupParams const& setupParams) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + FillContextExplicitDraftTokensParams params; + params.randDataSample = setupParams.randomDataSample.template getPtr(); + params.outputTemperatures = setupParams.temperatures.template getPtr(); + params.inputTemperatures = mTemperatureDevice; + params.curandState = mCurandStatesDevice; + params.batchSlots = batchSlots; + params.batchSize = batchSize; + + params.checkParams(); + + invokeFillContextBuffers(params, mStream); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template void ExplicitDraftTokensLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto inputs = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); // DO NOT CHANGE THE ORDER. @@ -167,23 +194,22 @@ void ExplicitDraftTokensLayer::forwardAsync( template void ExplicitDraftTokensLayer::convertPackedMask( - DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs) + ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto batchSlots = inputs.batch_slots->template getPtr(); + auto batchSlots = inputs.seqSlots.template getPtr(); auto masksDevice = inputs.masks.template getPtr(); - auto specDecodingGenerationLengths = inputs.specDecodingGenerationLengths.template getPtr(); - auto packedMasksDevice = outputs.explicitDraftTokensOutputs->packedMasks.template getPtr(); + auto generationLengths = inputs.generationLengths.template getPtr(); + auto packedMasksDevice = outputs.packedMasks.template getPtr(); - auto const batchSize = inputs.batch_slots->shape[0]; + auto const batchSize = inputs.localBatchSize; - invokeScanReduceSpecDecodingGenerationLengths(batchSize, specDecodingGenerationLengths, mWorkspaceDevice, - mScanWorkspaceSizeInBytes, mGenerationLengthInclusiveSum, mWorkspaceDevice, mReduceWorkspaceSizeInBytes, - mMaxGenerationLength, mStream); + invokeScanReduceGenerationLengths(batchSize, generationLengths, mWorkspaceDevice, mScanWorkspaceSizeInBytes, + mGenerationLengthInclusiveSum, mWorkspaceDevice, mReduceWorkspaceSizeInBytes, mMaxGenerationLength, mStream); - invokeConvertSpecDecodingMaskToPackedMask(batchSize, mGenerationLengthInclusiveSum, mMaxGenerationLength, - masksDevice, batchSlots, mDecoderDomain.getSpeculativeDecodingModule()->getMaxDecodingDraftTokens(), + invokeConvertMaskToPackedMask(batchSize, mGenerationLengthInclusiveSum, mMaxGenerationLength, masksDevice, + batchSlots, mDecoderDomain.getSpeculativeDecodingModule()->getMaxDecodingDraftTokens(), mDecoderDomain.getSpeculativeDecodingModule()->getMaxDecodingTokens(), packedMasksDevice, mStream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -191,32 +217,34 @@ void ExplicitDraftTokensLayer::convertPackedMask( template void ExplicitDraftTokensLayer::splitInputDataToBatchSlots( - DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs) + ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.batch_slots->shape[0]; - auto const maxSeqLen = outputs.output_ids.shape[outputs.output_ids.shape.size() - 1]; + auto const batchSize = inputs.localBatchSize; + auto const maxSeqLen = outputs.outputIds.shape[outputs.outputIds.shape.size() - 1]; ExtractExplicitDraftTokensParams params; - params.outputIds = outputs.output_ids.template getPtr(); - params.outputPositionIdsBase = outputs.explicitDraftTokensOutputs->positionIdsBase.template getPtr(); - params.outputPositionIds = outputs.explicitDraftTokensOutputs->nextDraftPosIds.template getPtr(); - params.outputNextDraftTokens = outputs.explicitDraftTokensOutputs->nextDraftTokens.template getPtr(); - params.unpackedNextDraftTokens - = outputs.explicitDraftTokensOutputs->unpackedNextDraftTokens.template getPtr(); - params.unpackedNextDraftIndices - = outputs.explicitDraftTokensOutputs->unpackedNextDraftIndices.template getPtr(); - params.acceptedLengths = outputs.explicitDraftTokensOutputs->acceptedLengths.template getPtr(); - params.nextDraftLengths = outputs.explicitDraftTokensOutputs->nextDraftLengths.template getPtr(); - params.sequenceLengths = outputs.sequence_length->template getPtr(); - params.randDataSample = outputs.explicitDraftTokensOutputs->randomDataSample.template getPtr(); - params.randDataVerification = outputs.explicitDraftTokensOutputs->randomDataValidation.template getPtr(); - params.outputDraftProbs = outputs.explicitDraftTokensOutputs->nextDraftProbs.template getPtr(); - params.outputTemperatures = outputs.explicitDraftTokensOutputs->temperatures.template getPtr(); - - params.batchSlots = inputs.batch_slots->template getPtr(); + params.outputIds = outputs.outputIds.template getPtr(); + params.outputPositionIdsBase = outputs.positionIdsBase.template getPtr(); + params.outputPositionIds = outputs.nextDraftPosIds.template getPtr(); + params.outputNextDraftTokens = outputs.nextDraftTokens.template getPtr(); + params.unpackedNextDraftTokens = outputs.unpackedNextDraftTokens.template getPtr(); + params.unpackedNextDraftIndices = outputs.unpackedNextDraftIndices.template getPtr(); + params.acceptedLengths = outputs.numNewTokens->template getPtr(); + params.nextDraftLengths = outputs.nextDraftLengths.template getPtr(); + params.prevDraftLengths = outputs.prevDraftLengths.template getPtr(); + params.sequenceLengths = outputs.sequenceLength->template getPtr(); + params.randDataSample = outputs.randomDataSample.template getPtr(); + params.randDataVerification = outputs.randomDataValidation.template getPtr(); + params.outputDraftProbs = outputs.nextDraftProbs.template getPtr(); + params.outputTemperatures = outputs.temperatures.template getPtr(); + params.outputGenerationLengths = outputs.generationLengths.template getPtr(); + params.outputBestPathIndices = mBestPathIndicesSlots; + params.outputLastDraftIndices = mLastDraftIndicesSlots; + + params.batchSlots = inputs.seqSlots.template getPtr(); params.nextDraftTokens = inputs.nextDraftTokens.template getPtr(); params.lastDraftTokens = inputs.lastDraftTokens.template getPtr(); params.inputUnpackedNextDraftIndices = inputs.nextDraftIndices.template getPtr(); @@ -226,15 +254,32 @@ void ExplicitDraftTokensLayer::splitInputDataToBatchSlots( params.packedPositionIds = inputs.packedPosIds.template getPtr(); params.nextFlatTokens = inputs.nextFlatTokens.template getPtr(); params.nextDraftProbs = inputs.nextDraftProbs.template getPtr(); + params.lastGenerationLengths = inputs.lastGenerationLengths.template getPtr(); params.generationLengthInclusiveSum = mGenerationLengthInclusiveSum; + params.lastDraftIndices = inputs.lastDraftIndices.template getPtr(); params.inputTemperatures = mTemperatureDevice; params.curandState = mCurandStatesDevice; - params.curandState = mCurandStatesDevice; params.batchSize = batchSize; params.numPaths = mDecoderDomain.getSpeculativeDecodingModule()->getMaxNumPaths(); params.maxPathLength = mDecoderDomain.getSpeculativeDecodingModule()->getMaxPathLen(); params.maxSeqLen = maxSeqLen; params.vocabSize = mDecoderDomain.getVocabSizePadded(); + params.numContextRequests = batchSize - inputs.lastDraftTokens.shape[0]; + params.numGenerationRequests = inputs.lastDraftTokens.shape[0]; + + params.checkParams(); + + // Copy max generation length + cudaMemcpyAsync(outputs.maxGenLengthHost.template getPtr(), + inputs.maxGenLengthDevice.template getPtr(), sizeof(SizeType32), cudaMemcpyDeviceToHost, + mStream); + + params.checkParams(); + + // Copy max generation length + cudaMemcpyAsync(outputs.maxGenLengthHost.template getPtr(), + inputs.maxGenLengthDevice.template getPtr(), sizeof(SizeType32), cudaMemcpyDeviceToHost, + mStream); invokeExtractExplicitDraftTokens(params, mStream); @@ -245,28 +290,25 @@ void ExplicitDraftTokensLayer::splitInputDataToBatchSlots( template void ExplicitDraftTokensLayer::packAcceptedPaths( - DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs) + ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.batch_slots->shape[0]; + auto const batchSize = inputs.localBatchSize; - auto paths = inputs.lastDraftIndices.template getPtr(); - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() : nullptr; - auto acceptedLengths = outputs.explicitDraftTokensOutputs->acceptedLengths.template getPtr(); - auto acceptedLengthsCumSum - = outputs.explicitDraftTokensOutputs->acceptedLengthsCumSum.template getPtr(); - auto pathsOffsets = outputs.explicitDraftTokensOutputs->pathsOffsets.template getPtr(); - auto bestPathIndices = inputs.bestPathIndices.template getPtr(); + auto numNewTokens = outputs.numNewTokens->template getPtr(); + auto numNewTokensCumSum = outputs.numNewTokensCumSum.template getPtr(); + auto pathsOffsets = outputs.pathsOffsets.template getPtr(); + auto batchSlots = inputs.batchSlots->template getPtr(); TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for ExplicitDraftTokensLayer"); - TLLM_CHECK_WITH_INFO(acceptedLengths != nullptr, "Accepted lengths must be provided for ExplicitDraftTokensLayer"); + TLLM_CHECK_WITH_INFO(numNewTokens != nullptr, "Accepted lengths must be provided for ExplicitDraftTokensLayer"); TLLM_CHECK_WITH_INFO( - acceptedLengthsCumSum != nullptr, "acceptedLengthsCumSum must be provided for ExplicitDraftTokensLayer"); + numNewTokensCumSum != nullptr, "numNewTokensCumSum must be provided for ExplicitDraftTokensLayer"); TLLM_CHECK_WITH_INFO(pathsOffsets != nullptr, "pathsOffsets must be provided for ExplicitDraftTokensLayer"); - invokePackAcceptedPaths(acceptedLengthsCumSum, pathsOffsets, acceptedLengths, bestPathIndices, paths, batchSlots, - batchSize, mDecoderDomain.getSpeculativeDecodingModule()->getMaxNumPaths(), - mDecoderDomain.getSpeculativeDecodingModule()->getMaxPathLen(), true, mStream); + invokePackAcceptedPaths(numNewTokensCumSum, pathsOffsets, numNewTokens, mBestPathIndicesSlots, + mLastDraftIndicesSlots, batchSlots, batchSize, mDecoderDomain.getSpeculativeDecodingModule()->getMaxNumPaths(), + mDecoderDomain.getSpeculativeDecodingModule()->getMaxPathLen(), false, mStream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.h b/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.h index c80c913f7..9a49a14d6 100644 --- a/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.h +++ b/cpp/tensorrt_llm/layers/explicitDraftTokensLayer.h @@ -16,68 +16,14 @@ #pragma once -#include - -#include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/runtime/common.h" -#include "tensorrt_llm/runtime/iTensor.h" -namespace tc = tensorrt_llm::common; - -namespace tensorrt_llm -{ -namespace layers -{ - -class ExplicitDraftTokensSetupParams : public BaseSetupParams -{ -public: - std::optional> temperature; // [setupBatchSize] on cpu - std::optional> randomSeed; // [1] or [setupBatchSize] on cpu -}; +#include -class ExplicitDraftTokensInputParams : public BaseInputParams +namespace tensorrt_llm::layers { -public: - explicit ExplicitDraftTokensInputParams() - : BaseInputParams{0, 0, tc::Tensor()} - { - } - - //! Draft tokens for the next iteration. The first token in each path is the last accepted token at current - //! iteration. E.g. if forwardBatchSize == 1, maxNumPaths == 2, maxPathLen== 3, [[[0, 1, 2], [0, 1, 10]]] - tc::Tensor nextDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu - //! Compressed form of `nextDraftTokens`, where common prefixes and collapsed. - //! Using example above [0, 1, 2, 10] - tc::Tensor nextFlatTokens; // [forwardBatchSize * maxDecodingTokens], gpu - //! Indices of draft tokens in the compressed `nextFlatTokens` for the next iteration. - //! Using example above, [[[0, 1, 2], [0, 1, 3]]] - tc::Tensor nextDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu - //! Probabilities of the next draft tokens. - tc::Tensor nextDraftProbs; // [forwardBatchSize, maxNumPaths, maxDraftPathLen, vocabSize], gpu - //! Same as `nextDraftTokens`, but for current iteration. - //! Current accepted tokens obtained as `lastDraftTokens[bi][bestPathIndices[bi]][1:bestPathLengths[bi]]`. - tc::Tensor lastDraftTokens; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu - //! Same as `nextDraftIndices`, but for current iteration. - tc::Tensor lastDraftIndices; // [forwardBatchSize, maxNumPaths, maxPathLen], gpu - //! Boolean attention masks. - //! maxDecodingTokens' = specDecodingGenerationLengths.max() - tc::Tensor masks; // [forwardBatchSize, maxDecodingTokens', maxDecodingTokens'], gpu - //! Relative to `positionIdsBase` position ids. Same as `nextFlatTokens` for next draft indices. - //! Using example above, [0, 1, 2, 3] - tc::Tensor packedPosIds; // [forwardBatchSize * maxDecodingTokens], gpu - //! Lengths of the accepted paths for each request. It is 1 for context phase (Only 1 primary tokens is accepted). - tc::Tensor bestPathLengths; // [forwardBatchSize], gpu - //! Indices of the accepted paths for each request. It is 0 for context phase. - tc::Tensor bestPathIndices; // [forwardBatchSize], gpu - //! Number of the draft tokens for the next iteration. - tc::Tensor specDecodingGenerationLengths; // [forwardBatchSize], gpu - //! Baseline for the position ids. - tc::Tensor positionIdsBase; // [forwardBatchSize], gpu -}; //! \brief Decoding layer for speculative decoding technique, when all tokens are generated, decoded and accepted in the //! engine. @@ -94,20 +40,23 @@ class ExplicitDraftTokensLayer : public BaseLayer ~ExplicitDraftTokensLayer() override; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: void allocateBuffer(); void freeBuffer(); - void convertPackedMask(DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs); + void fillContextBuffers( + SizeType32 batchSize, SizeType32 const* batchSlots, ExplicitDraftTokensSetupParams const& params); + + void convertPackedMask(ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs); - void splitInputDataToBatchSlots( - DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs); + void splitInputDataToBatchSlots(ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs); - void packAcceptedPaths(DynamicDecodeOutputParams const& outputs, ExplicitDraftTokensInputParams const& inputs); + void packAcceptedPaths(ExplicitDraftTokensOutputs const& outputs, ExplicitDraftTokensInputs const& inputs); private: using Base::mStream; @@ -129,9 +78,10 @@ class ExplicitDraftTokensLayer : public BaseLayer SizeType32* mGenerationLengthInclusiveSum{nullptr}; SizeType32* mMaxGenerationLength{nullptr}; float* mTemperatureDevice{nullptr}; + SizeType32* mBestPathIndicesSlots{nullptr}; + SizeType32* mLastDraftIndicesSlots{nullptr}; std::vector mTemperature; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/layerUtils.h b/cpp/tensorrt_llm/layers/layerUtils.h index 68c108a66..1d4d69da0 100644 --- a/cpp/tensorrt_llm/layers/layerUtils.h +++ b/cpp/tensorrt_llm/layers/layerUtils.h @@ -92,35 +92,32 @@ inline bool allOfBatchSlots( } inline DecoderDomain getLocalDecoderDomain( - std::shared_ptr baseInputs, DecoderDomain const& globalDecoderDomain) + std::shared_ptr baseInputs, DecoderDomain const& globalDecoderDomain) { - auto inputs = std::dynamic_pointer_cast(baseInputs); - runtime::SizeType32 batchSize{0}; + auto inputs = std::dynamic_pointer_cast(baseInputs); + runtime::SizeType32 batchSize{baseInputs->localBatchSize}; runtime::SizeType32 beamWidth{0}; runtime::SizeType32 vocabSize{0}; if (inputs->logits) { auto const& logitsShape = inputs->logits->shape; TLLM_CHECK(logitsShape.size() == 3 || logitsShape.size() == 4); - batchSize = logitsShape[0]; auto const idxOffset = logitsShape.size() - 3; beamWidth = logitsShape[idxOffset + 1]; vocabSize = logitsShape[idxOffset + 2]; } - else if (inputs->logits_vec) + else if (inputs->logitsVec) { - TLLM_CHECK(inputs->logits_vec->size()); - auto const& logitsShape = inputs->logits_vec.value()[0].shape; + TLLM_CHECK(inputs->logitsVec->size()); + auto const& logitsShape = inputs->logitsVec.value()[0].shape; TLLM_CHECK(logitsShape.size() == 3 || logitsShape.size() == 4); auto const idxOffset = logitsShape.size() - 3; - batchSize = inputs->logits_vec->size(); beamWidth = logitsShape[idxOffset + 1]; vocabSize = logitsShape[idxOffset + 2]; } - else if (inputs->batch_slots) + else if (inputs->batchSlots) { - auto const& batchSlotsShape = inputs->batch_slots->shape; - batchSize = batchSlotsShape[0]; + auto const& batchSlotsShape = inputs->batchSlots->shape; beamWidth = globalDecoderDomain.getBeamWidth(); vocabSize = globalDecoderDomain.getVocabSize(); } diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp index b7f71d0a5..7392c4e8d 100644 --- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp +++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp @@ -15,7 +15,12 @@ */ #include "tensorrt_llm/layers/lookaheadAlgorithm.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/lookaheadModule.h" +#include namespace tensorrt_llm::layers { @@ -31,10 +36,13 @@ void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeT mW = w; mN = n; mG = g; + std::tie(std::ignore, std::ignore, mRuntimeMaxDraftLen, std::ignore) + = executor::LookaheadDecodingConfig(mW, mN, mG).calculateSpeculativeResource(); + mPoolManager.setup(mG); mPoolManager.accept(prompt, mN); mGoldenTokens = ITensor::slice(mGoldenTokensMax, 0, mN * 2 - 1); - mPrefills = ITensor::slice(mPrefillsMax, 0, mN - 2); + mPrefills = ITensor::slice(mPrefillsMax, 0, mN <= 1 ? 0 : mN - 2); mKeyTokens = ITensor::slice(mKeyTokensMax, 0, mW); mPastTokens = ITensor::slice(mPastTokensMax, 0, mW * (mN - 1)); mPastTokens->reshape(ITensor::makeShape({mW, mN - 1})); @@ -48,10 +56,14 @@ void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeT std::for_each(pastRange.begin(), pastRange.end(), [](auto& a) { a = -1; }); for (SizeType32 i = 0; i < mW; i++) { - randToken(pastRange[i * (mN - 1)]); + if (mN - 1 > 0) + { + randToken(pastRange[i * (mN - 1)]); + } } std::copy(std::prev(promptRange.end(), mN - 1), promptRange.end(), goldRange.begin()); - mFilling = 1; + mGuessTokens = ITensor::slice(mGuessTokensMax, 0, 0); + mFilling = (mN - 1) > 0 ? 1 : 0; PRINT_TOKENS(prompt); PRINT_TOKENS(mPrefills); PRINT_TOKENS(mPastTokens); @@ -75,6 +87,8 @@ void LookaheadAlgorithm::accept(TensorConstPtr const& generatedTokens) runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, TensorPtr const& positionIds, TensorPtr const& samplingMask, runtime::SizeType32 offset) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + SizeType32 prefill = mN - 2 - mFilling; SizeType32 len = prefill + mFilling * mW; TLLM_CHECK(len <= ITensor::volume(draftTokens->getShape())); @@ -132,8 +146,9 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, samplingMaskRange[wj * mFilling + mFilling - 1 - 1] = true; } } - TLLM_LOG_DEBUG("prefill=%d, offset=%d", prefill, offset); PRINT_VALUES(positionIds); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return len; } @@ -171,10 +186,19 @@ void LookaheadAlgorithm::prepare(TensorPtr const& draftTokens, TensorPtr const& TensorPtr const& samplingMask, TensorPtr const& length, TensorConstPtr const& offsetPtr, TensorConstPtr const& lastTokenPtr) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + if (mRuntimeMaxDraftLen == 0) + { + (BufferRange(*length))[0] = 0; + return; + } + auto lastToken = BufferRange(*lastTokenPtr)[0]; auto offset = BufferRange(*offsetPtr)[0]; SizeType32 inputLen = ITensor::volume(draftTokens->getShape()); + TLLM_CHECK(inputLen >= mRuntimeMaxDraftLen); BufferRange draftRange(*draftTokens); BufferRange positionRange(*positionIds); @@ -182,33 +206,39 @@ void LookaheadAlgorithm::prepare(TensorPtr const& draftTokens, TensorPtr const& SizeType32 filledLen = 0; - filledLen += lookahead(ITensor::slice(draftTokens, filledLen, inputLen - filledLen), - ITensor::slice(positionIds, filledLen, inputLen - filledLen), - ITensor::slice(samplingMask, filledLen, inputLen - filledLen), offset); + filledLen += lookahead(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen), + ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), + ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset); auto guessStart = filledLen; - filledLen += guess(ITensor::slice(draftTokens, filledLen, inputLen - filledLen), - ITensor::slice(positionIds, filledLen, inputLen - filledLen), - ITensor::slice(samplingMask, filledLen, inputLen - filledLen), offset, lastToken); + filledLen += guess(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen), + ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), + ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset, lastToken); auto guessEnd = filledLen; mGuessTokens = ITensor::slice(mGuessTokensMax, 0, guessEnd - guessStart); + std::copy(draftRange.begin() + guessStart, draftRange.begin() + guessEnd, BufferRange(*mGuessTokens).begin()); (BufferRange(*length))[0] = filledLen; + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength, TokenIdType newLastToken, TensorConstPtr const& goldenTokens, TensorConstPtr const& endToken) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + TLLM_CHECK(ITensor::volume(goldenTokens->getShape()) == ITensor::volume(mGuessTokens->getShape())); BufferRange goldRange(*goldenTokens); BufferRange guessTokensRange(*mGuessTokens); auto guessSize = ITensor::volume(mGuessTokens->getShape()); - SizeType32 guesses = guessSize / (mN - 1), hit = 0, maxHit = 0, hitIdx = 0; + SizeType32 guesses = (mN - 1 > 0) ? (guessSize / (mN - 1)) : 0; + SizeType32 hit = 0, maxHit = 0, hitIdx = 0; for (SizeType32 i = 0; i < guesses; i++) { SizeType32 hit = 0; @@ -248,6 +278,8 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce } *BufferRange(*acceptedLength).begin() = maxHit + 1; + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } //! lookahead Jacobi matrix has prefilling phase and maintenance phase. @@ -293,6 +325,8 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength, TensorConstPtr const& sampledTokens, TensorConstPtr const& endToken) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + TLLM_CHECK(ITensor::volume(acceptedTokens->getShape()) >= mN); BufferRange sampledRange(*sampledTokens); BufferRange keyRange(*mKeyTokens); @@ -312,7 +346,7 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const pastRange[i * (mN - 1) + mFilling] = keyRange[i]; } } - else + else if (mN > 1) { for (SizeType32 i = 0; i < mW; i++) { @@ -329,8 +363,9 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const auto guessSize = ITensor::volume(mGuessTokens->getShape()); auto outputSize = ITensor::volume(sampledTokens->getShape()); - auto lookSize = 1 + mN - 2 - mFilling + mFilling * mW; + auto lookSize = 1 + (mN > 1 ? mN - 2 : 0) - mFilling + mFilling * mW; TLLM_CHECK(guessSize + lookSize == outputSize); + TensorConstPtr goldenTokens = ITensor::slice(sampledTokens, lookSize, guessSize); verify(acceptedTokens, acceptedOffsets, acceptedLength, newLastToken, goldenTokens, endToken); @@ -341,6 +376,8 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const { mFilling++; } + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } } // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h index 89e22df45..99df44128 100644 --- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h +++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h @@ -44,7 +44,8 @@ class LookaheadAlgorithm , mId(id) , mGoldenTokensMax( runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN * 2 - 1}), nvinfer1::DataType::kINT32)) - , mPrefillsMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN - 2}), nvinfer1::DataType::kINT32)) + , mPrefillsMax(runtime::BufferManager::cpu( + runtime::ITensor::makeShape({(maxN <= 1 ? 0 : maxN - 2)}), nvinfer1::DataType::kINT32)) , mKeyTokensMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW}), nvinfer1::DataType::kINT32)) , mPastTokensMax( runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW * (maxN - 1)}), nvinfer1::DataType::kINT32)) @@ -125,6 +126,7 @@ class LookaheadAlgorithm runtime::SizeType32 mW{0}; runtime::SizeType32 mN{0}; runtime::SizeType32 mG{0}; + runtime::SizeType32 mRuntimeMaxDraftLen{0}; //! in prefilling mode when mFilling < mN-1. runtime::SizeType32 mFilling; diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp new file mode 100644 index 000000000..f4d40577b --- /dev/null +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/layers/lookaheadDecodingLayer.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/kernels/decodingKernels.h" +#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/layers/decodingParams.h" +#include "tensorrt_llm/layers/defaultDecodingParams.h" +#include "tensorrt_llm/layers/lookaheadAlgorithm.h" +#include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iBuffer.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include +#include +#include +#include + +namespace tensorrt_llm::layers +{ + +using namespace tensorrt_llm::common; +using namespace tensorrt_llm::kernels; +using namespace tensorrt_llm::runtime; + +template +LookaheadDecodingLayer::CpuAlgorithmResources::CpuAlgorithmResources(DecoderDomain const& decoderDomain) +{ + auto maxBatchSize = decoderDomain.getBatchSize(); + auto lookaheadModule + = std::dynamic_pointer_cast(decoderDomain.getSpeculativeDecodingModule()); + auto const [maxW, maxN, maxG] = lookaheadModule->getExecutionConfig().get(); + + for (runtime::SizeType32 id = 0; id < maxBatchSize; id++) + { + mAlgos.emplace_back(maxW, maxN, maxG, id); + } + + SizeType32 maxTokensPerStep, maxNumNewTokens, maxDraftLen; + std::tie(maxTokensPerStep, maxNumNewTokens, maxDraftLen, std::ignore) + = executor::LookaheadDecodingConfig(maxW, maxN, maxG).calculateSpeculativeResource(); + + auto const maxBatchShape1D = ITensor::makeShape({maxBatchSize}); + mBatchSlots = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); + mTargetTokens + = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32); + mTokensPerStep = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); + mEndIds = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); + + mOutputIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxNumNewTokens}), nvinfer1::DataType::kINT32); + mPathsOffsets = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxNumNewTokens}), nvinfer1::DataType::kINT32); + mNumNewTokens = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); + mNumNewTokensCumSum = BufferManager::cpu(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32); + mNextDraftTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); + mNextDraftPosIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); + auto divUp32 = [](SizeType32 x) { return x / 32 + ((x % 32) ? 1 : 0); }; + mPackedMasks = BufferManager::cpu( + ITensor::makeShape({maxBatchSize, maxTokensPerStep, divUp32(maxTokensPerStep)}), nvinfer1::DataType::kINT32); + mSamplingMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kBOOL); + mNextDraftLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); + mSequenceLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); +} + +template +LookaheadDecodingLayer::LookaheadDecodingLayer( + DecoderDomain const& decoderDomain, std::shared_ptr const& bufferManager) + : BaseLayer(decoderDomain, bufferManager) + , mCpuAlgo(std::make_optional(decoderDomain)) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const maxBatchSize = mDecoderDomain.getBatchSize(); + auto const maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens(); + auto const vocabSizePadded = mDecoderDomain.getVocabSizePadded(); + auto const maxTopK = 1; + auto const maxBatchShape1D = ITensor::makeShape({maxBatchSize}); + auto const maxBatchShape2D = ITensor::makeShape({maxBatchSize, maxTokensPerStep}); + + mWorkspaceSize = getTopKWorkspaceSize(maxBatchSize, maxTokensPerStep, maxTopK, vocabSizePadded); + TLLM_LOG_DEBUG("mWorkspaceSize=%d", mWorkspaceSize); + + mSamplingWorkspaceDevice + = mBufferManager->gpu(ITensor::makeShape({static_cast(mWorkspaceSize)}), nvinfer1::DataType::kINT8); + mTargetTokensDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kINT32); + mRandomSeedsDevice = mBufferManager->gpu(maxBatchShape1D, nvinfer1::DataType::kINT64); + mSamplingMaskDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kBOOL); + mCurandStatesDevice = mBufferManager->gpu(maxBatchShape1D, nvinfer1::DataType::kINT8); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void LookaheadDecodingLayer::setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, + runtime::SizeType32 const* batchSlots, std::shared_ptr const& baseSetupParams) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto setupParams = std::dynamic_pointer_cast(baseSetupParams); + + if (mCpuAlgo) + { + auto& algoConfigs = setupParams->algoConfigs; + TLLM_CHECK_WITH_INFO(algoConfigs.size() == 1 || algoConfigs.size() == batchSize, + "Lookahead runtime configuration size should be either 1 or batchSize"); + for (runtime::SizeType32 bi = 0; bi < batchSize; bi++) + { + SizeType32 gbi = batchSlots[bi]; + SizeType32 bi1orN = (algoConfigs.size() == 1) ? 0 : bi; + TLLM_LOG_DEBUG("CPU ALGO [ %d ] setup", gbi); + PRINT_TOKENS(setupParams->prompt[bi]); + auto [w, n, g] = algoConfigs[bi1orN].get(); + SizeType32 runtimeTokensPerStep; + std::tie(runtimeTokensPerStep, std::ignore, std::ignore, std::ignore) + = executor::LookaheadDecodingConfig(w, n, g).calculateSpeculativeResource(); + TLLM_CHECK_WITH_INFO(runtimeTokensPerStep <= mDecoderDomain.getMaxDecodingTokens(), + "runtime w(%d) n(%d) g(%d) exceeds maxTokensPerStep(%d)", w, n, g, + mDecoderDomain.getMaxDecodingTokens()); + mCpuAlgo->mAlgos[gbi].setup(setupParams->prompt[bi], w, n, g); + } + } + + auto curandStatesDevicePtr = reinterpret_cast(bufferCast(*mCurandStatesDevice)); + if (setupParams->randomSeed) + { + auto& randomSeed = setupParams->randomSeed.value(); + if (randomSeed.size() == 1) + { + invokeCurandInitialize(curandStatesDevicePtr, batchSlots, batchSize, randomSeed.front(), mStream); + sync_check_cuda_error(); + } + else + { + TLLM_CHECK_WITH_INFO(randomSeed.size() == batchSize, "Random seed vector size mismatch."); + cudaAutoCpy(bufferCast(*mRandomSeedsDevice), randomSeed.data(), batchSize, mStream); + invokeCurandBatchInitialize( + curandStatesDevicePtr, batchSlots, batchSize, bufferCast(*mRandomSeedsDevice), mStream); + sync_check_cuda_error(); + } + } + else + { + invokeCurandInitialize(curandStatesDevicePtr, batchSlots, batchSize, DefaultDecodingParams::getSeed(), mStream); + } + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void LookaheadDecodingLayer::forwardAsync( + std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(inputParams); + auto outputs = std::dynamic_pointer_cast(outputParams); + auto batchSize = inputs->localBatchSize; + + TLLM_CHECK_WITH_INFO(inputs->batchSlots, "Batch slots must be provided for LookaheadDecoding"); + TLLM_CHECK_WITH_INFO(inputs->curTokensPerStep, "curTokensPerStep must be provided for LookaheadDecoding"); + TLLM_CHECK_WITH_INFO(outputs->sequenceLength, "sequenceLength must be provided for LookaheadDecoding"); + // TODO(liweim) to be confirmed. + TLLM_CHECK(inputs->logits); + + mBufferManager->copy( + inputs->batchSlots->template getPtr(), *mCpuAlgo->mBatchSlots, runtime::MemoryType::kGPU); + mBufferManager->copy(inputs->curTokensPerStep->template getPtr(), *mCpuAlgo->mTokensPerStep, + runtime::MemoryType::kGPU); + mBufferManager->copy( + inputs->endIds.template getPtr(), *mCpuAlgo->mEndIds, runtime::MemoryType::kGPU); + mBufferManager->copy(outputs->sequenceLength->template getPtr(), *mCpuAlgo->mSequenceLengths, + runtime::MemoryType::kGPU); + + TopKSamplingKernelParams params; + params.maxBatchSize = mDecoderDomain.getBatchSize(); + params.batchSize = batchSize; + params.maxTopK = 1; + params.returnAllTopK = true; + params.maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens(); + params.maxSeqLen = mDecoderDomain.getMaxDecodingTokens(); + params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); + params.batchSlots = inputs->batchSlots->template getPtr(); + TLLM_LOG_DEBUG("batchSize = %d", batchSize); + params.logProbs = inputs->logits ? inputs->logits->template getPtr() : nullptr; + params.outputIds = bufferCast(*mTargetTokensDevice); + params.workspace = bufferCast(*mSamplingWorkspaceDevice); + params.curandState = reinterpret_cast(bufferCast(*mCurandStatesDevice)); + params.tokensPerStep = inputs->curTokensPerStep->template getPtr(); + + TLLM_LOG_DEBUG( + "invokeBatchTopKSampling: maxBatchSize=%d, batchSize=%d, maxTopK=%d, maxTokensPerStep=%d, maxSeqLen=%d, " + "vocabSizePadded=%d", + params.maxBatchSize, params.batchSize, params.maxTopK, params.maxTokensPerStep, params.maxSeqLen, + params.vocabSizePadded); + + // Sample multiple tokens per request and store them to separate to be accepted/rejected later + // Sequence length is not modified, endIds is not checked, outputLogProbs are not supported. + // Finished state is not set. + invokeBatchTopKSampling(params, mStream); + + mBufferManager->copy(*mTargetTokensDevice, *mCpuAlgo->mTargetTokens); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void LookaheadDecodingLayer::forwardSync( + std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) +{ + if (mCpuAlgo) + { + forwardSyncCPU(outputParams, inputParams); + } +} + +template +void LookaheadDecodingLayer::posIdsToMask(TensorPtr mask, TensorConstPtr posIds) +{ + auto len = ITensor::volume(posIds->getShape()); + TLLM_CHECK(mask->getShape().d[0] > len); + TLLM_CHECK(mask->getShape().d[1] * 32 > len); + auto posIdsRange = BufferRange(*posIds); + auto maskLocation = BufferLocation(*mask); + + for (auto i = 0; i < maskLocation.size(); i++) + { + maskLocation[i] = 0; + } + maskLocation.at(0, 0) = 1; + + auto setBit = [](SizeType32& x, SizeType32 idx) { x |= (1 << idx); }; + if (len > 0) + { + std::vector> stack; + stack.push_back(std::make_pair(0, posIdsRange[0] - 1)); + for (auto i = 1; i < len + 1; i++) + { + auto cur = posIdsRange[i - 1]; + while (stack.size() > 0 && cur <= stack.back().second) + { + stack.pop_back(); + } + TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); + stack.push_back(std::make_pair(i, cur)); + for (auto prev : stack) + { + setBit(maskLocation.at(i, prev.first / 32), prev.first % 32); + } + } + } +} + +template +void LookaheadDecodingLayer::forwardSyncCPU( + std::shared_ptr const& outputParams, std::shared_ptr const& inputParams) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(inputParams); + auto outputs = std::dynamic_pointer_cast(outputParams); + auto const batchSize = inputs->localBatchSize; + + TensorPtr outputIds(wrap(outputs->outputIds)); + BufferRange tokensPerStepRange(*mCpuAlgo->mTokensPerStep); + BufferRange numNewTokensRange(*mCpuAlgo->mNumNewTokens); + BufferRange numNewTokensCumSumRange(*mCpuAlgo->mNumNewTokensCumSum); + BufferRange batchSlotsRange(*mCpuAlgo->mBatchSlots); + BufferRange nextDraftLengthsRange(*mCpuAlgo->mNextDraftLengths); + BufferRange sequenceLengthsRange(*mCpuAlgo->mSequenceLengths); + + for (SizeType32 bi = 0; bi < batchSize; bi++) + { + SizeType32 gbi = batchSlotsRange[bi]; + LookaheadAlgorithm& theAlgo(mCpuAlgo->mAlgos[gbi]); + + SizeType32 const tokensPerStep = tokensPerStepRange[gbi]; + TensorPtr sampledTokens = ITensor::slice(mCpuAlgo->mTargetTokens, {gbi, 0}, tokensPerStep); + + if (tokensPerStep == 1) + { // The first step in generation phase has no draft tokens. + theAlgo.accept(sampledTokens); + mBufferManager->copy(*sampledTokens, *ITensor::slice(mCpuAlgo->mOutputIds, {gbi, 0}, tokensPerStep)); + BufferLocation(*mCpuAlgo->mPathsOffsets).at(gbi, 0) = 0; + numNewTokensRange[gbi] = tokensPerStep; + BufferLocation(*mCpuAlgo->mNextDraftLengths).at(gbi) = 0; + } + else + { + theAlgo.update( // + ITensor::at(mCpuAlgo->mOutputIds, {gbi}), // + ITensor::at(mCpuAlgo->mPathsOffsets, {gbi}), // + ITensor::at(mCpuAlgo->mNumNewTokens, {gbi}), // + sampledTokens, // + ITensor::at(mCpuAlgo->mEndIds, {gbi})); + } + + auto maxNumNewTokens = mCpuAlgo->mOutputIds->getShape().d[1]; + mBufferManager->copy(*ITensor::at(mCpuAlgo->mOutputIds, {gbi}), + *ITensor::slice(outputIds, {gbi, sequenceLengthsRange[gbi]}, maxNumNewTokens)); + + sequenceLengthsRange[gbi] += numNewTokensRange[gbi]; + + theAlgo.prepare( // + ITensor::at(mCpuAlgo->mNextDraftTokens, {gbi}), // + ITensor::at(mCpuAlgo->mNextDraftPosIds, {gbi}), // + ITensor::at(mCpuAlgo->mSamplingMask, {gbi}), // + ITensor::at(mCpuAlgo->mNextDraftLengths, {gbi}), // + ITensor::at(mCpuAlgo->mSequenceLengths, {gbi}), // + ITensor::at(mCpuAlgo->mOutputIds, {gbi, numNewTokensRange[gbi] - 1})); + + posIdsToMask( // + ITensor::at(mCpuAlgo->mPackedMasks, {gbi}), // + ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi])); + } + + numNewTokensCumSumRange[0] = 0; + for (SizeType32 i = 0; i < numNewTokensRange.size(); i++) + { + numNewTokensCumSumRange[i + 1] = numNewTokensCumSumRange[i] + numNewTokensRange[i]; + } + + TLLM_CHECK(outputs->numNewTokens); + + mBufferManager->copy(*mCpuAlgo->mSequenceLengths, // + const_cast(outputs->sequenceLength.value().data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mPathsOffsets, // + const_cast(outputs->pathsOffsets.data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mNumNewTokens, // + const_cast(outputs->numNewTokens->data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mNumNewTokensCumSum, // + const_cast(outputs->numNewTokensCumSum.data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mNextDraftTokens, // + const_cast(outputs->nextDraftTokens.data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mNextDraftPosIds, // + const_cast(outputs->nextDraftPosIds.data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mPackedMasks, // + const_cast(outputs->packedMasks.data), runtime::MemoryType::kGPU); + mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, // + const_cast(outputs->nextDraftLengths.data), runtime::MemoryType::kGPU); + + // TODO(liweim) do we need this? + // mBufferManager->getStream().synchronize(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} +template class LookaheadDecodingLayer; +template class LookaheadDecodingLayer; + +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h new file mode 100644 index 000000000..5d3988c36 --- /dev/null +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "lookaheadAlgorithm.h" +#include "tensorrt_llm/common/cudaAllocator.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/layers/baseLayer.h" +#include "tensorrt_llm/layers/decodingParams.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/lookaheadModule.h" + +namespace tensorrt_llm::layers +{ + +//! \brief LookaheadDecodingLayer +template +class LookaheadDecodingLayer : public BaseLayer +{ +public: + using Base = BaseLayer; + using TensorPtr = runtime::ITensor::SharedPtr; + using TensorConstPtr = runtime::ITensor::SharedConstPtr; + using Base::mBufferManager; + + LookaheadDecodingLayer( + DecoderDomain const& decoderDomain, std::shared_ptr const& bufferManager); + + ~LookaheadDecodingLayer() override {} + + void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, + std::shared_ptr const& baseSetupParams) override; + + void forwardAsync(std::shared_ptr const& outputParams, + std::shared_ptr const& inputParams) override; + + void forwardSync(std::shared_ptr const& outputParams, + std::shared_ptr const& inputParams) override; + +private: + void forwardSyncCPU(std::shared_ptr const& outputParams, + std::shared_ptr const& inputParams); + void posIdsToMask(TensorPtr mask, TensorConstPtr posIds); + +private: + using Base::mStream; + using Base::mAllocator; + using Base::mWorkspaceSize; + using Base::mDecoderDomain; + + TensorPtr mCurandStatesDevice; + TensorPtr mSamplingWorkspaceDevice; + TensorPtr mTargetTokensDevice; + TensorPtr mRandomSeedsDevice; + TensorPtr mSamplingMaskDevice; + + struct CpuAlgorithmResources + { + explicit CpuAlgorithmResources(DecoderDomain const& decoderDomain); + + std::vector mAlgos; + TensorPtr mBatchSlots; + TensorPtr mTargetTokens; + TensorPtr mTokensPerStep; + TensorPtr mEndIds; + + TensorPtr mOutputIds; + TensorPtr mPathsOffsets; + TensorPtr mNumNewTokens; + TensorPtr mNumNewTokensCumSum; + + TensorPtr mNextDraftTokens; + TensorPtr mNextDraftPosIds; + TensorPtr mPackedMasks; + TensorPtr mSamplingMask; + TensorPtr mNextDraftLengths; + TensorPtr mSequenceLengths; + }; + + std::optional mCpuAlgo; +}; + +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp deleted file mode 100644 index e10827c3c..000000000 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/layers/lookaheadDecodingUtils.h" - -namespace tensorrt_llm::layers -{ - -using namespace tensorrt_llm::runtime; -using TensorPtr = ITensor::SharedPtr; - -ITensor::UniquePtr slice( - ITensor::SharedPtr tensor, std::initializer_list const& offsetDims, size_t const sizeDim) -{ - auto shape = tensor->getShape(); - TLLM_CHECK(offsetDims.size() > 0); - TLLM_CHECK(shape.nbDims >= offsetDims.size()); - std::vector volumes(shape.nbDims); - - int i; - volumes[shape.nbDims - 1] = 1; - for (i = shape.nbDims - 2; i >= 0; i--) - { - volumes[i] = shape.d[i + 1] * volumes[i + 1]; - } - - size_t offset = 0; - i = 0; - for (auto itd = offsetDims.begin(); itd != offsetDims.end(); itd++) - { - TLLM_CHECK(0 <= (*itd) && (*itd) < shape.d[i]); - offset += (*itd) * volumes[i++]; - } - - ITensor::Shape dims; - dims.nbDims = shape.nbDims - offsetDims.size() + 1; - dims.d[0] = sizeDim; - for (i = 1; i < dims.nbDims; i++) - { - dims.d[i] = shape.d[i - 1 + offsetDims.size()]; - } - - size_t size = ITensor::volume(dims); - - return std::make_unique(std::move(tensor), offset, size, dims); -} - -ITensor::UniquePtr slice(ITensor::SharedPtr tensor, std::initializer_list const& offsetDims) -{ - auto result = slice(tensor, offsetDims, 1); - if (result->getShape().nbDims > 1) - { - result->squeeze(0); - } - return result; -} - -} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h index 41acff784..f1c5476d6 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h @@ -240,10 +240,18 @@ class DebugTensor { buf << token; } - buf << (i == size - 1 ? ']' : ','); + if (i != size - 1) + { + buf << ','; + } } + buf << ']'; }; - if (shape.nbDims == 1) + if (shape.nbDims == 0) + { + buf << "[]"; + } + else if (shape.nbDims == 1) { line(tensorRange.begin(), shape.d[0]); } @@ -277,10 +285,19 @@ class DebugTensor buf << '['; for (SizeType32 i = 0; i < size; i++) { - buf << array[i] << (i == size - 1 ? ']' : ','); + buf << array[i]; + if (i != size - 1) + { + buf << ','; + } } + buf << ']'; }; - if (shape.nbDims == 1) + if (shape.nbDims == 0) + { + buf << "[]"; + } + else if (shape.nbDims == 1) { line(tensorRange.begin(), shape.d[0]); } @@ -305,15 +322,24 @@ class DebugTensor { switch (mTensor.getDataType()) { + case nvinfer1::DataType::kBOOL: return values(); case nvinfer1::DataType::kFLOAT: return values(); case nvinfer1::DataType::kINT8: return values(); case nvinfer1::DataType::kINT32: return values(); case nvinfer1::DataType::kINT64: return values(); case nvinfer1::DataType::kUINT8: return values(); - default: return std::string("Unsupported data type"); + default: return std::string(mName + ": Unsupported data type"); } } + std::string shape(void) + { + using namespace tensorrt_llm::runtime; + std::ostringstream buf; + buf << mName << ": " << mTensor.getShape(); + return buf.str(); + } + void print_tokens(void) { TLLM_LOG_DEBUG(tokens()); @@ -324,6 +350,11 @@ class DebugTensor TLLM_LOG_DEBUG(values()); } + void print_shape(void) + { + TLLM_LOG_DEBUG(shape()); + } + private: runtime::ITensor const& mTensor; std::string mName; @@ -332,5 +363,6 @@ class DebugTensor #define D(x) tensorrt_llm::layers::DebugTensor(x, #x) #define PRINT_TOKENS(x) D(x).print_tokens() #define PRINT_VALUES(x) D(x).print_values() +#define PRINT_SHAPE(x) D(x).print_shape() } // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp b/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp index 04a934280..4af17c099 100644 --- a/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp +++ b/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/layers/lookaheadPoolManager.h" +#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" namespace tensorrt_llm::layers @@ -24,13 +25,18 @@ using namespace tensorrt_llm::runtime; void LookaheadPoolManager::setup(SizeType32 guessSetSize) { - TLLM_CHECK(guessSetSize > 0 && guessSetSize <= mGuessSetSizeMax); + TLLM_CHECK(guessSetSize >= 0 && guessSetSize <= mGuessSetSizeMax); mGuessSetSize = guessSetSize; mTokenMap.clear(); } void LookaheadPoolManager::insertOne(Key key, TensorConstPtr const& ngram) { + if (TLLM_UNLIKELY(ITensor::volume(ngram->getShape()) == 0 || mGuessSetSize == 0)) + { + return; + } + auto search = mTokenMap.find(key); if (search != mTokenMap.end()) { @@ -41,7 +47,7 @@ void LookaheadPoolManager::insertOne(Key key, TensorConstPtr const& ngram) BufferRange itemRange(*item); return std::equal(ngramRange.begin(), ngramRange.end(), itemRange.begin()); }); - if (mGuessSetSize >= 0 && search->second.size() >= mGuessSetSize) + if (mGuessSetSize > 0 && search->second.size() >= mGuessSetSize) { search->second.pop_front(); } @@ -104,7 +110,6 @@ void LookaheadPoolManager::update(TensorConstPtr const& keyTokens, TensorConstPt BufferRange sourceRange(*source); BufferRange ngramRange(*ngram); std::copy(sourceRange.begin(), sourceRange.end(), ngramRange.begin()); - insertOne(keyRange[wi], ngram); } } diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp index 4221ae980..c8fe35dbe 100644 --- a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp @@ -31,9 +31,7 @@ using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::kernels::speculative_decoding; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -144,7 +142,7 @@ void MedusaDecodingLayer::freeBuffer() template void MedusaDecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -272,12 +270,12 @@ void MedusaDecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, S template void MedusaDecodingLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto inputs = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); samplePrimeHeadTokens(*outputs, *inputs); @@ -294,16 +292,16 @@ void MedusaDecodingLayer::forwardAsync( template void MedusaDecodingLayer::samplePrimeHeadTokens( - DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs) + SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.logits.shape[0]; + auto const batchSize = inputs.logits->shape[0]; - auto logits = inputs.logits.template getPtr(); - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() : nullptr; - auto sequenceLengths = outputs.sequence_length ? outputs.sequence_length->template getPtr() : nullptr; - auto tokensPerStepDevice = inputs.medusaCurTokensPerStep.template getPtr(); + auto logits = inputs.logits->template getPtr(); + auto batchSlots = inputs.batchSlots ? inputs.batchSlots->template getPtr() : nullptr; + auto sequenceLengths = outputs.sequenceLength ? outputs.sequenceLength->template getPtr() : nullptr; + auto tokensPerStepDevice = inputs.curTokensPerStep->template getPtr(); TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(sequenceLengths != nullptr, "Sequence lengths must be provided for MedusaDecoding"); @@ -333,22 +331,22 @@ void MedusaDecodingLayer::samplePrimeHeadTokens( template void MedusaDecodingLayer::acceptDraftTokens( - DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs) + SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.logits.shape[0]; - auto const maxSeqLen = outputs.output_ids.shape[outputs.output_ids.shape.size() - 1]; + auto const batchSize = inputs.logits->shape[0]; + auto const maxSeqLen = outputs.outputIds.shape[outputs.outputIds.shape.size() - 1]; - auto outputIds = outputs.output_ids.template getPtr(); - auto endIds = inputs.end_ids.template getPtr(); + auto outputIds = outputs.outputIds.template getPtr(); + auto endIds = inputs.endIds.template getPtr(); auto paths = inputs.paths.template getPtr(); - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() : nullptr; - auto sequenceLengths = outputs.sequence_length ? outputs.sequence_length->template getPtr() : nullptr; - auto acceptedLengths = outputs.speculativeDecodingOutputs->acceptedLengths.template getPtr(); - auto curTokensPerStepDevice = inputs.medusaCurTokensPerStep.template getPtr(); - auto targetTokensPerStepDevice = inputs.medusaTargetTokensPerStep.template getPtr(); + auto batchSlots = inputs.batchSlots ? inputs.batchSlots->template getPtr() : nullptr; + auto sequenceLengths = outputs.sequenceLength ? outputs.sequenceLength->template getPtr() : nullptr; + auto numNewTokens = outputs.numNewTokens->template getPtr(); + auto curTokensPerStepDevice = inputs.curTokensPerStep->template getPtr(); + auto targetTokensPerStepDevice = inputs.targetTokensPerStep.template getPtr(); auto const maxDraftPathLen = mDecoderDomain.getSpeculativeDecodingModule()->getMaxDraftPathLen(); @@ -362,12 +360,12 @@ void MedusaDecodingLayer::acceptDraftTokens( } } - auto draftIds = outputs.speculativeDecodingOutputs->nextDraftTokens.template getPtr(); + auto draftIds = outputs.nextDraftTokens.template getPtr(); TLLM_CHECK_WITH_INFO(draftIds != nullptr, "Draft ids must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(sequenceLengths != nullptr, "Sequence lengths must be provided for MedusaDecoding"); - TLLM_CHECK_WITH_INFO(acceptedLengths != nullptr, "Accepted lengths must be provided for MedusaDecoding"); + TLLM_CHECK_WITH_INFO(numNewTokens != nullptr, "Accepted lengths must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO( curTokensPerStepDevice != nullptr, "Current tokens per step must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO( @@ -379,7 +377,7 @@ void MedusaDecodingLayer::acceptDraftTokens( // Compare draft tokens from outputIds with sampled target tokens at mTargetTokensDevice using paths. // Select the longest accepted path, modify outputIds in-place, increment sequenceLengths accordingly. // Fill mMedusaSelectedLogitsPtrsDevice with respective Medusa logits - acceptDraftTokensByIdsWithPaths(outputIds, draftIds, mTargetTokensDevice, sequenceLengths, acceptedLengths, + acceptDraftTokensByIdsWithPaths(outputIds, draftIds, mTargetTokensDevice, sequenceLengths, numNewTokens, finishedStates, batchSlots, paths, endIds, reinterpret_cast(bufferCast(*mMedusaInputLogitsPtrs)), const_cast(mMedusaSelectedLogitsPtrsDevice), curTokensPerStepDevice, targetTokensPerStepDevice, @@ -391,13 +389,13 @@ void MedusaDecodingLayer::acceptDraftTokens( template void MedusaDecodingLayer::sampleNewDraftTokens( - DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs) + SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.logits.shape[0]; - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() : nullptr; - auto sequenceLengths = (outputs.sequence_length) ? outputs.sequence_length->template getPtr() : nullptr; + auto const batchSize = inputs.logits->shape[0]; + auto batchSlots = inputs.batchSlots ? inputs.batchSlots->template getPtr() : nullptr; + auto sequenceLengths = (outputs.sequenceLength) ? outputs.sequenceLength->template getPtr() : nullptr; TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(sequenceLengths != nullptr, "Sequence lengths must be provided for MedusaDecoding"); @@ -449,18 +447,18 @@ void MedusaDecodingLayer::sampleNewDraftTokens( template void MedusaDecodingLayer::scatterNewDraftTokens( - DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs) + SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.logits.shape[0]; - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() - : static_cast(nullptr); + auto const batchSize = inputs.logits->shape[0]; + auto batchSlots = inputs.batchSlots ? inputs.batchSlots->template getPtr() + : static_cast(nullptr); TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for MedusaDecoding"); - auto draftIds = outputs.speculativeDecodingOutputs->nextDraftTokens.template getPtr(); - auto tokensPerStepDevice = inputs.medusaCurTokensPerStep.template getPtr(); + auto draftIds = outputs.nextDraftTokens.template getPtr(); + auto tokensPerStepDevice = inputs.curTokensPerStep->template getPtr(); auto treeIds = inputs.treeIds.template getPtr(); TLLM_CHECK_WITH_INFO(draftIds != nullptr, "Draft ids must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(tokensPerStepDevice != nullptr, "Tokens per step must be provided for MedusaDecoding"); @@ -474,23 +472,22 @@ void MedusaDecodingLayer::scatterNewDraftTokens( template void MedusaDecodingLayer::packAcceptedPaths( - DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs) + SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const batchSize = inputs.logits.shape[0]; + auto const batchSize = inputs.logits->shape[0]; auto paths = inputs.paths.template getPtr(); - auto batchSlots = inputs.batch_slots ? inputs.batch_slots->template getPtr() : nullptr; - auto acceptedLengths = outputs.speculativeDecodingOutputs->acceptedLengths.template getPtr(); - auto acceptedLengthsCumSum - = outputs.speculativeDecodingOutputs->acceptedLengthsCumSum.template getPtr(); - auto pathsOffsets = outputs.speculativeDecodingOutputs->pathsOffsets.template getPtr(); + auto batchSlots = inputs.batchSlots ? inputs.batchSlots->template getPtr() : nullptr; + auto numNewTokens = outputs.numNewTokens->template getPtr(); + auto numNewTokensCumSum = outputs.numNewTokensCumSum.template getPtr(); + auto pathsOffsets = outputs.pathsOffsets.template getPtr(); TLLM_CHECK_WITH_INFO(batchSlots != nullptr, "Batch slots must be provided for MedusaDecoding"); - TLLM_CHECK_WITH_INFO(acceptedLengths != nullptr, "Accepted lengths must be provided for MedusaDecoding"); - TLLM_CHECK_WITH_INFO(acceptedLengthsCumSum != nullptr, "acceptedLengthsCumSum must be provided for MedusaDecoding"); + TLLM_CHECK_WITH_INFO(numNewTokens != nullptr, "Accepted lengths must be provided for MedusaDecoding"); + TLLM_CHECK_WITH_INFO(numNewTokensCumSum != nullptr, "numNewTokensCumSum must be provided for MedusaDecoding"); TLLM_CHECK_WITH_INFO(pathsOffsets != nullptr, "pathsOffsets must be provided for MedusaDecoding"); - invokePackAcceptedPaths(acceptedLengthsCumSum, pathsOffsets, acceptedLengths, mBestPathIdsDevice, paths, batchSlots, + invokePackAcceptedPaths(numNewTokensCumSum, pathsOffsets, numNewTokens, mBestPathIdsDevice, paths, batchSlots, batchSize, mDecoderDomain.getMaxDecodingTokens(), mDecoderDomain.getSpeculativeDecodingModule()->getMaxPathLen(), false, mStream); @@ -500,5 +497,4 @@ void MedusaDecodingLayer::packAcceptedPaths( template class MedusaDecodingLayer; template class MedusaDecodingLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.h b/cpp/tensorrt_llm/layers/medusaDecodingLayer.h index f66c93e51..d1c0013fc 100644 --- a/cpp/tensorrt_llm/layers/medusaDecodingLayer.h +++ b/cpp/tensorrt_llm/layers/medusaDecodingLayer.h @@ -19,46 +19,13 @@ #include -#include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" -namespace tc = tensorrt_llm::common; - -namespace tensorrt_llm -{ -namespace layers -{ - -class MedusaSetupParams : public BaseSetupParams -{ -public: - std::optional> runtimeTopK; // [1] or [setupBatchSize] on cpu - std::optional>> runtimeHeadsTopK; // [setupBatchSize, maxDraftPathLen] on cpu - std::optional> randomSeed; // [1] or [setupBatchSize] on cpu -}; - -class MedusaInputParams : public BaseInputParams +namespace tensorrt_llm::layers { -public: - explicit MedusaInputParams(tc::Tensor logits, tc::Tensor endIds) - : BaseInputParams{0, 0, std::move(endIds)} - , logits{std::move(logits)} - { - } - - tc::Tensor logits; // [maxBatchSize, beamWidth, vocabSizePadded] - - tc::Tensor paths; // [maxBatchSize, maxDecodingTokens, maxPathLen] on gpu - std::vector> - medusaLogits; // [maxBatchSize][maxDraftPathLen][maxDecodingTokens, vocabSize] on gpu - tc::Tensor medusaCurTokensPerStep; // [maxBatchSize] on gpu - tc::Tensor medusaTargetTokensPerStep; // [maxBatchSize] on gpu - tc::Tensor treeIds; // [maxBatchSize, maxDecodingTokens] on gpu -}; //! \brief template @@ -74,19 +41,20 @@ class MedusaDecodingLayer : public BaseLayer ~MedusaDecodingLayer() override; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: void allocateBuffer(); void freeBuffer(); - void samplePrimeHeadTokens(DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs); - void acceptDraftTokens(DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs); - void sampleNewDraftTokens(DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs); - void scatterNewDraftTokens(DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs); - void packAcceptedPaths(DynamicDecodeOutputParams const& outputs, MedusaInputParams const& inputs); + void samplePrimeHeadTokens(SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs); + void acceptDraftTokens(SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs); + void sampleNewDraftTokens(SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs); + void scatterNewDraftTokens(SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs); + void packAcceptedPaths(SpeculativeDecodingOutputs const& outputs, MedusaDecodingInputs const& inputs); private: using Base::mStream; @@ -118,5 +86,4 @@ class MedusaDecodingLayer : public BaseLayer std::vector mCummulativeTopK; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/penaltyLayer.cpp b/cpp/tensorrt_llm/layers/penaltyLayer.cpp index fcd9d2cfe..f70900d1a 100644 --- a/cpp/tensorrt_llm/layers/penaltyLayer.cpp +++ b/cpp/tensorrt_llm/layers/penaltyLayer.cpp @@ -17,9 +17,11 @@ #include "tensorrt_llm/layers/penaltyLayer.h" #include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/kernels/penaltyKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" +#include "tensorrt_llm/layers/layerUtils.h" +#include "tensorrt_llm/runtime/bufferManager.h" #include @@ -27,9 +29,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -182,7 +182,7 @@ void PenaltyLayer::freeBuffer() template void PenaltyLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -207,14 +207,15 @@ void PenaltyLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType FillBuffers const fillBuffers{batchSize, mDecoderDomain.getBatchSize(), mStream}; auto const& penaltyParams = setupParams->penaltyParams; + TLLM_CHECK_WITH_INFO(penaltyParams, "penaltyParams for setup is not set"); - bool const useTemperature = mDecodingMode.isUseTemperature() && penaltyParams.temperature.has_value(); + bool const useTemperature = mDecodingMode.isUseTemperature() && penaltyParams->temperature.has_value(); bool const useRepetitionPenalty - = mDecodingMode.isUseRepetitionPenalty() && penaltyParams.repetitionPenalty.has_value(); - bool const usePresencePenalty = mDecodingMode.isUsePresencePenalty() && penaltyParams.presencePenalty.has_value(); + = mDecodingMode.isUseRepetitionPenalty() && penaltyParams->repetitionPenalty.has_value(); + bool const usePresencePenalty = mDecodingMode.isUsePresencePenalty() && penaltyParams->presencePenalty.has_value(); bool const useFrequencyPenalty - = mDecodingMode.isUseFrequencyPenalty() && penaltyParams.frequencyPenalty.has_value(); - bool const useMinLength = mDecodingMode.isUseMinLength() && penaltyParams.minLength.has_value(); + = mDecodingMode.isUseFrequencyPenalty() && penaltyParams->frequencyPenalty.has_value(); + bool const useMinLength = mDecodingMode.isUseMinLength() && penaltyParams->minLength.has_value(); // FIXME(nkorobov): once one of the requests has some penalty, we will always have to compute it. // To avoid that we need to scan through all active requests at each iteration. mUseTemperature |= useTemperature; @@ -225,31 +226,31 @@ void PenaltyLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType if (mUseTemperature) { - fillBuffers(penaltyParams.temperature, DefaultDecodingParams::getTemperature(), mTemperature, + fillBuffers(penaltyParams->temperature, DefaultDecodingParams::getTemperature(), mTemperature, mTemperatureDevice, batchSlotsHost, getLimitsPenalty(DecodingPenaltyType::Temperature), "temperature penalty"); } if (mUseRepetitionPenalty) { - fillBuffers(penaltyParams.repetitionPenalty, DefaultDecodingParams::getRepetitionPenalty(), mRepetitionPenalty, + fillBuffers(penaltyParams->repetitionPenalty, DefaultDecodingParams::getRepetitionPenalty(), mRepetitionPenalty, mRepetitionPenaltyDevice, batchSlotsHost, getLimitsPenalty(DecodingPenaltyType::Repetition), "repetition penalty"); } if (mUsePresencePenalty) { - fillBuffers(penaltyParams.presencePenalty, DefaultDecodingParams::getPresencePenalty(), mPresencePenalty, + fillBuffers(penaltyParams->presencePenalty, DefaultDecodingParams::getPresencePenalty(), mPresencePenalty, mPresencePenaltyDevice, batchSlotsHost, getLimitsPenalty(DecodingPenaltyType::Presence), "presence penalty"); } if (mUseFrequencyPenalty) { - fillBuffers(penaltyParams.frequencyPenalty, DefaultDecodingParams::getFrequencyPenalty(), mFrequencyPenalty, + fillBuffers(penaltyParams->frequencyPenalty, DefaultDecodingParams::getFrequencyPenalty(), mFrequencyPenalty, mFrequencyPenaltyDevice, batchSlotsHost, getLimitsPenalty(DecodingPenaltyType::Frequency), "frequency penalty"); } if (mUseMinLength) { - fillBuffers(penaltyParams.minLength, DefaultDecodingParams::getMinLength(), mMinLength, mMinLengthDevice, + fillBuffers(penaltyParams->minLength, DefaultDecodingParams::getMinLength(), mMinLength, mMinLengthDevice, batchSlotsHost, getLimitsPenalty(DecodingPenaltyType::MinLength), "min length"); } @@ -258,21 +259,21 @@ void PenaltyLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType template void PenaltyLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto outputs = std::dynamic_pointer_cast(baseOutputs); - auto params = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto params = std::dynamic_pointer_cast(baseInputs); auto const localDecoderDomain = getLocalDecoderDomain(params, mDecoderDomain); - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; - auto batchSlots = params->batch_slots ? params->batch_slots->template getPtr() : nullptr; + auto const maxSeqLen = outputs->outputIds.shape[outputs->outputIds.shape.size() - 1]; + auto batchSlots = params->batchSlots ? params->batchSlots->template getPtr() : nullptr; std::vector batchSlotsVec(localDecoderDomain.getBatchSize()); std::iota(batchSlotsVec.begin(), batchSlotsVec.end(), 0); auto batchSlotsHost - = params->batch_slots ? params->batch_slots->template getPtr() : batchSlotsVec.data(); + = params->batchSlots ? params->batchSlots->template getPtr() : batchSlotsVec.data(); if (!mLogitsPtrsHost->data()) { @@ -288,12 +289,12 @@ void PenaltyLayer::forwardAsync( auto logitsPtrsHostData = reinterpret_cast(runtime::bufferCast(*logitsPtrsHost)); for (SizeType32 bi = 0; bi < localDecoderDomain.getBatchSize(); bi++) { - if (params->logits_vec) + if (params->logitsVec) { - TLLM_CHECK_WITH_INFO(params->logits_vec->size() == localDecoderDomain.getBatchSize(), - "Logits vector size (%lu) is not equal to the batchSize (%d)", params->logits_vec->size(), + TLLM_CHECK_WITH_INFO(params->logitsVec->size() == localDecoderDomain.getBatchSize(), + "Logits vector size (%lu) is not equal to the batchSize (%d)", params->logitsVec->size(), localDecoderDomain.getBatchSize()); - logitsPtrsHostData[bi] = params->logits_vec.value()[bi].template getPtr(); + logitsPtrsHostData[bi] = params->logitsVec.value()[bi].template getPtr(); } else { @@ -303,12 +304,11 @@ void PenaltyLayer::forwardAsync( } SizeType32 const* inputLengths = nullptr; - if (params->input_lengths) + if (params->inputLengths) { - auto& input_lengths = params->input_lengths.value(); - inputLengths = input_lengths.template getPtr(); + inputLengths = params->inputLengths->template getPtr(); } - auto* embeddingBias = params->embedding_bias ? params->embedding_bias->template getPtr() : nullptr; + auto* embeddingBias = params->embeddingBias ? params->embeddingBias->template getPtr() : nullptr; #define GET_PENALTIES(capital_name, type) \ (mUse##capital_name \ && !allOfBatchSlots(batchSlotsHost, m##capital_name.data(), localDecoderDomain.getBatchSize(), \ @@ -324,9 +324,8 @@ void PenaltyLayer::forwardAsync( #undef GET_PENALTIES - auto const tokensPerStep = params->medusaInputs - ? params->medusaInputs->medusaCurTokensPerStep.template getPtr() - : nullptr; + auto const tokensPerStep + = params->curTokensPerStep ? params->curTokensPerStep->template getPtr() : nullptr; InvokeBatchApplyPenaltyParams penaltyParams; penaltyParams.inputLogits = reinterpret_cast(logitsPtrsHostData); @@ -343,12 +342,12 @@ void PenaltyLayer::forwardAsync( penaltyParams.maxSeqLen = maxSeqLen; penaltyParams.vocabSize = mDecoderDomain.getVocabSize(); penaltyParams.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); - penaltyParams.outputIdsPtr = outputs->output_ids_ptr.template getPtr(); - penaltyParams.parentIdsPtr = outputs->parent_ids_ptr.template getPtr(); + penaltyParams.outputIdsPtr = outputs->outputIdsPtr.template getPtr(); + penaltyParams.parentIdsPtr = outputs->parentIdsPtr.template getPtr(); penaltyParams.inputLengths = inputLengths; - penaltyParams.sequenceLengths = outputs->sequence_length->template getPtr(); + penaltyParams.sequenceLengths = outputs->sequenceLength->template getPtr(); penaltyParams.minLengths = minLengths; - penaltyParams.endIds = params->end_ids.template getPtr(); + penaltyParams.endIds = params->endIds.template getPtr(); penaltyParams.batchSlots = batchSlots; penaltyParams.maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens(); penaltyParams.tokensPerStep = tokensPerStep; @@ -376,5 +375,4 @@ void PenaltyLayer::forwardAsync( template class PenaltyLayer; template class PenaltyLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/penaltyLayer.h b/cpp/tensorrt_llm/layers/penaltyLayer.h index ec6c24f27..257f0316d 100644 --- a/cpp/tensorrt_llm/layers/penaltyLayer.h +++ b/cpp/tensorrt_llm/layers/penaltyLayer.h @@ -19,17 +19,12 @@ #include -#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" -#include "tensorrt_llm/layers/layerUtils.h" -#include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/iTensor.h" -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { //! \brief Layer applies penalties to the logits. Supports: @@ -48,10 +43,11 @@ class PenaltyLayer : public BaseLayer ~PenaltyLayer() override; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; //! \brief Modifies 'outputs->logits' in-place with -INF for banned words - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; T* getRuntimeLogitsDevice() { @@ -103,5 +99,4 @@ class PenaltyLayer : public BaseLayer runtime::ITensor::SharedPtr mLogitsPtrsHost; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/samplingLayer.cpp b/cpp/tensorrt_llm/layers/samplingLayer.cpp index e6963f154..7c67a73bf 100644 --- a/cpp/tensorrt_llm/layers/samplingLayer.cpp +++ b/cpp/tensorrt_llm/layers/samplingLayer.cpp @@ -19,7 +19,8 @@ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" -#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/layers/topKSamplingLayer.h" +#include "tensorrt_llm/layers/topPSamplingLayer.h" #include @@ -27,9 +28,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template SamplingLayer::SamplingLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain, @@ -111,7 +110,7 @@ void SamplingLayer::freeBuffer() template void SamplingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -168,20 +167,17 @@ void SamplingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeTyp template void SamplingLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& outputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto inputs = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); - auto const batchSize = inputs->logits.shape[0]; + auto const batchSize = inputs->logits->shape[0]; - auto logits = inputs->logits.template getPtr(); - auto endIds = inputs->end_ids.template getPtr(); - auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr() : nullptr; - float* cumLogProbs = (outputs->cum_log_probs) ? outputs->cum_log_probs->template getPtr() : nullptr; - float* outputLogProbs = (outputs->output_log_probs) ? outputs->output_log_probs->template getPtr() : nullptr; + auto logits = inputs->logits->template getPtr(); + auto endIds = inputs->endIds.template getPtr(); + auto batchSlots = inputs->batchSlots ? inputs->batchSlots->template getPtr() : nullptr; FinishedState* finishedInput = (inputs->finished) ? reinterpret_cast(inputs->finished->template getPtr()) @@ -192,9 +188,9 @@ void SamplingLayer::forwardAsync( // Compute probabilities either for TopP or if cumLogProbs or outputLogProbs are specified bool const skipSoftMax = skipTopP && !mOutputLogProbs && !mCumLogProbs; - inputs->curand_states = mCurandStatesDevice; - inputs->sampling_workspace = mSamplingWorkspaceDevice; - inputs->probs_computed = !skipSoftMax; + inputs->curandStates = mCurandStatesDevice; + inputs->samplingWorkspace = mSamplingWorkspaceDevice; + inputs->probsComputed = !skipSoftMax; if (!skipSoftMax) { invokeAddBiasSoftMax(logits, (T**) nullptr, logits, (T*) (nullptr), endIds, finishedInput, batchSlots, @@ -205,7 +201,7 @@ void SamplingLayer::forwardAsync( for (auto&& layer : mSamplingLayers) { - layer->forwardAsync(baseOutputs, baseInputs); + layer->forwardAsync(outputs, baseInputs); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -214,5 +210,4 @@ void SamplingLayer::forwardAsync( template class SamplingLayer; template class SamplingLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/samplingLayer.h b/cpp/tensorrt_llm/layers/samplingLayer.h index b1dcd955b..ee64f50a6 100644 --- a/cpp/tensorrt_llm/layers/samplingLayer.h +++ b/cpp/tensorrt_llm/layers/samplingLayer.h @@ -17,21 +17,14 @@ #pragma once -#include - -#include "tensorrt_llm/common/tensor.h" +#include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" -#include "tensorrt_llm/layers/samplingParams.h" -#include "tensorrt_llm/layers/topKSamplingLayer.h" -#include "tensorrt_llm/layers/topPSamplingLayer.h" #include "tensorrt_llm/runtime/common.h" -namespace tc = tensorrt_llm::common; +#include -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { //! \brief Top class for sampling layers. @@ -48,9 +41,10 @@ class SamplingLayer : public BaseLayer ~SamplingLayer() override = default; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: using Base::mWorkspaceSize; @@ -82,5 +76,4 @@ class SamplingLayer : public BaseLayer void freeBuffer(); }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/samplingParams.h b/cpp/tensorrt_llm/layers/samplingParams.h deleted file mode 100644 index 169184bd3..000000000 --- a/cpp/tensorrt_llm/layers/samplingParams.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "tensorrt_llm/layers/decodingParams.h" -#include -#include - -#include -#include - -namespace tc = tensorrt_llm::common; - -namespace tensorrt_llm::layers -{ - -class SamplingSetupParams : public BaseSetupParams -{ -public: - std::optional> runtime_top_k; // [1] or [batchSize] on cpu - std::optional> runtime_top_p; // [1] or [batchSize] on cpu - std::optional> randomSeed; // [1] or [batchSize] on cpu - std::optional> top_p_decay; // [batchSize], must between [0, 1] - std::optional> top_p_min; // [batchSize], must between [0, 1] - std::optional> top_p_reset_ids; // [batchSize] - std::optional> outputLogProbs; // [batchSize] - std::optional> cumLogProbs; // [batchSize] - std::optional normalize_log_probs; -}; - -class SamplingInputParams : public BaseInputParams -{ -public: - explicit SamplingInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor logits, - tc::Tensor end_ids, runtime::SizeType32 max_seq_len) - : BaseInputParams{step, ite, std::move(end_ids)} - , logits{std::move(logits)} - , max_seq_len{max_seq_len} - { - } - - // mandatory parameters - tc::Tensor logits; // [local_batch_size, beam_width, vocab_size_padded] - runtime::SizeType32 max_seq_len; - - // optional parameters - std::optional input_lengths; // [localBatchSize] - curandState_t* curand_states; // [localBatchSize] - // Pointer to the workspace for sampling computation - void* sampling_workspace; - // Flag to mark that logits tensor contains probabilities - bool probs_computed; -}; - -class SamplingOutputParams : public BaseOutputParams -{ -public: - explicit SamplingOutputParams(tc::Tensor outputIds) - : BaseOutputParams{std::move(outputIds)} - { - } -}; - -} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/stopCriteriaLayer.cpp b/cpp/tensorrt_llm/layers/stopCriteriaLayer.cpp index f51655ff7..317858fc2 100644 --- a/cpp/tensorrt_llm/layers/stopCriteriaLayer.cpp +++ b/cpp/tensorrt_llm/layers/stopCriteriaLayer.cpp @@ -17,19 +17,14 @@ #include "tensorrt_llm/layers/stopCriteriaLayer.h" #include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/stopCriteriaKernels.h" #include "tensorrt_llm/layers/layerUtils.h" -#include - using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -44,7 +39,7 @@ StopCriteriaLayer::StopCriteriaLayer(executor::DecodingMode const& mode, Deco template void StopCriteriaLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr setupParams) + std::shared_ptr const& setupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -52,16 +47,18 @@ void StopCriteriaLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, Siz template void StopCriteriaLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& baseOutputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto inputs = std::dynamic_pointer_cast(baseInputs); - auto outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto outputs = std::dynamic_pointer_cast(baseOutputs); auto const localDecoderDomain = getLocalDecoderDomain(inputs, mDecoderDomain); - auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1]; - auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr() : nullptr; + auto const maxSeqLen = outputs->outputIds.shape[outputs->outputIds.shape.size() - 1]; + auto batchSlots = inputs->batchSlots ? inputs->batchSlots->template getPtr() : nullptr; + + TLLM_CHECK_WITH_INFO(inputs->stopCriteriaInputs, "stopCriteriaInputs for forward is not set"); if (mDecodingMode.isUseStopWords()) { @@ -80,60 +77,61 @@ void StopCriteriaLayer::forwardAsync( } template -void StopCriteriaLayer::checkStopWordsStopCriteria(std::shared_ptr& outputs, - std::shared_ptr const& inputs, SizeType32 const* batchSlots, - DecoderDomain const& decoderDomain, SizeType32 maxSeqLen, cudaStream_t stream) +void StopCriteriaLayer::checkStopWordsStopCriteria(std::shared_ptr& outputs, + std::shared_ptr const& inputs, SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, + SizeType32 maxSeqLen, cudaStream_t stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const maxStopWordsLength = inputs->max_stop_words_len; + auto const maxStopWordsLength = inputs->stopCriteriaInputs->maxStopWordsLen; if (maxStopWordsLength) { - auto numNewTokens = outputs->speculativeDecodingOutputs - ? outputs->speculativeDecodingOutputs->acceptedLengths.template getPtr() - : nullptr; - invokeStopWordsCriterion(outputs->output_ids_ptr.template getPtr(), - outputs->parent_ids_ptr.template getPtr(), - inputs->stop_words_ptr->template getPtr(), + auto numNewTokens = outputs->numNewTokens ? outputs->numNewTokens->template getPtr() : nullptr; + invokeStopWordsCriterion(outputs->outputIdsPtr.template getPtr(), + outputs->parentIdsPtr.template getPtr(), + inputs->stopCriteriaInputs->stopWordsPtr->template getPtr(), reinterpret_cast(outputs->finished->template getPtr()), - outputs->sequence_length->template getPtr(), batchSlots, - inputs->stop_words_lengths->template getPtr(), numNewTokens, maxStopWordsLength, - decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), maxSeqLen, stream); + outputs->sequenceLength->template getPtr(), batchSlots, + inputs->stopCriteriaInputs->stopWordsLengths->template getPtr(), numNewTokens, + maxStopWordsLength, decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), maxSeqLen, stream); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template -void StopCriteriaLayer::checkMaxLengthStopCriteria(std::shared_ptr& outputs, - std::shared_ptr const& inputs, SizeType32 const* batchSlots, - DecoderDomain const& decoderDomain, SizeType32 maxSeqLen, cudaStream_t stream) +void StopCriteriaLayer::checkMaxLengthStopCriteria(std::shared_ptr& outputs, + std::shared_ptr const& inputs, SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, + SizeType32 maxSeqLen, cudaStream_t stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - if (inputs->sequence_limit_length) + if (inputs->stopCriteriaInputs->sequenceLimitLength) { + auto numNewTokens = outputs->numNewTokens ? outputs->numNewTokens->template getPtr() : nullptr; + invokeLengthCriterion( reinterpret_cast(outputs->finished->template getPtr()), - outputs->finished_sum ? outputs->finished_sum->template getPtr() : nullptr, - inputs->sequence_limit_length->template getPtr(), - outputs->sequence_length->template getPtr(), batchSlots, decoderDomain.getBatchSize(), - decoderDomain.getBeamWidth(), stream); + outputs->finishedSum ? outputs->finishedSum->template getPtr() : nullptr, + inputs->stopCriteriaInputs->sequenceLimitLength->template getPtr(), + outputs->sequenceLength->template getPtr(), numNewTokens, batchSlots, + decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), stream); sync_check_cuda_error(); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template -void StopCriteriaLayer::checkEosToken(std::shared_ptr& outputs, - std::shared_ptr const& inputs, SizeType32 const* batchSlots, - DecoderDomain const& decoderDomain, SizeType32 maxSeqLen, cudaStream_t stream) +void StopCriteriaLayer::checkEosToken(std::shared_ptr& outputs, + std::shared_ptr const& inputs, SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, + SizeType32 maxSeqLen, cudaStream_t stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - invokeExplicitEOSCriterion(outputs->output_ids_ptr.template getPtr(), - inputs->end_ids.template getPtr(), + + auto numNewTokens = outputs->numNewTokens ? outputs->numNewTokens->template getPtr() : nullptr; + + invokeExplicitEOSCriterion(outputs->outputIdsPtr.template getPtr(), + inputs->endIds.template getPtr(), reinterpret_cast(outputs->finished->template getPtr()), - outputs->sequence_length->template getPtr(), - // FIXME(nkorobov): add tokens per step tensor when necessary - /* tokensPerStep */ nullptr, batchSlots, decoderDomain.getBatchSize(), decoderDomain.getBeamWidth(), - decoderDomain.getMaxDecodingTokens(), stream); + outputs->sequenceLength->template getPtr(), numNewTokens, batchSlots, decoderDomain.getBatchSize(), + decoderDomain.getBeamWidth(), decoderDomain.getMaxDecodingTokens(), stream); sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -141,5 +139,4 @@ void StopCriteriaLayer::checkEosToken(std::shared_ptr; template class StopCriteriaLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/stopCriteriaLayer.h b/cpp/tensorrt_llm/layers/stopCriteriaLayer.h index 3aabbfbdf..6364f2507 100644 --- a/cpp/tensorrt_llm/layers/stopCriteriaLayer.h +++ b/cpp/tensorrt_llm/layers/stopCriteriaLayer.h @@ -17,17 +17,13 @@ #pragma once -#include - -#include "tensorrt_llm/common/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/layers/baseLayer.h" #include "tensorrt_llm/layers/decodingParams.h" -#include "tensorrt_llm/runtime/iTensor.h" -namespace tensorrt_llm -{ -namespace layers +#include + +namespace tensorrt_llm::layers { //! \brief Layer to process stop criteria. Supports: @@ -43,19 +39,20 @@ class StopCriteriaLayer : public BaseLayer ~StopCriteriaLayer() override = default; void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; + std::shared_ptr const& setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; private: - static void checkMaxLengthStopCriteria(std::shared_ptr& outputs, - std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, + static void checkMaxLengthStopCriteria(std::shared_ptr& outputs, + std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, runtime::SizeType32 maxSeqLen, cudaStream_t stream); - static void checkStopWordsStopCriteria(std::shared_ptr& outputs, - std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, + static void checkStopWordsStopCriteria(std::shared_ptr& outputs, + std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, runtime::SizeType32 maxSeqLen, cudaStream_t stream); - static void checkEosToken(std::shared_ptr& outputs, - std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, + static void checkEosToken(std::shared_ptr& outputs, + std::shared_ptr const& inputs, runtime::SizeType32 const* batchSlots, DecoderDomain const& decoderDomain, runtime::SizeType32 maxSeqLen, cudaStream_t stream); private: @@ -70,5 +67,4 @@ class StopCriteriaLayer : public BaseLayer executor::DecodingMode mDecodingMode; }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/topKSamplingLayer.cu b/cpp/tensorrt_llm/layers/topKSamplingLayer.cu index f98aae5b3..75e4cd15c 100644 --- a/cpp/tensorrt_llm/layers/topKSamplingLayer.cu +++ b/cpp/tensorrt_llm/layers/topKSamplingLayer.cu @@ -19,11 +19,9 @@ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/samplingTopKKernels.h" -#include "tensorrt_llm/kernels/samplingTopPKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" #include "tensorrt_llm/layers/topKSamplingLayer.h" -#include "tensorrt_llm/runtime/iTensor.h" #include #include @@ -32,9 +30,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { template @@ -135,19 +131,19 @@ void TopKSamplingLayer::freeBuffer() template void TopKSamplingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); auto const defaultTopK = DefaultDecodingParams::getTopK(); - auto runtimeTopK = setupParams->runtime_top_k.value_or(std::vector(batchSize, defaultTopK)); - auto runtimeTopP = setupParams->runtime_top_p.value_or(std::vector{}); + auto runtimeTopK = setupParams->runtimeTopK.value_or(std::vector(batchSize, defaultTopK)); + auto runtimeTopP = setupParams->runtimeTopP.value_or(std::vector{}); auto const runtimeTopKSize = runtimeTopK.size(); auto const runtimeTopPSize = runtimeTopP.size(); - mNormalizeLogProbs = setupParams->normalize_log_probs.has_value() && setupParams->normalize_log_probs.value(); + mNormalizeLogProbs = setupParams->normalizeLogProbs.has_value() && setupParams->normalizeLogProbs.value(); for (auto& topP : runtimeTopP) { @@ -218,26 +214,24 @@ void TopKSamplingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, Siz template void TopKSamplingLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& outputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - std::shared_ptr inputs = std::dynamic_pointer_cast(baseInputs); - std::shared_ptr outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); - auto const batchSize = inputs->logits.shape[0]; + auto const batchSize = inputs->logits->shape[0]; - auto logits = inputs->logits.template getPtr(); - auto endIds = inputs->end_ids.template getPtr(); - auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr() : nullptr; - auto curandStatesDevice = inputs->curand_states; - auto samplingWorkspaceDevice = inputs->sampling_workspace; - auto const probsComputed = inputs->probs_computed; + auto logits = inputs->logits->template getPtr(); + auto endIds = inputs->endIds.template getPtr(); + auto batchSlots = inputs->batchSlots ? inputs->batchSlots->template getPtr() : nullptr; + auto curandStatesDevice = inputs->curandStates; + auto samplingWorkspaceDevice = inputs->samplingWorkspace; + auto const probsComputed = inputs->probsComputed; std::vector batchSlotsVec(batchSize); std::iota(batchSlotsVec.begin(), batchSlotsVec.end(), 0); - auto batchSlotsHost - = inputs->batch_slots ? inputs->batch_slots->template getPtr() : batchSlotsVec.data(); + auto batchSlotsHost = inputs->batchSlots ? inputs->batchSlots->template getPtr() : batchSlotsVec.data(); auto const skip = allOfBatchSlots(batchSlotsHost, mSkipDecodeHost, batchSize, true); if (skip) { @@ -254,14 +248,14 @@ void TopKSamplingLayer::forwardAsync( ? reinterpret_cast(outputs->finished->template getPtr()) : nullptr; - auto cumLogProbs = (outputs->cum_log_probs) ? outputs->cum_log_probs->template getPtr() : nullptr; - auto outputLogProbs = (outputs->output_log_probs) ? outputs->output_log_probs->template getPtr() : nullptr; - auto sequenceLengths - = (outputs->sequence_length) ? outputs->sequence_length->template getPtr() : nullptr; + auto cumLogProbs = (outputs->cumLogProbs) ? outputs->cumLogProbs->template getPtr() : nullptr; + auto outputLogProbs + = (outputs->outputLogProbsTiled) ? outputs->outputLogProbsTiled->template getPtr() : nullptr; + auto sequenceLengths = (outputs->sequenceLength) ? outputs->sequenceLength->template getPtr() : nullptr; TopKSamplingKernelParams params; params.logProbs = logits; - params.outputIdsPtrs = outputs->output_ids_ptr.template getPtr(); + params.outputIdsPtrs = outputs->outputIdsPtr.template getPtr(); params.workspace = samplingWorkspaceDevice; params.maxTopP = 1.0f; params.topPs = mRuntimeTopPDevice; @@ -292,5 +286,4 @@ void TopKSamplingLayer::forwardAsync( template class TopKSamplingLayer; template class TopKSamplingLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/topKSamplingLayer.h b/cpp/tensorrt_llm/layers/topKSamplingLayer.h index 306c5b7e5..4b770a9f7 100644 --- a/cpp/tensorrt_llm/layers/topKSamplingLayer.h +++ b/cpp/tensorrt_llm/layers/topKSamplingLayer.h @@ -17,16 +17,10 @@ #pragma once -#include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/layers/baseLayer.h" -#include "tensorrt_llm/layers/samplingParams.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { //! \brief Layer to randomly sample tokens from TopK logits. @@ -43,8 +37,9 @@ class TopKSamplingLayer : public BaseLayer ~TopKSamplingLayer(); void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + std::shared_ptr const& setupParams) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; bool const* getSkipDecodeHost() const { @@ -72,5 +67,4 @@ class TopKSamplingLayer : public BaseLayer void freeBuffer(); }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu index d5ebbc735..470e3be9c 100644 --- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu +++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu @@ -17,9 +17,7 @@ #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" -#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/decodingCommon.h" -#include "tensorrt_llm/kernels/samplingTopKKernels.h" #include "tensorrt_llm/kernels/samplingTopPKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/layers/layerUtils.h" @@ -32,9 +30,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { static __global__ void setTopPRuntimeArgs(SizeType32 batchSize, SizeType32 topK, SizeType32* topKs, @@ -152,28 +148,27 @@ void TopPSamplingLayer::freeBuffer() template void TopPSamplingLayer::setup(SizeType32 const batchSize, SizeType32 const beamWidth, SizeType32 const* batchSlots, - std::shared_ptr baseSetupParams) + std::shared_ptr const& baseSetupParams) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto setupParams = std::dynamic_pointer_cast(baseSetupParams); auto const defaultTopK = DefaultDecodingParams::getTopK(); - auto runtimeTopK = setupParams->runtime_top_k.value_or(std::vector(batchSize, defaultTopK)); - auto runtimeTopP = setupParams->runtime_top_p.value_or(std::vector{}); + auto runtimeTopK = setupParams->runtimeTopK.value_or(std::vector(batchSize, defaultTopK)); + auto runtimeTopP = setupParams->runtimeTopP.value_or(std::vector{}); auto const runtimeTopKSize = runtimeTopK.size(); auto const runtimeTopPSize = runtimeTopP.size(); auto const defaultTopPDecay = DefaultDecodingParams::getTopPDecay(); - auto decayVec = setupParams->top_p_decay.value_or(std::vector(batchSize, defaultTopPDecay)); + auto decayVec = setupParams->topPDecay.value_or(std::vector(batchSize, defaultTopPDecay)); auto const defaultTopPMin = DefaultDecodingParams::getTopPMin(); // prevent TopP becoming 0.0 - auto topPMinVec = setupParams->top_p_min.value_or(std::vector(batchSize, defaultTopPMin)); + auto topPMinVec = setupParams->topPMin.value_or(std::vector(batchSize, defaultTopPMin)); auto const defaultTopPResetId = DefaultDecodingParams::getTopPResetId(); - auto topPResetIdsVec - = setupParams->top_p_reset_ids.value_or(std::vector(batchSize, defaultTopPResetId)); + auto topPResetIdsVec = setupParams->topPResetIds.value_or(std::vector(batchSize, defaultTopPResetId)); if (runtimeTopPSize == 0) { @@ -233,7 +228,7 @@ void TopPSamplingLayer::setup(SizeType32 const batchSize, SizeType32 const be if (runtimeTopPSize > 1) { TLLM_CHECK_WITH_INFO(static_cast(runtimeTopP.size()) == batchSize, - fmtstr("runtime_top_p.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopP.size(), batchSize)); + fmtstr("runtimeTopP.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopP.size(), batchSize)); cudaAutoCpy(reinterpret_cast(mSetupWorkspaceDevice), runtimeTopP.data(), batchSize, mStream); invokeScatterDecodingParams( reinterpret_cast(mSetupWorkspaceDevice), mRuntimeTopPDevice, batchSlots, batchSize, mStream); @@ -248,12 +243,12 @@ void TopPSamplingLayer::setup(SizeType32 const batchSize, SizeType32 const be invokeScatterDecodingParams(deviceTmpBuffer, deviceBuffer, batchSlots, batchSize, mStream); }; - fillBuffers("top_p_decay", decayVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPDecayDevice); + fillBuffers("topPDecay", decayVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPDecayDevice); - fillBuffers("top_p_min", topPMinVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPMinDevice); + fillBuffers("topPMin", topPMinVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPMinDevice); fillBuffers( - "top_p_reset_ids", topPResetIdsVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPResetIdsDevice); + "topPResetIds", topPResetIdsVec, reinterpret_cast(mSetupWorkspaceDevice), mTopPResetIdsDevice); { dim3 block(std::min(static_cast(batchSize), 256u)); @@ -296,19 +291,17 @@ void TopPSamplingLayer::setup(SizeType32 const batchSize, SizeType32 const be template void TopPSamplingLayer::forwardAsync( - std::shared_ptr baseOutputs, std::shared_ptr baseInputs) + std::shared_ptr const& outputs, std::shared_ptr const& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - std::shared_ptr inputs = std::dynamic_pointer_cast(baseInputs); - std::shared_ptr outputs = std::dynamic_pointer_cast(baseOutputs); + auto inputs = std::dynamic_pointer_cast(baseInputs); - auto const batchSize = inputs->logits.shape[0]; + auto const batchSize = inputs->logits->shape[0]; std::vector batchSlotsVec(batchSize); std::iota(batchSlotsVec.begin(), batchSlotsVec.end(), 0); - auto batchSlotsHost - = inputs->batch_slots ? inputs->batch_slots->template getPtr() : batchSlotsVec.data(); + auto batchSlotsHost = inputs->batchSlots ? inputs->batchSlots->template getPtr() : batchSlotsVec.data(); auto const skip = allOfBatchSlots(batchSlotsHost, mSkipDecodeHost, batchSize, true); if (skip) { @@ -316,11 +309,11 @@ void TopPSamplingLayer::forwardAsync( } // Probabilities must be already computed instead of logits - auto probs = inputs->logits.template getPtr(); - auto endIds = inputs->end_ids.template getPtr(); - auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr() : nullptr; - auto curandStatesDevice = inputs->curand_states; - auto samplingWorkspaceDevice = inputs->sampling_workspace; + auto probs = inputs->logits->template getPtr(); + auto endIds = inputs->endIds.template getPtr(); + auto batchSlots = inputs->batchSlots ? inputs->batchSlots->template getPtr() : nullptr; + auto curandStatesDevice = inputs->curandStates; + auto samplingWorkspaceDevice = inputs->samplingWorkspace; TLLM_CHECK_WITH_INFO(curandStatesDevice, "No curand states provided"); TLLM_CHECK_WITH_INFO(samplingWorkspaceDevice, "No sampling workspace provided"); @@ -332,14 +325,14 @@ void TopPSamplingLayer::forwardAsync( ? reinterpret_cast(outputs->finished->template getPtr()) : nullptr; - auto cumLogProbs = (outputs->cum_log_probs) ? outputs->cum_log_probs->template getPtr() : nullptr; - auto outputLogProbs = (outputs->output_log_probs) ? outputs->output_log_probs->template getPtr() : nullptr; - auto sequenceLength - = (outputs->sequence_length) ? outputs->sequence_length->template getPtr() : nullptr; + auto cumLogProbs = (outputs->cumLogProbs) ? outputs->cumLogProbs->template getPtr() : nullptr; + auto outputLogProbs + = (outputs->outputLogProbsTiled) ? outputs->outputLogProbsTiled->template getPtr() : nullptr; + auto sequenceLength = (outputs->sequenceLength) ? outputs->sequenceLength->template getPtr() : nullptr; TopPSamplingKernelParams params; params.probs = probs; - params.outputIds = outputs->output_ids_ptr.template getPtr(); + params.outputIds = outputs->outputIdsPtr.template getPtr(); params.workspace = samplingWorkspaceDevice; params.topPs = mRuntimeTopPDevice; params.sequenceLength = sequenceLength; @@ -370,7 +363,7 @@ void TopPSamplingLayer::forwardAsync( sync_check_cuda_error(); invokeComputeToppDecay(mRuntimeTopPDevice, mInitialTopPDevice, - outputs->output_ids_ptr.template getPtr(), mTopPDecayDevice, mTopPMinDevice, + outputs->outputIdsPtr.template getPtr(), mTopPDecayDevice, mTopPMinDevice, mTopPResetIdsDevice, sequenceLength, batchSlots, batchSize, mStream); sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -379,5 +372,4 @@ void TopPSamplingLayer::forwardAsync( template class TopPSamplingLayer; template class TopPSamplingLayer; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.h b/cpp/tensorrt_llm/layers/topPSamplingLayer.h index 76dc821cc..cc683bb18 100644 --- a/cpp/tensorrt_llm/layers/topPSamplingLayer.h +++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.h @@ -17,16 +17,10 @@ #pragma once -#include "tensorrt_llm/common/tensor.h" -#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/layers/baseLayer.h" -#include "tensorrt_llm/layers/samplingParams.h" #include "tensorrt_llm/runtime/common.h" -namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ -namespace layers +namespace tensorrt_llm::layers { //! \brief Layer to randomly sample tokens from TopP logits. @@ -43,10 +37,11 @@ class TopPSamplingLayer : public BaseLayer ~TopPSamplingLayer(); void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 const* batchSlots, - std::shared_ptr setupParams) override; - void forwardAsync(std::shared_ptr outputs, std::shared_ptr inputs) override; + std::shared_ptr const& setupParams) override; + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs) override; - bool const* getSkipDecodeHost() const + [[nodiscard]] bool const* getSkipDecodeHost() const { return mSkipDecodeHost; } @@ -82,5 +77,4 @@ class TopPSamplingLayer : public BaseLayer void freeBuffer(); }; -} // namespace layers -} // namespace tensorrt_llm +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.cpp b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.cpp index a3aeb0c68..3c8075bed 100644 --- a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.cpp +++ b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.cpp @@ -232,7 +232,12 @@ std::optional GemmPluginProfiler) + { + msg << ": " << candidateConfig.toString(); + } + msg << "\n (for" << " m=" << m << ", n=" << n << ", k=" << k << ")" << ", reason: \"" << e.what() << "\". Skipped"; TLLM_LOG_TRACE(msg.str()); diff --git a/cpp/tensorrt_llm/plugins/common/plugin.cpp b/cpp/tensorrt_llm/plugins/common/plugin.cpp index d58cd2015..4156a4ee0 100644 --- a/cpp/tensorrt_llm/plugins/common/plugin.cpp +++ b/cpp/tensorrt_llm/plugins/common/plugin.cpp @@ -41,50 +41,60 @@ std::unordered_map* getDtypeMap() return &dtypeMap; } -std::map, ncclComm_t>* getCommMap() +namespace { - static std::map, ncclComm_t> commMap; - return &commMap; -} -void initCommMap(std::set const& group) +// Get NCCL unique ID for a group of ranks. +ncclUniqueId getUniqueId(std::set const& group) noexcept { - auto& commMap = *getCommMap(); - // [] operator inserts T() if it does not exist - if (isBuilding() || commMap[group] != nullptr) - { - return; - } - auto& comm = COMM_SESSION; - auto const myRank = comm.getRank(); - - int groupRank = 0; - for (int it : group) - { - if (it == myRank) - { - break; - } - ++groupRank; - } - + auto const rank = COMM_SESSION.getRank(); ncclUniqueId id; - if (myRank == *group.begin()) + if (rank == *group.begin()) { - ncclGetUniqueId(&id); + NCCLCHECK(ncclGetUniqueId(&id)); for (auto it = std::next(std::begin(group), 1); it != group.end(); ++it) { - comm.sendValue(id, *it, 0); + COMM_SESSION.sendValue(id, *it, 0); } } else { - comm.recvValue(id, *group.begin(), 0); + COMM_SESSION.recvValue(id, *group.begin(), 0); } + return id; +} +} // namespace - commMap[group] = nullptr; - NCCLCHECK(ncclCommInitRank(&commMap[group], group.size(), id, groupRank)); +std::shared_ptr getComm(std::set const& group) +{ + static std::map, std::weak_ptr> commMap; + static std::mutex mutex; + std::lock_guard lock(mutex); + auto it = commMap.find(group); + if (it != commMap.end()) + { + // If the weak_ptr can be locked, return the shared_ptr + auto ncclComm = it->second.lock(); + if (ncclComm) + { + return ncclComm; + } + } + + ncclUniqueId id = getUniqueId(group); + auto const rank = COMM_SESSION.getRank(); + auto const groupRank = rank % group.size(); + std::shared_ptr ncclComm(new ncclComm_t, + [](ncclComm_t* comm) + { + ncclCommDestroy(*comm); + delete comm; + }); + NCCLCHECK(ncclCommInitRank(ncclComm.get(), group.size(), id, groupRank)); + commMap[group] = ncclComm; + return ncclComm; } +#endif // ENABLE_MULTI_DEVICE void* tensorrt_llm::plugins::getCommSessionHandle() { @@ -95,8 +105,6 @@ void* tensorrt_llm::plugins::getCommSessionHandle() #endif // ENABLE_MULTI_DEVICE } -#endif // ENABLE_MULTI_DEVICE - namespace { diff --git a/cpp/tensorrt_llm/plugins/common/plugin.h b/cpp/tensorrt_llm/plugins/common/plugin.h index 5c14eb024..bc20b6634 100644 --- a/cpp/tensorrt_llm/plugins/common/plugin.h +++ b/cpp/tensorrt_llm/plugins/common/plugin.h @@ -171,9 +171,7 @@ inline bool isBuilding() std::unordered_map* getDtypeMap(); -std::map, ncclComm_t>* getCommMap(); - -void initCommMap(std::set const& group); +std::shared_ptr getComm(std::set const& group); #endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp index 5afaeb410..045eced23 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp @@ -565,7 +565,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng uint32_t decoderXQARunnerResourceSerializedSize; read(d, decoderXQARunnerResourceSerializedSize); - mDecoderXQARunnerResource = DecoderXQARunner::Resource(d, decoderXQARunnerResourceSerializedSize); + DecoderXQARunner::getResourceGlobal()->merge(DecoderXQARunner::Resource(d, decoderXQARunnerResourceSerializedSize)); d += decoderXQARunnerResourceSerializedSize; TLLM_CHECK_WITH_INFO(d == a + length, @@ -1620,8 +1620,8 @@ int GPTAttentionPluginCommon::initialize() noexcept TLLM_CHECK_WITH_INFO(!mMultiBlockMode, "Medusa doesn't support multi-block mode."); } - mDecoderXQARunner.reset(new DecoderXQARunner( - &mDecoderXQARunnerResource, xqa_runner_data_type, mNumHeads, mNumKVHeads, mHeadSize, mMultiBlockMode)); + mDecoderXQARunner.reset( + new DecoderXQARunner(xqa_runner_data_type, mNumHeads, mNumKVHeads, mHeadSize, mMultiBlockMode)); } else if (mIsSpecDecodingEnabled) { @@ -1656,8 +1656,8 @@ size_t GPTAttentionPluginCommon::getCommonSerializationSize() const noexcept + sizeof(mCrossAttention) + sizeof(mMaxDistance) + sizeof(mPosShiftEnabled) + sizeof(mDenseContextFMHA) + sizeof(mPagedContextFMHA) + sizeof(mFP8ContextFMHA) + sizeof(mUseKVCache) + sizeof(mUnfuseQkvGemm) + sizeof(mIsSpecDecodingEnabled) + sizeof(mNbMultiBlockSemaphores) - + sizeof(uint32_t) // size of mDecoderXQARunnerResource buffer. - + mDecoderXQARunnerResource.getSerializationSize(); + + sizeof(uint32_t) // size of DecoderXQARunnerResource buffer. + + DecoderXQARunner::getResourceGlobal()->getSerializationSize(); } void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept @@ -1708,9 +1708,9 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept write(d, mNbMultiBlockSemaphores); // An uint32_t that specifies the size of the serialized buffer, followed by the actual content. - uint32_t decoderXQARunnerResourceSerializedSize = mDecoderXQARunnerResource.getSerializationSize(); + uint32_t decoderXQARunnerResourceSerializedSize = DecoderXQARunner::getResourceGlobal()->getSerializationSize(); write(d, decoderXQARunnerResourceSerializedSize); - mDecoderXQARunnerResource.serialize(d, decoderXQARunnerResourceSerializedSize); + DecoderXQARunner::getResourceGlobal()->serialize(d, decoderXQARunnerResourceSerializedSize); d += decoderXQARunnerResourceSerializedSize; assert(d == a + getCommonSerializationSize()); diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h index af22db368..cfe9f870e 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h @@ -339,7 +339,6 @@ class GPTAttentionPluginCommon : public BasePlugin // The default copy constructor will leave it as nullptr. clone() shall initialize it. std::shared_ptr mDriver; UniqPtrWNullCopy mFMHARunner; - tensorrt_llm::kernels::DecoderXQARunner::Resource mDecoderXQARunnerResource; UniqPtrWNullCopy mDecoderXQARunner; bool mMultiBlockMode; diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp index b6fedf8c9..a96a35834 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp @@ -621,8 +621,10 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 host_secondary_pool_pointer = reinterpret_cast(typed_host_pool_pointers[1] + layerOffset); } - T* context_buf_ - = (T*) (outputs[0]) + outputDesc[0].dims.d[getPackedTensorHiddenDimIndex(mRemovePadding)] * tokenIdxBeg; + // FP8 output when fp8_context_fmha is enabled. + auto const outputElemSize = (mFP8ContextFMHA ? 1 : sizeof(T)); + T* context_buf_ = reinterpret_cast(static_cast(outputs[0]) + + outputDesc[0].dims.d[getPackedTensorHiddenDimIndex(mRemovePadding)] * tokenIdxBeg * outputElemSize); void* key_value_cache = nullptr; if (useKVCache() && !mPagedKVCache) { @@ -643,28 +645,30 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 int const* spec_decoding_packed_mask = nullptr; int const* spec_decoding_position_offsets = nullptr; int const* spec_decoding_generation_lengths = nullptr; - int num_spec_decoding_tokens = 0; + int num_decoding_draft_tokens = 0; if (mIsSpecDecodingEnabled) { - // Second dimension of spec_decoding_position_offsets is num_spec_decoding_tokens + 1. - // [batch_size, num_spec_decoding_tokens + 1] - num_spec_decoding_tokens = inputDesc[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)].dims.d[1] - 1; - if (num_spec_decoding_tokens > 0) + // Second dimension of spec_decoding_position_offsets is num_decoding_draft_tokens + 1. + // [batch_size, num_decoding_draft_tokens + 1] + num_decoding_draft_tokens = inputDesc[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)].dims.d[1] - 1; + if (num_decoding_draft_tokens > 0) { + // spec_decoding_* tensors are not filled for context requests. Hence, always strting from 0th index + int32_t constexpr genSeqIdx = 0; spec_decoding_packed_mask = static_cast(inputs[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)]) - + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)].dims, 0); + + genSeqIdx * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)].dims, 0); // Packed as [num_tokens, packed_mask_size] - // Use seqIdxBeg * (num_spec_decoding_tokens + 1) here as only generation tokens have the packed_mask + // Use seqIdxBeg * (num_decoding_draft_tokens + 1) here as only generation tokens have the packed_mask // buffer. // TODO: support variable sequence length based on generationTokenIdxBeg. spec_decoding_packed_mask = static_cast(inputs[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)]) - + seqIdxBeg * (num_spec_decoding_tokens + 1) + + genSeqIdx * (num_decoding_draft_tokens + 1) * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)].dims, 0); spec_decoding_position_offsets = static_cast(inputs[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)]) - + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)].dims, 0); + + genSeqIdx * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)].dims, 0); spec_decoding_generation_lengths - = static_cast(inputs[getIdx(IdxEntry::SPEC_DECODING_GENERATION_LENGTHS)]) + seqIdxBeg; + = static_cast(inputs[getIdx(IdxEntry::SPEC_DECODING_GENERATION_LENGTHS)]) + genSeqIdx; } } @@ -747,7 +751,7 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 input_seq_length, mIsSpecDecodingEnabled ? "true" : "false", qkvDims.nbDims, qkvDims.d[0], qkvDims.d[1], qkvDims.d[2]); TLLM_CHECK_WITH_INFO( - input_seq_length == num_spec_decoding_tokens + 1, "The generation input length is not expected."); + input_seq_length == num_decoding_draft_tokens + 1, "The generation input length is not expected."); EnqueueGenerationParams enqueue_params{attention_input, qkv_bias, input_seq_length, sequence_kv_length, max_context_kv_len, beamWidth, context_q_lengths, kv_scale_orig_quant, kv_scale_quant_orig, attention_output_orig_quant, rotary_embedding_scaling_factors, alibi_slopes, diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp index f3e23d6b7..cf9e40b89 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp @@ -39,7 +39,7 @@ std::vector MixtureOfExpertsPluginCreator::mPluginAttribu MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(int number_of_experts, int top_k, int expert_hidden_size, int expert_inter_size, tensorrt_llm::ActivationType activation_type, nvinfer1::DataType type, nvinfer1::DataType weight_type, nvinfer1::DataType output_type, QuantMode quant_mode, bool use_finished, - bool use_bias, int tp_size, int tp_rank, MOEParallelismMode parallelism_mode, + bool use_bias, int tp_size, int tp_rank, int ep_size, int ep_rank, MOEExpertScaleNormalizationMode normalization_mode, MixtureOfExpertsPluginProfilerPtr plugin_profiler_ptr) : mNumExperts(number_of_experts) , mK(top_k) @@ -52,9 +52,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(int number_of_experts, int top_k, , mQuantMode(quant_mode) , mUseFinished(use_finished) , mUseBias(use_bias) - , mTPSize(tp_size) - , mTPRank(tp_rank) - , mParallelismMode(parallelism_mode) + , mParallelismConfig(MOEParallelismConfig{tp_size, tp_rank, ep_size, ep_rank}) , mNormalizationMode(normalization_mode) , mPluginProfiler(std::move(plugin_profiler_ptr)) { @@ -74,9 +72,7 @@ tensorrt_llm::plugins::MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(MixtureOfE , mQuantMode(other.mQuantMode) , mUseFinished(other.mUseFinished) , mUseBias(other.mUseBias) - , mTPSize(other.mTPSize) - , mTPRank(other.mTPRank) - , mParallelismMode(other.mParallelismMode) + , mParallelismConfig(other.mParallelismConfig) , mNormalizationMode(other.mNormalizationMode) , mDims(other.mDims) , mGemmId(other.mGemmId) @@ -91,9 +87,8 @@ size_t MixtureOfExpertsPlugin::getSerializationSize() const noexcept { return sizeof(mNumExperts) + sizeof(mK) + sizeof(mExpertHiddenSize) + sizeof(mExpertInterSize) + sizeof(mActivationType) + sizeof(mType) + sizeof(mWeightType) + sizeof(mOutputType) - + sizeof(QuantMode::BaseType) + sizeof(mUseFinished) + sizeof(mUseBias) + sizeof(mTPSize) + sizeof(mTPRank) - + sizeof(mParallelismMode) + sizeof(mNormalizationMode) + sizeof(mDims) - + mPluginProfiler->getSerializationSize(mGemmId); + + sizeof(QuantMode::BaseType) + sizeof(mUseFinished) + sizeof(mUseBias) + sizeof(mParallelismConfig) + + sizeof(mNormalizationMode) + sizeof(mDims) + mPluginProfiler->getSerializationSize(mGemmId); } MixtureOfExpertsPlugin::MixtureOfExpertsPlugin( @@ -115,9 +110,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin( mQuantMode = QuantMode{quant_mode}; read(d, mUseFinished); read(d, mUseBias); - read(d, mTPSize); - read(d, mTPRank); - read(d, mParallelismMode); + read(d, mParallelismConfig); read(d, mNormalizationMode); read(d, mDims); @@ -146,9 +139,7 @@ void MixtureOfExpertsPlugin::serialize(void* buffer) const noexcept write(d, mQuantMode.value()); write(d, mUseFinished); write(d, mUseBias); - write(d, mTPSize); - write(d, mTPRank); - write(d, mParallelismMode); + write(d, mParallelismConfig); write(d, mNormalizationMode); write(d, mDims); @@ -226,8 +217,8 @@ void MixtureOfExpertsPlugin::init() static_cast(mType), static_cast(mWeightType)); } - mGemmId = GemmIDMoe{mNumExperts, mK, mExpertHiddenSize, mExpertInterSize, mActivationType, mType, mWeightType, - mQuantMode, mParallelismMode}; + mGemmId = GemmIDMoe{mNumExperts, mK, mParallelismConfig, mExpertHiddenSize, mExpertInterSize, mActivationType, + mType, mWeightType, mQuantMode}; } // IPluginV2DynamicExt Methods @@ -316,8 +307,8 @@ void MixtureOfExpertsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc c { mDims = {minM, maxM, maxN, maxK}; } - mGemmId = GemmIDMoe{mNumExperts, mK, mExpertHiddenSize, mExpertInterSize, mActivationType, mType, mWeightType, - mQuantMode, mParallelismMode}; + mGemmId = GemmIDMoe{mNumExperts, mK, mParallelismConfig, mExpertHiddenSize, mExpertInterSize, mActivationType, + mType, mWeightType, mQuantMode}; } auto MixtureOfExpertsPlugin::setupWorkspace(void* base_ptr, int64_t num_tokens) const -> WorkspaceInfo @@ -325,7 +316,7 @@ auto MixtureOfExpertsPlugin::setupWorkspace(void* base_ptr, int64_t num_tokens) size_t dtype_size = tensorrt_llm::common::getDTypeSize(mType); size_t moe_workspace_size = mMOERunner->getWorkspaceSize( - num_tokens, mExpertHiddenSize, mExpertInterSize, mNumExperts, mK, mActivationType, getParallelismConfig()); + num_tokens, mExpertHiddenSize, mExpertInterSize, mNumExperts, mK, mActivationType, mParallelismConfig); // Output of post-softmax routing probabilities size_t scale_probabilities_size = num_tokens * mNumExperts * sizeof(float); @@ -382,16 +373,7 @@ size_t MixtureOfExpertsPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const MOEParallelismConfig MixtureOfExpertsPlugin::getParallelismConfig() const { - switch (mParallelismMode) - { - case kernels::MOEParallelismMode::NONE: return {}; - case kernels::MOEParallelismMode::EXPERT_PARALLELISM: - return MOEParallelismConfig::ExpertParallelism(mTPSize, mTPRank); - case kernels::MOEParallelismMode::TENSOR_PARALLELISM: - return MOEParallelismConfig::TensorParallelism(mTPSize, mTPRank); - } - assert(false); - return {}; + return mParallelismConfig; } QuantParams tensorrt_llm::plugins::MixtureOfExpertsPlugin::getQuantParams( @@ -418,14 +400,13 @@ int MixtureOfExpertsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, { int64_t const num_tokens = getNumTokens(inputDesc); int64_t const num_not_finished = num_tokens; // TODO Take this as an input - auto parallelism_config = getParallelismConfig(); auto workspace = setupWorkspace(workspace_ptr, num_tokens); auto w1_desc = inputDesc[getExpertWeights1Index()]; auto w2_desc = inputDesc[getExpertWeights2Index()]; TLLM_CHECK(w1_desc.dims.nbDims == 3); - size_t experts_per_node = mNumExperts / parallelism_config.ep_size; + size_t experts_per_node = mNumExperts / mParallelismConfig.ep_size; TLLM_CHECK(w1_desc.dims.d[0] == experts_per_node); TLLM_CHECK(w2_desc.dims.nbDims == 3); TLLM_CHECK(w2_desc.dims.d[0] == experts_per_node); @@ -469,7 +450,7 @@ int MixtureOfExpertsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, outputs[getOutputTensorIndex()], hasFinishedTensor() ? static_cast(inputs[getFinishedTensorIndex()]) : nullptr, num_not_finished, workspace.scale_probs, static_cast(workspace.src_to_dest_map), - static_cast(workspace.selected_experts), parallelism_config, mNormalizationMode, stream); + static_cast(workspace.selected_experts), mParallelismConfig, mNormalizationMode, stream); return 0; } @@ -556,8 +537,8 @@ MixtureOfExpertsPluginCreator::MixtureOfExpertsPluginCreator() mPluginAttributes.emplace_back(nvinfer1::PluginField("use_bias", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(nvinfer1::PluginField("tp_size", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(nvinfer1::PluginField("tp_rank", nullptr, PluginFieldType::kINT32, 0)); - mPluginAttributes.emplace_back(nvinfer1::PluginField( - "parallelism_mode", nullptr, PluginFieldType::kINT32, static_cast(MOEParallelismMode::NONE))); + mPluginAttributes.emplace_back(nvinfer1::PluginField("ep_size", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(nvinfer1::PluginField("ep_rank", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(nvinfer1::PluginField("normalization_mode", nullptr, PluginFieldType::kINT32, static_cast(MOEExpertScaleNormalizationMode::NONE))); mFC.nbFields = mPluginAttributes.size(); @@ -581,7 +562,8 @@ IPluginV2* MixtureOfExpertsPluginCreator::createPlugin( int mUseBias{0}; int mTPSize{}; int mTPRank{}; - int mParallelismMode{}; + int mEPSize{}; + int mEPRank{}; int mNormalizationMode{}; // Read configurations from each fields @@ -604,7 +586,8 @@ IPluginV2* MixtureOfExpertsPluginCreator::createPlugin( MapPair{"quant_mode", std::ref(mQuantMode)}, MapPair{"tp_size", std::ref(mTPSize)}, MapPair{"tp_rank", std::ref(mTPRank)}, - MapPair{"parallelism_mode", std::ref(mParallelismMode)}, + MapPair{"ep_size", std::ref(mEPSize)}, + MapPair{"ep_rank", std::ref(mEPRank)}, MapPair{"normalization_mode", std::ref(mNormalizationMode)}, // Optional @@ -646,8 +629,7 @@ IPluginV2* MixtureOfExpertsPluginCreator::createPlugin( mNumExperts, mK, mExpertHiddenSize, mExpertInterSize, static_cast(mActivationType), static_cast(mType), static_cast(mWeightType), static_cast(mOutputType), - QuantMode(mQuantMode), mUseFinished != 0, mUseBias != 0, mTPSize, mTPRank, - static_cast(mParallelismMode), + QuantMode(mQuantMode), mUseFinished != 0, mUseBias != 0, mTPSize, mTPRank, mEPSize, mEPRank, static_cast(mNormalizationMode), pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h index 3f5fc70d6..a0aa6e202 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h @@ -29,33 +29,35 @@ namespace tensorrt_llm::plugins { class MixtureOfExpertsGemmProfiler; +using MOEParallelismConfig = tensorrt_llm::kernels::MOEParallelismConfig; using MixtureOfExpertsPluginProfilerPtr = std::shared_ptr; struct GemmIDMoe { int num_experts{}; int moe_k{}; + MOEParallelismConfig parallelism_config{}; int64_t hidden{}; int64_t inter{}; tensorrt_llm::ActivationType actfn{}; nvinfer1::DataType dtype{}; nvinfer1::DataType wdtype{}; tensorrt_llm::common::QuantMode quant_mode; - tensorrt_llm::kernels::MOEParallelismMode parallelism_mode{}; bool operator==(GemmIDMoe const& id) const { - return id.num_experts == num_experts && id.moe_k == moe_k && id.hidden == hidden && id.inter == inter - && id.actfn == actfn && id.dtype == dtype && id.wdtype == wdtype && id.quant_mode == quant_mode - && id.parallelism_mode == parallelism_mode; + return id.num_experts == num_experts && id.moe_k == moe_k && id.parallelism_config == parallelism_config + && id.hidden == hidden && id.inter == inter && id.actfn == actfn && id.dtype == dtype && id.wdtype == wdtype + && id.quant_mode == quant_mode; } friend std::ostream& operator<<(std::ostream& out, GemmIDMoe const& id) { - out << "experts, k, hidden, inter, actfn, dtype, weight type, parallelism mode=" << id.num_experts << "," - << id.moe_k << "," << id.hidden << "," << id.inter << "," << static_cast(id.actfn) << "," - << static_cast(id.dtype) << "," << static_cast(id.wdtype) << "," << id.quant_mode.value() << "," - << static_cast(id.parallelism_mode); + out << "experts, k, parallelism_config, hidden, inter, actfn, dtype, weight " + "type, parallelism mode=" + << id.num_experts << "," << id.moe_k << "," << id.parallelism_config << "," << id.hidden << "," << id.inter + << "," << static_cast(id.actfn) << "," << static_cast(id.dtype) << "," + << static_cast(id.wdtype) << "," << id.quant_mode.value(); return out; } }; @@ -67,13 +69,16 @@ struct GemmIDMoeHash { size_t hash = std::hash{}(id.num_experts); hash ^= std::hash{}(id.moe_k); + hash ^= std::hash{}(id.parallelism_config.tp_size); + hash ^= std::hash{}(id.parallelism_config.ep_size); + hash ^= std::hash{}(id.parallelism_config.tp_rank); + hash ^= std::hash{}(id.parallelism_config.ep_rank); hash ^= std::hash{}(id.hidden); hash ^= std::hash{}(id.inter); hash ^= std::hash{}(static_cast(id.actfn)); hash ^= std::hash{}(static_cast(id.dtype)); hash ^= std::hash{}(static_cast(id.wdtype)); hash ^= std::hash{}(static_cast(id.quant_mode.value())); - hash ^= std::hash{}(static_cast(id.parallelism_mode)); return hash; } }; @@ -81,15 +86,15 @@ struct GemmIDMoeHash class MixtureOfExpertsPlugin : public nvinfer1::IPluginV2DynamicExt { public: - using MOEParallelismMode = tensorrt_llm::kernels::MOEParallelismMode; + using MOEParallelismConfig = tensorrt_llm::kernels::MOEParallelismConfig; using MOEExpertScaleNormalizationMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode; MixtureOfExpertsPlugin() = delete; MixtureOfExpertsPlugin(int number_of_experts, int top_k, int expert_hidden_size, int expert_inter_size, tensorrt_llm::ActivationType activation_type, nvinfer1::DataType type, nvinfer1::DataType weight_type, nvinfer1::DataType output_type, tensorrt_llm::common::QuantMode quant_mode, bool use_finished, bool use_bias, - int tp_size, int tp_rank, MOEParallelismMode parallelism_mode, - MOEExpertScaleNormalizationMode normalization_mode, MixtureOfExpertsPluginProfilerPtr plugin_profiler_ptr); + int tp_size, int tp_rank, int ep_size, int ep_rank, MOEExpertScaleNormalizationMode normalization_mode, + MixtureOfExpertsPluginProfilerPtr plugin_profiler_ptr); MixtureOfExpertsPlugin(void const* data, size_t length, MixtureOfExpertsPluginProfilerPtr plugin_profiler_ptr); MixtureOfExpertsPlugin(MixtureOfExpertsPlugin const&); @@ -145,9 +150,7 @@ class MixtureOfExpertsPlugin : public nvinfer1::IPluginV2DynamicExt tensorrt_llm::common::QuantMode mQuantMode; bool mUseFinished{}; bool mUseBias{}; - int mTPSize{}; - int mTPRank{}; - MOEParallelismMode mParallelismMode{}; + MOEParallelismConfig mParallelismConfig{}; MOEExpertScaleNormalizationMode mNormalizationMode{}; GemmDims mDims{}; diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp index 921dd73f8..2a08a7de4 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp @@ -100,8 +100,8 @@ int AllgatherPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe size *= inputDesc[0].dims.d[i]; } - NCCLCHECK(ncclAllGather( - inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], (*getCommMap())[mGroup], stream)); + TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used"); + NCCLCHECK(ncclAllGather(inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], *mNcclComm, stream)); return 0; } @@ -133,22 +133,16 @@ int AllgatherPlugin::getNbOutputs() const noexcept int AllgatherPlugin::initialize() noexcept { - initCommMap(mGroup); - return 0; -} - -void AllgatherPlugin::terminate() noexcept -{ - auto* commMap = getCommMap(); - // [] operator inserts T() if it does not exist - if (isBuilding() || (*commMap)[mGroup] == nullptr) + if (isBuilding()) { - return; + return 0; } - NCCLCHECK(ncclCommDestroy((*commMap)[mGroup])); - (*commMap)[mGroup] = nullptr; + mNcclComm = getComm(mGroup); + return 0; } +void AllgatherPlugin::terminate() noexcept {} + size_t AllgatherPlugin::getSerializationSize() const noexcept { return sizeof(int) * mGroup.size() + sizeof(mType); diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h index ac8e723f7..3d7810e6b 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h @@ -65,6 +65,7 @@ class AllgatherPlugin : public BasePlugin const std::string mLayerName; std::set mGroup; nvinfer1::DataType mType; + std::shared_ptr mNcclComm; }; class AllgatherPluginCreator : public BaseCreator diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index 6ff7f6dcb..eef20e8f5 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -249,21 +249,22 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe } // Log runtime strategy + auto const rank = COMM_SESSION.getRank(); switch (runtimeStrategy) { case AllReduceStrategyType::NCCL: { - TLLM_LOG_DEBUG("AllReducePlugin strategy: AllReduceStrategyType::NCCL"); + TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d layer %d: NCCL", rank, mCounter); break; } case AllReduceStrategyType::ONESHOT: { - TLLM_LOG_DEBUG("AllReducePlugin strategy: AllReduceStrategyType::ONESHOT"); + TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d layer %d: ONESHOT", rank, mCounter); break; } case AllReduceStrategyType::TWOSHOT: { - TLLM_LOG_DEBUG("AllReducePlugin strategy: AllReduceStrategyType::TWOSHOT"); + TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d layer %d: TWOSHOT", rank, mCounter); break; } default: break; @@ -273,8 +274,7 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe { if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM) { - NCCLCHECK(ncclAllReduce( - inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, (*getCommMap())[mGroup], stream)); + NCCLCHECK(ncclAllReduce(inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream)); tensorrt_llm::kernels::AllReduceParams params; int fusion_ptr_idx = 0; if (mStrategy == AllReduceStrategyType::NCCL) @@ -297,19 +297,16 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe } else { - NCCLCHECK(ncclAllReduce( - inputs[0], outputs[0], size, (*getDtypeMap())[mType], ncclSum, (*getCommMap())[mGroup], stream)); + NCCLCHECK(ncclAllReduce(inputs[0], outputs[0], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream)); } } else { - auto myRank = COMM_SESSION.getRank(); - int nRanks = inputDesc[1].dims.d[0] / utils::customAllReduceUtils::NUM_POINTERS_PER_RANK; - // FIXME: pass world config here - myRank = myRank % nRanks; + auto const tpSize = mGroup.size(); + auto const tpRank = rank % tpSize; auto params = tensorrt_llm::kernels::AllReduceParams::deserialize( - reinterpret_cast(inputs[1]), nRanks, myRank, mCounter); + reinterpret_cast(inputs[1]), tpSize, tpRank, mCounter); params.local_output_buffer_ptr = outputs[0]; params.local_input_buffer_ptr = inputs[0]; @@ -578,7 +575,7 @@ int AllreducePlugin::initialize() noexcept return 0; } - initCommMap(mGroup); + mNcclComm = getComm(mGroup); if (mStrategy != AllReduceStrategyType::NCCL) { initGroupTopology(); @@ -587,20 +584,7 @@ int AllreducePlugin::initialize() noexcept return 0; } -void AllreducePlugin::terminate() noexcept -{ - if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::AUTO) - { - auto* commMap = getCommMap(); - // [] operator inserts T() if it does not exist - if (isBuilding() || (*commMap)[mGroup] == nullptr) - { - return; - } - NCCLCHECK(ncclCommDestroy((*commMap)[mGroup])); - (*commMap)[mGroup] = nullptr; - } -} +void AllreducePlugin::terminate() noexcept {} size_t AllreducePlugin::getSerializationSize() const noexcept { diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h index 593261439..93a9b0651 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h @@ -85,6 +85,7 @@ class AllreducePlugin : public BasePlugin kernels::AllReduceFusionOp mOp; float mEps; int32_t mCounter; + std::shared_ptr mNcclComm; int8_t mAffine; int8_t mBias; }; diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp index 9a8fe3852..b87ccdf2a 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp @@ -101,8 +101,9 @@ int ReduceScatterPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, size *= outputDesc[0].dims.d[i]; } + TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used"); NCCLCHECK(ncclReduceScatter( - inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], ncclSum, (*getCommMap())[mGroup], stream)); + inputs[0], outputs[0], size, (*getDtypeMap())[inputDesc[0].type], ncclSum, *mNcclComm, stream)); return 0; } @@ -134,22 +135,16 @@ int ReduceScatterPlugin::getNbOutputs() const noexcept int ReduceScatterPlugin::initialize() noexcept { - initCommMap(mGroup); - return 0; -} - -void ReduceScatterPlugin::terminate() noexcept -{ - auto* commMap = getCommMap(); - // [] operator inserts T() if it does not exist - if (isBuilding() || (*commMap)[mGroup] == nullptr) + if (isBuilding()) { - return; + return 0; } - NCCLCHECK(ncclCommDestroy((*commMap)[mGroup])); - (*commMap)[mGroup] = nullptr; + mNcclComm = getComm(mGroup); + return 0; } +void ReduceScatterPlugin::terminate() noexcept {} + size_t ReduceScatterPlugin::getSerializationSize() const noexcept { return sizeof(int) * mGroup.size() + sizeof(mType); diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.h index 10f28d2e9..c630b57a2 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.h @@ -64,6 +64,7 @@ class ReduceScatterPlugin : public BasePlugin const std::string mLayerName; std::set mGroup; nvinfer1::DataType mType; + std::shared_ptr mNcclComm; }; class ReduceScatterPluginCreator : public BaseCreator diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index 0f32653b7..65f54c0c3 100644 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -31,9 +31,7 @@ set(SRCS batch_manager/inferenceRequest.cpp batch_manager/namedTensor.cpp executor/bindings.cpp - executor/executor.cpp - runtime/generationInput.cpp - runtime/generationOutput.cpp) + executor/executor.cpp) pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS}) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp index 7441dc807..4ef0e179a 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp @@ -17,18 +17,17 @@ #include "llmRequest.h" #include "tensorrt_llm/batch_manager/llmRequest.h" -#include "tensorrt_llm/runtime/cudaStream.h" -#include "tensorrt_llm/runtime/generationInput.h" #include "tensorrt_llm/runtime/torch.h" #include "tensorrt_llm/runtime/torchUtils.h" #include "tensorrt_llm/runtime/torchView.h" -#include #include #include #include #include +#include + namespace tb = tensorrt_llm::batch_manager; namespace tr = tensorrt_llm::runtime; diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp index ad87393ac..ebaa53b24 100644 --- a/cpp/tensorrt_llm/pybind/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -28,8 +28,6 @@ #include "tensorrt_llm/pybind/batch_manager/llmRequest.h" #include "tensorrt_llm/pybind/batch_manager/namedTensor.h" #include "tensorrt_llm/pybind/executor/bindings.h" -#include "tensorrt_llm/pybind/runtime/generationInput.h" -#include "tensorrt_llm/pybind/runtime/generationOutput.h" #include "tensorrt_llm/pybind/utils/pathCaster.h" #include "tensorrt_llm/batch_manager/BatchManager.h" @@ -39,7 +37,6 @@ #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" -#include "tensorrt_llm/runtime/gptSession.h" #include "tensorrt_llm/runtime/memoryCounters.h" #include "tensorrt_llm/runtime/samplingConfig.h" @@ -49,8 +46,6 @@ namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager; namespace tpb = tensorrt_llm::pybind::batch_manager; namespace tc = tensorrt_llm::common; namespace tr = tensorrt_llm::runtime; -namespace texec = tensorrt_llm::executor; -namespace tpr = tensorrt_llm::pybind::runtime; using SizeType32 = tr::SizeType32; using TokenIdType = tr::TokenIdType; template @@ -68,10 +63,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) py::module_ executor_submodule = m.def_submodule("executor", "Executor bindings"); tensorrt_llm::pybind::executor::InitBindings(executor_submodule); - tpr::PromptTuningParams::initBindings(m); - tpr::GenerationInput::initBindings(m); - tpr::GenerationOutput::initBindings(m); - auto buildInfo = m.def_submodule("BuildInfo"); buildInfo.attr("ENABLE_MULTI_DEVICE") = py::int_(ENABLE_MULTI_DEVICE); @@ -120,19 +111,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) .def_readwrite("device_cache_percent", &tb::PeftCacheManagerConfig::deviceCachePercent) .def_readwrite("host_cache_size", &tb::PeftCacheManagerConfig::hostCacheSize); - py::class_(m, "GptSessionConfig") - .def(py::init(), py::arg("max_batch_size"), - py::arg("max_beam_width"), py::arg("max_sequence_length"), py::arg("gpu_weights_percent") = 1.0) - .def_readwrite("max_batch_size", &tr::GptSession::Config::maxBatchSize) - .def_readwrite("max_beam_width", &tr::GptSession::Config::maxBeamWidth) - .def_readwrite("max_sequence_length", &tr::GptSession::Config::maxSequenceLength) - .def_readwrite("gpu_weights_percent", &tr::GptSession::Config::gpuWeightsPercent) - .def_readwrite("decoder_per_request", &tr::GptSession::Config::decoderPerRequest) - .def_readwrite("cuda_graph_mode", &tr::GptSession::Config::cudaGraphMode) - .def_readwrite("ctx_micro_batch_size", &tr::GptSession::Config::ctxMicroBatchSize) - .def_readwrite("gen_micro_batch_size", &tr::GptSession::Config::genMicroBatchSize) - .def_readwrite("kv_cache_config", &tr::GptSession::Config::kvCacheConfig); - py::enum_(m, "DataType") .value("FLOAT", nvinfer1::DataType::kFLOAT) .value("HALF", nvinfer1::DataType::kHALF) @@ -332,38 +310,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) py::overload_cast(&tr::GptJsonConfig::engineFilename, py::const_), py::arg("world_config")); - py::class_(m, "GptSession") - .def(py::init( - [](tr::GptSession::Config const& config, tr::ModelConfig const& modelConfig, - tr::WorldConfig const& worldConfig, py::bytearray const& bytes) - { - PyErr_WarnEx( - PyExc_DeprecationWarning, "GptSession is deprecated use the executor API instead.", 1); - - auto buf = static_cast(bytes); - return tr::GptSession{config, modelConfig, worldConfig, buf.data(), buf.size()}; - }), - py::arg("config"), py::arg("model_config"), py::arg("world_config"), py::arg("engine_buffer")) - .def(py::init( - [](tr::GptSession::Config const& config, tr::ModelConfig const& modelConfig, - tr::WorldConfig const& worldConfig, std::string const& engineFile) - { - PyErr_WarnEx( - PyExc_DeprecationWarning, "GptSession is deprecated use the executor API instead.", 1); - - return tr::GptSession{config, modelConfig, worldConfig, engineFile}; - }), - py::arg("config"), py::arg("model_config"), py::arg("world_config"), py::arg("engine_file")) - .def_property_readonly("model_config", &tr::GptSession::getModelConfig) - .def_property_readonly("world_config", &tr::GptSession::getWorldConfig) - .def_property_readonly("device", &tr::GptSession::getDevice) - .def( - "generate", - [](tr::GptSession& self, tpr::GenerationOutput& outputs, tpr::GenerationInput const& inputs, - tr::SamplingConfig const& samplingConfig) - { self.generate(*outputs.toTrtLlm(), *inputs.toTrtLlm(), samplingConfig); }, - py::arg("outputs"), py::arg("inputs"), py::arg("sampling_config")); - py::enum_(m, "LlmRequestState") .value("REQUEST_STATE_UNKNOWN", tb::LlmRequestState_t::REQUEST_STATE_UNKNOWN) .value("REQUEST_STATE_ENCODER_INIT", tb::LlmRequestState_t::REQUEST_STATE_ENCODER_INIT) diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp index fa6defc8c..9f7799dee 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp @@ -89,7 +89,10 @@ void InitBindings(pybind11::module_& m) .def_readwrite("max_num_blocks", &tle::KvCacheStats::maxNumBlocks) .def_readwrite("free_num_blocks", &tle::KvCacheStats::freeNumBlocks) .def_readwrite("used_num_blocks", &tle::KvCacheStats::usedNumBlocks) - .def_readwrite("tokens_per_block", &tle::KvCacheStats::tokensPerBlock); + .def_readwrite("tokens_per_block", &tle::KvCacheStats::tokensPerBlock) + .def_readwrite("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks) + .def_readwrite("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks) + .def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks); py::class_(m, "StaticBatchingStats") .def(py::init<>()) @@ -113,6 +116,7 @@ void InitBindings(pybind11::module_& m) .def(py::init<>()) .def_readwrite("timestamp", &tle::IterationStats::timestamp) .def_readwrite("iter", &tle::IterationStats::iter) + .def_readwrite("iter_latency_ms", &tle::IterationStats::iterLatencyMS) .def_readwrite("num_active_requests", &tle::IterationStats::numActiveRequests) .def_readwrite("max_num_active_requests", &tle::IterationStats::maxNumActiveRequests) .def_readwrite("gpu_mem_usage", &tle::IterationStats::gpuMemUsage) @@ -217,7 +221,8 @@ void InitBindings(pybind11::module_& m) .def_property_readonly("weights", &tle::LoraConfig::getWeights) .def_property_readonly("config", &tle::LoraConfig::getConfig); - py::class_(m, "Request") + py::class_ request(m, "Request"); + request .def(py::init const&, std::optional const&, std::optional>, std::optional>, std::optional, @@ -249,6 +254,7 @@ void InitBindings(pybind11::module_& m) &tle::Request::setLogitsPostProcessorName) .def_property( "encoder_input_token_ids", &tle::Request::getEncoderInputTokenIds, &tle::Request::setEncoderInputTokenIds); + request.attr("BATCHED_POST_PROCESSOR_NAME") = tle::Request::kBatchedPostProcessorName; py::class_(m, "Result") .def(py::init<>()) @@ -291,6 +297,22 @@ void InitBindings(pybind11::module_& m) .def_property_readonly("context_chunking_policy", &tle::SchedulerConfig::getContextChunkingPolicy) .def(py::pickle(schedulerConfigGetstate, schedulerConfigSetstate)); + auto kvCacheConfigGetstate = [](tle::KvCacheConfig const& self) + { + return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindow(), + self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(), + self.getOnboardBlocks()); + }; + auto kvCacheConfigSetstate = [](py::tuple state) + { + if (state.size() != 7) + { + throw std::runtime_error("Invalid state!"); + } + return tle::KvCacheConfig(state[0].cast(), state[1].cast>(), + state[2].cast>(), state[3].cast>(), + state[4].cast>(), state[5].cast>(), state[6].cast()); + }; py::class_(m, "KvCacheConfig") .def(py::init const&, std::optional const&, std::optional const&, std::optional const&, std::optional const&, bool>(), @@ -298,13 +320,18 @@ void InitBindings(pybind11::module_& m) py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(), py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(), py::arg("onboard_blocks") = true) - .def_property_readonly("enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse) - .def_property_readonly("max_tokens", &tle::KvCacheConfig::getMaxTokens) - .def_property_readonly("max_attention_window", &tle::KvCacheConfig::getMaxAttentionWindow) - .def_property_readonly("sink_token_length", &tle::KvCacheConfig::getSinkTokenLength) - .def_property_readonly("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction) - .def_property_readonly("host_cache_size", &tle::KvCacheConfig::getHostCacheSize) - .def_property_readonly("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks); + .def_property( + "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse) + .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens) + .def_property("max_attention_window", &tle::KvCacheConfig::getMaxAttentionWindow, + &tle::KvCacheConfig::setMaxAttentionWindow) + .def_property( + "sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength) + .def_property("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction, + &tle::KvCacheConfig::setFreeGpuMemoryFraction) + .def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize) + .def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks) + .def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate)); py::class_(m, "OrchestratorConfig") .def(py::init(), py::arg("is_orchestrator") = true, py::arg("worker_executable_path") = "") @@ -313,6 +340,22 @@ void InitBindings(pybind11::module_& m) .def_property("worker_executable_path", &tle::OrchestratorConfig::getWorkerExecutablePath, &tle::OrchestratorConfig::setWorkerExecutablePath); + auto parallelConfigGetstate = [](tle::ParallelConfig const& self) + { + return py::make_tuple(self.getCommunicationType(), self.getCommunicationMode(), self.getDeviceIds(), + self.getParticipantIds(), self.getOrchestratorConfig()); + }; + auto parallelConfigSetstate = [](py::tuple state) + { + if (state.size() != 5) + { + throw std::runtime_error("Invalid state!"); + } + return tle::ParallelConfig(state[0].cast(), state[1].cast(), + state[2].cast>>(), + state[3].cast>>(), + state[4].cast>()); + }; py::class_(m, "ParallelConfig") .def(py::init> const&, std::optional> const&, std::optional const&>(), @@ -328,8 +371,28 @@ void InitBindings(pybind11::module_& m) .def_property( "participant_ids", &tle::ParallelConfig::getParticipantIds, &tle::ParallelConfig::setParticipantIds) .def_property("orchestrator_config", &tle::ParallelConfig::getOrchestratorConfig, - &tle::ParallelConfig::setOrchestratorConfig); + &tle::ParallelConfig::setOrchestratorConfig) + .def(py::pickle(parallelConfigGetstate, parallelConfigSetstate)); + auto peftCacheConfigSetstate = [](py::tuple state) + { + if (state.size() != 11) + { + throw std::runtime_error("Invalid state!"); + } + return tle::PeftCacheConfig(state[0].cast(), state[1].cast(), + state[2].cast(), state[3].cast(), state[4].cast(), + state[5].cast(), state[6].cast(), state[7].cast(), + state[8].cast(), state[9].cast>(), + state[10].cast>()); + }; + auto peftCacheConfigGetstate = [](tle::PeftCacheConfig const& self) + { + return py::make_tuple(self.getNumHostModuleLayer(), self.getNumDeviceModuleLayer(), + self.getOptimalAdapterSize(), self.getMaxAdapterSize(), self.getNumPutWorkers(), self.getNumEnsureWorkers(), + self.getNumCopyStreams(), self.getMaxPagesPerBlockHost(), self.getMaxPagesPerBlockDevice(), + self.getDeviceCachePercent(), self.getHostCacheSize()); + }; py::class_(m, "PeftCacheConfig") .def(py::init const&, std::optional const&>(), @@ -348,17 +411,15 @@ void InitBindings(pybind11::module_& m) .def_property_readonly("max_pages_per_block_host", &tle::PeftCacheConfig::getMaxPagesPerBlockHost) .def_property_readonly("max_pages_per_block_device", &tle::PeftCacheConfig::getMaxPagesPerBlockDevice) .def_property_readonly("device_cache_percent", &tle::PeftCacheConfig::getDeviceCachePercent) - .def_property_readonly("host_cache_size", &tle::PeftCacheConfig::getHostCacheSize); + .def_property_readonly("host_cache_size", &tle::PeftCacheConfig::getHostCacheSize) + .def(py::pickle(peftCacheConfigGetstate, peftCacheConfigSetstate)); py::class_(m, "LookaheadDecodingConfig") - .def(py::init(), py::arg("max_ngram_size"), py::arg("max_window_size"), + .def(py::init(), py::arg("max_window_size"), py::arg("max_ngram_size"), py::arg("max_verification_set_size")) - .def_property("max_ngram_size", &tle::LookaheadDecodingConfig::getMaxNgramSize, - &tle::LookaheadDecodingConfig::setMaxNgramSize) - .def_property("max_window_size", &tle::LookaheadDecodingConfig::getMaxWindowSize, - &tle::LookaheadDecodingConfig::setMaxWindowSize) - .def_property("max_verification_set_size", &tle::LookaheadDecodingConfig::getMaxVerificationSetSize, - &tle::LookaheadDecodingConfig::setMaxVerificationSetSize); + .def_property_readonly("max_window_size", &tle::LookaheadDecodingConfig::getWindowSize) + .def_property_readonly("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize) + .def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize); py::class_(m, "DecodingConfig") .def(py::init, std::optional, @@ -370,19 +431,70 @@ void InitBindings(pybind11::module_& m) &tle::DecodingConfig::setLookaheadDecoding) .def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices); + auto executorConfigGetState = [&](tle::ExecutorConfig const& self) + { + py::object peftCacheConfigState = py::none(); + + if (self.getPeftCacheConfig().has_value()) + { + peftCacheConfigState = peftCacheConfigGetstate(self.getPeftCacheConfig().value()); + } + auto kvCacheConfigState = kvCacheConfigGetstate(self.getKvCacheConfig()); + auto schedulerConfigState = schedulerConfigGetstate(self.getSchedulerConfig()); + py::object parallelConfigState = py::none(); + if (self.getParallelConfig().has_value()) + { + parallelConfigState = parallelConfigGetstate(self.getParallelConfig().value()); + } + + return py::make_tuple(self.getMaxBeamWidth(), schedulerConfigState, kvCacheConfigState, + self.getEnableChunkedContext(), self.getNormalizeLogProbs(), self.getIterStatsMaxIterations(), + self.getRequestStatsMaxIterations(), self.getBatchingType(), self.getMaxBatchSize(), parallelConfigState, + peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(), + self.getDecodingConfig(), self.getGpuWeightsPercent()); + }; + auto executorConfigSetState = [&](py::tuple state) + { + if (state.size() != 15) + { + throw std::runtime_error("Invalid state!"); + } + auto kvCacheConfig = kvCacheConfigSetstate(state[2].cast()); + auto schedulerConfig = schedulerConfigSetstate(state[1].cast()); + + std::optional peftCacheConfig; + if (state[10].cast() != py::none()) + { + peftCacheConfig = peftCacheConfigSetstate(state[10].cast()); + } + std::optional parallelConfig; + if (state[9].cast() != py::none()) + { + parallelConfig = parallelConfigSetstate(state[9].cast()); + } + + return tle::ExecutorConfig(state[0].cast(), schedulerConfig, kvCacheConfig, state[3].cast(), + state[4].cast(), state[5].cast(), state[6].cast(), + state[7].cast(), state[8].cast>(), parallelConfig, + peftCacheConfig, state[11].cast>(), + state[12].cast>(), + state[13].cast>(), state[14].cast()); + }; py::class_(m, "ExecutorConfig") .def(py::init, tle::PeftCacheConfig const&, - std::optional, std::optional>(), + SizeType32, tle::BatchingType, std::optional, std::optional, + tle::PeftCacheConfig const&, std::optional, + std::optional, std::optional, float>(), py::arg("max_beam_width") = 1, py::arg_v("scheduler_config", tle::SchedulerConfig(), "SchedulerConfig()"), py::arg_v("kv_cache_config", tle::KvCacheConfig(), "KvCacheConfig()"), py::arg("enable_chunked_context") = false, py::arg("normalize_log_probs") = true, py::arg("iter_stats_max_iterations") = tle::kDefaultIterStatsMaxIterations, py::arg("request_stats_max_iterations") = tle::kDefaultRequestStatsMaxIterations, py::arg_v("batching_type", tle::BatchingType::kINFLIGHT, "BatchingType.INFLIGHT"), - py::arg("parallel_config") = py::none(), + py::arg("max_batch_size") = py::none(), py::arg("parallel_config") = py::none(), py::arg_v("peft_cache_config", tle::PeftCacheConfig(), "PeftCacheConfig()"), - py::arg("logits_post_processor_map") = py::none(), py::arg("decoding_config") = py::none()) + py::arg("logits_post_processor_map") = py::none(), py::arg("logits_post_processor_batched") = py::none(), + py::arg("decoding_config") = py::none(), py::arg("gpu_weights_percent") = 1.0) .def_property("max_beam_width", &tle::ExecutorConfig::getMaxBeamWidth, &tle::ExecutorConfig::setMaxBeamWidth) .def_property( "scheduler_config", &tle::ExecutorConfig::getSchedulerConfig, &tle::ExecutorConfig::setSchedulerConfig) @@ -402,8 +514,13 @@ void InitBindings(pybind11::module_& m) "peft_cache_config", &tle::ExecutorConfig::getPeftCacheConfig, &tle::ExecutorConfig::setPeftCacheConfig) .def_property("logits_post_processor_map", &tle::ExecutorConfig::getLogitsPostProcessorMap, &tle::ExecutorConfig::setLogitsPostProcessorMap) + .def_property("logits_post_processor_batched", &tle::ExecutorConfig::getLogitsPostProcessorBatched, + &tle::ExecutorConfig::setLogitsPostProcessorBatched) .def_property( - "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig); + "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig) + .def_property("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent, + &tle::ExecutorConfig::setGpuWeightsPercent) + .def(py::pickle(executorConfigGetState, executorConfigSetState)); tensorrt_llm::pybind::executor::Executor::initBindings(m); } diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp deleted file mode 100644 index 2b96eb1e2..000000000 --- a/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "generationInput.h" - -#include "tensorrt_llm/runtime/generationInput.h" -#include "tensorrt_llm/runtime/torchView.h" - -#include -#include -#include -#include - -namespace tr = tensorrt_llm::runtime; - -using namespace tensorrt_llm::pybind::runtime; - -std::shared_ptr PromptTuningParams::toTrtLlm() const -{ - auto ptt = std::make_shared(); - if (embeddingTable) - ptt->embeddingTable = tr::TorchView::of(embeddingTable.value()); - if (tasks) - ptt->tasks = tr::TorchView::of(tasks.value()); - if (vocabSize) - ptt->vocabSize = tr::TorchView::of(vocabSize.value()); - ptt->promptTuningEnabled = promptTuningEnabled; - return ptt; -} - -void PromptTuningParams::initBindings(pybind11::module_& m) -{ - py::class_(m, "PromptTuningParams") - .def(py::init(), - py::arg("embedding_table") = py::none(), py::arg("tasks") = py::none(), py::arg("vocab_size") = py::none()) - .def_readwrite("embedding_table", &PromptTuningParams::embeddingTable) - .def_readwrite("tasks", &PromptTuningParams::tasks) - .def_readwrite("vocab_size", &PromptTuningParams::vocabSize) - .def_readwrite("prompt_tuning_enabled", &PromptTuningParams::promptTuningEnabled); -} - -std::shared_ptr GenerationInput::toTrtLlm() const -{ - auto input = std::make_shared( - endId, padId, tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value()), packed); - if (embeddingBias) - input->embeddingBias = tr::TorchView::of(embeddingBias.value()); - if (badWordsList) - input->badWordsList = tr::TorchView::of(badWordsList.value()); - if (stopWordsList) - input->stopWordsList = tr::TorchView::of(stopWordsList.value()); - input->maxNewTokens = maxNewTokens; - input->promptTuningParams = *promptTuningParams.toTrtLlm(); - return input; - - return input; -} - -void GenerationInput::initBindings(pybind11::module_& m) -{ - py::class_(m, "GenerationInput") - .def(py::init(), - py::arg("end_id"), py::arg("pad_id"), py::arg("ids"), py::arg("lengths"), py::arg("packed") = false) - .def_readwrite("end_id", &GenerationInput::endId) - .def_readwrite("pad_id", &GenerationInput::padId) - .def_readwrite("ids", &GenerationInput::ids) - .def_readwrite("lengths", &GenerationInput::lengths) - .def_readwrite("packed", &GenerationInput::packed) - .def_readwrite("embedding_bias", &GenerationInput::embeddingBias) - .def_readwrite("bad_words_list", &GenerationInput::badWordsList) - .def_readwrite("stop_words_list", &GenerationInput::stopWordsList) - .def_readwrite("max_new_tokens", &GenerationInput::maxNewTokens) - .def_readwrite("prompt_tuning_params", &GenerationInput::promptTuningParams); -} diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.h b/cpp/tensorrt_llm/pybind/runtime/generationInput.h deleted file mode 100644 index a99fd0227..000000000 --- a/cpp/tensorrt_llm/pybind/runtime/generationInput.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/runtime/common.h" -#include "tensorrt_llm/runtime/generationInput.h" - -#include -#include -#include -#include -#include - -namespace tensorrt_llm::pybind::runtime -{ - -using SizeType32 = tensorrt_llm::runtime::SizeType32; - -class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams> -{ -public: - using Base = tensorrt_llm::runtime::GenericPromptTuningParams>; - using TensorPtr = Base::TensorPtr; - using SizeType32 = Base::SizeType32; - - explicit PromptTuningParams( - TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr()) - : GenericPromptTuningParams(std::move(embeddingTable), std::move(tasks), std::move(vocabSize)) - { - } - - [[nodiscard]] std::shared_ptr toTrtLlm() const; - static void initBindings(pybind11::module_& m); -}; - -class GenerationInput - : public tensorrt_llm::runtime::GenericGenerationInput, PromptTuningParams> -{ -public: - using Base = tensorrt_llm::runtime::GenericGenerationInput, PromptTuningParams>; - using TensorPtr = Base::TensorPtr; - - explicit GenerationInput( - SizeType32 const endId, SizeType32 const padId, TensorPtr ids, TensorPtr lengths, bool packed = false) - : GenericGenerationInput(endId, padId, std::move(ids), std::move(lengths), packed) - { - } - - [[nodiscard]] std::shared_ptr toTrtLlm() const; - static void initBindings(pybind11::module_& m); -}; -} // namespace tensorrt_llm::pybind::runtime diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp deleted file mode 100644 index 4f955ae0f..000000000 --- a/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "generationOutput.h" - -#include "tensorrt_llm/runtime/torch.h" -#include "tensorrt_llm/runtime/torchView.h" - -#include -#include -#include -#include - -namespace tr = tensorrt_llm::runtime; - -using namespace tensorrt_llm::pybind::runtime; - -std::shared_ptr GenerationOutput::toTrtLlm() const -{ - auto output - = std::make_shared(tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value())); - if (cumLogProbs) - { - output->cumLogProbs = tr::TorchView::of(cumLogProbs.value()); - } - if (logProbs) - { - output->logProbs = tr::TorchView::of(logProbs.value()); - } - if (contextLogits) - { - output->contextLogits = tr::TorchView::of(contextLogits.value()); - } - if (generationLogits) - { - output->generationLogits = tr::TorchView::of(generationLogits.value()); - } - - if (onTokenGenerated) - { - output->onTokenGenerated = [delegate = onTokenGenerated]( - tr::GenerationOutput::TensorPtr const& ids, tr::SizeType32 step, bool finished) - { delegate(tr::Torch::tensor(ids), step, finished); }; - } - return output; -} - -void GenerationOutput::initBindings(py::module_& m) -{ - py::class_(m, "GenerationOutput") - .def(py::init(), py::arg("ids"), py::arg("lengths")) - .def_readwrite("ids", &GenerationOutput::ids) - .def_readwrite("lengths", &GenerationOutput::lengths) - .def_readwrite("cum_log_probs", &GenerationOutput::cumLogProbs) - .def_readwrite("log_probs", &GenerationOutput::logProbs) - .def_readwrite("context_logits", &GenerationOutput::contextLogits) - .def_readwrite("generation_logits", &GenerationOutput::generationLogits) - .def_readwrite("on_token_generated", &GenerationOutput::onTokenGenerated); -} diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.h b/cpp/tensorrt_llm/pybind/runtime/generationOutput.h deleted file mode 100644 index ef6943516..000000000 --- a/cpp/tensorrt_llm/pybind/runtime/generationOutput.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/runtime/generationOutput.h" - -#include -#include -#include - -namespace tensorrt_llm::pybind::runtime -{ - -class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput> -{ -public: - using Base = tensorrt_llm::runtime::GenericGenerationOutput>; - using TensorPtr = Base::TensorPtr; - - explicit GenerationOutput(TensorPtr ids, TensorPtr lengths) - : GenericGenerationOutput(std::move(ids), std::move(lengths)) - { - } - - [[nodiscard]] std::shared_ptr toTrtLlm() const; - static void initBindings(pybind11::module_& m); -}; - -} // namespace tensorrt_llm::pybind::runtime diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt index b4fd7333b..d92bd588a 100644 --- a/cpp/tensorrt_llm/runtime/CMakeLists.txt +++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt @@ -19,6 +19,7 @@ set(SRCS utils/sessionUtils.cpp utils/debugUtils.cu bufferManager.cpp + explicitDraftTokensBuffers.cpp layerProfiler.cpp loraManager.cpp loraUtils.cpp diff --git a/cpp/tensorrt_llm/runtime/explicitDraftTokensBuffers.cpp b/cpp/tensorrt_llm/runtime/explicitDraftTokensBuffers.cpp new file mode 100644 index 000000000..52da8b854 --- /dev/null +++ b/cpp/tensorrt_llm/runtime/explicitDraftTokensBuffers.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h" + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iBuffer.h" + +namespace tksd = tensorrt_llm::kernels::speculative_decoding; + +namespace tensorrt_llm::runtime +{ + +void ExplicitDraftTokensBuffers::Inputs::create(SizeType32 maxNumSequences, TllmRuntime const& runtime, + ModelConfig const& modelConfig, WorldConfig const& worldConfig) +{ + auto const& manager = runtime.getBufferManager(); + + auto const& speculativeDecodingModule = modelConfig.getSpeculativeDecodingModule(); + auto const maxNumPaths = speculativeDecodingModule.getMaxNumPaths(); + auto const maxDraftPathLen = speculativeDecodingModule.getMaxDraftPathLen(); + auto const maxPathLen = speculativeDecodingModule.getMaxPathLen(); + auto const maxDecodingTokens = speculativeDecodingModule.getMaxDecodingTokens(); + auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); + + auto constexpr TRTTokenIdType = runtime::TRTDataType::value; + auto const dtype = modelConfig.getDataType(); + + maxGenLengthHost = manager.pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + temperatures = manager.gpu(ITensor::makeShape({maxNumSequences}), dtype); + positionIdsBase = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32); + generationLengths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32); + randomDataSample = manager.gpu(ITensor::makeShape({maxNumSequences}), dtype); + randomDataValidation = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxDraftPathLen}), dtype); + draftTokens = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxPathLen}), TRTTokenIdType); + draftIndices + = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxPathLen}), nvinfer1::DataType::kINT32); + draftProbs + = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxDraftPathLen, vocabSizePadded}), dtype); + packedMasks + = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}), + nvinfer1::DataType::kINT32); + positionIds = manager.gpu(ITensor::makeShape({maxNumSequences * maxDecodingTokens}), nvinfer1::DataType::kINT32); +} + +ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, + runtime::BufferManager const& manager, runtime::ModelConfig const& modelConfig, + runtime::WorldConfig const& worldConfig, executor::DecodingConfig const& decodingConfig, + runtime::TllmRuntime const& runtime) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "Explicit draft tokens does not support beam search"); + + auto const maxNumSequences = maxBatchSize; + auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); + + auto const explicitDraftTokensModule + = std::dynamic_pointer_cast( + modelConfig.getSpeculativeDecodingModulePtr()); + + auto const numBeams = explicitDraftTokensModule->getMaxNumPaths(); + auto const beamDraftLength = explicitDraftTokensModule->getMaxDraftPathLen(); + auto const beamLength = explicitDraftTokensModule->getMaxPathLen(); // beamDraftLength + 1 + + auto constexpr TRTTokenIdType = runtime::TRTDataType::value; + auto const dtype = modelConfig.getDataType(); + + // input tensors + engineInputs.requestTypesDevice = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineInputs.temperatures = manager.emptyTensor(runtime::MemoryType::kGPU, dtype); + + engineInputs.draftTokens = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamLength}), TRTTokenIdType); + engineInputs.draftIndices + = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamLength}), nvinfer1::DataType::kINT32); + engineInputs.draftProbs + = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamDraftLength, vocabSizePadded}), dtype); + + engineInputs.generationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineInputs.positionIds = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineInputs.positionOffsets = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineInputs.packedMasks = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + + engineInputs.randomDataSample = manager.emptyTensor(runtime::MemoryType::kGPU, dtype); + engineInputs.randomDataValidation = manager.emptyTensor(runtime::MemoryType::kGPU, dtype); + engineInputs.positionIdsBase = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + + // output tensors + engineOutputs.nextDraftTokens + = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamLength}), TRTTokenIdType); + engineOutputs.nextDraftIndices + = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamLength}), nvinfer1::DataType::kINT32); + engineOutputs.nextDraftProbs + = manager.gpu(ITensor::makeShape({maxNumSequences, numBeams, beamDraftLength, vocabSizePadded}), dtype); + + engineOutputs.maxGenToken = manager.gpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + engineOutputs.totalGenToken = manager.gpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + + engineOutputs.nextGenerationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineOutputs.nextPositionOffsets = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineOutputs.masks = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kBOOL); + + engineOutputs.nextFlatTokens = manager.emptyTensor(runtime::MemoryType::kGPU, TRTTokenIdType); + engineOutputs.bestPathLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineOutputs.bestPathIndices = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + engineOutputs.packedPositionIds = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + + // helper tensors + auto const& stream = manager.getStream(); + scanTempStorageBytes + = tksd::invokeScanGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get()); + scanTempStorage = manager.gpu(scanTempStorageBytes); + cumSumGenerationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32); + + // pre-allocate empty tensors + reshape(0, maxNumSequences, modelConfig); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void ExplicitDraftTokensBuffers::reshape( + SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const numSequences = numCtxSequences + numGenSequences; + + auto const explicitDraftTokensModule + = std::dynamic_pointer_cast( + modelConfig.getSpeculativeDecodingModulePtr()); + + auto const numBeams = explicitDraftTokensModule->getMaxNumPaths(); + auto const beamDraftLength = explicitDraftTokensModule->getMaxDraftPathLen(); + auto const maxDecodingTokens = explicitDraftTokensModule->getMaxDecodingTokens(); + + // input tensors + engineInputs.requestTypesDevice->reshape(ITensor::makeShape({numSequences})); + engineInputs.temperatures->reshape(ITensor::makeShape({numSequences})); + + auto draftTokensShape = engineInputs.draftTokens->getShape(); + draftTokensShape.d[0] = numGenSequences; + engineInputs.draftTokens->reshape(draftTokensShape); + auto draftIndicesShape = engineInputs.draftIndices->getShape(); + draftIndicesShape.d[0] = numGenSequences; + engineInputs.draftIndices->reshape(draftIndicesShape); + auto draftProbsShape = engineInputs.draftProbs->getShape(); + draftProbsShape.d[0] = numGenSequences; + engineInputs.draftProbs->reshape(draftProbsShape); + + engineInputs.generationLengths->reshape(ITensor::makeShape({numGenSequences})); + engineInputs.positionIds->reshape(ITensor::makeShape({numSequences * maxDecodingTokens})); + engineInputs.positionOffsets->reshape(ITensor::makeShape({numGenSequences, maxDecodingTokens})); + engineInputs.packedMasks->reshape( + ITensor::makeShape({numGenSequences * maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)})); + + engineInputs.randomDataSample->reshape(ITensor::makeShape({numSequences})); + engineInputs.randomDataValidation->reshape(ITensor::makeShape({numGenSequences, numBeams, beamDraftLength})); + engineInputs.positionIdsBase->reshape(ITensor::makeShape({numSequences})); + + // output tensors + engineOutputs.nextGenerationLengths->reshape(ITensor::makeShape({numSequences})); + engineOutputs.nextPositionOffsets->reshape(ITensor::makeShape({numSequences, maxDecodingTokens})); + engineOutputs.masks->reshape(ITensor::makeShape({numSequences, maxDecodingTokens, maxDecodingTokens})); + + engineOutputs.nextFlatTokens->reshape(ITensor::makeShape({numSequences * maxDecodingTokens})); + engineOutputs.bestPathLengths->reshape(ITensor::makeShape({numSequences})); + engineOutputs.bestPathIndices->reshape(ITensor::makeShape({numSequences})); + engineOutputs.packedPositionIds->reshape(ITensor::makeShape({numSequences * maxDecodingTokens})); + + cumSumGenerationLengths->reshape(ITensor::makeShape({numSequences})); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExplicitDraftTokensBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, + SizeType32 vocabSizePadded, ITensor const& seqSlots, ExplicitDraftTokensBuffers::Inputs const& draftBuffers, + ITensor const& contextPositionIds, runtime::ExplicitDraftTokensModule const& explicitDraftTokensModule, + runtime::CudaStream const& stream) const +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + using runtime::bufferCast; + + tksd::PackExplicitDraftTokensParams params; + params.batchSize = numCtxSequences + numGenSequences; + params.numPaths = explicitDraftTokensModule.getMaxNumPaths(); + params.maxPathLength = explicitDraftTokensModule.getMaxPathLen(); + params.vocabSize = vocabSizePadded; + params.numContextRequests = numCtxSequences; + params.numGenerationRequests = numGenSequences; + params.numContextTokens = contextPositionIds.getShape().d[0]; + + params.batchSlots = bufferCast(seqSlots); + + params.maxGenerationLength = bufferCast(*engineOutputs.maxGenToken); + + params.inputTemperatures = bufferCast(*draftBuffers.temperatures); + params.inputPositionIdsBase = bufferCast(*draftBuffers.positionIdsBase); + params.inputGenerationLengths = bufferCast(*draftBuffers.generationLengths); + params.inputRandomDataSample = bufferCast(*draftBuffers.randomDataSample); + params.inputRandomDataValidation = bufferCast(*draftBuffers.randomDataValidation); + params.inputNextDraftTokens = bufferCast(*draftBuffers.draftTokens); + params.inputNextDraftIndices = bufferCast(*draftBuffers.draftIndices); + params.inputDraftProbs = bufferCast(*draftBuffers.draftProbs); + params.inputPackedMask = bufferCast(*draftBuffers.packedMasks); + params.inputPositionIds = bufferCast(*draftBuffers.positionIds); + + params.outputTemperatures = bufferCast(*engineInputs.temperatures); + params.outputPositionIdsBase = bufferCast(*engineInputs.positionIdsBase); + params.outputGenerationLengths = bufferCast(*engineInputs.generationLengths); + params.outputRandomDataSample = bufferCast(*engineInputs.randomDataSample); + params.outputRandomDataValidation = bufferCast(*engineInputs.randomDataValidation); + params.outputNextDraftTokens = bufferCast(*engineInputs.draftTokens); + params.outputNextDraftIndices = bufferCast(*engineInputs.draftIndices); + params.outputDraftProbs = bufferCast(*engineInputs.draftProbs); + params.outputPackedMask = bufferCast(*engineInputs.packedMasks); + params.outputPositionOffsets = bufferCast(*engineInputs.positionOffsets); + params.outputPositionIds = bufferCast(*engineInputs.positionIds); + + params.cumSumGenerationLengths = bufferCast(*cumSumGenerationLengths); + + params.checkParams(); + + // Pack tensors from batch slot position to continuous array + tksd::invokePackGenerationLengths(params, stream.get()); + + if (numGenSequences) + { + // Compute inclusive sum + tksd::invokeScanGenerationLengths(bufferCast(*scanTempStorage), scanTempStorageBytes, + bufferCast(*engineInputs.generationLengths), bufferCast(*cumSumGenerationLengths), + numGenSequences, stream.get()); + } + + // Pack tensors from batch slot position to continuous array + tksd::invokePackExplicitDraftTokens(params, stream.get()); + + if (numGenSequences) + { + // Copy draft probs + tksd::invokeCopyProbs(params, stream.get()); + } + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void ExplicitDraftTokensBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, + ITensor const& requestTypes, ITensor const& seqSlots, ExplicitDraftTokensBuffers::Inputs const& draftBuffers, + ITensor const& contextPositionIds, runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig, + runtime::WorldConfig const& worldConfig) const +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const& manager = runtime.getBufferManager(); + auto const& stream = runtime.getStream(); + + // Copy position ids -- hacky solution to avoid filling them for the context requests. + TensorPtr posIdsSlice = ITensor::slice(engineInputs.positionIds, 0, contextPositionIds.getShape().d[0]); + manager.copy(contextPositionIds, *posIdsSlice); + + manager.copy(requestTypes, *engineInputs.requestTypesDevice); + + auto const numSequences = numCtxSequences + numGenSequences; + auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); + + auto const explicitDraftTokensModule = std::dynamic_pointer_cast( + modelConfig.getSpeculativeDecodingModulePtr()); + + auto const dtype = modelConfig.getDataType(); + + switch (dtype) + { + case nvinfer1::DataType::kFLOAT: + setFromInputs(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers, + contextPositionIds, *explicitDraftTokensModule, stream); + break; + case nvinfer1::DataType::kHALF: + setFromInputs(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers, + contextPositionIds, *explicitDraftTokensModule, stream); + break; + default: + TLLM_THROW("DataType %d not supported in ExplicitDraftTokensBuffers", static_cast(dtype)); + break; + } + + // reshape outputs + auto draftTokensShape = engineOutputs.nextDraftTokens->getShape(); + draftTokensShape.d[0] = numSequences; + engineOutputs.nextDraftTokens->reshape(draftTokensShape); + auto draftIndicesShape = engineOutputs.nextDraftIndices->getShape(); + draftIndicesShape.d[0] = numSequences; + engineOutputs.nextDraftIndices->reshape(draftIndicesShape); + auto draftProbsShape = engineOutputs.nextDraftProbs->getShape(); + draftProbsShape.d[0] = numSequences; + engineOutputs.nextDraftProbs->reshape(draftProbsShape); + + auto maxGenLength = bufferCast(*draftBuffers.maxGenLengthHost)[0]; + if (maxGenLength == 0) + { + maxGenLength = explicitDraftTokensModule->getMaxDecodingTokens(); + } + auto positionOffsetsShape = engineInputs.positionOffsets->getShape(); + positionOffsetsShape.d[1] = maxGenLength; + engineInputs.positionOffsets->reshape(positionOffsetsShape); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void ExplicitDraftTokensBuffers::insertInputTensors( + TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& /* worldConfig */) const +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + // inputs + inputBuffers.insert_or_assign("explicit_inverted_temperature", engineInputs.temperatures); + inputBuffers.insert_or_assign("device_request_types", engineInputs.requestTypesDevice); + + inputBuffers.insert_or_assign("spec_decoding_generation_lengths", engineInputs.generationLengths); + inputBuffers.insert_or_assign("spec_decoding_position_offsets", engineInputs.positionOffsets); + inputBuffers.insert_or_assign("spec_decoding_packed_mask", engineInputs.packedMasks); + + inputBuffers.insert_or_assign("draft_tokens", engineInputs.draftTokens); + inputBuffers.insert_or_assign("draft_indices", engineInputs.draftIndices); + inputBuffers.insert_or_assign("draft_probs", engineInputs.draftProbs); + + inputBuffers.insert_or_assign("rand_data_sample", engineInputs.randomDataSample); + inputBuffers.insert_or_assign("rand_data_validation", engineInputs.randomDataValidation); + inputBuffers.insert_or_assign("position_ids_base", engineInputs.positionIdsBase); + inputBuffers.insert_or_assign("position_ids", engineInputs.positionIds); + + // outputs + outputBuffers.insert_or_assign("next_spec_decoding_generation_lengths", engineOutputs.nextGenerationLengths); + outputBuffers.insert_or_assign("next_spec_decoding_position_offsets", engineOutputs.nextPositionOffsets); + outputBuffers.insert_or_assign("spec_decoding_mask", engineOutputs.masks); + + outputBuffers.insert_or_assign("next_draft_tokens", engineOutputs.nextDraftTokens); + outputBuffers.insert_or_assign("next_draft_indices", engineOutputs.nextDraftIndices); + outputBuffers.insert_or_assign("next_draft_probs", engineOutputs.nextDraftProbs); + outputBuffers.insert_or_assign("next_flat_tokens", engineOutputs.nextFlatTokens); + + outputBuffers.insert_or_assign("num_accepted_tokens", engineOutputs.bestPathLengths); + outputBuffers.insert_or_assign("accepted_beam_index", engineOutputs.bestPathIndices); + outputBuffers.insert_or_assign("max_gen_token", engineOutputs.maxGenToken); + outputBuffers.insert_or_assign("total_gen_token", engineOutputs.totalGenToken); + outputBuffers.insert_or_assign("packed_position_ids", engineOutputs.packedPositionIds); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp index 5fd99fd47..e650a2c54 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp @@ -20,6 +20,7 @@ #include "tensorrt_llm/common/tensorConversion.h" #include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h" +#include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/layers/dynamicDecodeLayer.h" #include @@ -56,43 +57,83 @@ GptDecoder::GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSiz } template -void GptDecoder::setup( - SamplingConfig const& samplingConfig, size_t batchSize, std::optional const& batchSlots) +void GptDecoder::setup(SamplingConfig const& samplingConfig, size_t batchSize, + std::optional const& batchSlots, std::optional const& output) { mSamplingConfig = samplingConfig; auto setupParams = std::make_shared(); TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Sampling config is invalid"); - setupParams->penaltyParams.repetitionPenalty = mSamplingConfig.repetitionPenalty; - setupParams->penaltyParams.presencePenalty = mSamplingConfig.presencePenalty; - setupParams->penaltyParams.frequencyPenalty = mSamplingConfig.frequencyPenalty; - setupParams->penaltyParams.temperature = mSamplingConfig.temperature; - setupParams->penaltyParams.minLength = mSamplingConfig.minLength; - setupParams->penaltyParams.noRepeatNgramSize = mSamplingConfig.noRepeatNgramSize; + auto penaltyParams = std::make_shared(); + penaltyParams->repetitionPenalty = mSamplingConfig.repetitionPenalty; + penaltyParams->presencePenalty = mSamplingConfig.presencePenalty; + penaltyParams->frequencyPenalty = mSamplingConfig.frequencyPenalty; + penaltyParams->temperature = mSamplingConfig.temperature; + penaltyParams->minLength = mSamplingConfig.minLength; - setupParams->randomSeed = mSamplingConfig.randomSeed; + setupParams->penaltyParams = std::move(penaltyParams); - setupParams->samplingParams.normalize_log_probs = mSamplingConfig.normalizeLogProbs; - // signed to unsigned - if (mSamplingConfig.topK) + auto banWordsParams = std::make_shared(); + banWordsParams->noRepeatNgramSize = mSamplingConfig.noRepeatNgramSize; + + setupParams->banWordsParams = std::move(banWordsParams); + + if (mDecodingMode.isTopKorTopP()) { - auto const& topK = mSamplingConfig.topK.value(); - setupParams->samplingParams.runtime_top_k = std::vector(std::begin(topK), std::end(topK)); + auto samplingParams = std::make_shared(); + samplingParams->normalizeLogProbs = mSamplingConfig.normalizeLogProbs; + // signed to unsigned + if (mSamplingConfig.topK) + { + auto const& topK = mSamplingConfig.topK.value(); + samplingParams->runtimeTopK = std::vector(std::begin(topK), std::end(topK)); + } + + samplingParams->runtimeTopP = mSamplingConfig.topP; + samplingParams->topPDecay = mSamplingConfig.topPDecay; + samplingParams->topPMin = mSamplingConfig.topPMin; + samplingParams->topPResetIds = mSamplingConfig.topPResetIds; + samplingParams->outputLogProbs = mSamplingConfig.outputLogProbs; + samplingParams->cumLogProbs = mSamplingConfig.cumLogProbs; + + setupParams->decodingParams = std::move(samplingParams); } + else if (mDecodingMode.isBeamSearch()) + { + auto beamSearchParams = std::make_shared(); + beamSearchParams->beamSearchDiversityRate = mSamplingConfig.beamSearchDiversityRate; + beamSearchParams->lengthPenalty = mSamplingConfig.lengthPenalty; + beamSearchParams->earlyStopping = mSamplingConfig.earlyStopping; - setupParams->samplingParams.runtime_top_p = mSamplingConfig.topP; - setupParams->samplingParams.top_p_decay = mSamplingConfig.topPDecay; - setupParams->samplingParams.top_p_min = mSamplingConfig.topPMin; - setupParams->samplingParams.top_p_reset_ids = mSamplingConfig.topPResetIds; - setupParams->samplingParams.outputLogProbs = mSamplingConfig.outputLogProbs; - setupParams->samplingParams.cumLogProbs = mSamplingConfig.cumLogProbs; + setupParams->decodingParams = std::move(beamSearchParams); + } + else if (mDecodingMode.isMedusa()) + { + auto medusaParams = std::make_shared(); + // signed to unsigned + if (mSamplingConfig.topK) + { + auto const& topK = mSamplingConfig.topK.value(); + medusaParams->runtimeTopK = std::vector(std::begin(topK), std::end(topK)); + } + medusaParams->runtimeHeadsTopK = mSamplingConfig.topKMedusaHeads; - setupParams->beamSearchParams.beam_search_diversity_rate = mSamplingConfig.beamSearchDiversityRate; - setupParams->beamSearchParams.length_penalty = mSamplingConfig.lengthPenalty; - setupParams->beamSearchParams.early_stopping = mSamplingConfig.earlyStopping; + setupParams->decodingParams = std::move(medusaParams); + } + else if (mDecodingMode.isExplicitDraftTokens()) + { + TLLM_CHECK_WITH_INFO(output.has_value(), "Output tensors must be provided for ExplicitDraftTokens"); + auto explicitDraftTokensParams = std::make_shared(); + explicitDraftTokensParams->temperature = mSamplingConfig.temperature; + explicitDraftTokensParams->randomDataSample + = tcc::toTllmTensor(*output->explicitDraftTokensBuffers->randomDataSample); + explicitDraftTokensParams->temperatures = tcc::toTllmTensor(*output->explicitDraftTokensBuffers->temperatures); - setupParams->medusaParams.topKMedusaHeads = mSamplingConfig.topKMedusaHeads; + setupParams->decodingParams = explicitDraftTokensParams; + } + + setupParams->decodingParams->randomSeed = mSamplingConfig.randomSeed; auto const batchSlotsPtr = batchSlots.has_value() ? bufferCast(*(batchSlots.value())) : nullptr; mDynamicDecodeLayer->setup(batchSize, mSamplingConfig.beamWidth, batchSlotsPtr, setupParams); @@ -109,18 +150,53 @@ void safeInsert(tc::TensorMap& map, std::string const& key, DecodingOutput::Tens } } -template -tl::DynamicDecodeInputParams::MedusaInputs prepareMedusaInputs(DecodingInput const& inputs, size_t maxBatchSize) +std::shared_ptr prepareBanWordsInputs(DecodingInput const& input) +{ + auto banWordsParams = std::make_shared(input.batchSize); + if (input.badWordsPtrs) + { + TLLM_CHECK_WITH_INFO(input.badWordsPtrs, "Bad word lengths must be provided when badWordsPtrs is given"); + banWordsParams->badWordsPtr = tcc::toTllmTensor(*input.badWordsPtrs); + banWordsParams->badWordsLengths = tcc::toTllmTensor(*input.badWordsLens); + banWordsParams->maxBadWordsLen = input.maxBadWordsLen; + } + + return banWordsParams; +} + +std::shared_ptr prepareStopCriteriaInputs(DecodingInput const& input) +{ + auto stopCriteriaParams = std::make_shared(input.batchSize); + if (input.stopWordsPtrs) + { + TLLM_CHECK_WITH_INFO(input.stopWordsLens, "Stop word lengths must be provided when stopWordsPtrs is given"); + + stopCriteriaParams->stopWordsPtr = tcc::toTllmTensor(*input.stopWordsPtrs); + stopCriteriaParams->stopWordsLengths = tcc::toTllmTensor(*input.stopWordsLens); + stopCriteriaParams->maxStopWordsLen = input.maxStopWordsLen; + } + + if (input.sequenceLimitLength) + { + stopCriteriaParams->sequenceLimitLength = tcc::toTllmTensor(*input.sequenceLimitLength); + } + + return stopCriteriaParams; +} + +void prepareMedusaInputs( + DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputParams = std::dynamic_pointer_cast(baseInputs); + auto const& medusaInputs = inputs.medusaInputs.value(); - tl::DynamicDecodeInputParams::MedusaInputs medusaDecodingInputs; - medusaDecodingInputs.medusaCurTokensPerStep = tcc::toTllmTensor(*medusaInputs.medusaCurTokensPerStep); - medusaDecodingInputs.medusaTargetTokensPerStep = tcc::toTllmTensor(*medusaInputs.medusaTargetTokensPerStep); - medusaDecodingInputs.medusaPaths = tcc::toTllmTensor(*medusaInputs.medusaPaths); - medusaDecodingInputs.medusaTreeIds = tcc::toTllmTensor(*medusaInputs.medusaTreeIds); + inputParams->curTokensPerStep = tcc::toTllmTensor(*medusaInputs.medusaCurTokensPerStep); + inputParams->targetTokensPerStep = tcc::toTllmTensor(*medusaInputs.medusaTargetTokensPerStep); + inputParams->paths = tcc::toTllmTensor(*medusaInputs.medusaPaths); + inputParams->treeIds = tcc::toTllmTensor(*medusaInputs.medusaTreeIds); auto const batchSlots = bufferCast(*inputs.batchSlots); if (medusaInputs.medusaLogits.size()) { @@ -141,81 +217,111 @@ tl::DynamicDecodeInputParams::MedusaInputs prepareMedusaInputs(DecodingInput con } } } - medusaDecodingInputs.medusaLogits = medusaLogits; + inputParams->medusaLogits = medusaLogits; } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return medusaDecodingInputs; } -template -tl::DynamicDecodeInputParams::ExplicitDraftTokensInputs prepareExplicitDraftTokensInput( - DecodingInput const& inputs, size_t maxBatchSize) +void prepareExplicitDraftTokensInput( + DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto inputParams = std::dynamic_pointer_cast(baseInputs); + + auto& explicitDraftTokensInputs = inputs.explicitDraftTokensInputs; + + TLLM_CHECK_WITH_INFO(explicitDraftTokensInputs.has_value(), "ExplicitDraftTokensInputs are not set"); + + inputParams->nextDraftTokens = tcc::toTllmTensor(*explicitDraftTokensInputs->nextDraftTokens); + inputParams->nextFlatTokens = tcc::toTllmTensor(*explicitDraftTokensInputs->nextFlatTokens); + inputParams->nextDraftIndices = tcc::toTllmTensor(*explicitDraftTokensInputs->nextDraftIndices); + inputParams->nextDraftProbs = tcc::toTllmTensor(*explicitDraftTokensInputs->nextDraftProbs); + inputParams->lastDraftTokens = tcc::toTllmTensor(*explicitDraftTokensInputs->lastDraftTokens); + inputParams->lastDraftIndices = tcc::toTllmTensor(*explicitDraftTokensInputs->lastDraftIndices); + inputParams->masks = tcc::toTllmTensor(*explicitDraftTokensInputs->masks); + inputParams->packedPosIds = tcc::toTllmTensor(*explicitDraftTokensInputs->packedPositionIds); + inputParams->bestPathLengths = tcc::toTllmTensor(*explicitDraftTokensInputs->bestPathLengths); + inputParams->bestPathIndices = tcc::toTllmTensor(*explicitDraftTokensInputs->bestPathIndices); + inputParams->generationLengths = tcc::toTllmTensor(*explicitDraftTokensInputs->nextGenerationLengths); + inputParams->positionIdsBase = tcc::toTllmTensor(*explicitDraftTokensInputs->lastPositionIdsBase); + inputParams->lastGenerationLengths = tcc::toTllmTensor(*explicitDraftTokensInputs->lastGenerationLengths); + inputParams->maxGenLengthDevice = tcc::toTllmTensor(*explicitDraftTokensInputs->maxGenLengthDevice); + inputParams->seqSlots = tcc::toTllmTensor(*explicitDraftTokensInputs->seqSlots); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return tl::DynamicDecodeInputParams::ExplicitDraftTokensInputs{}; } template -std::shared_ptr prepareInputs( +std::shared_ptr prepareInputs( DecodingInput const& input, size_t maxBatchSize, tle::DecodingMode const& decodingMode) { - auto constexpr ite = 0; // no pipeline parallelism - auto forwardParams = std::make_shared(input.step, ite, input.maxLength, - input.maxAttentionWindow, input.sinkTokenLength, input.maxBatchSize, tcc::toTllmTensor(*input.endIds)); + auto constexpr ite = 0; - if (input.logitsVec) + std::shared_ptr forwardParams; + if (decodingMode.isTopKorTopP()) { - std::vector logitsVec; - for (auto const& logits : input.logitsVec.value()) - { - TLLM_CHECK(logits->getDataType() == TRTDataType::value); - logitsVec.push_back(tcc::toTllmTensor(*logits)); - } - forwardParams->logits_vec = logitsVec; + forwardParams + = std::make_shared(tcc::toTllmTensor(*input.endIds), input.step, ite, input.batchSize); } - else + else if (decodingMode.isBeamSearch()) { - TLLM_CHECK(input.logits->getDataType() == TRTDataType::value); - forwardParams->logits = tcc::toTllmTensor(*input.logits); + forwardParams = std::make_shared(tcc::toTllmTensor(*input.endIds), input.step, ite, + input.batchSize, input.maxAttentionWindow, input.sinkTokenLength); } - - if (input.cacheIndirection) + else if (decodingMode.isMedusa()) { - forwardParams->src_cache_indirection = tcc::toTllmTensor(*input.cacheIndirection); + forwardParams = std::make_shared(tcc::toTllmTensor(*input.endIds), input.batchSize); } - - if (input.sequenceLimitLength) + else if (decodingMode.isLookahead()) + { + // TODO add lookahead inputs + } + else if (decodingMode.isExplicitDraftTokens()) { - forwardParams->sequence_limit_length = tcc::toTllmTensor(*input.sequenceLimitLength); + forwardParams + = std::make_shared(tcc::toTllmTensor(*input.endIds), input.batchSize); } - if (input.embeddingBias) + // No logits for explicit draft tokens + if (!decodingMode.isExplicitDraftTokens()) { - forwardParams->embedding_bias = tcc::toTllmTensor(*input.embeddingBias); + if (input.logitsVec) + { + std::vector logitsVec; + for (auto const& logits : input.logitsVec.value()) + { + TLLM_CHECK(logits->getDataType() == TRTDataType::value); + logitsVec.push_back(tcc::toTllmTensor(*logits)); + } + forwardParams->logitsVec = logitsVec; + } + else if (input.logits) + { + TLLM_CHECK(input.logits->getDataType() == TRTDataType::value); + forwardParams->logits = tcc::toTllmTensor(*input.logits); + } } - if (input.lengths) + if (input.cacheIndirection) { - forwardParams->input_lengths = tcc::toTllmTensor(*input.lengths); + forwardParams->srcCacheIndirection = tcc::toTllmTensor(*input.cacheIndirection); } - if (input.badWordsPtrs) + if (input.embeddingBias) { - TLLM_CHECK_WITH_INFO(input.badWordsPtrs, "Bad word lengths must be provided when badWordsPtrs is given"); - forwardParams->bad_words_ptr = tcc::toTllmTensor(*input.badWordsPtrs); - forwardParams->bad_words_lengths = tcc::toTllmTensor(*input.badWordsLens); - forwardParams->max_bad_words_len = input.maxBadWordsLen; + forwardParams->embeddingBias = tcc::toTllmTensor(*input.embeddingBias); } - if (input.stopWordsPtrs) + if (input.lengths) { - TLLM_CHECK_WITH_INFO(input.stopWordsLens, "Stop word lengths must be provided when stopWordsPtrs is given"); - forwardParams->stop_words_ptr = tcc::toTllmTensor(*input.stopWordsPtrs); - forwardParams->stop_words_lengths = tcc::toTllmTensor(*input.stopWordsLens); - forwardParams->max_stop_words_len = input.maxStopWordsLen; + forwardParams->inputLengths = tcc::toTllmTensor(*input.lengths); } + forwardParams->banWordsInputs = prepareBanWordsInputs(input); + + forwardParams->stopCriteriaInputs = prepareStopCriteriaInputs(input); + if (input.finished) { forwardParams->finished = tcc::toTllmTensor(*input.finished); @@ -223,19 +329,19 @@ std::shared_ptr prepareInputs( if (input.batchSlots) { - forwardParams->batch_slots = tcc::toTllmTensor(*input.batchSlots); + forwardParams->batchSlots = tcc::toTllmTensor(*input.batchSlots); } // Medusa if (decodingMode.isMedusa()) { - forwardParams->medusaInputs = prepareMedusaInputs(input, maxBatchSize); + prepareMedusaInputs(input, maxBatchSize, forwardParams); } // Explicit draft tokens if (decodingMode.isExplicitDraftTokens()) { - forwardParams->explicitDraftTokensInputs = prepareExplicitDraftTokensInput(input, maxBatchSize); + prepareExplicitDraftTokensInput(input, maxBatchSize, forwardParams); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -243,106 +349,165 @@ std::shared_ptr prepareInputs( return forwardParams; } -template -tl::DynamicDecodeOutputParams::SpeculativeDecodingOutputs prepareSpeculativeDecodingOutputs( - DecodingOutput::SpeculativeDecodingOutputs& output) +void prepareBeamSearchOutputs(DecodingOutput& output, std::shared_ptr& baseOutputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - tl::DynamicDecodeOutputParams::SpeculativeDecodingOutputs speculativeDecodingOutputs; - speculativeDecodingOutputs.nextDraftTokens = tcc::toTllmTensor(*output.nextDraftTokens); - speculativeDecodingOutputs.acceptedLengths = tcc::toTllmTensor(*output.acceptedTokensLen); - speculativeDecodingOutputs.acceptedLengthsCumSum = tcc::toTllmTensor(*output.acceptedLengthsCumSum); - speculativeDecodingOutputs.pathsOffsets = tcc::toTllmTensor(*output.pathsOffsets); + auto outputParams = std::dynamic_pointer_cast(baseOutputs); + outputParams->beamHypotheses = std::make_unique(); + if (output.beamHypotheses.outputIdsCBA) + { + outputParams->beamHypotheses->outputIdsCBA = bufferCast(*output.beamHypotheses.outputIdsCBA); + } + if (output.beamHypotheses.logProbsCBA) + { + outputParams->beamHypotheses->logProbsCBA = bufferCast(*output.beamHypotheses.logProbsCBA); + } + if (output.beamHypotheses.sequenceLengthsCBA) + { + outputParams->beamHypotheses->sequenceLengthsCBA = bufferCast(*output.beamHypotheses.sequenceLengthsCBA); + } + if (output.beamHypotheses.cumLogProbsCBA) + { + outputParams->beamHypotheses->cumLogProbsCBA = bufferCast(*output.beamHypotheses.cumLogProbsCBA); + } + if (output.beamHypotheses.normedScoresCBA) + { + outputParams->beamHypotheses->normedScoresCBA = bufferCast(*output.beamHypotheses.normedScoresCBA); + } + if (output.beamHypotheses.numBeamsCBA) + { + outputParams->beamHypotheses->numBeamsCBA = bufferCast(*output.beamHypotheses.numBeamsCBA); + } + if (output.beamHypotheses.minNormedScoresCBA) + { + outputParams->beamHypotheses->minNormedScoresCBA = bufferCast(*output.beamHypotheses.minNormedScoresCBA); + } + if (output.beamHypotheses.batchDones) + { + outputParams->beamHypotheses->batchDones = bufferCast(*output.beamHypotheses.batchDones); + } + + if (output.cacheIndirection) + { + outputParams->tgtCacheIndirection = tcc::toTllmTensor(*output.cacheIndirection); + } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return speculativeDecodingOutputs; } -template -std::shared_ptr prepareOutputs( - DecodingOutput& output, DecodingOutput::TensorPtr& logProbsTiled, tle::DecodingMode const& decodingMode) +void prepareSpeculativeDecodingOutputs(DecodingOutput& output, std::shared_ptr& baseOutputs, + tle::DecodingMode const& decodingMode) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto outputParams = std::make_shared(tcc::toTllmTensor(*output.ids)); - outputParams->newTokens = tcc::toTllmTensor(*output.newTokens); + auto outputParams = std::dynamic_pointer_cast(baseOutputs); - if (output.cumLogProbs) - { - outputParams->cum_log_probs = tcc::toTllmTensor(*output.cumLogProbs); - } + auto const& speculativeDecodingOutputs = output.speculativeDecodingOutputs; + TLLM_CHECK_WITH_INFO(speculativeDecodingOutputs.has_value(), "speculativeDecodingOutputs is not set"); - if (output.parentIds) + outputParams->nextDraftTokens = tcc::toTllmTensor(*speculativeDecodingOutputs->nextDraftTokens); + outputParams->numNewTokens = tcc::toTllmTensor(*speculativeDecodingOutputs->acceptedTokensLen); + outputParams->numNewTokensCumSum = tcc::toTllmTensor(*speculativeDecodingOutputs->acceptedLengthsCumSum); + outputParams->pathsOffsets = tcc::toTllmTensor(*speculativeDecodingOutputs->pathsOffsets); + if (speculativeDecodingOutputs->nextDraftTokensLen) { - outputParams->parent_ids = tcc::toTllmTensor(*output.parentIds); + outputParams->nextDraftLengths = tcc::toTllmTensor(*speculativeDecodingOutputs->nextDraftTokensLen); } - - if (output.cacheIndirection) + if (speculativeDecodingOutputs->prevDraftTokensLen) { - outputParams->tgt_cache_indirection = tcc::toTllmTensor(*output.cacheIndirection); + outputParams->prevDraftLengths = tcc::toTllmTensor(*speculativeDecodingOutputs->prevDraftTokensLen); } - if (output.finished) + if (decodingMode.isExplicitDraftTokens()) { - outputParams->finished = tcc::toTllmTensor(*output.finished); + auto outputParams = std::dynamic_pointer_cast(baseOutputs); + auto const& explicitDraftTokensBuffers = output.explicitDraftTokensBuffers; + TLLM_CHECK_WITH_INFO(explicitDraftTokensBuffers.has_value(), "explicitDraftTokensBuffers is not set"); + outputParams->packedMasks = tcc::toTllmTensor(*explicitDraftTokensBuffers->packedMasks); + outputParams->nextDraftPosIds = tcc::toTllmTensor(*explicitDraftTokensBuffers->positionIds); + + outputParams->unpackedNextDraftTokens = tcc::toTllmTensor(*explicitDraftTokensBuffers->draftTokens); + outputParams->unpackedNextDraftIndices = tcc::toTllmTensor(*explicitDraftTokensBuffers->draftIndices); + outputParams->nextDraftProbs = tcc::toTllmTensor(*explicitDraftTokensBuffers->draftProbs); + outputParams->positionIdsBase = tcc::toTllmTensor(*explicitDraftTokensBuffers->positionIdsBase); + outputParams->randomDataSample = tcc::toTllmTensor(*explicitDraftTokensBuffers->randomDataSample); + outputParams->randomDataValidation = tcc::toTllmTensor(*explicitDraftTokensBuffers->randomDataValidation); + outputParams->temperatures = tcc::toTllmTensor(*explicitDraftTokensBuffers->temperatures); + outputParams->generationLengths = tcc::toTllmTensor(*explicitDraftTokensBuffers->generationLengths); + outputParams->maxGenLengthHost = tcc::toTllmTensor(*explicitDraftTokensBuffers->maxGenLengthHost); } - if (output.finishedSum) + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +std::shared_ptr prepareOutputs( + DecodingOutput& output, DecodingOutput::TensorPtr& logProbsTiled, tle::DecodingMode const& decodingMode) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + std::shared_ptr outputParams; + + if (decodingMode.isBeamSearch()) { - outputParams->finished_sum = tcc::toTllmTensor(*output.finishedSum); + outputParams = std::make_shared(tcc::toTllmTensor(*output.ids)); } - - if (output.lengths) + else if (decodingMode.isMedusa() || decodingMode.isLookahead()) { - outputParams->sequence_length = tcc::toTllmTensor(*output.lengths); + outputParams = std::make_shared(tcc::toTllmTensor(*output.ids)); } - - if (output.logProbs) + else if (decodingMode.isExplicitDraftTokens()) { - outputParams->output_log_probs = tcc::toTllmTensor(*output.logProbs); - outputParams->output_log_probs_tiled = tcc::toTllmTensor(*logProbsTiled); + outputParams = std::make_shared(tcc::toTllmTensor(*output.ids)); } - - outputParams->beamHypotheses = std::make_unique(); - if (output.beamHypotheses.outputIdsCBA) + else { - outputParams->beamHypotheses->outputIdsCBA = bufferCast(*output.beamHypotheses.outputIdsCBA); + outputParams = std::make_shared(tcc::toTllmTensor(*output.ids)); } - if (output.beamHypotheses.logProbsCBA) + + // Common outputs + outputParams->newTokens = tcc::toTllmTensor(*output.newTokens); + + if (output.cumLogProbs) { - outputParams->beamHypotheses->logProbsCBA = bufferCast(*output.beamHypotheses.logProbsCBA); + outputParams->cumLogProbs = tcc::toTllmTensor(*output.cumLogProbs); } - if (output.beamHypotheses.sequenceLengthsCBA) + + if (output.parentIds) { - outputParams->beamHypotheses->sequenceLengthsCBA = bufferCast(*output.beamHypotheses.sequenceLengthsCBA); + outputParams->parentIds = tcc::toTllmTensor(*output.parentIds); } - if (output.beamHypotheses.cumLogProbsCBA) + + if (output.finished) { - outputParams->beamHypotheses->cumLogProbsCBA = bufferCast(*output.beamHypotheses.cumLogProbsCBA); + outputParams->finished = tcc::toTllmTensor(*output.finished); } - if (output.beamHypotheses.normedScoresCBA) + + if (output.finishedSum) { - outputParams->beamHypotheses->normedScoresCBA = bufferCast(*output.beamHypotheses.normedScoresCBA); + outputParams->finishedSum = tcc::toTllmTensor(*output.finishedSum); } - if (output.beamHypotheses.numBeamsCBA) + + if (output.lengths) { - outputParams->beamHypotheses->numBeamsCBA = bufferCast(*output.beamHypotheses.numBeamsCBA); + outputParams->sequenceLength = tcc::toTllmTensor(*output.lengths); } - if (output.beamHypotheses.minNormedScoresCBA) + + if (output.logProbs) { - outputParams->beamHypotheses->minNormedScoresCBA = bufferCast(*output.beamHypotheses.minNormedScoresCBA); + outputParams->outputLogProbs = tcc::toTllmTensor(*output.logProbs); + outputParams->outputLogProbsTiled = tcc::toTllmTensor(*logProbsTiled); } - if (output.beamHypotheses.batchDones) + + // Beam search outputs + if (decodingMode.isBeamSearch()) { - outputParams->beamHypotheses->batchDones = bufferCast(*output.beamHypotheses.batchDones); + prepareBeamSearchOutputs(output, outputParams); } - // Speculative decoding + // Speculative decoding outputs if (decodingMode.isMedusa() || decodingMode.isLookahead() || decodingMode.isExplicitDraftTokens()) { - outputParams->speculativeDecodingOutputs - = prepareSpeculativeDecodingOutputs(output.speculativeDecodingOutputs.value()); + prepareSpeculativeDecodingOutputs(output, outputParams, decodingMode); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -356,7 +521,7 @@ void GptDecoder::forwardAsync(DecodingOutput& output, DecodingInput const& in { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto forwardParams = prepareInputs(input, mMaxBatchSize, mDecodingMode); - auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); + auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); mDynamicDecodeLayer->forwardAsync(outputParams, forwardParams); @@ -368,7 +533,7 @@ void GptDecoder::forwardSync(DecodingOutput& output, DecodingInput const& inp { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto forwardParams = prepareInputs(input, mMaxBatchSize, mDecodingMode); - auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); + auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); mDynamicDecodeLayer->forwardSync(outputParams, forwardParams); diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp index 6783da07c..1c5c22ba6 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaEvent.h" +#include "tensorrt_llm/runtime/memoryCounters.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include @@ -165,6 +166,8 @@ void GptDecoderBatch::allocateSpeculativeDecodingBuffers() { speculativeDecodingOutputs.nextDraftTokensLen = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + speculativeDecodingOutputs.prevDraftTokensLen + = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); } } if (mSpeculativeDecodingMode.needsKVCacheRewind()) @@ -177,6 +180,17 @@ void GptDecoderBatch::allocateSpeculativeDecodingBuffers() = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); } dOutput->speculativeDecodingOutputs = speculativeDecodingOutputs; + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void GptDecoderBatch::setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + TLLM_CHECK(mSpeculativeDecodingMode.isExplicitDraftTokens()); + mJointDecodingOutput->explicitDraftTokensBuffers = std::move(explicitDraftTokensBuffers); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -190,7 +204,6 @@ void GptDecoderBatch::setup(executor::DecodingMode const& mode, SizeType32 maxBa TLLM_CHECK(maxTokensPerEngineStep > 0); TLLM_CHECK(maxSequenceLength > 0); mActualBatchSize = maxBatchSize; - mNumDecodingEngineTokens.resize(maxBatchSize); mMaxSequenceLength = maxSequenceLength; mMaxAttentionWindow = maxAttentionWindow; mSinkTokenLength = sinkTokenLength; @@ -294,34 +307,44 @@ void GptDecoderBatch::setup(executor::DecodingMode const& mode, SizeType32 maxBa mMaxDecodingDecoderTokens = 1; } - auto const numOfDecoders = fusedDecoder ? 1 : maxBatchSize; + if (!mFusedDecoder) + { + mStreams.resize(maxBatchSize); + auto const device = mStream->getDevice(); + for (SizeType32 i = 0; i < maxBatchSize; ++i) + { + mStreams[i] = std::make_shared(); + TLLM_CHECK(mStreams[i]->getDevice() == device); + } + } - mStreams.resize(maxBatchSize); + auto const numOfDecoders = mFusedDecoder ? 1 : maxBatchSize; + auto const maxBatchSizePerDecoder = mFusedDecoder ? maxBatchSize : 1; mDecoders.resize(numOfDecoders); + for (SizeType32 i = 0; i < numOfDecoders; ++i) + { + auto& stream = mFusedDecoder ? mStream : mStreams.at(i); + mDecoders[i] = IGptDecoder::create(mode, dtype, maxBatchSizePerDecoder, maxBeamWidth, mVocabSize, + mVocabSizePadded, mMaxSequenceLength, stream, speculativeDecodingModulePtr); + } + + mNbSteps.clear(); + mNbSteps.resize(maxBatchSize, 0); + mFinished.clear(); + mFinished.resize(maxBatchSize, true); + mMaxNewTokens.clear(); + mMaxNewTokens.resize(maxBatchSize, 0); + mBeamWidths.clear(); + mBeamWidths.resize(maxBatchSize, 0); + mNumDecodingEngineTokens.clear(); + mNumDecodingEngineTokens.resize(maxBatchSize, 0); + mDecodingInputs.resize(maxBatchSize); mDecodingOutputs.resize(maxBatchSize); - mNbSteps.resize(maxBatchSize); - mFinished.resize(maxBatchSize); - mMaxNewTokens.resize(maxBatchSize); - mBeamWidths.resize(maxBatchSize); - auto const device = mStream->getDevice(); for (SizeType32 i = 0; i < maxBatchSize; ++i) { - mStreams[i] = std::make_shared(); - TLLM_CHECK(mStreams[i]->getDevice() == device); - if (i < numOfDecoders) - { - auto maxBatchSizePerDecoder = fusedDecoder ? maxBatchSize : 1; - mDecoders[i] = IGptDecoder::create(mode, dtype, maxBatchSizePerDecoder, maxBeamWidth, mVocabSize, - mVocabSizePadded, mMaxSequenceLength, mStreams[i], speculativeDecodingModulePtr); - } mDecodingInputs[i].reset(); mDecodingOutputs[i].reset(); - mNbSteps[i] = 0; - mFinished[i] = true; - mMaxNewTokens[i] = 0; - mBeamWidths[i] = 0; - mNumDecodingEngineTokens[i] = 0; } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -360,6 +383,7 @@ void GptDecoderBatch::setupSpeculativeDecoding(ModelConfig const& modelConfig) if (mSpeculativeDecodingMode.variableDraftLength()) { dOutput.speculativeDecodingOutputs->nextDraftTokensLen->reshape(ITensor::makeShape({mActualBatchSize})); + dOutput.speculativeDecodingOutputs->prevDraftTokensLen->reshape(ITensor::makeShape({mActualBatchSize})); } } if (mSpeculativeDecodingMode.needsKVCacheRewind()) @@ -405,7 +429,7 @@ void GptDecoderBatch::newRequest( auto constexpr localBatchSize = 1; auto const decoderIdx = mFusedDecoder ? 0 : batchIdx; - auto& stream = mStreams[decoderIdx]; + auto& stream = mFusedDecoder ? mStream : mStreams[decoderIdx]; BufferManager manager{stream}; // input @@ -575,7 +599,7 @@ void GptDecoderBatch::newRequestSpeculativeDecoding( if (mSpeculativeDecodingMode.predictsDraftTokens()) { auto constexpr decoderIdx = 0; - auto& stream = mStreams[decoderIdx]; + auto& stream = mFusedDecoder ? mStream : mStreams[decoderIdx]; BufferManager manager{stream}; auto& dJointOutput = *mJointDecodingOutput; @@ -585,6 +609,12 @@ void GptDecoderBatch::newRequestSpeculativeDecoding( = ITensor::slice(dJointOutput.speculativeDecodingOutputs->nextDraftTokens, batchIdx, localBatchSize); // FIXME(nkorobov): can we skip this? manager.setZero(*nextDraftTokens); + if (mSpeculativeDecodingMode.variableDraftLength()) + { + TensorPtr nextDraftTokensLen + = ITensor::slice(dJointOutput.speculativeDecodingOutputs->nextDraftTokensLen, batchIdx, localBatchSize); + manager.setZero(*nextDraftTokensLen); + } } if (mSpeculativeDecodingMode.isDraftTokensExternal()) @@ -613,7 +643,7 @@ void GptDecoderBatch::newRequestDraftTokensExternal( TLLM_CHECK_WITH_INFO(mFusedDecoder, "Speculative decoding requires fused decoder"); auto constexpr decoderIdx = 0; - auto& stream = mStreams[decoderIdx]; + auto& stream = mFusedDecoder ? mStream : mStreams[decoderIdx]; BufferManager manager{stream}; auto constexpr localBatchSize = 1; @@ -658,7 +688,7 @@ void GptDecoderBatch::newRequestMedusa(SizeType32 batchIdx, decoder_batch::Reque TLLM_CHECK_WITH_INFO(mFusedDecoder, "Medusa requires fused decoder"); auto constexpr decoderIdx = 0; - auto& stream = mStreams[decoderIdx]; + auto& stream = mFusedDecoder ? mStream : mStreams[decoderIdx]; BufferManager manager{stream}; auto& dJointInput = *mJointDecodingInput; @@ -704,8 +734,42 @@ void GptDecoderBatch::newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO(mFusedDecoder, "Explicit draft tokens decoding requires fused decoder"); - // TODO(nkorobov) add explicit draft tokens - TLLM_LOG_WARNING("Explicit draft tokens is not supported yet."); + TLLM_CHECK(mJointDecodingOutput->explicitDraftTokensBuffers); + + auto constexpr localBatchSize = 1; + auto& stream = mStream; + + TensorPtr positionIdsBaseSlice + = ITensor::slice(mJointDecodingOutput->explicitDraftTokensBuffers->positionIdsBase, batchIdx, localBatchSize); + kernels::invokeFill(*positionIdsBaseSlice, request.inputLen, *stream); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void GptDecoderBatch::setExplicitDraftTokensInputs(decoder_batch::Input const& input) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto explicitDraftTokensInputs = DecodingInput::ExplicitDraftTokensInputs(); + TLLM_CHECK(input.explicitDraftTokensInputs.has_value()); + TLLM_CHECK(input.explicitDraftTokensLastInputs.has_value()); + + explicitDraftTokensInputs.nextDraftTokens = input.explicitDraftTokensInputs->nextDraftTokens; + explicitDraftTokensInputs.nextFlatTokens = input.explicitDraftTokensInputs->nextFlatTokens; + explicitDraftTokensInputs.nextDraftIndices = input.explicitDraftTokensInputs->nextDraftIndices; + explicitDraftTokensInputs.nextDraftProbs = input.explicitDraftTokensInputs->nextDraftProbs; + explicitDraftTokensInputs.lastDraftTokens = input.explicitDraftTokensLastInputs->draftTokens; + explicitDraftTokensInputs.lastDraftIndices = input.explicitDraftTokensLastInputs->draftIndices; + explicitDraftTokensInputs.lastPositionIdsBase = input.explicitDraftTokensLastInputs->positionIdsBase; + explicitDraftTokensInputs.masks = input.explicitDraftTokensInputs->masks; + explicitDraftTokensInputs.packedPositionIds = input.explicitDraftTokensInputs->packedPositionIds; + explicitDraftTokensInputs.bestPathLengths = input.explicitDraftTokensInputs->bestPathLengths; + explicitDraftTokensInputs.bestPathIndices = input.explicitDraftTokensInputs->bestPathIndices; + explicitDraftTokensInputs.nextGenerationLengths = input.explicitDraftTokensInputs->nextGenerationLengths; + explicitDraftTokensInputs.lastGenerationLengths = input.explicitDraftTokensLastInputs->generationLengths; + explicitDraftTokensInputs.maxGenLengthDevice = input.explicitDraftTokensInputs->maxGenToken; + explicitDraftTokensInputs.seqSlots = input.seqSlots; + mJointDecodingInput->explicitDraftTokensInputs = explicitDraftTokensInputs; TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -729,14 +793,14 @@ void GptDecoderBatch::newRequests(std::vector const& seqSlots, { TensorPtr batchSlotsView = std::move(ITensor::slice(mBatchSlotsSetup, 0, localBatchSize)); auto fusedSamplingConfig = SamplingConfig(samplingConfigs); - mDecoders[0]->setup(fusedSamplingConfig, localBatchSize, {batchSlotsView}); + mDecoders[0]->setup(fusedSamplingConfig, localBatchSize, {batchSlotsView}, {*mJointDecodingOutput}); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptDecoderBatch::forwardDispatch( - decoder_batch::Output& output, decoder_batch::Input const& input, std::optional const& eventStart) + decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType) { auto const maxDecodingEngineTokens = *std::max_element(std::begin(mNumDecodingEngineTokens), std::end(mNumDecodingEngineTokens)); @@ -745,11 +809,12 @@ void GptDecoderBatch::forwardDispatch( { if (!mFusedDecoder) { - forwardUnfusedDecoder(si, output, input, eventStart); + TLLM_CHECK_WITH_INFO(forwardType == ForwardType::kASYNC, "Unfused decoder supports only async forward"); + forwardUnfusedDecoder(si, output, input, forwardType); } else { - forwardFusedDecoder(si, output, input, eventStart); + forwardFusedDecoder(si, output, input, forwardType); } } } @@ -759,10 +824,7 @@ GptDecoderBatch::TokenPtr GptDecoderBatch::forwardAsync( { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - std::optional eventStart = CudaEvent{}; - mStream->record(eventStart.value()); - - forwardDispatch(output, input, eventStart); + forwardDispatch(output, input, ForwardType::kASYNC); CudaEvent eventStop{}; mStream->record(eventStop); @@ -770,11 +832,14 @@ GptDecoderBatch::TokenPtr GptDecoderBatch::forwardAsync( return std::make_unique(std::move(eventStop), input.active); } -void GptDecoderBatch::forwardUnfusedDecoder(SizeType32 step, decoder_batch::Output& output, - decoder_batch::Input const& input, std::optional const& eventStart) +void GptDecoderBatch::forwardUnfusedDecoder( + SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto eventStart = CudaEvent{}; + mStream->record(eventStart); + auto& allTargetLogits = input.logits; auto const& jointOutputIdsShape = mJointDecodingOutput->ids->getShape(); auto const maxBeamWidth = jointOutputIdsShape.d[1]; @@ -792,7 +857,7 @@ void GptDecoderBatch::forwardUnfusedDecoder(SizeType32 step, decoder_batch::Outp = ITensor::view(output.sequenceLengths, ITensor::makeShape({mActualBatchSize, maxBeamWidth})); TLLM_CHECK(sequenceLengths); - bool async{eventStart.has_value()}; + bool const async = forwardType == ForwardType::kASYNC; auto constexpr singleRequest = 1; @@ -803,10 +868,10 @@ void GptDecoderBatch::forwardUnfusedDecoder(SizeType32 step, decoder_batch::Outp continue; } - auto& stream = mStreams[bi]; + auto& stream = mFusedDecoder ? mStream : mStreams[bi]; if (async) { - stream->wait(eventStart->get()); + stream->wait(eventStart); } auto& targetLogits = allTargetLogits[bi]; @@ -857,7 +922,7 @@ void GptDecoderBatch::forwardUnfusedDecoder(SizeType32 step, decoder_batch::Outp { if (step == mNumDecodingEngineTokens[bi] - 1) { - auto& stream = mStreams[bi]; + auto& stream = mFusedDecoder ? mStream : mStreams[bi]; CudaEvent event{}; stream->record(event); mStream->wait(event); @@ -868,8 +933,8 @@ void GptDecoderBatch::forwardUnfusedDecoder(SizeType32 step, decoder_batch::Outp TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void GptDecoderBatch::forwardFusedDecoder(SizeType32 step, decoder_batch::Output& output, - decoder_batch::Input const& input, std::optional const& eventStart) +void GptDecoderBatch::forwardFusedDecoder( + SizeType32 step, decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -890,20 +955,17 @@ void GptDecoderBatch::forwardFusedDecoder(SizeType32 step, decoder_batch::Output auto batchSlotsDecoderPtr = bufferCast(*mBatchSlotsDecoder); auto batchSlotsAcceptTokensPtr = bufferCast(*mBatchSlotsAcceptTokens); auto batchSlotsAcceptLogitsPtr = bufferCast(*mBatchSlotsAcceptLogits); - auto& dInput = *mJointDecodingInput; auto& dOutput = *mJointDecodingOutput; auto& decoder = *mDecoders[0]; - auto& stream = mStreams[0]; + auto& stream = mFusedDecoder ? mStream : mStreams[0]; - bool async{eventStart.has_value()}; - - if (async) + if (mSpeculativeDecodingMode.isExplicitDraftTokens()) { - stream->wait(eventStart->get()); + setExplicitDraftTokensInputs(input); } - BufferManager manager{stream}; + bool const async = forwardType == ForwardType::kASYNC; SizeType32 localBatchDecoderIdx = 0; SizeType32 localBatchAcceptTokensIdx = 0; @@ -989,7 +1051,7 @@ void GptDecoderBatch::forwardFusedDecoder(SizeType32 step, decoder_batch::Output TensorPtr batchSlotsDecoderSlice = std::move(ITensor::slice(mBatchSlotsDecoder, step, 1)); batchSlotsDecoderSlice->squeeze(0); dInput.batchSlots = batchSlotsDecoderSlice; - dInput.maxBatchSize = localBatchDecoderIdx; + dInput.batchSize = localBatchDecoderIdx; if (mSpeculativeDecodingMode.isMedusa()) { dInput.medusaInputs->medusaLogits = input.predictedDraftLogits; @@ -1001,14 +1063,18 @@ void GptDecoderBatch::forwardFusedDecoder(SizeType32 step, decoder_batch::Output if (localBatchDecoderIdx > 0) { - if (async) + if (forwardType == ForwardType::kASYNC) { decoder.forwardAsync(dOutput, dInput); } - else + else if (forwardType == ForwardType::kSYNC) { decoder.forwardSync(dOutput, dInput); } + else + { + TLLM_THROW("Unknown ForwardType"); + } } for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi) @@ -1046,7 +1112,6 @@ void GptDecoderBatch::forwardFusedDecoder(SizeType32 step, decoder_batch::Output { CudaEvent event{}; stream->record(event); - mStream->wait(event); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -1084,7 +1149,7 @@ void GptDecoderBatch::forwardSync( TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); token.event.synchronize(); - forwardDispatch(output, input, std::nullopt); + forwardDispatch(output, input, ForwardType::kSYNC); updateFinished(token); @@ -1095,7 +1160,7 @@ void GptDecoderBatch::forwardSync( CudaEvent GptDecoderBatch::postProcessRequest(SizeType32 batchIdx) const { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto& stream = mStreams[batchIdx]; + auto& stream = mFusedDecoder ? mStream : mStreams[batchIdx]; auto manager = BufferManager{stream}; auto& decoder = *mDecoders[batchIdx]; diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp index 06788b263..f90bf004a 100644 --- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp +++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp @@ -136,6 +136,9 @@ ModelConfig createModelConfig( auto const hiddenSize = config.at("hidden_size").template get() / tensorParallelism; auto const sizePerHead = parseJsonFieldOr(config, "head_size", hiddenSize / numHeads); + // Logits datatype + auto const logitsDtypeStr = parseJsonFieldOr(config, "logits_dtype", std::string("float32")); + // TODO: // Code crashes when numKvHeads <= 0. Clamping downwards to 1 prevents that, make sure this is best fix. auto const numKvHeads @@ -148,6 +151,16 @@ ModelConfig createModelConfig( modelConfig.setNbKvHeads(numKvHeads); modelConfig.setLayerTypes(layerTypes); + // Set logits datatype + auto logitsDtype = nvinfer1::DataType::kFLOAT; + if (logitsDtypeStr == "float32") + logitsDtype = nvinfer1::DataType::kFLOAT; + else if (logitsDtypeStr == "float16") + logitsDtype = nvinfer1::DataType::kHALF; + else + TLLM_THROW("Unsupported logits data type"); + modelConfig.setLogitsDtype(logitsDtype); + // only enable cross attention for the decoder in encoder-decoder model // TODO: add cross_attention and has_token_type_embedding as fields in pretrained config auto const useCrossAttention = arch == std::string("DecoderModel") ? true : false; @@ -170,7 +183,7 @@ void parseBuilderConfig(ModelConfig& modelConfig, Json const& builderConfig) auto const maxBatchSize = parseJsonFieldOr(builderConfig, "max_batch_size", 0); auto const maxBeamWidth = parseJsonFieldOr(builderConfig, "max_beam_width", 0); auto const maxInputLen = parseJsonFieldOr(builderConfig, "max_input_len", 0); - auto const maxSequenceLen = maxInputLen + parseJsonFieldOr(builderConfig, "max_output_len", 0); + auto const maxSequenceLen = parseJsonFieldOr(builderConfig, "max_seq_len", 0); auto const maxDraftLen = parseJsonFieldOr(builderConfig, "max_draft_len", 0); auto const maxNumTokens = parseJsonFieldOptional(builderConfig, "max_num_tokens"); auto const maxPromptEmbeddingTableSize @@ -366,6 +379,7 @@ GptJsonConfig parseJson(InputType&& input) auto explicitDraftTokensModule = std::make_shared(maxDraftPathLen, maxDraftLen, maxNumPaths); modelConfig.setSpeculativeDecodingModule(explicitDraftTokensModule); + modelConfig.setUseShapeInference(false); } else if (modelConfig.getSpeculativeDecodingMode().isMedusa()) { diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index a961d8266..98db487a4 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -74,12 +74,12 @@ auto const kProfileMbIdxs = populateMicrobatchIndexes(); } // namespace GptSession::GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig, - void const* engineBuffer, std::size_t engineSize, LoggerPtr logger) + RawEngine const& rawEngine, LoggerPtr logger) : mModelConfig{modelConfig} , mWorldConfig{worldConfig} , mDevice{utils::initDevice(worldConfig)} , mLogger{logger ? std::move(logger) : std::make_shared()} - , mRuntime{std::make_shared(engineBuffer, engineSize, sessionConfig.gpuWeightsPercent, *mLogger)} + , mRuntime{std::make_shared(rawEngine, mLogger.get(), sessionConfig.gpuWeightsPercent)} { TLLM_LOG_WARNING( "GptSession is deprecated and will be removed in a future release." @@ -109,6 +109,11 @@ BufferManager const& GptSession::getBufferManager() const return mRuntime->getBufferManager(); } +BufferManager::CudaStreamPtr GptSession::getRuntimeStreamPtr() const +{ + return mRuntime->getStreamPtr(); +} + nvinfer1::DataType GptSession::getLogitDataType() const { return mRuntime->getEngine().getTensorDataType("logits"); diff --git a/cpp/tensorrt_llm/runtime/iTensor.cpp b/cpp/tensorrt_llm/runtime/iTensor.cpp index 52923a7e4..c18bd6550 100644 --- a/cpp/tensorrt_llm/runtime/iTensor.cpp +++ b/cpp/tensorrt_llm/runtime/iTensor.cpp @@ -37,17 +37,28 @@ ITensor::UniquePtr ITensor::slice(SharedPtr tensor, std::size_t offset, std::siz ITensor::UniquePtr ITensor::slice(SharedPtr tensor, Shape const& offsetDims, ITensor::DimType64 size) { auto shape = tensor->getShape(); - TLLM_CHECK(offsetDims.nbDims > 0); + TLLM_CHECK(offsetDims.nbDims >= 0); TLLM_CHECK(shape.nbDims >= offsetDims.nbDims); + TLLM_CHECK(size >= 0); Shape strides = ITensor::strides(shape); DimType64 offset{0}; - for (SizeType32 di = 0; di < offsetDims.nbDims; di++) + for (SizeType32 di = 0; di < offsetDims.nbDims - 1; di++) { TLLM_CHECK(0 <= offsetDims.d[di] && offsetDims.d[di] < shape.d[di]); offset += offsetDims.d[di] * strides.d[di]; } - TLLM_CHECK(offsetDims.d[offsetDims.nbDims - 1] + size <= shape.d[offsetDims.nbDims - 1]); + + if (TLLM_LIKELY(offsetDims.nbDims > 0)) + { + TLLM_CHECK(offsetDims.d[offsetDims.nbDims - 1] + size <= shape.d[offsetDims.nbDims - 1]); + offset += offsetDims.d[offsetDims.nbDims - 1] * strides.d[offsetDims.nbDims - 1]; + } + else + { + TLLM_CHECK(size >= 0 && size <= 1); + TLLM_CHECK(shape.nbDims == 0 ? size == 0 : true); + } Shape dims; dims.nbDims = shape.nbDims - offsetDims.nbDims + 1; diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu index 42d2824b2..36f03ed1e 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu @@ -1222,14 +1222,15 @@ void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftT ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray, ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, - int* rewindDraftTokenSeparateAdjustments, ITensor const& seqSlotRemapping, SizeType32 maxKVCacheLen, - SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream) + SizeType32 const* rewindDraftTokenSeparateAdjustments, ITensor const& seqSlotRemapping, ITensor const& batchSlots, + SizeType32 maxKVCacheLen, SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream) { tensorrt_llm::kernels::speculative_decoding::updateKVBlockArrayDraftTokenLocation( bufferCast(seqAcceptedDraftTokenOffsets), bufferCast(packedAcceptedDraftTokensIndices), bufferCast(pastKeyValueLengths), pointerArray, offsetArray, layerCount, seqCount, numKVHeads, sizeInBytesPerKVHead, rewindDraftTokenCommonCount, rewindDraftTokenSeparateAdjustments, - bufferCast(seqSlotRemapping), maxKVCacheLen, maxBlocksPerSeq, tokensPerBlock, stream); + bufferCast(seqSlotRemapping), bufferCast(batchSlots), maxKVCacheLen, maxBlocksPerSeq, + tokensPerBlock, stream); } } // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h index 4a44ef7af..7bda33bd3 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.h +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h @@ -99,6 +99,6 @@ void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftT ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray, ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount, SizeType32 numKVHeads, SizeType32 sizeInBytesPerKVHead, SizeType32 rewindDraftTokenCommonCount, - int* rewindDraftTokenSeparateAdjustments, ITensor const& seqSlotRemapping, SizeType32 maxKVCacheLen, - SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream); + SizeType32 const* rewindDraftTokenSeparateAdjustments, ITensor const& seqSlotRemapping, ITensor const& batchSlots, + SizeType32 maxKVCacheLen, SizeType32 maxBlocksPerSeq, SizeType32 tokensPerBlock, cudaStream_t stream); } // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp index 82279b3a5..a1c76abb2 100644 --- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp @@ -185,7 +185,7 @@ void StatefulGptDecoder::newBatch( dInput.maxLength = maxInputLength; dInput.maxAttentionWindow = mMaxAttentionWindow; dInput.sinkTokenLength = mSinkTokenLength; - dInput.maxBatchSize = batchSize; + dInput.batchSize = batchSize; kernels::invokeFill(const_cast(*dInput.endIds), endId, *stream); dInput.embeddingBias = inputs.embeddingBias; diff --git a/cpp/tensorrt_llm/runtime/tensorView.h b/cpp/tensorrt_llm/runtime/tensorView.h index b11335387..17e7fb719 100644 --- a/cpp/tensorrt_llm/runtime/tensorView.h +++ b/cpp/tensorrt_llm/runtime/tensorView.h @@ -78,7 +78,7 @@ class TensorView : virtual public ITensor, public BufferView static std::size_t sizeDim0(ITensor const& tensor) { auto& shape = tensor.getShape(); - return shape.nbDims >= 0 ? ITensor::volume(shape) / shape.d[0] : 0; + return shape.nbDims > 0 && shape.d[0] > 0 ? ITensor::volume(shape) / shape.d[0] : 0; } nvinfer1::Dims mDims{}; diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp index fe5493572..011d108b7 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp @@ -57,30 +57,80 @@ std::vector dimsToShape(nvinfer1::Dims const& dims) tensorrt_llm::runtime::TllmLogger defaultLogger{}; -} // namespace +class StreamReader final : public nvinfer1::IStreamReader +{ +public: + StreamReader(std::filesystem::path fp) + { + mFile.open(fp.string()); + TLLM_CHECK_WITH_INFO(mFile.good(), std::string("Error opening engine file: " + fp.string())); + } -TllmRuntime::TllmRuntime( - void const* engineData, std::size_t engineSize, float const gpuWeightsPercent, nvinfer1::ILogger& logger) - : mStream(std::make_shared()) - , mBufferManager{mStream, true} // Ensure to trim the memory pool on destruction. - , mRuntime{nvinfer1::createInferRuntime(logger)} - , mEngine{mRuntime->deserializeCudaEngine(engineData, engineSize)} - , mEngineInspector{mEngine->createEngineInspector()} + virtual ~StreamReader() + { + if (mFile.is_open()) + { + mFile.close(); + } + } + + int64_t read(void* destination, int64_t nbBytes) final + { + if (!mFile.good()) + { + return -1; + } + mFile.read(static_cast(destination), nbBytes); + return mFile.gcount(); + } + + std::ifstream mFile; +}; + +void setWeightStreaming(nvinfer1::ICudaEngine& engine, float const gpuWeightsPercent) { - TLLM_CHECK_WITH_INFO(mEngine != nullptr, "Failed to deserialize cuda engine"); if (gpuWeightsPercent < 1) { -#if NV_TENSORRT_MAJOR >= 10 - int64_t min = mEngine->getMinimumWeightStreamingBudget(); - int64_t max = mEngine->getStreamableWeightsSize(); + int64_t min = engine.getMinimumWeightStreamingBudget(); + int64_t max = engine.getStreamableWeightsSize(); int64_t budget = min + gpuWeightsPercent * (max - min); TLLM_LOG_INFO("Set gpu weights percent to %f, which is %lld bytes. Valid range: %lld bytes - %lld bytes.", gpuWeightsPercent, budget, min, max); - mEngine->setWeightStreamingBudget(budget); -#else - TLLM_THROW("Weight streaming is only supported with TensorRT 10.0 or later."); -#endif // NV_TENSORRT_MAJOR >= 10 + engine.setWeightStreamingBudget(budget); } +} +} // namespace + +TllmRuntime::TllmRuntime( + RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent, bool useShapeInference) + : mStream(std::make_shared()) + , mBufferManager{mStream, true} // Ensure to trim the memory pool on destruction. + , mRuntime{nvinfer1::createInferRuntime(logger ? *logger : defaultLogger)} + , mUseShapeInference{useShapeInference} +{ + switch (rawEngine.getType()) + { + case RawEngine::Type::FilePath: + { + auto reader = StreamReader(rawEngine.getPath()); + mEngine.reset(mRuntime->deserializeCudaEngine(reader)); + break; + } + case RawEngine::Type::AddressWithSize: + mEngine.reset(mRuntime->deserializeCudaEngine(rawEngine.getAddress(), rawEngine.getSize())); + break; + case RawEngine::Type::HostMemory: + mEngine.reset( + mRuntime->deserializeCudaEngine(rawEngine.getHostMemory()->data(), rawEngine.getHostMemory()->size())); + break; + default: TLLM_THROW("Unsupported raw engine type."); + } + + TLLM_CHECK_WITH_INFO(mEngine != nullptr, "Failed to deserialize cuda engine."); + mEngineInspector.reset(mEngine->createEngineInspector()); + + setWeightStreaming(getEngine(), gpuWeightsPercent); + auto const devMemorySize = mEngine->getDeviceMemorySize(); mEngineBuffer = mBufferManager.gpu(devMemorySize); @@ -89,24 +139,17 @@ TllmRuntime::TllmRuntime( static_cast(devMemorySize) / 1048576.0); } -TllmRuntime::TllmRuntime(void const* engineData, std::size_t engineSize, float const gpuWeightsPercent = 1.0F) - : TllmRuntime{engineData, engineSize, gpuWeightsPercent, defaultLogger} -{ -} - nvinfer1::IExecutionContext& TllmRuntime::addContext(std::int32_t profileIndex) { TLLM_CHECK(0 <= profileIndex && profileIndex < mEngine->getNbOptimizationProfiles()); mContexts.emplace_back(mEngine->createExecutionContextWithoutDeviceMemory()); if (!mContexts.back()) { -#if NV_TENSORRT_MAJOR >= 10 if (mEngine->getStreamableWeightsSize() > 0) { TLLM_THROW("Failed to allocate memory for weights. Please try reducing --gpu_weights_percent."); } else -#endif // NV_TENSORRT_MAJOR >= 10 { TLLM_THROW("Internal Error: Failed to create an execution context."); } @@ -193,6 +236,7 @@ void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tens } } + if (mUseShapeInference) { NVTX3_SCOPED_RANGE(infer_shapes); char const* missing; @@ -223,7 +267,6 @@ void TllmRuntime::setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap auto const name = mEngine->getIOTensorName(i); if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT) { - auto const dims = context.getTensorShape(name); auto const engineDtype = mEngine->getTensorDataType(name); auto pos = tensorMap.find(name); if (pos != tensorMap.end()) @@ -236,15 +279,24 @@ void TllmRuntime::setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap "%s: expected type %d, provided type %d", name, static_cast(engineDtype), static_cast(tensorDtype)); - tensor->reshape(dims); + if (mUseShapeInference) + { + auto const dims = context.getTensorShape(name); + tensor->reshape(dims); + } context.setTensorAddress(name, tensor->data()); } - else + else if (mUseShapeInference) { + auto const dims = context.getTensorShape(name); auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype)); tensorMap.insert(pos, std::make_pair(name, tensor)); context.setTensorAddress(name, tensor->data()); } + else + { + TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name); + } } } } diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h index 75b34f671..7afd3dcb2 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.h +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h @@ -19,6 +19,7 @@ #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/layerProfiler.h" +#include "tensorrt_llm/runtime/rawEngine.h" #include #include @@ -32,26 +33,8 @@ class TllmRuntime public: using TensorMap = StringPtrMap; - explicit TllmRuntime( - void const* engineData, std::size_t engineSize, float const gpuWeightsPercent, nvinfer1::ILogger& logger); - - explicit TllmRuntime(void const* engineData, std::size_t engineSize, nvinfer1::ILogger& logger) - : TllmRuntime{engineData, engineSize, 1, logger} - { - } - - explicit TllmRuntime( - nvinfer1::IHostMemory const& engineBuffer, float const gpuWeightsPercent, nvinfer1::ILogger& logger) - : TllmRuntime{engineBuffer.data(), engineBuffer.size(), gpuWeightsPercent, logger} - { - } - - explicit TllmRuntime(void const* engineData, std::size_t engineSize, float const gpuWeightsPercent); - - explicit TllmRuntime(nvinfer1::IHostMemory const& engineBuffer, float const gpuWeightsPercent) - : TllmRuntime{engineBuffer.data(), engineBuffer.size(), gpuWeightsPercent} - { - } + explicit TllmRuntime(RawEngine const& rawEngine, nvinfer1::ILogger* logger, float gpuWeightsPercent = 1.0f, + bool useShapeInference = true); SizeType32 getNbContexts() const { @@ -73,14 +56,14 @@ class TllmRuntime /// multiple profiles on the num_tokens dimension, hence the profile index is selected based on which profile /// handles the actual num_tokens /// @return The index of the selected TensorRT optimization profile - [[nodiscard]] SizeType32 getOptProfileId(int numTokens, std::vector const& splitPoint) const + [[nodiscard]] SizeType32 getOptProfileId(int numTokens, std::vector const& splitPoints) const { if (getNbProfiles() == 1) { return 0; } - auto const it = std::lower_bound(splitPoint.begin(), splitPoint.end(), numTokens); - auto const optProfileId = std::distance(splitPoint.begin(), it); + auto const it = std::lower_bound(splitPoints.begin(), splitPoints.end(), numTokens); + auto const optProfileId = std::distance(splitPoints.begin(), it); return optProfileId; } @@ -146,5 +129,6 @@ class TllmRuntime std::unique_ptr mDummyTensor; std::unique_ptr mEngineInspector; std::unique_ptr mLayerProfiler; + bool mUseShapeInference; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu index 4ec3d4d8a..ce0b7ef20 100644 --- a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu +++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu @@ -129,7 +129,7 @@ template bool tensorHasNan(ITensor const& tensor, BufferManager const& manager, std::string const& infoStr) { printLogitsKeyInfo(tensor, infoStr); - auto foundNan = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + auto foundNan = BufferManager::pinnedPool(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); auto foundNanPtr = bufferCast(*foundNan); foundNanPtr[0] = 0; auto const size = tensor.getSize(); diff --git a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp index e8fed8374..e868caa4f 100644 --- a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp +++ b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp @@ -49,11 +49,10 @@ void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLen int batchSize = specDecodingGenerationLengthsTensor.size(0); - int64_t scanTempMemoryBytes = tensorrt_llm::kernels::speculative_decoding::invokeScanSpecDecodingGenerationLengths( + int64_t scanTempMemoryBytes = tensorrt_llm::kernels::speculative_decoding::invokeScanGenerationLengths( + nullptr, 0, nullptr, nullptr, batchSize, stream); + int64_t reduceMaxTempMemoryBytes = tensorrt_llm::kernels::speculative_decoding::invokeReduceMaxGenerationLengths( nullptr, 0, nullptr, nullptr, batchSize, stream); - int64_t reduceMaxTempMemoryBytes - = tensorrt_llm::kernels::speculative_decoding::invokeReduceMaxSpecDecodingGenerationLengths( - nullptr, 0, nullptr, nullptr, batchSize, stream); torch::Tensor scanTempMemoryStorage = torch::empty( { @@ -76,7 +75,7 @@ void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLen }, torch::dtype(torch::kInt).device(torch::kCUDA).requires_grad(false)); - tensorrt_llm::kernels::speculative_decoding::invokeScanReduceSpecDecodingGenerationLengths(batchSize, + tensorrt_llm::kernels::speculative_decoding::invokeScanReduceGenerationLengths(batchSize, specDecodingGenerationLengthsTensor.data_ptr(), reinterpret_cast(scanTempMemoryStorage.data_ptr()), scanTempMemoryBytes, scanedSpecDecodingGenerationLengths.data_ptr(), @@ -86,7 +85,7 @@ void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLen int hostMaxSpecDecodingGenerationLengths; cudaMemcpyAsync(&hostMaxSpecDecodingGenerationLengths, maxSpecDecodingGenerationLengths.data_ptr(), sizeof(int), cudaMemcpyDeviceToHost, stream); - tensorrt_llm::kernels::speculative_decoding::invokeConvertSpecDecodingMaskToPackedMask(batchSize, + tensorrt_llm::kernels::speculative_decoding::invokeConvertMaskToPackedMask(batchSize, scanedSpecDecodingGenerationLengths.data_ptr(), maxSpecDecodingGenerationLengths.data_ptr(), specDecodingMaskTensor.data_ptr(), nullptr, maxSpecDecodingTokens, maxSpecDecodingTokens + 1, specDecodingPackedMaskTensor.data_ptr(), stream); diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp index b03863ebe..cb2c19f2f 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp @@ -27,27 +27,27 @@ namespace th = torch; namespace tle = tensorrt_llm::executor; namespace tr = tensorrt_llm::runtime; namespace tcc = tensorrt_llm::common::conversion; +namespace tl = tensorrt_llm::layers; namespace torch_ext { template -FtDynamicDecode::FtDynamicDecode(size_t const max_batch_size, size_t const max_beam_width, size_t const vocab_size, - size_t const vocab_size_padded, int const tensor_para_size, int const pipeline_para_size) - : finished_sum_(tr::BufferManager::pinned( - tr::ITensor::makeShape({static_cast(max_batch_size)}), nvinfer1::DataType::kINT32)) +FtDynamicDecode::FtDynamicDecode(size_t const maxBatchSize, size_t const maxBeamWidth, size_t const vocabSize, + size_t const vocabSizePadded, int const tensorParaSize, int const pipelineParaSize) + : mFinishedSum(tr::BufferManager::pinned( + tr::ITensor::makeShape({static_cast(maxBatchSize)}), nvinfer1::DataType::kINT32)) { - TLLM_CHECK_WITH_INFO(vocab_size_padded % tensor_para_size == 0, + TLLM_CHECK_WITH_INFO(vocabSizePadded % tensorParaSize == 0, tensorrt_llm::common::fmtstr( - "vocab_size (%ld) is not multiple of tensor_para_size (%d).", vocab_size_padded, tensor_para_size)); + "vocabSize (%ld) is not multiple of tensorParaSize (%d).", vocabSizePadded, tensorParaSize)); auto stream = at::cuda::getCurrentCUDAStream().stream(); auto allocator = std::make_shared(stream); - auto const decodingDomain - = tensorrt_llm::layers::DecoderDomain(max_batch_size, max_beam_width, vocab_size, vocab_size_padded); + auto const decodingDomain = tl::DecoderDomain(maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded); - dynamic_decode_layer_ = std::make_shared>( + mDynamicDecodeLayer = std::make_shared>( tle::DecodingMode::Auto(), decodingDomain, stream, std::move(allocator)); } @@ -57,13 +57,13 @@ namespace template void safeInsert(th::optional& tensor, std::optional>& arg) { - using value_type = T; + using valueType = T; if (tensor.has_value()) { - auto ptr = get_ptr(tensor.value()); + auto ptr = get_ptr(tensor.value()); auto shape = convert_shape(tensor.value()); size_t const size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>()); - arg = std::vector(ptr, ptr + size); + arg = std::vector(ptr, ptr + size); } } @@ -99,140 +99,192 @@ void safeUpdatePtr(th::optional& tensor, T*& ptr) } // namespace template -void FtDynamicDecode::setup(size_t const batch_size, size_t const beam_width, - th::optional runtime_top_k_opt, th::optional runtime_top_p_opt, - th::optional temperature_opt, th::optional repetition_penalty_opt, - th::optional presence_penalty_opt, th::optional frequency_penalty_opt, - th::optional min_length_opt, th::optional length_penalty_opt, - th::optional early_stopping_opt, th::optional beam_search_diversity_rate_opt, - th::optional random_seed_opt, th::optional top_p_decay_opt, - th::optional top_p_min_opt, th::optional top_p_reset_ids_opt, - th::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) +void FtDynamicDecode::setup(size_t const batchSize, size_t const beamWidth, th::optional runtimeTopKOpt, + th::optional runtimeTopPOpt, th::optional temperatureOpt, + th::optional repetitionPenaltyOpt, th::optional presencePenaltyOpt, + th::optional frequencyPenaltyOpt, th::optional minLengthOpt, + th::optional lengthPenaltyOpt, th::optional earlyStoppingOpt, + th::optional beamSearchDiversityRateOpt, th::optional randomSeedOpt, + th::optional topPDecayOpt, th::optional topPMinOpt, + th::optional topPResetIdsOpt, th::optional noRepeatNgramSizeOpt, bool outputLogProbs, + bool cumLogProbs) { + mBeamWidth = beamWidth; + auto stream = at::cuda::getCurrentCUDAStream().stream(); - dynamic_decode_layer_->setStream(stream); - - auto setupParams = std::make_shared(); - safeInsert(temperature_opt, setupParams->penaltyParams.temperature); - safeInsert(repetition_penalty_opt, setupParams->penaltyParams.repetitionPenalty); - safeInsert(presence_penalty_opt, setupParams->penaltyParams.presencePenalty); - safeInsert(frequency_penalty_opt, setupParams->penaltyParams.frequencyPenalty); - safeInsert(min_length_opt, setupParams->penaltyParams.minLength); - safeInsert(no_repeat_ngram_size_opt, setupParams->penaltyParams.noRepeatNgramSize); - safeInsert(runtime_top_k_opt, setupParams->samplingParams.runtime_top_k); - safeInsert(runtime_top_p_opt, setupParams->samplingParams.runtime_top_p); - safeInsert(random_seed_opt, setupParams->randomSeed); - safeInsert(top_p_decay_opt, setupParams->samplingParams.top_p_decay); - safeInsert(top_p_min_opt, setupParams->samplingParams.top_p_min); - safeInsert(top_p_reset_ids_opt, setupParams->samplingParams.top_p_reset_ids); - safeInsert(beam_search_diversity_rate_opt, setupParams->beamSearchParams.beam_search_diversity_rate); - safeInsert(length_penalty_opt, setupParams->beamSearchParams.length_penalty); - safeInsert(early_stopping_opt, setupParams->beamSearchParams.early_stopping); - setupParams->samplingParams.outputLogProbs = std::vector({output_log_probs}); - setupParams->samplingParams.cumLogProbs = std::vector({cum_log_probs}); - // TODO: insert "normalize_log_probs" and "topKMedusaHeads" - - dynamic_decode_layer_->setup(batch_size, beam_width, nullptr, setupParams); + mDynamicDecodeLayer->setStream(stream); + + auto setupParams = std::make_shared(); + auto penaltyParams = std::make_shared(); + auto banWordsParams = std::make_shared(); + safeInsert(temperatureOpt, penaltyParams->temperature); + safeInsert(repetitionPenaltyOpt, penaltyParams->repetitionPenalty); + safeInsert(presencePenaltyOpt, penaltyParams->presencePenalty); + safeInsert(frequencyPenaltyOpt, penaltyParams->frequencyPenalty); + safeInsert(minLengthOpt, penaltyParams->minLength); + + safeInsert(noRepeatNgramSizeOpt, banWordsParams->noRepeatNgramSize); + + if (beamWidth == 1) + { + auto decodingParams = std::make_shared(); + safeInsert(runtimeTopKOpt, decodingParams->runtimeTopK); + safeInsert(runtimeTopPOpt, decodingParams->runtimeTopP); + safeInsert(topPDecayOpt, decodingParams->topPDecay); + safeInsert(topPMinOpt, decodingParams->topPMin); + safeInsert(topPResetIdsOpt, decodingParams->topPResetIds); + decodingParams->outputLogProbs = std::vector({outputLogProbs}); + decodingParams->cumLogProbs = std::vector({cumLogProbs}); + safeInsert(randomSeedOpt, decodingParams->randomSeed); + + setupParams->decodingParams = decodingParams; + } + else + { + auto decodingParams = std::make_shared(); + safeInsert(beamSearchDiversityRateOpt, decodingParams->beamSearchDiversityRate); + safeInsert(lengthPenaltyOpt, decodingParams->lengthPenalty); + safeInsert(earlyStoppingOpt, decodingParams->earlyStopping); + decodingParams->outputLogProbs = std::vector({outputLogProbs}); + decodingParams->cumLogProbs = std::vector({cumLogProbs}); + safeInsert(randomSeedOpt, decodingParams->randomSeed); + + setupParams->decodingParams = decodingParams; + } + + // TODO: insert "normalizeLogProbs" and "topKMedusaHeads" + + setupParams->penaltyParams = penaltyParams; + setupParams->banWordsParams = banWordsParams; + mDynamicDecodeLayer->setup(batchSize, beamWidth, nullptr, setupParams); } template -void FtDynamicDecode::forward(th::Tensor const& logits, int const step, int const max_input_length, - int const max_attention_window, int const sink_token_length, uint64_t const ite, int const local_batch_size, - th::Tensor end_id, th::optional embedding_bias_opt, th::optional input_lengths_opt, - th::optional sequence_limit_length_opt, th::optional stop_words_list_ptrs_opt, - th::optional stop_words_lens_opt, int32_t const max_stop_words_len, - th::optional bad_words_list_ptrs_opt, th::optional bad_words_lens_opt, - int32_t const max_bad_words_len, th::optional src_cache_indirection_opt, th::Tensor& output_token_ids, - th::Tensor& newTokens, th::Tensor& should_stop, th::optional finished_input, - th::optional finished_output, th::optional sequence_lengths_opt, - th::optional cum_log_probs_opt, th::optional output_log_probs_opt, - th::optional output_log_probs_tiled_opt, th::optional parent_ids_opt, - th::optional tgt_cache_indirection_opt, th::optional beam_hyps_output_ids_cba_opt, - th::optional beam_hyps_seq_len_cba_opt, th::optional beam_hyps_cum_log_probs_cba_opt, - th::optional beam_hyps_normed_scores_cba_opt, th::optional beam_hyps_log_probs_cba_opt, - th::optional beam_hyps_min_normed_scores_opt, th::optional beam_hyps_num_beams_opt, - th::optional beam_hyps_is_done_opt, bool const use_beam_hyps) +void FtDynamicDecode::forward(th::Tensor const& logits, int const step, int const maxInputLength, + int const maxAttentionWindow, int const sinkTokenLength, uint64_t const ite, int const localBatchSize, + th::Tensor endId, th::optional embeddingBiasOpt, th::optional inputLengthsOpt, + th::optional sequenceLimitLengthOpt, th::optional stopWordsListPtrsOpt, + th::optional stopWordsLensOpt, int32_t const maxStopWordsLen, + th::optional badWordsListPtrsOpt, th::optional badWordsLensOpt, + int32_t const maxBadWordsLen, th::optional srcCacheIndirectionOpt, th::Tensor& outputTokenIds, + th::Tensor& newTokens, th::Tensor& shouldStop, th::optional finishedInput, + th::optional finishedOutput, th::optional sequenceLengthsOpt, + th::optional cumLogProbsOpt, th::optional outputLogProbsOpt, + th::optional outputLogProbsTiledOpt, th::optional parentIdsOpt, + th::optional tgtCacheIndirectionOpt, th::optional beamHypsOutputIdsCbaOpt, + th::optional beamHypsSeqLenCbaOpt, th::optional beamHypsCumLogProbsCbaOpt, + th::optional beamHypsNormedScoresCbaOpt, th::optional beamHypsLogProbsCbaOpt, + th::optional beamHypsMinNormedScoresOpt, th::optional beamHypsNumBeamsOpt, + th::optional beamHypsIsDoneOpt, bool const useBeamHyps) { - auto forwardParams = std::make_shared(step, static_cast(ite), - max_input_length, max_attention_window, sink_token_length, local_batch_size, convert_tensor(end_id)); + TLLM_CHECK_WITH_INFO(mBeamWidth.has_value(), "Beam width is not set. setup() must be called before forward()"); + auto const isBeamSearch = mBeamWidth.value() > 1; - forwardParams->logits = convert_tensor(logits); + std::shared_ptr forwardParams; + if (isBeamSearch) + { + forwardParams = std::make_shared(convert_tensor(endId), step, static_cast(ite), + localBatchSize, maxAttentionWindow, sinkTokenLength); + } + else + { + forwardParams = std::make_shared( + convert_tensor(endId), step, static_cast(ite), localBatchSize); + } - safeUpdate(embedding_bias_opt, forwardParams->embedding_bias); - safeUpdate(input_lengths_opt, forwardParams->input_lengths); - safeUpdate(sequence_limit_length_opt, forwardParams->sequence_limit_length); - safeUpdate(stop_words_list_ptrs_opt, forwardParams->stop_words_ptr); - safeUpdate(stop_words_lens_opt, forwardParams->stop_words_lengths); - forwardParams->max_stop_words_len = max_stop_words_len; - safeUpdate(bad_words_list_ptrs_opt, forwardParams->bad_words_ptr); - safeUpdate(bad_words_lens_opt, forwardParams->bad_words_lengths); - forwardParams->max_bad_words_len = max_bad_words_len; - safeUpdate(src_cache_indirection_opt, forwardParams->src_cache_indirection); - - auto const& output_ids_converted = convert_tensor(output_token_ids); - auto outputParams = std::make_shared(output_ids_converted); + forwardParams->logits = convert_tensor(logits); + forwardParams->stopCriteriaInputs = std::make_shared(localBatchSize); + forwardParams->banWordsInputs = std::make_shared(localBatchSize); + + safeUpdate(embeddingBiasOpt, forwardParams->embeddingBias); + safeUpdate(inputLengthsOpt, forwardParams->inputLengths); + safeUpdate(sequenceLimitLengthOpt, forwardParams->stopCriteriaInputs->sequenceLimitLength); + safeUpdate(stopWordsListPtrsOpt, forwardParams->stopCriteriaInputs->stopWordsPtr); + safeUpdate(stopWordsLensOpt, forwardParams->stopCriteriaInputs->stopWordsLengths); + forwardParams->stopCriteriaInputs->maxStopWordsLen = maxStopWordsLen; + safeUpdate(badWordsListPtrsOpt, forwardParams->banWordsInputs->badWordsPtr); + safeUpdate(badWordsLensOpt, forwardParams->banWordsInputs->badWordsLengths); + forwardParams->banWordsInputs->maxBadWordsLen = maxBadWordsLen; + safeUpdate(srcCacheIndirectionOpt, forwardParams->srcCacheIndirection); + + auto const& outputIdsConverted = convert_tensor(outputTokenIds); + + std::shared_ptr outputParams; + if (isBeamSearch) + { + outputParams = std::make_shared(outputIdsConverted); + } + else + { + outputParams = std::make_shared(outputIdsConverted); + } outputParams->newTokens = std::move(convert_tensor(newTokens)); - safeUpdate(finished_input, forwardParams->finished); - safeUpdate(finished_output, outputParams->finished); - safeUpdate(sequence_lengths_opt, outputParams->sequence_length); - safeUpdate(cum_log_probs_opt, outputParams->cum_log_probs); - safeUpdate(output_log_probs_opt, outputParams->output_log_probs); - safeUpdate(output_log_probs_tiled_opt, outputParams->output_log_probs_tiled); - safeUpdate(parent_ids_opt, outputParams->parent_ids); - safeUpdate(tgt_cache_indirection_opt, outputParams->tgt_cache_indirection); - - std::int32_t* finished_sum_host = nullptr; - if (forwardParams->sequence_limit_length && outputParams->finished.has_value()) + safeUpdate(finishedInput, forwardParams->finished); + safeUpdate(finishedOutput, outputParams->finished); + safeUpdate(sequenceLengthsOpt, outputParams->sequenceLength); + safeUpdate(cumLogProbsOpt, outputParams->cumLogProbs); + safeUpdate(outputLogProbsOpt, outputParams->outputLogProbs); + safeUpdate(outputLogProbsTiledOpt, outputParams->outputLogProbsTiled); + safeUpdate(parentIdsOpt, outputParams->parentIds); + + std::int32_t* finishedSumHost = nullptr; + if (forwardParams->stopCriteriaInputs->sequenceLimitLength && outputParams->finished.has_value()) { // Skip the initialization and later calculation if there is no limit of sequence length or no finished beam - outputParams->finished_sum = tcc::toTllmTensor(*finished_sum_); - finished_sum_host = tr::bufferCast(*finished_sum_); - for (int32_t bi = 0; bi < local_batch_size; ++bi) + outputParams->finishedSum = tcc::toTllmTensor(*mFinishedSum); + finishedSumHost = tr::bufferCast(*mFinishedSum); + for (int32_t bi = 0; bi < localBatchSize; ++bi) { - finished_sum_host[bi] = 0; + finishedSumHost[bi] = 0; } } - if (use_beam_hyps) + if (isBeamSearch) { - // Additional parameters for beam search - outputParams->beamHypotheses = std::make_unique(); - safeUpdatePtr(beam_hyps_is_done_opt, outputParams->beamHypotheses->batchDones); - safeUpdatePtr(beam_hyps_cum_log_probs_cba_opt, outputParams->beamHypotheses->cumLogProbsCBA); - safeUpdatePtr(beam_hyps_log_probs_cba_opt, outputParams->beamHypotheses->logProbsCBA); - safeUpdatePtr(beam_hyps_min_normed_scores_opt, outputParams->beamHypotheses->minNormedScoresCBA); - safeUpdatePtr(beam_hyps_normed_scores_cba_opt, outputParams->beamHypotheses->normedScoresCBA); - safeUpdatePtr(beam_hyps_num_beams_opt, outputParams->beamHypotheses->numBeamsCBA); - safeUpdatePtr(beam_hyps_output_ids_cba_opt, outputParams->beamHypotheses->outputIdsCBA); - safeUpdatePtr(beam_hyps_seq_len_cba_opt, outputParams->beamHypotheses->sequenceLengthsCBA); + auto outputsBeamSearch = std::dynamic_pointer_cast(outputParams); + TLLM_CHECK_WITH_INFO(tgtCacheIndirectionOpt.has_value(), "tgtCacheIndirection must be set for beam search"); + outputsBeamSearch->tgtCacheIndirection = std::move(convert_tensor(tgtCacheIndirectionOpt.value())); + if (useBeamHyps) + { + // Additional parameters for beam search + outputsBeamSearch->beamHypotheses = std::make_unique(); + safeUpdatePtr(beamHypsIsDoneOpt, outputsBeamSearch->beamHypotheses->batchDones); + safeUpdatePtr(beamHypsCumLogProbsCbaOpt, outputsBeamSearch->beamHypotheses->cumLogProbsCBA); + safeUpdatePtr(beamHypsLogProbsCbaOpt, outputsBeamSearch->beamHypotheses->logProbsCBA); + safeUpdatePtr(beamHypsMinNormedScoresOpt, outputsBeamSearch->beamHypotheses->minNormedScoresCBA); + safeUpdatePtr(beamHypsNormedScoresCbaOpt, outputsBeamSearch->beamHypotheses->normedScoresCBA); + safeUpdatePtr(beamHypsNumBeamsOpt, outputsBeamSearch->beamHypotheses->numBeamsCBA); + safeUpdatePtr(beamHypsOutputIdsCbaOpt, outputsBeamSearch->beamHypotheses->outputIdsCBA); + safeUpdatePtr(beamHypsSeqLenCbaOpt, outputsBeamSearch->beamHypotheses->sequenceLengthsCBA); + } } - dynamic_decode_layer_->forwardAsync(outputParams, forwardParams); + mDynamicDecodeLayer->forwardAsync(outputParams, forwardParams); - if (finished_sum_host) + if (finishedSumHost) { - TLLM_CUDA_CHECK(::cudaStreamSynchronize(dynamic_decode_layer_->getStream())); + TLLM_CUDA_CHECK(::cudaStreamSynchronize(mDynamicDecodeLayer->getStream())); int32_t numRealFinished = 0; - for (int32_t bi = 0; bi < local_batch_size; ++bi) + for (int32_t bi = 0; bi < localBatchSize; ++bi) { - numRealFinished += finished_sum_host[bi]; + numRealFinished += finishedSumHost[bi]; } auto const numToFinish = outputParams->finished->size(); - auto should_stop_accessor = should_stop.accessor(); - should_stop_accessor[0] = numToFinish == numRealFinished; + auto shouldStopAccessor = shouldStop.accessor(); + shouldStopAccessor[0] = numToFinish == numRealFinished; } } -DynamicDecodeOp::DynamicDecodeOp(int64_t const max_batch_size, int64_t const max_beam_width, int64_t const vocab_size, - int64_t const vocab_size_padded, int64_t const tensor_para_size, int64_t const pipeline_para_size, - at::ScalarType const scalar_type) - : max_batch_size_(static_cast(max_batch_size)) - , max_beam_width_(static_cast(max_beam_width)) - , vocab_size_(static_cast(vocab_size)) - , vocab_size_padded_(static_cast(vocab_size_padded)) - , tensor_para_size_(static_cast(tensor_para_size)) - , pipeline_para_size_(static_cast(pipeline_para_size)) - , scalar_type_(scalar_type) +DynamicDecodeOp::DynamicDecodeOp(int64_t const maxBatchSize, int64_t const maxBeamWidth, int64_t const vocabSize, + int64_t const vocabSizePadded, int64_t const tensorParaSize, int64_t const pipelineParaSize, + at::ScalarType const scalarType) + : maxBatchSize_(static_cast(maxBatchSize)) + , maxBeamWidth_(static_cast(maxBeamWidth)) + , vocabSize_(static_cast(vocabSize)) + , vocabSizePadded_(static_cast(vocabSizePadded)) + , tensorParaSize_(static_cast(tensorParaSize)) + , pipelineParaSize_(static_cast(pipelineParaSize)) + , scalarType_(scalarType) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); createInstance(); @@ -240,141 +292,140 @@ DynamicDecodeOp::DynamicDecodeOp(int64_t const max_batch_size, int64_t const max void DynamicDecodeOp::createInstance() { - dynamic_decode_.reset(); - switch (scalar_type_) + dynamicDecode_.reset(); + switch (scalarType_) { case at::ScalarType::Float: - dynamic_decode_ = std::make_unique>( - max_batch_size_, max_beam_width_, vocab_size_, vocab_size_padded_, tensor_para_size_, pipeline_para_size_); + dynamicDecode_ = std::make_unique>( + maxBatchSize_, maxBeamWidth_, vocabSize_, vocabSizePadded_, tensorParaSize_, pipelineParaSize_); break; case at::ScalarType::Half: - dynamic_decode_ = std::make_unique>( - max_batch_size_, max_beam_width_, vocab_size_, vocab_size_padded_, tensor_para_size_, pipeline_para_size_); + dynamicDecode_ = std::make_unique>( + maxBatchSize_, maxBeamWidth_, vocabSize_, vocabSizePadded_, tensorParaSize_, pipelineParaSize_); break; default: throw std::runtime_error("Wrong tensor type."); } } -void DynamicDecodeOp::setup(int64_t const batch_size, int64_t const beam_width, - th::optional runtime_top_k_opt, th::optional runtime_top_p_opt, - th::optional temperature_opt, th::optional repetition_penalty_opt, - th::optional presence_penalty_opt, th::optional frequency_penalty_opt, - th::optional min_length_opt, th::optional length_penalty_opt, - th::optional early_stopping_opt, th::optional beam_search_diversity_rate_opt, - th::optional random_seed_opt, th::optional top_p_decay_opt, - th::optional top_p_min_opt, th::optional top_p_reset_ids_opt, - th::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) +void DynamicDecodeOp::setup(int64_t const batchSize, int64_t const beamWidth, th::optional runtimeTopKOpt, + th::optional runtimeTopPOpt, th::optional temperatureOpt, + th::optional repetitionPenaltyOpt, th::optional presencePenaltyOpt, + th::optional frequencyPenaltyOpt, th::optional minLengthOpt, + th::optional lengthPenaltyOpt, th::optional earlyStoppingOpt, + th::optional beamSearchDiversityRateOpt, th::optional randomSeedOpt, + th::optional topPDecayOpt, th::optional topPMinOpt, + th::optional topPResetIdsOpt, th::optional noRepeatNgramSizeOpt, bool outputLogProbs, + bool cumLogProbs) { // TODO: Revise DynamicDecodeLayer and make the decode arguments consistent. - // TODO: add parameters "normalize_log_probs" and "topKMedusaHeads" - CHECK_OPTIONAL_CPU_INPUT(runtime_top_k_opt, torch::kInt32); - CHECK_OPTIONAL_CPU_INPUT(runtime_top_p_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(temperature_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(repetition_penalty_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(presence_penalty_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(frequency_penalty_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(min_length_opt, torch::kInt32); - CHECK_OPTIONAL_CPU_INPUT(length_penalty_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(early_stopping_opt, torch::kInt32); - CHECK_OPTIONAL_CPU_INPUT(no_repeat_ngram_size_opt, torch::kInt32); - CHECK_OPTIONAL_CPU_INPUT(beam_search_diversity_rate_opt, torch::kFloat); - CHECK_OPTIONAL_CPU_INPUT(random_seed_opt, torch::kInt64); - CHECK_OPTIONAL_INPUT(top_p_decay_opt, torch::kFloat); - CHECK_OPTIONAL_INPUT(top_p_min_opt, torch::kFloat); - CHECK_OPTIONAL_INPUT(top_p_reset_ids_opt, torch::kInt32); - - dynamic_decode_->setup(static_cast(batch_size), static_cast(beam_width), runtime_top_k_opt, - runtime_top_p_opt, temperature_opt, repetition_penalty_opt, presence_penalty_opt, frequency_penalty_opt, - min_length_opt, length_penalty_opt, early_stopping_opt, beam_search_diversity_rate_opt, random_seed_opt, - top_p_decay_opt, top_p_min_opt, top_p_reset_ids_opt, no_repeat_ngram_size_opt, output_log_probs, cum_log_probs); + // TODO: add parameters "normalizeLogProbs" and "topKMedusaHeads" + CHECK_OPTIONAL_CPU_INPUT(runtimeTopKOpt, torch::kInt32); + CHECK_OPTIONAL_CPU_INPUT(runtimeTopPOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(temperatureOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(repetitionPenaltyOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(presencePenaltyOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(frequencyPenaltyOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(minLengthOpt, torch::kInt32); + CHECK_OPTIONAL_CPU_INPUT(lengthPenaltyOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(earlyStoppingOpt, torch::kInt32); + CHECK_OPTIONAL_CPU_INPUT(noRepeatNgramSizeOpt, torch::kInt32); + CHECK_OPTIONAL_CPU_INPUT(beamSearchDiversityRateOpt, torch::kFloat); + CHECK_OPTIONAL_CPU_INPUT(randomSeedOpt, torch::kInt64); + CHECK_OPTIONAL_INPUT(topPDecayOpt, torch::kFloat); + CHECK_OPTIONAL_INPUT(topPMinOpt, torch::kFloat); + CHECK_OPTIONAL_INPUT(topPResetIdsOpt, torch::kInt32); + + dynamicDecode_->setup(static_cast(batchSize), static_cast(beamWidth), runtimeTopKOpt, + runtimeTopPOpt, temperatureOpt, repetitionPenaltyOpt, presencePenaltyOpt, frequencyPenaltyOpt, minLengthOpt, + lengthPenaltyOpt, earlyStoppingOpt, beamSearchDiversityRateOpt, randomSeedOpt, topPDecayOpt, topPMinOpt, + topPResetIdsOpt, noRepeatNgramSizeOpt, outputLogProbs, cumLogProbs); } th::Tensor DynamicDecodeOp::forward( - // Inputs BS: batch_size, BM: beam_width, MSL: max_seq_length, V: vocab_size, VP: vocab_size_padded - th::Tensor const& logits, // [BS, BM, VP], T, variables for input - int64_t const step, // - int64_t const max_input_length, // - int64_t const max_attention_window, // - int64_t const sink_token_length, // - int64_t const ite, // - int64_t const local_batch_size, // - th::Tensor const end_id, // [BS*BM], int - th::optional embedding_bias_opt, // [VP], T - th::optional input_lengths_opt, // [BS*BM], int, length of input contexts - th::optional sequence_limit_length_opt, // [BS, 1], int - th::optional stop_words_list_ptrs_opt, // [BS][2, stop_words_length], int64 - th::optional stop_words_lens_opt, // [BS], int - int64_t const max_stop_words_len, // - th::optional bad_words_list_ptrs_opt, // [BS][2, bad_words_length], int64 - th::optional bad_words_lens_opt, // [BS], int - int64_t const max_bad_words_len, // - th::optional src_cache_indirection_opt, // [local_BS, BM, MSL], int + // Inputs BS: batchSize, BM: beamWidth, MSL: maxSeqLength, V: vocabSize, VP: vocabSizePadded + th::Tensor const& logits, // [BS, BM, VP], T, variables for input + int64_t const step, // + int64_t const maxInputLength, // + int64_t const maxAttentionWindow, // + int64_t const sinkTokenLength, // + int64_t const ite, // + int64_t const localBatchSize, // + th::Tensor const endId, // [BS*BM], int + th::optional embeddingBiasOpt, // [VP], T + th::optional inputLengthsOpt, // [BS*BM], int, length of input contexts + th::optional sequenceLimitLengthOpt, // [BS, 1], int + th::optional stopWordsListPtrsOpt, // [BS][2, stopWordsLength], int64 + th::optional stopWordsLensOpt, // [BS], int + int64_t const maxStopWordsLen, // + th::optional badWordsListPtrsOpt, // [BS][2, badWordsLength], int64 + th::optional badWordsLensOpt, // [BS], int + int64_t const maxBadWordsLen, // + th::optional srcCacheIndirectionOpt, // [localBS, BM, MSL], int // Outputs - th::Tensor output_token_ids, // [BS, BM, MSL], variables for output - th::Tensor newTokens, // [BS, BM, 1], int - th::optional finished_input, // [BS, BM], uint8 - th::optional finished_output, // [BS, BM], uint8 - th::optional sequence_lengths_opt, // [BS*BM], int, length of the current sequences - th::optional cum_log_probs_opt, // [BS, BM], float - th::optional output_log_probs_opt, // [BS, BM, MSL], float - th::optional output_log_probs_tiled_opt, // [MSL, BS, BM], float, transpose of output_log_probs_opt - th::optional parent_ids_opt, // [BS, BM, MSL], int - th::optional tgt_cache_indirection_opt, // [local_BS, BM, MSL], int - th::optional beam_hyps_output_ids_cba_opt, // [BS, BM*2, MSL], int - th::optional beam_hyps_seq_len_cba_opt, // [BS, BM*2], int - th::optional beam_hyps_cum_log_probs_cba_opt, // [BS, BM*2], float - th::optional beam_hyps_normed_scores_cba_opt, // [BS, BM*2], float - th::optional beam_hyps_log_probs_cba_opt, // [BS, BM*2, MSL], float - th::optional beam_hyps_min_normed_scores_opt, // [BS], float - th::optional beam_hyps_num_beams_opt, // [BS], int - th::optional beam_hyps_is_done_opt, // [BS], bool - bool const use_beam_hyps // + th::Tensor outputTokenIds, // [BS, BM, MSL], variables for output + th::Tensor newTokens, // [BS, BM, 1], int + th::optional finishedInput, // [BS, BM], uint8 + th::optional finishedOutput, // [BS, BM], uint8 + th::optional sequenceLengthsOpt, // [BS*BM], int, length of the current sequences + th::optional cumLogProbsOpt, // [BS, BM], float + th::optional outputLogProbsOpt, // [BS, BM, MSL], float + th::optional outputLogProbsTiledOpt, // [MSL, BS, BM], float, transpose of outputLogProbsOpt + th::optional parentIdsOpt, // [BS, BM, MSL], int + th::optional tgtCacheIndirectionOpt, // [localBS, BM, MSL], int + th::optional beamHypsOutputIdsCbaOpt, // [BS, BM*2, MSL], int + th::optional beamHypsSeqLenCbaOpt, // [BS, BM*2], int + th::optional beamHypsCumLogProbsCbaOpt, // [BS, BM*2], float + th::optional beamHypsNormedScoresCbaOpt, // [BS, BM*2], float + th::optional beamHypsLogProbsCbaOpt, // [BS, BM*2, MSL], float + th::optional beamHypsMinNormedScoresOpt, // [BS], float + th::optional beamHypsNumBeamsOpt, // [BS], int + th::optional beamHypsIsDoneOpt, // [BS], bool + bool const useBeamHyps // ) { - CHECK_INPUT(logits, scalar_type_); + CHECK_INPUT(logits, scalarType_); TLLM_CHECK_WITH_INFO(logits.dim() == 3, - "logits is of shape (batch_size, beam_width, vocab_size_padded), but got dim=%d shape=%s", (int) logits.dim(), + "logits is of shape (batchSize, beamWidth, vocabSizePadded), but got dim=%d shape=%s", (int) logits.dim(), tensorrt_llm::common::vec2str(convert_shape(logits)).c_str()); - TLLM_CHECK_WITH_INFO(static_cast(logits.size(2)) == vocab_size_padded_, - "logits is of shape (batch_size, beam_width, vocab_size(%ld)), but got the last dim=%ld.", vocab_size_padded_, + TLLM_CHECK_WITH_INFO(static_cast(logits.size(2)) == vocabSizePadded_, + "logits is of shape (batchSize, beamWidth, vocabSize(%ld)), but got the last dim=%ld.", vocabSizePadded_, static_cast(logits.size(2))); - CHECK_INPUT(end_id, torch::kInt32); - CHECK_OPTIONAL_INPUT(embedding_bias_opt, scalar_type_); - CHECK_OPTIONAL_INPUT(input_lengths_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(sequence_limit_length_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(stop_words_list_ptrs_opt, torch::kInt64); - CHECK_OPTIONAL_INPUT(stop_words_lens_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(bad_words_list_ptrs_opt, torch::kInt64); - CHECK_OPTIONAL_INPUT(bad_words_lens_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(src_cache_indirection_opt, torch::kInt32); - CHECK_INPUT(output_token_ids, torch::kInt32); + CHECK_INPUT(endId, torch::kInt32); + CHECK_OPTIONAL_INPUT(embeddingBiasOpt, scalarType_); + CHECK_OPTIONAL_INPUT(inputLengthsOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(sequenceLimitLengthOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(stopWordsListPtrsOpt, torch::kInt64); + CHECK_OPTIONAL_INPUT(stopWordsLensOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(badWordsListPtrsOpt, torch::kInt64); + CHECK_OPTIONAL_INPUT(badWordsLensOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(srcCacheIndirectionOpt, torch::kInt32); + CHECK_INPUT(outputTokenIds, torch::kInt32); CHECK_INPUT(newTokens, torch::kInt32); - CHECK_OPTIONAL_INPUT(finished_input, torch::kUInt8); - CHECK_OPTIONAL_INPUT(finished_output, torch::kUInt8); - CHECK_OPTIONAL_INPUT(sequence_lengths_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(cum_log_probs_opt, torch::kFloat32); - CHECK_OPTIONAL_INPUT(output_log_probs_opt, torch::kFloat32); - CHECK_OPTIONAL_INPUT(output_log_probs_tiled_opt, torch::kFloat32); - CHECK_OPTIONAL_INPUT(parent_ids_opt, torch::kInt32); - CHECK_OPTIONAL_INPUT(tgt_cache_indirection_opt, torch::kInt32); - - th::Tensor should_stop = torch::zeros({1}, torch::dtype(torch::kBool).requires_grad(false)); - - dynamic_decode_->forward( + CHECK_OPTIONAL_INPUT(finishedInput, torch::kUInt8); + CHECK_OPTIONAL_INPUT(finishedOutput, torch::kUInt8); + CHECK_OPTIONAL_INPUT(sequenceLengthsOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(cumLogProbsOpt, torch::kFloat32); + CHECK_OPTIONAL_INPUT(outputLogProbsOpt, torch::kFloat32); + CHECK_OPTIONAL_INPUT(outputLogProbsTiledOpt, torch::kFloat32); + CHECK_OPTIONAL_INPUT(parentIdsOpt, torch::kInt32); + CHECK_OPTIONAL_INPUT(tgtCacheIndirectionOpt, torch::kInt32); + + th::Tensor shouldStop = torch::zeros({1}, torch::dtype(torch::kBool).requires_grad(false)); + + dynamicDecode_->forward( // Inputs - logits, static_cast(step), static_cast(max_input_length), static_cast(max_attention_window), - static_cast(sink_token_length), static_cast(ite), static_cast(local_batch_size), end_id, - embedding_bias_opt, input_lengths_opt, sequence_limit_length_opt, stop_words_list_ptrs_opt, stop_words_lens_opt, - static_cast(max_stop_words_len), bad_words_list_ptrs_opt, bad_words_lens_opt, - static_cast(max_bad_words_len), src_cache_indirection_opt, + logits, static_cast(step), static_cast(maxInputLength), static_cast(maxAttentionWindow), + static_cast(sinkTokenLength), static_cast(ite), static_cast(localBatchSize), endId, + embeddingBiasOpt, inputLengthsOpt, sequenceLimitLengthOpt, stopWordsListPtrsOpt, stopWordsLensOpt, + static_cast(maxStopWordsLen), badWordsListPtrsOpt, badWordsLensOpt, + static_cast(maxBadWordsLen), srcCacheIndirectionOpt, // Outputs - output_token_ids, newTokens, should_stop, finished_input, finished_output, sequence_lengths_opt, - cum_log_probs_opt, output_log_probs_opt, output_log_probs_tiled_opt, parent_ids_opt, tgt_cache_indirection_opt, - beam_hyps_output_ids_cba_opt, beam_hyps_seq_len_cba_opt, beam_hyps_cum_log_probs_cba_opt, - beam_hyps_normed_scores_cba_opt, beam_hyps_log_probs_cba_opt, beam_hyps_min_normed_scores_opt, - beam_hyps_num_beams_opt, beam_hyps_is_done_opt, use_beam_hyps); + outputTokenIds, newTokens, shouldStop, finishedInput, finishedOutput, sequenceLengthsOpt, cumLogProbsOpt, + outputLogProbsOpt, outputLogProbsTiledOpt, parentIdsOpt, tgtCacheIndirectionOpt, beamHypsOutputIdsCbaOpt, + beamHypsSeqLenCbaOpt, beamHypsCumLogProbsCbaOpt, beamHypsNormedScoresCbaOpt, beamHypsLogProbsCbaOpt, + beamHypsMinNormedScoresOpt, beamHypsNumBeamsOpt, beamHypsIsDoneOpt, useBeamHyps); - return should_stop; + return shouldStop; } } // namespace torch_ext diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h index 805370d71..2cc3cdf8f 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h @@ -94,8 +94,9 @@ class FtDynamicDecode : public IFtDynamicDecode th::optional beam_hyps_is_done_opt, bool const use_beam_hyps) override; private: - tensorrt_llm::runtime::ITensor::SharedPtr finished_sum_; // [batch_size] pinned - std::shared_ptr> dynamic_decode_layer_; + tensorrt_llm::runtime::ITensor::SharedPtr mFinishedSum; // [batch_size] pinned + std::shared_ptr> mDynamicDecodeLayer; + std::optional mBeamWidth; }; class DynamicDecodeOp : public th::jit::CustomClassHolder @@ -135,14 +136,14 @@ class DynamicDecodeOp : public th::jit::CustomClassHolder private: // Members initialized in constructor and used in call of createInstance() - size_t const max_batch_size_; - size_t const max_beam_width_; - size_t const vocab_size_; - size_t const vocab_size_padded_; - int const tensor_para_size_; - int const pipeline_para_size_; - at::ScalarType const scalar_type_; // Data type of expected input logits - std::unique_ptr dynamic_decode_; // FT Dynamic decode layer wrapper instance + size_t const maxBatchSize_; + size_t const maxBeamWidth_; + size_t const vocabSize_; + size_t const vocabSizePadded_; + int const tensorParaSize_; + int const pipelineParaSize_; + at::ScalarType const scalarType_; // Data type of expected input logits + std::unique_ptr dynamicDecode_; // FT Dynamic decode layer wrapper instance void createInstance(); }; diff --git a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp index 0ae48ada6..aaa87e8cf 100644 --- a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp +++ b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp @@ -87,7 +87,7 @@ void updateKVCacheDraftTokenLocation(torch::Tensor seqAcceptedDraftTokenOffsetsT reinterpret_cast( offsetArray.data_ptr()), layerCount, seqCount, numKVHeads, headSizeInBytes, rewindDraftTokenCount, rewindDraftTokenTensorPtr, - nullptr, maxKVCacheLen, maxBlocksPerSeqOpt.value(), tokensPerBlockOpt.value(), stream); + nullptr, nullptr, maxKVCacheLen, maxBlocksPerSeqOpt.value(), tokensPerBlockOpt.value(), stream); } else { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e392f3abd..84b4fa939 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -136,6 +136,9 @@ set(LOOKAHEAD_RANDOMLLM_TEST_SRC layers/randomLlm.cpp layers/lookaheadRandomLlmTest.cpp) add_gtest(lookaheadRandomLlmTest "${LOOKAHEAD_RANDOMLLM_TEST_SRC}") add_gtest(explicitDraftTokensLayerTest layers/explicitDraftTokensLayerTest.cpp) +set(LOOKAHEAD_DECODING_TEST_SRC layers/randomLlm.cpp + layers/lookaheadDecodingLayerTest.cpp) +add_gtest(lookaheadDecodingLayerTest "${LOOKAHEAD_DECODING_TEST_SRC}") add_gtest( gemmSwigluRunnerTest diff --git a/cpp/tests/README.md b/cpp/tests/README.md index 36239de12..4d491aa76 100644 --- a/cpp/tests/README.md +++ b/cpp/tests/README.md @@ -31,7 +31,7 @@ From the top-level directory call: ```bash CPP_BUILD_DIR=cpp/build python3 scripts/build_wheel.py -a "80-real;86-real" --build_dir ${CPP_BUILD_DIR} --trt_root /usr/local/tensorrt -pip install -r requirements-dev.txt --extra-index-url https://pypi.ngc.nvidia.com +pip install -r requirements-dev.txt pip install build/tensorrt_llm*.whl cd $CPP_BUILD_DIR && make -j$(nproc) google-tests ``` diff --git a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu index 9f1d9651f..a0f9233df 100644 --- a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu +++ b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu @@ -231,7 +231,6 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa } params.barrier_flag = 0; params.ranks_per_node = world_size; - params.rank = rank; params.local_rank = rank; params.local_output_buffer_ptr = out.data(); params.local_input_buffer_ptr = in.data(); diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu index a02e0ab2d..a23ce3d3e 100644 --- a/cpp/tests/kernels/mixtureOfExpertsTest.cu +++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu @@ -318,15 +318,18 @@ protected: mRawExpertWeight1 = allocBuffer(expert_matrix_size * mGatedMultiplier); mRawExpertWeight2 = allocBuffer(expert_matrix_size); - mTpExpertScratchSize = expert_matrix_size * mGatedMultiplier / parallelism_config.tp_size; - mTpExpertScratchSize += expert_matrix_size / parallelism_config.tp_size; + size_t const experts_per_node = mNumExperts / parallelism_config.ep_size; + int const moe_parallel_size = parallelism_config.tp_size * parallelism_config.ep_size; + + mTpExpertScratchSize = expert_matrix_size * mGatedMultiplier / moe_parallel_size; + mTpExpertScratchSize += expert_matrix_size / moe_parallel_size; mExpertBias1 = nullptr; mExpertBias2 = nullptr; if (mUseBias) { // Allow space for the slice of bias1 in the scratch - mTpExpertScratchSize += mNumExperts * gated_inter / parallelism_config.tp_size; + mTpExpertScratchSize += experts_per_node * gated_inter / parallelism_config.tp_size; mExpertBias1 = allocBuffer(mNumExperts * gated_inter); mExpertBias2 = allocBuffer(mNumExperts * mHiddenSize); @@ -339,7 +342,7 @@ protected: mExpertWeight1 = allocBuffer(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE); mExpertWeight2 = allocBuffer(expert_matrix_size / WEIGHT_ELEM_PER_BYTE); - mTpExpertScratchSize += mNumExperts * gated_inter / parallelism_config.tp_size; + mTpExpertScratchSize += experts_per_node * gated_inter / parallelism_config.tp_size; mExpertIntScale1 = allocBuffer(mNumExperts * gated_inter); mExpertIntScale2 = allocBuffer(mNumExperts * mHiddenSize); } @@ -359,11 +362,7 @@ protected: initFP8Scales(mMaxInput); } - mTpExpertScratch = nullptr; - if (parallelism_config.tp_size > 1) - { - mTpExpertScratch = allocBuffer(mTpExpertScratchSize); - } + mTpExpertScratch = allocBuffer(mTpExpertScratchSize); mActiveRows = mTotalTokens; mFinished = nullptr; @@ -558,86 +557,82 @@ protected: auto getWeights(MOEParallelismConfig parallelism_config) { - void* scale_1 = FP8 ? (void*) mExpertFP8Scale1 : (void*) mExpertIntScale1; - void* scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2; - void* scale_3 = FP8 ? mExpertFP8Scale3 : nullptr; - - if (parallelism_config.tp_size > 1) + void* ep_scale_1 = FP8 ? (void*) mExpertFP8Scale1 : (void*) mExpertIntScale1; + void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2; + void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr; + + // Slice weights for EP + size_t const gated_inter = mInterSize * mGatedMultiplier; + size_t const experts_per_node = mNumExperts / parallelism_config.ep_size; + size_t const weight_matrix_size = mHiddenSize * mInterSize * experts_per_node / WEIGHT_ELEM_PER_BYTE; + size_t const bias_fc1_size = gated_inter * experts_per_node; + size_t const bias_fc2_size = mHiddenSize * experts_per_node; + size_t const scale1_size = gated_inter * experts_per_node; + size_t const scale2_size = mHiddenSize * experts_per_node; + auto* weight1_ptr = mExpertWeight1 + weight_matrix_size * mGatedMultiplier * parallelism_config.ep_rank; + auto* weight2_ptr = mExpertWeight2 + weight_matrix_size * parallelism_config.ep_rank; + auto* bias1_ptr = mUseBias ? mExpertBias1 + bias_fc1_size * parallelism_config.ep_rank : nullptr; + auto* bias2_ptr = mUseBias ? mExpertBias2 + bias_fc2_size * parallelism_config.ep_rank : nullptr; + + if (INT_QUANT) { - int const tp_size = parallelism_config.tp_size; - int const tp_rank = parallelism_config.tp_rank; - - size_t const matrix_size = mHiddenSize * mInterSize / tp_size; - size_t const gated_matrix_size = mHiddenSize * mInterSize * mGatedMultiplier / tp_size; - size_t const row_size_inter = mInterSize / tp_size; - size_t const gated_row_size_inter = mInterSize * mGatedMultiplier / tp_size; - size_t const gated_bias_size = mUseBias ? gated_row_size_inter : 0; - - auto* weight_1 = reinterpret_cast(mTpExpertScratch); - auto* weight_2 = weight_1 + mNumExperts * gated_matrix_size; - auto* bias_1 = reinterpret_cast(weight_2 + mNumExperts * matrix_size); - auto* int_scale_1 = bias_1 + mNumExperts * gated_bias_size; - - // 2D memcpy just the slices we care about - // TODO Re-quantize here with matrices divided - size_t const row_size_1 = matrix_size * sizeof(WeightStorage) / WEIGHT_ELEM_PER_BYTE; - check_cuda_error(cudaMemcpy2DAsync(weight_1, row_size_1, (uint8_t*) mExpertWeight1 + row_size_1 * tp_rank, - row_size_1 * tp_size, row_size_1, mNumExperts * mGatedMultiplier, cudaMemcpyDeviceToDevice, - mStream->get())); + ep_scale_1 = mExpertIntScale1 + scale1_size * parallelism_config.ep_rank; + ep_scale_2 = mExpertIntScale2 + scale2_size * parallelism_config.ep_rank; + } + if constexpr (FP8) + { + ep_scale_1 = mExpertFP8Scale1 + experts_per_node * parallelism_config.ep_rank; + ep_scale_3 = mExpertFP8Scale3 + experts_per_node * parallelism_config.ep_rank; + } - size_t const row_size_2 = row_size_inter * sizeof(WeightStorage) / WEIGHT_ELEM_PER_BYTE; - check_cuda_error(cudaMemcpy2DAsync(weight_2, row_size_2, (uint8_t*) mExpertWeight2 + row_size_2 * tp_rank, - row_size_2 * tp_size, row_size_2, mNumExperts * mHiddenSize, cudaMemcpyDeviceToDevice, mStream->get())); + // Slice weights for TP + void* scale_1 = ep_scale_1; + void* scale_2 = ep_scale_2; + void* scale_3 = ep_scale_3; - if (mUseBias) - { - size_t const row_size_bias = row_size_inter * sizeof(DataType); - check_cuda_error(cudaMemcpy2DAsync(bias_1, row_size_bias, - (uint8_t*) mExpertBias1 + row_size_bias * tp_rank, row_size_bias * tp_size, row_size_bias, - mNumExperts * mGatedMultiplier, cudaMemcpyDeviceToDevice, mStream->get())); - } + int const tp_size = parallelism_config.tp_size; + int const tp_rank = parallelism_config.tp_rank; - if constexpr (INT_QUANT) - { - scale_2 = mExpertIntScale2; - size_t const row_size_scale = row_size_inter * sizeof(DataType); - check_cuda_error(cudaMemcpy2DAsync(scale_1, row_size_scale, - (uint8_t*) mExpertIntScale1 + row_size_scale * tp_rank, row_size_scale * tp_size, row_size_scale, - mNumExperts * mGatedMultiplier, cudaMemcpyDeviceToDevice, mStream->get())); - } + size_t const matrix_size = mHiddenSize * mInterSize / tp_size; + size_t const gated_matrix_size = mHiddenSize * mInterSize * mGatedMultiplier / tp_size; + size_t const row_size_inter = mInterSize / tp_size; - bias_1 = mUseBias ? bias_1 : nullptr; - return std::tuple{weight_1, weight_2, bias_1, mExpertBias2, scale_1, scale_2, scale_3}; - } - else if (parallelism_config.ep_size > 1) + auto* weight_1 = reinterpret_cast(mTpExpertScratch); + auto* weight_2 = weight_1 + experts_per_node * gated_matrix_size; + auto* bias_1 = reinterpret_cast(weight_2 + experts_per_node * matrix_size); + + // 2D memcpy just the slices we care about + // TODO Re-quantize here with matrices divided + size_t const row_size_1 = matrix_size * sizeof(WeightStorage) / WEIGHT_ELEM_PER_BYTE; + check_cuda_error( + cudaMemcpy2DAsync(weight_1, row_size_1, (uint8_t*) weight1_ptr + row_size_1 * tp_rank, row_size_1 * tp_size, + row_size_1, experts_per_node * mGatedMultiplier, cudaMemcpyDeviceToDevice, mStream->get())); + + size_t const row_size_2 = row_size_inter * sizeof(WeightStorage) / WEIGHT_ELEM_PER_BYTE; + check_cuda_error( + cudaMemcpy2DAsync(weight_2, row_size_2, (uint8_t*) weight2_ptr + row_size_2 * tp_rank, row_size_2 * tp_size, + row_size_2, experts_per_node * mHiddenSize, cudaMemcpyDeviceToDevice, mStream->get())); + + if (mUseBias) { - size_t const gated_inter = mInterSize * mGatedMultiplier; - size_t const experts_per_node = mNumExperts / parallelism_config.ep_size; - size_t const weight_matrix_size = mHiddenSize * mInterSize * experts_per_node / WEIGHT_ELEM_PER_BYTE; - size_t const bias_fc1_size = gated_inter * experts_per_node; - size_t const bias_fc2_size = mHiddenSize * experts_per_node; - size_t const scale1_size = gated_inter * experts_per_node; - size_t const scale2_size = mHiddenSize * experts_per_node; - auto* weight1_ptr = mExpertWeight1 + weight_matrix_size * mGatedMultiplier * parallelism_config.ep_rank; - auto* weight2_ptr = mExpertWeight2 + weight_matrix_size * parallelism_config.ep_rank; - auto* bias1_ptr = mUseBias ? mExpertBias1 + bias_fc1_size * parallelism_config.ep_rank : nullptr; - auto* bias2_ptr = mUseBias ? mExpertBias2 + bias_fc2_size * parallelism_config.ep_rank : nullptr; - - if (INT_QUANT) - { - scale_1 = mExpertIntScale1 + scale1_size * parallelism_config.ep_rank; - scale_2 = mExpertIntScale2 + scale2_size * parallelism_config.ep_rank; - } - if constexpr (FP8) - { - scale_1 = mExpertFP8Scale1 + experts_per_node * parallelism_config.ep_rank; - scale_3 = mExpertFP8Scale3 + experts_per_node * parallelism_config.ep_rank; - } + size_t const row_size_bias = row_size_inter * sizeof(DataType); + check_cuda_error(cudaMemcpy2DAsync(bias_1, row_size_bias, (uint8_t*) bias1_ptr + row_size_bias * tp_rank, + row_size_bias * tp_size, row_size_bias, experts_per_node * mGatedMultiplier, cudaMemcpyDeviceToDevice, + mStream->get())); + } - return std::tuple{weight1_ptr, weight2_ptr, bias1_ptr, bias2_ptr, scale_1, scale_2, scale_3}; + if constexpr (INT_QUANT) + { + scale_2 = ep_scale_2; + size_t const row_size_scale = row_size_inter * sizeof(DataType); + check_cuda_error(cudaMemcpy2DAsync(scale_1, row_size_scale, + (uint8_t*) ep_scale_1 + row_size_scale * tp_rank, row_size_scale * tp_size, row_size_scale, + experts_per_node * mGatedMultiplier, cudaMemcpyDeviceToDevice, mStream->get())); } - return std::tuple{mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, scale_1, scale_2, scale_3}; + bias_1 = mUseBias ? bias_1 : nullptr; + + return std::tuple{weight_1, weight_2, bias_1, bias2_ptr, scale_1, scale_2, scale_3}; } void runMoEPermute(MOEParallelismConfig parallelism_config) @@ -941,6 +936,8 @@ protected: void ExpertParallelTest(int k = 1); void TensorParallelTest(int k = 1); + + void MixedParallelTest(int k = 1); }; template @@ -1154,7 +1151,7 @@ void MixtureOfExpertsTest::ExpertParallelTest(int k) } int64_t hidden_size = DEFAULT_HIDDEN_SIZE; - int64_t parallelism = 2; + int parallelism = 2; int64_t num_experts = 4; int64_t num_tokens = 3; @@ -1178,12 +1175,12 @@ void MixtureOfExpertsTest::ExpertParallelTest(int k) if (i == 0) { // Only need to init the inputs on the first iteration - runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, - MOEParallelismConfig::ExpertParallelism(parallelism, i)); + runMoEPermute( + {hidden_states}, {probs}, hidden_size, num_experts, k, {}, MOEParallelismConfig{1, 0, parallelism, i}); } else { - runMoEPermute(MOEParallelismConfig::ExpertParallelism(parallelism, i)); + runMoEPermute(MOEParallelismConfig{1, 0, parallelism, i}); } auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); @@ -1256,7 +1253,7 @@ void MixtureOfExpertsTest::TensorParallelTest(int k) } int64_t hidden_size = DEFAULT_HIDDEN_SIZE; - int64_t parallelism = 8; + int parallelism = 8; int64_t num_experts = 4; int64_t num_tokens = 3; @@ -1280,12 +1277,12 @@ void MixtureOfExpertsTest::TensorParallelTest(int k) if (i == 0) { // Only need to init the inputs on the first iteration - runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, - MOEParallelismConfig::TensorParallelism(parallelism, i)); + runMoEPermute( + {hidden_states}, {probs}, hidden_size, num_experts, k, {}, MOEParallelismConfig{parallelism, i, 1, 0}); } else { - runMoEPermute(MOEParallelismConfig::TensorParallelism(parallelism, i)); + runMoEPermute(MOEParallelismConfig{parallelism, i, 1, 0}); } auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); @@ -1355,6 +1352,113 @@ TYPED_TEST(MixtureOfExpertsTest, TensorParallelSwiglu) this->TensorParallelTest(3); } +template +void MixtureOfExpertsTest::MixedParallelTest(int k) +{ + if (FP8) + { + // TODO Remove this when bias + FP8 is supported + mUseBias = false; + } + + int64_t hidden_size = DEFAULT_HIDDEN_SIZE; + int tp_parallelism = 2; + int ep_parallelism = 2; + int64_t num_experts = 4; + int64_t num_tokens = 3; + + std::vector hidden_states(hidden_size * num_tokens); + auto raw_unquant_input = populateTokens(hidden_states); + + std::vector probs = { + 0.5, 0.1, 0.25, 0.15, // + 0.03, 0.2, 0.07, 0.7, // + 0.25, 0.21, 0.35, 0.19, // + }; + + std::vector expected_experts{0, 3, 2}; + if (k == 2) + expected_experts = {0, 2, 3, 1, 2, 0}; + else if (k == 3) + expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1}; + std::vector results(hidden_states.size(), 0); + for (int i = 0; i < tp_parallelism; i++) + { + for (int j = 0; j < ep_parallelism; j++) + { + if (i == 0 && j == 0) + { + // Only need to init the inputs on the first iteration + runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, + MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j}); + } + else + { + runMoEPermute(MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j}); + } + + auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); + // Experts should only be selected when we are on the right node + // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node + int const start_expert = j * (mNumExperts / ep_parallelism); + std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(), + [&](int val) { return val == mNumExperts ? mNumExperts : val + start_expert; }); + auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_parallelism, j); + ASSERT_EQ(selected_expert, masked_expected_experts); + + auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k); + auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts); + ASSERT_EQ(permute_map, proj_map) << "Iteration " << i << " " << j; + compareSoftmax(expected_experts, probs); + + // Do the final reduce + auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size); + std::transform( + iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{}); + } + } + + compareFinal(expected_experts, probs, raw_unquant_input, results); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallel) +{ + this->MixedParallelTest(); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallelK2) +{ + this->MixedParallelTest(2); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallelNoBias) +{ + this->mUseBias = false; + this->MixedParallelTest(); + this->MixedParallelTest(2); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallelRenorm) +{ + this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; + this->MixedParallelTest(); + this->MixedParallelTest(2); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallelGeglu) +{ + this->mActType = tensorrt_llm::ActivationType::Geglu; + this->MixedParallelTest(); + this->MixedParallelTest(2); +} + +TYPED_TEST(MixtureOfExpertsTest, MixedParallelSwiglu) +{ + this->mActType = tensorrt_llm::ActivationType::Swiglu; + this->MixedParallelTest(); + this->MixedParallelTest(2); +} + TYPED_TEST(MixtureOfExpertsTest, ConfigSweep) { std::vector actiavtion_pool = { diff --git a/cpp/tests/kernels/stopCriteriaKernelsTest.cpp b/cpp/tests/kernels/stopCriteriaKernelsTest.cpp index 27bddd657..c66f63825 100644 --- a/cpp/tests/kernels/stopCriteriaKernelsTest.cpp +++ b/cpp/tests/kernels/stopCriteriaKernelsTest.cpp @@ -35,6 +35,8 @@ using namespace tensorrt_llm::runtime; namespace { +// TODO(nkorobov): add tests for numNewTokens for EOS and seqLenLimit + class StopCriteriaKernelsTest : public testing::Test { public: @@ -410,8 +412,8 @@ class StopCriteriaKernelsTest : public testing::Test reinterpret_cast(bufferCast(*mFinished)), bufferCast(*mFinishedSum), reinterpret_cast(bufferCast(*mSequenceLengthLimits)), - bufferCast(*mSequenceLengths), bufferCast(*mBatchSlots), batchSize, beamWidth, - mStream->get()); + bufferCast(*mSequenceLengths), /* numNewTokens */ nullptr, bufferCast(*mBatchSlots), + batchSize, beamWidth, mStream->get()); verifyMaxSeqLenStopCriteriaResults(seed, batchSize, beamWidth); } diff --git a/cpp/tests/layers/baseSamplingLayerTest.cpp b/cpp/tests/layers/baseSamplingLayerTest.cpp index 588f21a57..2504bca7e 100644 --- a/cpp/tests/layers/baseSamplingLayerTest.cpp +++ b/cpp/tests/layers/baseSamplingLayerTest.cpp @@ -93,15 +93,14 @@ void BaseSamplingLayerTest::setup(uint64_t seed, TestSamplingParams const& pa auto setupParams = std::make_shared(); setupParams->randomSeed = std::make_optional>({seed}); - setupParams->runtime_top_k + setupParams->runtimeTopK = params.topKs.size() ? std::make_optional>(params.topKs) : std::nullopt; - setupParams->runtime_top_p + setupParams->runtimeTopP = params.topPs.size() ? std::make_optional>(params.topPs) : std::nullopt; - setupParams->top_p_decay - = params.decay.size() ? std::make_optional>(params.decay) : std::nullopt; - setupParams->top_p_min + setupParams->topPDecay = params.decay.size() ? std::make_optional>(params.decay) : std::nullopt; + setupParams->topPMin = params.minTopP.size() ? std::make_optional>(params.minTopP) : std::nullopt; - setupParams->top_p_reset_ids + setupParams->topPResetIds = params.topPResetIds.size() ? std::make_optional>(params.topPResetIds) : std::nullopt; mSamplingLayer->setup(mBatchSize, mBeamWidth, batchSlotsPtr, setupParams); @@ -110,40 +109,42 @@ void BaseSamplingLayerTest::setup(uint64_t seed, TestSamplingParams const& pa } template -std::shared_ptr BaseSamplingLayerTest::createInputTensors(int32_t step) +std::shared_ptr BaseSamplingLayerTest::createInputTensors(int32_t step) { constexpr int32_t ite = 0; - auto decodeInputTensors = std::make_shared( - step, ite, tcc::toTllmTensor(*mLogitsDevice), tcc::toTllmTensor(*mEndIdsDevice), mMaxSeqLen); + auto decodeInputTensors + = std::make_shared(tcc::toTllmTensor(*mEndIdsDevice), step, ite, mBatchSize); - decodeInputTensors->input_lengths = tcc::toTllmTensor(*mContextLengthDevice); + decodeInputTensors->logits = tcc::toTllmTensor(*mLogitsDevice); + + decodeInputTensors->inputLengths = tcc::toTllmTensor(*mContextLengthDevice); decodeInputTensors->finished = tcc::toTllmTensor(*mFinishedDevice); - decodeInputTensors->batch_slots = tcc::toTllmTensor(*mBatchSlots); + decodeInputTensors->batchSlots = tcc::toTllmTensor(*mBatchSlots); - decodeInputTensors->probs_computed = mComputeProbs; + decodeInputTensors->probsComputed = mComputeProbs; - decodeInputTensors->curand_states = reinterpret_cast(bufferCast(*mCurandStatesDevice)); + decodeInputTensors->curandStates = reinterpret_cast(bufferCast(*mCurandStatesDevice)); - decodeInputTensors->sampling_workspace = reinterpret_cast(bufferCast(*mSamplingWorkspaceDevice)); + decodeInputTensors->samplingWorkspace = reinterpret_cast(bufferCast(*mSamplingWorkspaceDevice)); return decodeInputTensors; } template -std::shared_ptr BaseSamplingLayerTest::createOutputTensors() +std::shared_ptr BaseSamplingLayerTest::createOutputTensors() { - auto decodeOutputs = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); - decodeOutputs->output_ids_ptr = tcc::toTllmTensor(*mIdsPtrHost); + auto decodeOutputs = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); + decodeOutputs->outputIdsPtr = tcc::toTllmTensor(*mIdsPtrHost); - decodeOutputs->sequence_length = tcc::toTllmTensor(*mSeqLengthsDevice); + decodeOutputs->sequenceLength = tcc::toTllmTensor(*mSeqLengthsDevice); decodeOutputs->finished = tcc::toTllmTensor(*mFinishedDevice); - decodeOutputs->output_log_probs = tcc::toTllmTensor(*mOutputLogProbsDevice); + decodeOutputs->outputLogProbs = tcc::toTllmTensor(*mOutputLogProbsDevice); - decodeOutputs->cum_log_probs = tcc::toTllmTensor(*mCumLogProbsDevice); + decodeOutputs->cumLogProbs = tcc::toTllmTensor(*mCumLogProbsDevice); // TODO(nkorobov): check log probs and cum_log_probs return decodeOutputs; diff --git a/cpp/tests/layers/baseSamplingLayerTest.h b/cpp/tests/layers/baseSamplingLayerTest.h index f643d54bb..a4f86466e 100644 --- a/cpp/tests/layers/baseSamplingLayerTest.h +++ b/cpp/tests/layers/baseSamplingLayerTest.h @@ -146,9 +146,9 @@ class BaseSamplingLayerTest : public testing::Test virtual void initLayer(TestSamplingParams const& params) = 0; - std::shared_ptr createInputTensors(int32_t step); + std::shared_ptr createInputTensors(int32_t step); - std::shared_ptr createOutputTensors(); + std::shared_ptr createOutputTensors(); void batchCopy(int32_t step); bool checkResult(int32_t* outputIds, std::vector>& expectedIds); diff --git a/cpp/tests/layers/dynamicDecodeLayerTest.cpp b/cpp/tests/layers/dynamicDecodeLayerTest.cpp index 293e08fe8..8f96977ae 100644 --- a/cpp/tests/layers/dynamicDecodeLayerTest.cpp +++ b/cpp/tests/layers/dynamicDecodeLayerTest.cpp @@ -328,38 +328,57 @@ void DynamicDecodeLayerTest::setup(uint64_t seed, TestSamplingParams const& p } auto setupParams = std::make_shared(); - setupParams->penaltyParams.temperature + setupParams->penaltyParams = std::make_shared(); + setupParams->penaltyParams->temperature = params.temperatures.size() ? std::make_optional>(params.temperatures) : std::nullopt; - setupParams->penaltyParams.repetitionPenalty = params.repetitionPenalties.size() + setupParams->penaltyParams->repetitionPenalty = params.repetitionPenalties.size() ? std::make_optional>(params.repetitionPenalties) : std::nullopt; - setupParams->penaltyParams.presencePenalty = params.presencePenalties.size() + setupParams->penaltyParams->presencePenalty = params.presencePenalties.size() ? std::make_optional>(params.presencePenalties) : std::nullopt; - setupParams->penaltyParams.frequencyPenalty = params.frequencyPenalties.size() + setupParams->penaltyParams->frequencyPenalty = params.frequencyPenalties.size() ? std::make_optional>(params.frequencyPenalties) : std::nullopt; - setupParams->penaltyParams.minLength + setupParams->penaltyParams->minLength = params.minLengths.size() ? std::make_optional>(params.minLengths) : std::nullopt; - setupParams->randomSeed = std::make_optional>({seed}); - setupParams->samplingParams.runtime_top_k - = params.topKs.size() ? std::make_optional>(params.topKs) : std::nullopt; - setupParams->samplingParams.runtime_top_p - = params.topPs.size() ? std::make_optional>(params.topPs) : std::nullopt; - setupParams->samplingParams.top_p_decay - = params.decay.size() ? std::make_optional>(params.decay) : std::nullopt; - setupParams->samplingParams.top_p_min - = params.minTopP.size() ? std::make_optional>(params.minTopP) : std::nullopt; - setupParams->samplingParams.top_p_reset_ids - = params.topPResetIds.size() ? std::make_optional>(params.topPResetIds) : std::nullopt; - setupParams->samplingParams.normalize_log_probs = {false}; - setupParams->samplingParams.outputLogProbs = {true}; - setupParams->samplingParams.cumLogProbs = {true}; - setupParams->penaltyParams.noRepeatNgramSize = params.repeatNGramSizes.size() + + setupParams->banWordsParams = std::make_shared(); + setupParams->banWordsParams->noRepeatNgramSize = params.repeatNGramSizes.size() ? std::make_optional>(params.repeatNGramSizes) : std::nullopt; - setupParams->medusaParams.topKMedusaHeads = params.topKMedusaHeads; + if (mDecodingMode.isTopKorTopP()) + { + auto samplingParams = std::make_shared(); + samplingParams->randomSeed = std::make_optional>({seed}); + samplingParams->runtimeTopK + = params.topKs.size() ? std::make_optional>(params.topKs) : std::nullopt; + samplingParams->runtimeTopP + = params.topPs.size() ? std::make_optional>(params.topPs) : std::nullopt; + samplingParams->topPDecay + = params.decay.size() ? std::make_optional>(params.decay) : std::nullopt; + samplingParams->topPMin + = params.minTopP.size() ? std::make_optional>(params.minTopP) : std::nullopt; + samplingParams->topPResetIds = params.topPResetIds.size() + ? std::make_optional>(params.topPResetIds) + : std::nullopt; + samplingParams->normalizeLogProbs = {false}; + samplingParams->outputLogProbs = {true}; + samplingParams->cumLogProbs = {true}; + + setupParams->decodingParams = samplingParams; + } + else if (mDecodingMode.isMedusa()) + { + auto medusaParams = std::make_shared(); + medusaParams->runtimeHeadsTopK = params.topKMedusaHeads; + medusaParams->randomSeed = std::make_optional>({seed}); + medusaParams->runtimeTopK + = params.topKs.size() ? std::make_optional>(params.topKs) : std::nullopt; + + setupParams->decodingParams = medusaParams; + } initXWordsTensors(batchSlotsPtr, bufferCast(*mBadWords), reinterpret_cast(bufferCast(*mBadWordsPtrs)), bufferCast(*mBadWordsLens), @@ -431,9 +450,11 @@ void DynamicDecodeLayerTest::initXWordsTensors(SizeType32* batchSlotsPtr, Siz } template -DynamicDecodeInputParams::MedusaInputs DynamicDecodeLayerTest::createMedusaInputs() +void DynamicDecodeLayerTest::createMedusaInputs(std::shared_ptr& baseInputs) { - DynamicDecodeInputParams::MedusaInputs medusaInputs; + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto batchSlots = BufferRange(*mBatchSlots); std::vector> medusaLogits(mMaxBatchSize); auto const medusaLogitsPtr = bufferCast(*mMedusaLogitsDevice); @@ -452,57 +473,65 @@ DynamicDecodeInputParams::MedusaInputs DynamicDecodeLayerTest::createMedusaIn } } - medusaInputs.medusaPaths = tcc::toTllmTensor(*mPathsDevice); - medusaInputs.medusaTreeIds = tcc::toTllmTensor(*mTreeIdsDevice); - medusaInputs.medusaLogits = medusaLogits; - medusaInputs.medusaCurTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); - medusaInputs.medusaTargetTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); - return medusaInputs; + inputs->paths = tcc::toTllmTensor(*mPathsDevice); + inputs->treeIds = tcc::toTllmTensor(*mTreeIdsDevice); + inputs->medusaLogits = medusaLogits; + inputs->curTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); + inputs->targetTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template -std::shared_ptr DynamicDecodeLayerTest::createInputTensors(SizeType32 step) +std::shared_ptr DynamicDecodeLayerTest::createInputTensors(SizeType32 step) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - constexpr SizeType32 ite = 0; - auto forwardParams = std::make_shared( - step, ite, mMaxInputLen, mMaxSeqLen, mSinkTokenLength, mBatchSize, tcc::toTllmTensor(*mEndIdsDevice)); + SizeType32 constexpr ite = 0; + std::shared_ptr forwardParams; + if (mDecodingMode.isTopKorTopP()) + { + forwardParams = std::make_shared(tcc::toTllmTensor(*mEndIdsDevice), step, ite, mBatchSize); + } + else if (mDecodingMode.isMedusa()) + { + forwardParams = std::make_shared(tcc::toTllmTensor(*mEndIdsDevice), mBatchSize); + } - forwardParams->embedding_bias = tcc::toTllmTensor(*mEmbeddingBiasDevice); + forwardParams->embeddingBias = tcc::toTllmTensor(*mEmbeddingBiasDevice); forwardParams->finished = tcc::toTllmTensor(*mFinishedDevice); - forwardParams->batch_slots = tcc::toTllmTensor(*mBatchSlots); + forwardParams->batchSlots = tcc::toTllmTensor(*mBatchSlots); if (mUseLogitsVec) { - forwardParams->logits_vec = mLogitsVec; + forwardParams->logitsVec = mLogitsVec; } else { forwardParams->logits = tcc::toTllmTensor(*mLogitsDevice); } - forwardParams->bad_words_ptr = tcc::toTllmTensor(*mBadWordsPtrs); - forwardParams->bad_words_lengths = tcc::toTllmTensor(*mBadWordsLens); - forwardParams->max_bad_words_len = mMaxBadWordsLen; + forwardParams->banWordsInputs = std::make_shared(mBatchSize); + forwardParams->banWordsInputs->badWordsPtr = tcc::toTllmTensor(*mBadWordsPtrs); + forwardParams->banWordsInputs->badWordsLengths = tcc::toTllmTensor(*mBadWordsLens); + forwardParams->banWordsInputs->maxBadWordsLen = mMaxBadWordsLen; - forwardParams->stop_words_ptr = tcc::toTllmTensor(*mStopWordsPtrs); - forwardParams->stop_words_lengths = tcc::toTllmTensor(*mStopWordsLens); - forwardParams->max_stop_words_len = mMaxStopWordsLen; + forwardParams->stopCriteriaInputs = std::make_shared(mBatchSize); + forwardParams->stopCriteriaInputs->stopWordsPtr = tcc::toTllmTensor(*mStopWordsPtrs); + forwardParams->stopCriteriaInputs->stopWordsLengths = tcc::toTllmTensor(*mStopWordsLens); + forwardParams->stopCriteriaInputs->maxStopWordsLen = mMaxStopWordsLen; if (mDecodingMode.isMedusa()) { - forwardParams->medusaInputs = createMedusaInputs(); + createMedusaInputs(forwardParams); } // TODO(nkorobov): extend to // std::optional src_cache_indirection; // std::optional sequence_limit_length; // std::optional input_lengths; - // std::optional no_repeat_ngram_size; has move to sampling config - // std::optional> logits_vec; + // std::optional> logitsVec; TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -510,47 +539,59 @@ std::shared_ptr DynamicDecodeLayerTest::createInput } template -DynamicDecodeOutputParams::SpeculativeDecodingOutputs DynamicDecodeLayerTest::createMedusaOutputs() +void DynamicDecodeLayerTest::createMedusaOutputs(std::shared_ptr& baseOutputs) { - DynamicDecodeOutputParams::SpeculativeDecodingOutputs speculativeDecodingOutputs; - speculativeDecodingOutputs.nextDraftTokens = tcc::toTllmTensor(*mNextDraftTokensDevice); + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto outputs = std::dynamic_pointer_cast(baseOutputs); + outputs->nextDraftTokens = tcc::toTllmTensor(*mNextDraftTokensDevice); - speculativeDecodingOutputs.acceptedLengths = tcc::toTllmTensor(*mAcceptedLengths); + outputs->numNewTokens = tcc::toTllmTensor(*mAcceptedLengths); - speculativeDecodingOutputs.acceptedLengthsCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSumDevice); + outputs->numNewTokensCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSumDevice); - speculativeDecodingOutputs.pathsOffsets = tcc::toTllmTensor(*mPackedPathsDevice); - return speculativeDecodingOutputs; + outputs->pathsOffsets = tcc::toTllmTensor(*mPackedPathsDevice); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } template -std::shared_ptr DynamicDecodeLayerTest::createOutputTensors() +std::shared_ptr DynamicDecodeLayerTest::createOutputTensors() { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); + std::shared_ptr outputParams; + + if (mDecodingMode.isMedusa()) + { + outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); + } + else + { + outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); + } - outputParams->sequence_length = tcc::toTllmTensor(*mSeqLengthsDevice); + outputParams->sequenceLength = tcc::toTllmTensor(*mSeqLengthsDevice); outputParams->finished = tcc::toTllmTensor(*mFinishedDevice); - outputParams->finished_sum = tcc::toTllmTensor(*mFinishedSumDevice); + outputParams->finishedSum = tcc::toTllmTensor(*mFinishedSumDevice); outputParams->newTokens = tcc::toTllmTensor(*mNewTokens); if (!mDecodingMode.isMedusa()) { // Output log probs are not supported in Medusa - outputParams->cum_log_probs = tcc::toTllmTensor(*mCumLogProbsDevice); + outputParams->cumLogProbs = tcc::toTllmTensor(*mCumLogProbsDevice); - outputParams->output_log_probs = tcc::toTllmTensor(*mOutputLogProbsDevice); + outputParams->outputLogProbs = tcc::toTllmTensor(*mOutputLogProbsDevice); - outputParams->output_log_probs_tiled = tcc::toTllmTensor(*mOutputLogProbsTiledDevice); + outputParams->outputLogProbsTiled = tcc::toTllmTensor(*mOutputLogProbsTiledDevice); } if (mDecodingMode.isMedusa()) { - outputParams->speculativeDecodingOutputs = createMedusaOutputs(); + createMedusaOutputs(outputParams); } // TODO(nkorobov): extend to @@ -662,13 +703,13 @@ void DynamicDecodeLayerTest::runTestImpl( batchCopy(step); if (mUseLogitsVec) { - inputTensors->logits_vec = mLogitsVec; + inputTensors->logitsVec = mLogitsVec; inputTensors->logits = std::nullopt; } else { inputTensors->logits = tcc::toTllmTensor(*mLogitsDevice); - inputTensors->logits_vec = std::nullopt; + inputTensors->logitsVec = std::nullopt; } inputTensors->step = step; mDecodeLayer->forwardAsync(outputTensors, inputTensors); diff --git a/cpp/tests/layers/dynamicDecodeLayerTest.h b/cpp/tests/layers/dynamicDecodeLayerTest.h index c7db6eb1c..6a98968be 100644 --- a/cpp/tests/layers/dynamicDecodeLayerTest.h +++ b/cpp/tests/layers/dynamicDecodeLayerTest.h @@ -159,9 +159,9 @@ class DynamicDecodeLayerTest : public testing::Test runtime::TokenIdType** wordsPtr, runtime::SizeType32* wordsLenData, runtime::SizeType32 maxWordsLen, std::vector>> const& inputWords); - std::shared_ptr createInputTensors(runtime::SizeType32 step); + std::shared_ptr createInputTensors(runtime::SizeType32 step); - std::shared_ptr createOutputTensors(); + std::shared_ptr createOutputTensors(); void batchCopy(runtime::SizeType32 step); bool checkResult(runtime::TokenIdType* outputIds, std::vector> const& expectedIds, @@ -171,8 +171,8 @@ class DynamicDecodeLayerTest : public testing::Test void fillRefLogits(runtime::SizeType32 const* seqLenHost, std::vector> const& expectedOutputIds, runtime::SizeType32 step); - tensorrt_llm::layers::DynamicDecodeInputParams::MedusaInputs createMedusaInputs(); - tensorrt_llm::layers::DynamicDecodeOutputParams::SpeculativeDecodingOutputs createMedusaOutputs(); + void createMedusaInputs(std::shared_ptr& baseInputs); + void createMedusaOutputs(std::shared_ptr& baseOutputs); public: void runTest(std::vector> const& expectedOutputIds, TestSamplingParams const& params, diff --git a/cpp/tests/layers/explicitDraftTokensLayerTest.cpp b/cpp/tests/layers/explicitDraftTokensLayerTest.cpp index c996bf88c..eb44cacf6 100644 --- a/cpp/tests/layers/explicitDraftTokensLayerTest.cpp +++ b/cpp/tests/layers/explicitDraftTokensLayerTest.cpp @@ -25,6 +25,7 @@ namespace tensorrt_llm::tests::layers { +// TODO(nkorobov) verify context + gen mix using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::layers; @@ -660,6 +661,9 @@ void ExplicitDraftTokensLayerTest::allocateBuffers() mNextDraftLengths = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32); + mPrevDraftLengths + = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32); + mAcceptedLengthCumSum = BufferManager::pinned( ITensor::makeShape({mSamplingParams.getMaxBatchSize() + 1}), nvinfer1::DataType::kINT32); @@ -703,6 +707,11 @@ void ExplicitDraftTokensLayerTest::allocateBuffers() mOutputTemperatures = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), dataType); + mOutputGenerationLengths + = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32); + + mMaxGenLengthHost = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + // inputs mBatchSlots = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32); @@ -723,7 +732,7 @@ void ExplicitDraftTokensLayerTest::allocateBuffers() {mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxNumPaths(), mSamplingParams.getMaxPathLen()}), nvinfer1::DataType::kINT32); - mLastDraftTokens = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize(), + mLastDraftTokens = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxNumPaths(), mSamplingParams.getMaxPathLen()}), nvinfer1::DataType::kINT32); @@ -760,6 +769,11 @@ void ExplicitDraftTokensLayerTest::allocateBuffers() mSamplingParams.getMaxDraftPathLen(), mSamplingParams.getVocabSize()}), dataType); + mEndIds + = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32); + + mMaxGenLengthDevice = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + // Packed inputs mMaxGenerationLength = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); mCumSumGenerationLengths @@ -790,6 +804,9 @@ void ExplicitDraftTokensLayerTest::allocateBuffers() mPackedPositionOffsets = BufferManager::pinned( ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens()}), nvinfer1::DataType::kINT32); + mPackedPackedPosIds = BufferManager::pinned( + ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens()}), + nvinfer1::DataType::kINT32); mPackedDraftProbs = BufferManager::pinned(ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxNumPaths(), mSamplingParams.getMaxDraftPathLen(), mSamplingParams.getVocabSize()}), @@ -813,6 +830,7 @@ void ExplicitDraftTokensLayerTest::setup() trk::invokeFill(*mNextPosIds, SizeType32{0}, *mStream); trk::invokeFill(*mOutputUnpackedNextDraftTokens, TokenIdType{-1}, *mStream); trk::invokeFill(*mOutputUnpackedNextDraftIndices, SizeType32{0}, *mStream); + trk::invokeFill(*mEndIds, TokenIdType{-1}, *mStream); auto inDraftProbs = BufferRange(*mNextDraftProbs); @@ -839,6 +857,8 @@ void ExplicitDraftTokensLayerTest::setup() [&generator, &temperatureDistr]() { return temperatureDistr(generator); }); setupParams->randomSeed = mRandomSeeds; setupParams->temperature = mTemperatures; + setupParams->randomDataSample = tcc::toTllmTensor(*mRandomDataSample); + setupParams->temperatures = tcc::toTllmTensor(*mOutputTemperatures); mExplicitDraftTokensLayer->setup(mSamplingParams.getBatchSize(), 1, batchSlotsPtr, setupParams); @@ -913,11 +933,14 @@ void ExplicitDraftTokensLayerTest::setup() } template -std::shared_ptr ExplicitDraftTokensLayerTest::createInputTensors() +std::shared_ptr ExplicitDraftTokensLayerTest::createInputTensors() { - auto forwardParams = std::make_shared(); + auto forwardParams + = std::make_shared(tcc::toTllmTensor(*mEndIds), mSamplingParams.getBatchSize()); - forwardParams->batch_slots = tcc::toTllmTensor(*mBatchSlots); + forwardParams->batchSlots = tcc::toTllmTensor(*mBatchSlots); + + forwardParams->seqSlots = tcc::toTllmTensor(*mBatchSlots); forwardParams->masks = tcc::toTllmTensor(*mMasks); @@ -935,7 +958,7 @@ std::shared_ptr ExplicitDraftTokensLayerTest: forwardParams->bestPathIndices = tcc::toTllmTensor(*mBestPathIndices); - forwardParams->specDecodingGenerationLengths = tcc::toTllmTensor(*mSpecDecodingGenerationLengths); + forwardParams->generationLengths = tcc::toTllmTensor(*mSpecDecodingGenerationLengths); forwardParams->nextFlatTokens = tcc::toTllmTensor(*mNextFlatTokens); @@ -943,49 +966,53 @@ std::shared_ptr ExplicitDraftTokensLayerTest: forwardParams->nextDraftProbs = tcc::toTllmTensor(*mNextDraftProbs); + forwardParams->maxGenLengthDevice = tcc::toTllmTensor(*mMaxGenLengthDevice); + return forwardParams; } template -std::shared_ptr ExplicitDraftTokensLayerTest::createOutputTensors() +std::shared_ptr ExplicitDraftTokensLayerTest::createOutputTensors() { - auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIds)); + auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIds)); + + outputParams->sequenceLength = tcc::toTllmTensor(*mSeqLengths); - outputParams->sequence_length = tcc::toTllmTensor(*mSeqLengths); + outputParams->nextDraftTokens = tcc::toTllmTensor(*mOutputNextDraftTokens); - outputParams->explicitDraftTokensOutputs = BaseOutputParams::ExplicitDraftTokensOutputs(); + outputParams->numNewTokens = tcc::toTllmTensor(*mAcceptedLengths); - outputParams->explicitDraftTokensOutputs->nextDraftTokens = tcc::toTllmTensor(*mOutputNextDraftTokens); + outputParams->nextDraftLengths = tcc::toTllmTensor(*mNextDraftLengths); - outputParams->explicitDraftTokensOutputs->acceptedLengths = tcc::toTllmTensor(*mAcceptedLengths); + outputParams->prevDraftLengths = tcc::toTllmTensor(*mPrevDraftLengths); - outputParams->explicitDraftTokensOutputs->nextDraftLengths = tcc::toTllmTensor(*mNextDraftLengths); + outputParams->numNewTokensCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSum); - outputParams->explicitDraftTokensOutputs->acceptedLengthsCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSum); + outputParams->pathsOffsets = tcc::toTllmTensor(*mPathsOffsets); - outputParams->explicitDraftTokensOutputs->pathsOffsets = tcc::toTllmTensor(*mPathsOffsets); + outputParams->nextDraftPosIds = tcc::toTllmTensor(*mNextPosIds); - outputParams->explicitDraftTokensOutputs->nextDraftPosIds = tcc::toTllmTensor(*mNextPosIds); + outputParams->positionIdsBase = tcc::toTllmTensor(*mOutputPositionIdsBase); - outputParams->explicitDraftTokensOutputs->positionIdsBase = tcc::toTllmTensor(*mOutputPositionIdsBase); + outputParams->randomDataSample = tcc::toTllmTensor(*mRandomDataSample); - outputParams->explicitDraftTokensOutputs->randomDataSample = tcc::toTllmTensor(*mRandomDataSample); + outputParams->randomDataValidation = tcc::toTllmTensor(*mRandomDataValidation); - outputParams->explicitDraftTokensOutputs->randomDataValidation = tcc::toTllmTensor(*mRandomDataValidation); + outputParams->packedMasks = tcc::toTllmTensor(*mPackedMasks); - outputParams->explicitDraftTokensOutputs->packedMasks = tcc::toTllmTensor(*mPackedMasks); + outputParams->packedMasks = tcc::toTllmTensor(*mPackedMasks); - outputParams->explicitDraftTokensOutputs->packedMasks = tcc::toTllmTensor(*mPackedMasks); + outputParams->unpackedNextDraftTokens = tcc::toTllmTensor(*mOutputUnpackedNextDraftTokens); - outputParams->explicitDraftTokensOutputs->unpackedNextDraftTokens - = tcc::toTllmTensor(*mOutputUnpackedNextDraftTokens); + outputParams->unpackedNextDraftIndices = tcc::toTllmTensor(*mOutputUnpackedNextDraftIndices); - outputParams->explicitDraftTokensOutputs->unpackedNextDraftIndices - = tcc::toTllmTensor(*mOutputUnpackedNextDraftIndices); + outputParams->nextDraftProbs = tcc::toTllmTensor(*mOutputDraftProbs); - outputParams->explicitDraftTokensOutputs->nextDraftProbs = tcc::toTllmTensor(*mOutputDraftProbs); + outputParams->temperatures = tcc::toTllmTensor(*mOutputTemperatures); - outputParams->explicitDraftTokensOutputs->temperatures = tcc::toTllmTensor(*mOutputTemperatures); + outputParams->generationLengths = tcc::toTllmTensor(*mOutputGenerationLengths); + + outputParams->maxGenLengthHost = tcc::toTllmTensor(*mMaxGenLengthHost); return outputParams; } @@ -1117,7 +1144,8 @@ void ExplicitDraftTokensLayerTest::checkLayerResult() for (SizeType32 ti = 0; ti < generatedLength; ++ti) { auto const idx = tc::flat_index2(batchSlot, ti, mSamplingParams.getMaxDecodingTokens()); - EXPECT_EQ(nextPosIds[idx], packedPosIds[compressedIdx + ti]) << " bi: " << bi << " ti: " << ti; + // Minus -1 to account for context phase correction of pos ids + EXPECT_EQ(nextPosIds[idx], packedPosIds[compressedIdx + ti] - 1) << " bi: " << bi << " ti: " << ti; } compressedIdx += generatedLength; } @@ -1200,7 +1228,7 @@ void ExplicitDraftTokensLayerTest::checkLayerResult() for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi) { auto const batchSlot = batchSlots[bi]; - EXPECT_EQ(BufferRange(*mOutputTemperatures)[batchSlot], static_cast(mTemperatures[bi])) + EXPECT_EQ(BufferRange(*mOutputTemperatures)[batchSlot], static_cast(1.f / mTemperatures[bi])) << " bi: " << bi; } } @@ -1213,24 +1241,35 @@ void ExplicitDraftTokensLayerTest::packData() params.batchSlots = bufferCast(*mBatchSlots); params.cumSumGenerationLengths = bufferCast(*mCumSumGenerationLengths); params.maxGenerationLength = bufferCast(*mMaxGenerationLength); + params.outputPositionIdsBase = bufferCast(*mPackedPositionIdsBase); params.inputPositionIdsBase = bufferCast(*mOutputPositionIdsBase); + params.outputGenerationLengths = bufferCast(*mPackedGenerationLengths); params.inputGenerationLengths = bufferCast(*mSpecDecodingGenerationLengths); + params.outputRandomDataSample = bufferCast(*mPackedRandomDataSample); params.inputRandomDataSample = bufferCast(*mRandomDataSample); + params.outputRandomDataValidation = bufferCast(*mPackedRandomDataVerification); params.inputRandomDataValidation = bufferCast(*mRandomDataValidation); + params.outputNextDraftTokens = bufferCast(*mPackedNextDraftTokens); params.inputNextDraftTokens = bufferCast(*mOutputUnpackedNextDraftTokens); + params.outputNextDraftIndices = bufferCast(*mPackedNextDraftIndices); params.inputNextDraftIndices = bufferCast(*mOutputUnpackedNextDraftIndices); + params.outputPackedMask = bufferCast(*mPackedPackedMasks); params.inputPackedMask = bufferCast(*mPackedMasks); + + params.inputPositionIds = bufferCast(*mNextPosIds); params.outputPositionOffsets = bufferCast(*mPackedPositionOffsets); - params.inputPositionOffsets = bufferCast(*mNextPosIds); + params.outputPositionIds = bufferCast(*mPackedPackedPosIds); + params.outputDraftProbs = bufferCast(*mPackedDraftProbs); params.inputDraftProbs = bufferCast(*mOutputDraftProbs); + params.outputTemperatures = bufferCast(*mPackedTemperatures); params.inputTemperatures = bufferCast(*mOutputTemperatures); @@ -1238,12 +1277,18 @@ void ExplicitDraftTokensLayerTest::packData() params.numPaths = mSamplingParams.getMaxNumPaths(); params.maxPathLength = mSamplingParams.getMaxPathLen(); params.vocabSize = mSamplingParams.getVocabSize(); + params.numGenerationRequests = mSamplingParams.getBatchSize(); + params.numContextTokens = 0; + + params.checkParams(); + + tksd::invokePackGenerationLengths(params, mStream->get()); // Compute inclusive sum - auto reduceTempStorageBytes = tksd::invokeScanSpecDecodingGenerationLengths( + auto reduceTempStorageBytes = tksd::invokeScanGenerationLengths( nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get()); auto reduceMaxTempStorage = mBufferManager->gpu(reduceTempStorageBytes); - tksd::invokeScanSpecDecodingGenerationLengths(bufferCast(*reduceMaxTempStorage), reduceTempStorageBytes, + tksd::invokeScanGenerationLengths(bufferCast(*reduceMaxTempStorage), reduceTempStorageBytes, bufferCast(*mSpecDecodingGenerationLengths), bufferCast(*mCumSumGenerationLengths), mSamplingParams.getBatchSize(), mStream->get()); @@ -1298,12 +1343,13 @@ void ExplicitDraftTokensLayerTest::checkPackResult() << "bi: " << bi << " pi: " << pi << " ti: " << ti; } } + auto const basePosId = BufferRange(*mPackedPositionIdsBase)[bi]; for (SizeType32 ti = 0; ti < maxGenLength; ++ti) { auto const outPosOffsetIdx = tc::flat_index2(bi, ti, maxGenLength); auto const inPosOffsetIdx = tc::flat_index2(batchSlot, ti, mSamplingParams.getMaxDecodingTokens()); EXPECT_EQ(BufferRange(*mPackedPositionOffsets)[outPosOffsetIdx], - BufferRange(*mNextPosIds)[inPosOffsetIdx]) + BufferRange(*mNextPosIds)[inPosOffsetIdx] - basePosId + 1) << "bi: " << bi << " ti: " << ti; } auto const outputMaskStartId = (bi == 0) ? 0 : BufferRange(*mCumSumGenerationLengths)[bi - 1]; diff --git a/cpp/tests/layers/explicitDraftTokensLayerTest.h b/cpp/tests/layers/explicitDraftTokensLayerTest.h index 52bf69ef1..ef4560c62 100644 --- a/cpp/tests/layers/explicitDraftTokensLayerTest.h +++ b/cpp/tests/layers/explicitDraftTokensLayerTest.h @@ -267,10 +267,13 @@ class ExplicitDraftTokensLayerTest : public testing::Test TensorPtr mPathsOffsets; TensorPtr mNextPosIds; TensorPtr mNextDraftLengths; + TensorPtr mPrevDraftLengths; TensorPtr mOutputUnpackedNextDraftTokens; TensorPtr mOutputUnpackedNextDraftIndices; TensorPtr mOutputDraftProbs; TensorPtr mOutputTemperatures; + TensorPtr mOutputGenerationLengths; + TensorPtr mMaxGenLengthHost; // inputs TensorPtr mBatchSlots; @@ -287,6 +290,8 @@ class ExplicitDraftTokensLayerTest : public testing::Test TensorPtr mTokensPerStep; TensorPtr mNextFlatTokens; TensorPtr mInputPositionIdsBase; + TensorPtr mEndIds; + TensorPtr mMaxGenLengthDevice; // Packed inputs TensorPtr mMaxGenerationLength; @@ -301,6 +306,7 @@ class ExplicitDraftTokensLayerTest : public testing::Test TensorPtr mPackedNextDraftIndices; TensorPtr mPackedPackedMasks; TensorPtr mPackedPositionOffsets; + TensorPtr mPackedPackedPosIds; TensorPtr mPackedDraftProbs; TensorPtr mPackedTemperatures; @@ -320,9 +326,9 @@ class ExplicitDraftTokensLayerTest : public testing::Test void setup(); - std::shared_ptr createInputTensors(); + std::shared_ptr createInputTensors(); - std::shared_ptr createOutputTensors(); + std::shared_ptr createOutputTensors(); void checkLayerResult(); diff --git a/cpp/tests/layers/lookaheadAlgorithmTest.cpp b/cpp/tests/layers/lookaheadAlgorithmTest.cpp index 1686fcb44..d3075c8f5 100644 --- a/cpp/tests/layers/lookaheadAlgorithmTest.cpp +++ b/cpp/tests/layers/lookaheadAlgorithmTest.cpp @@ -14,9 +14,14 @@ * limitations under the License. */ #include +#include +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/layers/lookaheadAlgorithm.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/lookaheadModule.h" #include "tests/layers/randomLlm.h" namespace tensorrt_llm::tests::layers @@ -45,11 +50,20 @@ bool verifyAcceptOffsets(TensorPtr output, TensorPtr accepted, TensorPtr accepte TEST_P(LookaheadAlgorithmTest, predict) { + srand(42); auto [Ww, Nn, Gg] = GetParam(); auto [W, w] = Ww; auto [N, n] = Nn; auto [G, g] = Gg; + if (!executor::LookaheadDecodingConfig::isLegal(W, N, G) || !executor::LookaheadDecodingConfig::isLegal(w, n, g)) + { + TLLM_LOG_DEBUG("Just Pass for illegal parameter combination"); + GTEST_SKIP() << "Algorithm does not support these parameters WNG=(" << W << ", " << N << ", " << G << "), wng=(" + << w << ", " << n << ", " << g; + } + TLLM_LOG_DEBUG("Test Parameters: WNG=(%d, %d, %d), wng=(%d, %d, %d)", W, N, G, w, n, g); + auto ascii = std::make_shared(); std::string oracle( @@ -63,8 +77,13 @@ TEST_P(LookaheadAlgorithmTest, predict) auto promptLen = ITensor::volume(prompt->getShape()); auto maxSeqLen = 1024; - auto maxDraftLen = (W + G) * (N - 1) - 1; - auto shape = ITensor::makeShape({1 + maxDraftLen}); + SizeType32 maxTokensPerStep, maxDraftLen; + SizeType32 maxDraftLenRuntime; + std::tie(maxTokensPerStep, std::ignore, maxDraftLen, std::ignore) + = executor::LookaheadDecodingConfig(W, N, G).calculateSpeculativeResource(); + std::tie(std::ignore, std::ignore, maxDraftLenRuntime, std::ignore) + = executor::LookaheadDecodingConfig(w, n, g).calculateSpeculativeResource(); + auto shape = ITensor::makeShape({maxTokensPerStep}); auto shapeSingle = ITensor::makeShape({1}); TensorPtr posidMax = BufferManager::cpu(shape, nvinfer1::DataType::kINT32); TensorPtr smaskMax = BufferManager::cpu(shape, nvinfer1::DataType::kBOOL); @@ -105,12 +124,12 @@ TEST_P(LookaheadAlgorithmTest, predict) TLLM_LOG_DEBUG("\noracle[%d] = '%c'", sequenceLength - 1, static_cast(sequenceRange[sequenceLength - 1])); bufferCast(*posidMax)[0] = sequenceLength - 1; bufferCast(*smaskMax)[0] = true; - algo.prepare( // - ITensor::slice(sequence, sequenceLength, maxDraftLen), // - ITensor::slice(posidMax, 1, maxDraftLen), // - ITensor::slice(smaskMax, 1, maxDraftLen), // - inputLengthPtr, // - sequenceLengthPtr, // + algo.prepare( // + ITensor::slice(sequence, sequenceLength, maxDraftLenRuntime), // + ITensor::slice(posidMax, 1, maxDraftLenRuntime), // + ITensor::slice(smaskMax, 1, maxDraftLenRuntime), // + inputLengthPtr, // + sequenceLengthPtr, // ITensor::slice(sequence, sequenceLength - 1, 1)); TensorPtr input = ITensor::slice(sequence, sequenceLength - 1, inputLength + 1); @@ -128,7 +147,7 @@ TEST_P(LookaheadAlgorithmTest, predict) // algo.update(acceptedMax, acceptedOffsetsMax, acceptedLengthPtr, output, endIdPtr); algo.update( - ITensor::slice(sequence, sequenceLength, N), acceptedOffsetsMax, acceptedLengthPtr, output, endIdPtr); + ITensor::slice(sequence, sequenceLength, n), acceptedOffsetsMax, acceptedLengthPtr, output, endIdPtr); TensorPtr accepted = ITensor::slice(sequence, sequenceLength, acceptedLength); TensorPtr acceptedOffsets = ITensor::slice(acceptedOffsetsMax, 0, acceptedLength); @@ -154,18 +173,38 @@ TEST_P(LookaheadAlgorithmTest, predict) INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTest, LookaheadAlgorithmTest, testing::Combine( // testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(7, 7), - std::make_tuple(3, 2), std::make_tuple(5, 3), std::make_tuple(7, 4)), - testing::Values(std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(7, 7), std::make_tuple(3, 2), - std::make_tuple(5, 3), std::make_tuple(7, 4)), - testing::Values(std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(7, 7), std::make_tuple(3, 2), - std::make_tuple(5, 3), std::make_tuple(7, 4)))); + std::make_tuple(2, 1), std::make_tuple(3, 2), std::make_tuple(5, 3), std::make_tuple(7, 4)), + testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(7, 7), + std::make_tuple(2, 1), std::make_tuple(3, 2), std::make_tuple(5, 3), std::make_tuple(7, 4)), + testing::Values(std::make_tuple(0, 0), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(7, 7), + std::make_tuple(1, 0), std::make_tuple(3, 2), std::make_tuple(5, 3), std::make_tuple(7, 4)))); INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSingleMax, LookaheadAlgorithmTest, testing::Combine(testing::Values(std::make_tuple(5, 5)), testing::Values(std::make_tuple(5, 5)), testing::Values(std::make_tuple(5, 5)))); INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSingleDynamic, LookaheadAlgorithmTest, - testing::Combine(testing::Values(std::make_tuple(3, 2)), testing::Values(std::make_tuple(3, 2)), - testing::Values(std::make_tuple(3, 2)))); + testing::Combine(testing::Values(std::make_tuple(1, 1)), testing::Values(std::make_tuple(2, 1)), + testing::Values(std::make_tuple(1, 0)))); + +INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmallest_110, LookaheadAlgorithmTest, + testing::Combine(testing::Values(std::make_tuple(1, 1)), testing::Values(std::make_tuple(1, 1)), + testing::Values(std::make_tuple(0, 0)))); + +INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_120, LookaheadAlgorithmTest, + testing::Combine(testing::Values(std::make_tuple(1, 1)), testing::Values(std::make_tuple(2, 2)), + testing::Values(std::make_tuple(0, 0)))); + +INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_220, LookaheadAlgorithmTest, + testing::Combine(testing::Values(std::make_tuple(2, 2)), testing::Values(std::make_tuple(2, 2)), + testing::Values(std::make_tuple(0, 0)))); + +INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_121, LookaheadAlgorithmTest, + testing::Combine(testing::Values(std::make_tuple(1, 1)), testing::Values(std::make_tuple(2, 2)), + testing::Values(std::make_tuple(1, 1)))); + +INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_222, LookaheadAlgorithmTest, + testing::Combine(testing::Values(std::make_tuple(2, 2)), testing::Values(std::make_tuple(2, 2)), + testing::Values(std::make_tuple(2, 2)))); } // namespace tensorrt_llm::tests::layers diff --git a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp new file mode 100644 index 000000000..42e29252b --- /dev/null +++ b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/tensorConversion.h" +#include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/layers/lookaheadDecodingLayer.h" +#include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iBuffer.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/lookaheadModule.h" +#include "tensorrt_llm/runtime/modelConfig.h" +#include "tensorrt_llm/runtime/request.h" +#include "tensorrt_llm/runtime/runtimeKernels.h" +#include "tests/layers/randomLlm.h" + +namespace tensorrt_llm::tests::layers +{ +using namespace tensorrt_llm::runtime; +using namespace tensorrt_llm::layers; + +namespace tk = tensorrt_llm::kernels; +namespace tcc = tensorrt_llm::common::conversion; +namespace trk = tensorrt_llm::runtime::kernels; + +using TensorPtr = runtime::ITensor::SharedPtr; +using TensorConstPtr = runtime::ITensor::SharedConstPtr; + +struct TestParam +{ + SizeType32 maxBatchSize; + + enum BatchType + { + SINGLE_ONCE, + SINGLE_TWICE, + DYNAMIC + } batchType; + + SizeType32 maxW; + SizeType32 w; + SizeType32 maxN; + SizeType32 n; + SizeType32 maxG; + SizeType32 g; +}; + +class BatchSlotsManager +{ +public: + BatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases) + : mMaxBatchSize(maxBatchSize) + , mCases(cases) + { + } + + virtual std::vector alloc(void) = 0; + virtual void free(SizeType32 id) = 0; + + bool finished() + { + return mCases == 0; + } + +protected: + SizeType32 quota(void) + { + return mCases - mRunning; + } + + void consume(SizeType32 cases) + { + TLLM_CHECK(cases >= 0); + TLLM_CHECK_DEBUG_WITH_INFO(cases <= mCases, "cases=%d, mCases=%d", cases, mCases); + mRunning -= cases; + mCases -= cases; + } + +protected: + SizeType32 mMaxBatchSize{0}; + SizeType32 mCases{0}; + SizeType32 mRunning{0}; +}; + +class SingleBatchSlotsManager : public BatchSlotsManager +{ +public: + SingleBatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases, SizeType32 id) + : BatchSlotsManager(maxBatchSize, cases) + , mId(id) + { + TLLM_CHECK(id < maxBatchSize); + } + + virtual std::vector alloc(void) + { + if (mState == FREE && quota() > 0) + { + mState = BUSY; + mRunning += 1; + return std::vector({mId}); + } + else + { + return std::vector(); + } + } + + virtual void free(SizeType32 id) + { + TLLM_CHECK(id == mId); + mState = FREE; + consume(1); + } + +private: + enum + { + FREE, + BUSY + } mState{FREE}; + + SizeType32 mId; +}; + +class DynamicBatchSlotsManager : public BatchSlotsManager +{ +public: + DynamicBatchSlotsManager(SizeType32 maxBatchSize, SizeType32 cases) + : BatchSlotsManager(maxBatchSize, cases) + { + for (SizeType32 bi = 0; bi * 3 + 2 < maxBatchSize; bi++) + { + mFreeList.push(bi * 3 + 1); + mFreeList.push(bi * 3 + 2); + mFreeList.push(bi * 3); + } + } + + virtual std::vector alloc() + { + SizeType32 waterline = mMaxBatchSize / 4; + SizeType32 plan = mBusySet.size() < waterline ? rand() % (mMaxBatchSize / 4) : 0; + SizeType32 num = std::min(plan, quota()); + std::vector result; + for (SizeType32 i = 0; i < num && !mFreeList.empty(); i++) + { + SizeType32 id = mFreeList.front(); + result.push_back(id); + mBusySet.insert(id); + mFreeList.pop(); + } + mRunning += result.size(); + return result; + } + + virtual void free(SizeType32 id) + { + auto search = mBusySet.find(id); + TLLM_CHECK(search != mBusySet.end()); + mBusySet.erase(search); + mFreeList.push(id); + consume(1); + } + +private: + std::queue mFreeList; + std::set mBusySet; +}; + +class LookaheadDecodingLayerTest : public testing::Test +{ +public: + void SetUp() override; + void TearDown() override; + void runTest(TestParam const& param); + +private: + void allocateBuffers(); + + void setupBuffers(); + + void newRequests(std::vector requestIds); + + void manageBatch(void); + + void llmForward(void); + + void decodeForward(void); + + void verifyDecode(void); + +protected: + std::shared_ptr mBufferManager; + std::shared_ptr mStream; + + struct cudaDeviceProp mDeviceProp; + + TensorPtr mAlgoConfigBatch; + + TensorPtr mFinished; + TensorPtr mOutputIds; + TensorPtr mSequenceLengths; + TensorPtr mProbs; + TensorPtr mEndIds; + TensorPtr mTokensPerStep; + TensorPtr mGoldenSampledTokens; + TensorPtr mBatchSlots; + TensorPtr mBatchSlotsMax; + + TensorPtr mNumNewTokens; + TensorPtr mKNumNewTokensCumSum; + TensorPtr mPathsOffsets; + TensorPtr mDraftLengths; + TensorPtr mDraftTokens; + TensorPtr mDraftPosIds; + TensorPtr mPackedMasks; + TensorPtr mPackedMasksBool; + + TensorPtr mInputTokensBatch; + TensorPtr mPositionIdsBatch; + + int32_t mMaxTopK = 1; + static constexpr int32_t mMaxSeqLen = 512; + float mMaxTopP = 1.0; + std::shared_ptr mAscii; + std::vector mOracle; + std::vector mPrompt; + std::vector> mLlm; + std::shared_ptr> mDecoder; + SizeType32 mVocabSize; + SizeType32 mMaxTokensPerStep; + TestParam mTestParam; + std::shared_ptr mBatchSlotsManager; + std::vector mScoreBoard; + std::vector mHistogram; + std::list mReports; +}; + +void LookaheadDecodingLayerTest::SetUp() +{ + mStream = std::make_shared(); + mBufferManager = std::make_shared(mStream); + + int32_t device; + cudaGetDevice(&device); + cudaGetDeviceProperties(&mDeviceProp, device); + + mAscii = std::make_shared(); + mVocabSize = mAscii->getVocabSize(); +} + +void LookaheadDecodingLayerTest::TearDown() {} + +void LookaheadDecodingLayerTest::allocateBuffers() +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto const maxBatchSize = mTestParam.maxBatchSize; + auto const vocabSize = mAscii->getVocabSize(); + + SizeType32 maxNumNewTokens, maxDraftLen; + std::tie(mMaxTokensPerStep, maxNumNewTokens, maxDraftLen, std::ignore) + = executor::LookaheadDecodingConfig(mTestParam.maxW, mTestParam.maxN, mTestParam.maxG) + .calculateSpeculativeResource(); + // mMaxTokensPerStep = maxTokensPerStep; + + auto const vocabSizePadded = vocabSize; + auto const maxNumHeads = 1; + std::ostringstream buf; + + std::vector text({// + std::string("To be, or not to be: that is the question. " + "To Be, Or Not To Be: That Is The Question.&"), + std::string("Be not afraid of greatness. Some are born great, some achieve greatness, and others have " + "greatness thrust upon them. " + "Be Not Afraid Of Greatness. Some Are Born Great, Some Achieve Greatness, And Others Have " + "Greatness Thrust Upon Them.&"), + std::string("Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious " + "jewel in his head. " + "Sweet Are the Uses Of Adversity Which, Like The Toad, Ugly And Venomous, Wears Yet A Precious " + "Jewel In His Head.&"), + std::string("Talking isn't doing. It is a kind of good deed to say well; and yet words are not deeds. " + "Talking Isn't Doing. It Is A Kind Of Good Deed To Say Well; And Yet Words Are Not Deeds.&"), + std::string( + "Reputation is an idle and most false imposition; oft got without merit, and lost without deserving. " + "Reputation Is An Idle And Most False Imposition; Oft Got Without Merit, And Lost Without Deserving.&")}); + + mOracle.resize(maxBatchSize); + mLlm.resize(maxBatchSize); + mPrompt.resize(maxBatchSize); + mScoreBoard.resize(maxBatchSize); + mHistogram.resize(maxBatchSize); + for (SizeType32 gbi = 0; gbi < maxBatchSize; gbi++) + { + mOracle[gbi] = text[rand() % text.size()]; + mLlm[gbi] = std::make_shared(mAscii, mOracle[gbi], gbi); + + mScoreBoard[gbi] = std::ostringstream(); + mHistogram[gbi] = BufferManager::cpu(ITensor::makeShape({mTestParam.n + 1}), nvinfer1::DataType::kINT32); + } + switch (mTestParam.batchType) + { + case TestParam::SINGLE_ONCE: + mBatchSlotsManager = std::make_shared(maxBatchSize, 1, 1); + break; + case TestParam::SINGLE_TWICE: + mBatchSlotsManager = std::make_shared(maxBatchSize, 2, 1); + break; + case TestParam::DYNAMIC: + mBatchSlotsManager = std::make_shared(maxBatchSize, maxBatchSize * 2); + break; + } + + auto lookaheadModule = std::make_shared(mTestParam.maxN, mMaxTokensPerStep - 1); + + lookaheadModule->setExecutionConfig( + executor::LookaheadDecodingConfig(mTestParam.maxW, mTestParam.maxN, mTestParam.maxG)); + auto const decodingDomain + = tensorrt_llm::layers::DecoderDomain(maxBatchSize, 1, vocabSize, vocabSizePadded, lookaheadModule); + + mDecoder = std::make_shared>(decodingDomain, mBufferManager); + + TLLM_LOG_DEBUG("decoder ok"); + + auto maxBatchShape1D = ITensor::makeShape({maxBatchSize}); + + mAlgoConfigBatch = BufferManager::pinned(ITensor::makeShape({maxBatchSize, 3}), nvinfer1::DataType::kINT32); + + mFinished = BufferManager::pinned(maxBatchShape1D, TRTDataType::value); + mEndIds = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + mTokensPerStep = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + + mOutputIds = BufferManager::pinned( + ITensor::makeShape({maxBatchSize, mMaxSeqLen + mMaxTokensPerStep}), nvinfer1::DataType::kINT32); + mSequenceLengths = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + + mProbs = BufferManager::pinned( + ITensor::makeShape({maxBatchSize, mMaxTokensPerStep, vocabSize}), nvinfer1::DataType::kFLOAT); + + mGoldenSampledTokens + = BufferManager::cpu(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32); + mInputTokensBatch + = BufferManager::pinned(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32); + mPositionIdsBatch + = BufferManager::pinned(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32); + + mNumNewTokens = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + mDraftLengths = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + mDraftTokens = BufferManager::pinned(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); + mDraftPosIds = BufferManager::pinned(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); + auto divUp32 = [](SizeType32 x) { return x / 32 + ((x % 32) ? 1 : 0); }; + mPackedMasks = BufferManager::pinned( + ITensor::makeShape({maxBatchSize, mMaxTokensPerStep, divUp32(mMaxTokensPerStep)}), nvinfer1::DataType::kINT32); + mPackedMasksBool = BufferManager::pinned( + ITensor::makeShape({maxBatchSize, mMaxTokensPerStep, mMaxTokensPerStep}), nvinfer1::DataType::kBOOL); + mKNumNewTokensCumSum = BufferManager::pinned(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32); + mPathsOffsets + = BufferManager::pinned(ITensor::makeShape({maxBatchSize, maxNumNewTokens}), nvinfer1::DataType::kINT32); + + mBatchSlotsMax = BufferManager::pinned(maxBatchShape1D, nvinfer1::DataType::kINT32); + + auto const batchSize = 0; + auto batchShape1D = ITensor::makeShape({batchSize}); + auto batchShape2D = ITensor::makeShape({batchSize, mMaxTokensPerStep}); + + mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, batchSize); + + trk::invokeFill(*mFinished, uint8_t{0}, *mStream); + trk::invokeFill(*mEndIds, mAscii->getEndToken(), *mStream); + trk::invokeFill(*mOutputIds, int32_t{0}, *mStream); + trk::invokeFill(*mSequenceLengths, int32_t{0}, *mStream); + // trk::invokeFill(*mGeneratedLengths, int32_t{0}, *mStream); + trk::invokeFill(*mTokensPerStep, mMaxTokensPerStep, *mStream); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void LookaheadDecodingLayerTest::setupBuffers() {} + +void LookaheadDecodingLayerTest::newRequests(std::vector requestIds) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto const requestSize = requestIds.size(); + + auto const beamSize = 1; + SizeType32 vocabSize = mAscii->getVocabSize(); + + //////////////////////////////// + for (auto gbi : requestIds) + { + auto len = 5 + rand() % 10; + auto prompt = mOracle[gbi].substr(0, len); + + TokenIdType contextToken = mOracle[gbi][len]; + SizeType32 contextLen = len + 1; + + BufferRange outputRange(*ITensor::at(mOutputIds, {gbi})); + for (auto& v : outputRange) + { + v = 0; + } + std::copy(prompt.begin(), prompt.end(), outputRange.begin()); + outputRange[len] = contextToken; + BufferLocation(*mSequenceLengths).at(gbi) = len + 1; + BufferLocation(*mDraftLengths).at(gbi) = 0; + BufferLocation(*mNumNewTokens).at(gbi) = 0; + + mPrompt[gbi] = ITensor::slice(mOutputIds, {gbi, 0}, len + 1); + + for (auto& v : BufferRange(*mHistogram[gbi])) + { + v = 0; + } + mScoreBoard[gbi] << "request id=[" << gbi << "] starts. prompt len=[" << len << "]."; + } + + TLLM_LOG_DEBUG("batch slots"); + //////////////////////////////// + auto batchSize = ITensor::volume(mBatchSlots->getShape()); + BufferRange batchSlotMaxRange(*mBatchSlotsMax); + std::copy(requestIds.begin(), requestIds.end(), batchSlotMaxRange.begin() + batchSize); + + //////////////////////////////// + auto setupParams = std::make_shared(); + setupParams->prompt.resize(0); + setupParams->algoConfigs.resize(0); + for (SizeType32 bi = 0; bi < requestSize; bi++) + { + SizeType32 gbi = requestIds[bi]; + setupParams->prompt.emplace_back(mPrompt[gbi]); + setupParams->algoConfigs.emplace_back(mTestParam.w, mTestParam.n, mTestParam.g); + PRINT_TOKENS(setupParams->prompt[bi]); + } + std::vector seed(requestIds.begin(), requestIds.end()); + setupParams->randomSeed = std::make_optional(seed); + TensorPtr newRequestSlots = ITensor::slice(mBatchSlotsMax, batchSize, requestSize); + PRINT_VALUES(newRequestSlots); + PRINT_VALUES(mBatchSlotsMax); + mDecoder->setup(requestSize, beamSize, bufferCast(*newRequestSlots), setupParams); + + batchSize += requestIds.size(); + mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, batchSize); + TLLM_LOG_DEBUG("newwRequests mBatchSlots %s", D(mBatchSlots).values().c_str()); + PRINT_VALUES(mSequenceLengths); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void LookaheadDecodingLayerTest::manageBatch(void) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto const maxBatchSize = mTestParam.maxBatchSize; + auto requests = mBatchSlotsManager->alloc(); + if (requests.size() > 0) + { + newRequests(requests); + } + PRINT_VALUES(mSequenceLengths); + + auto batchSize = ITensor::volume(mBatchSlots->getShape()); + BufferRange batchSlotsRange(*mBatchSlots); + auto batchShape1D = ITensor::makeShape({batchSize}); + auto batchShape2D = ITensor::makeShape({batchSize, mMaxTokensPerStep}); + auto newBatchSize = 0; + PRINT_VALUES(mBatchSlots); + for (SizeType32 bi = 0; bi < batchSize; bi++) + { + SizeType32 gbi = batchSlotsRange[bi]; + SizeType32 nbi = newBatchSize; + + TensorPtr theSequence = ITensor::at(mOutputIds, {gbi}); + BufferRange theSequenceRange(*theSequence); + auto theSequenceLength = BufferRange(*mSequenceLengths)[gbi]; + auto theNumNewTokens = BufferRange(*mNumNewTokens)[gbi]; + + TensorPtr generated = ITensor::slice(theSequence, 0, theSequenceLength); + + PRINT_TOKENS(generated); + EXPECT_TRUE(mLlm[gbi]->verify(0, generated)); + + BufferRange(*mHistogram[gbi])[theNumNewTokens] += 1; + + if (BufferLocation(*theSequence).at(theSequenceLength - 1) == mAscii->getEndToken()) + { + TLLM_LOG_DEBUG("request[%d] ends: '%s'", gbi, D(theSequence).string().c_str()); + mScoreBoard[gbi] << "[" << gbi << "] ends. " << D(mHistogram[gbi]).values(); + mReports.push_back(mScoreBoard[gbi].str()); + mScoreBoard[gbi].str(""); + mScoreBoard[gbi].clear(); + mBatchSlotsManager->free(gbi); + } + else + { + batchSlotsRange[newBatchSize++] = gbi; + } + + auto theDraftLen = BufferRange(*mDraftLengths)[gbi]; + BufferLocation(*mTokensPerStep).at(gbi) = 1 + theDraftLen; + + BufferLocation(*mPositionIdsBatch).at(nbi, 0) = theSequenceLength - 1; + BufferLocation(*mInputTokensBatch).at(nbi, 0) = theSequenceRange[theSequenceLength - 1]; + + TLLM_LOG_DEBUG("W=%d, N=%d, G=%d, w=%d, n=%d, g=%d, draftLen = %d", mTestParam.maxW, mTestParam.maxN, + mTestParam.maxG, mTestParam.w, mTestParam.n, mTestParam.g, theDraftLen); + PRINT_VALUES(mInputTokensBatch); + + mBufferManager->copy(*ITensor::slice(mDraftTokens, {gbi, 0}, theDraftLen), + *ITensor::slice(mInputTokensBatch, {nbi, 1}, theDraftLen)); + mBufferManager->copy(*ITensor::slice(mDraftPosIds, {gbi, 0}, theDraftLen), + *ITensor::slice(mPositionIdsBatch, {nbi, 1}, theDraftLen)); + + TLLM_LOG_DEBUG("W=%d, N=%d, G=%d, w=%d, n=%d, g=%d, draftLen = %d", mTestParam.maxW, mTestParam.maxN, + mTestParam.maxG, mTestParam.w, mTestParam.n, mTestParam.g, theDraftLen); + + auto len = BufferRange(*mTokensPerStep)[gbi]; + PRINT_TOKENS(ITensor::slice(mInputTokensBatch, {nbi, 0}, len)); + PRINT_VALUES(ITensor::slice(mPositionIdsBatch, {nbi, 0}, len)); + } + mBatchSlots = ITensor::slice(mBatchSlotsMax, 0, newBatchSize); + PRINT_VALUES(mBatchSlots); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void convertInt32ToBool(TensorPtr const& dst, TensorConstPtr const& src) +{ + auto dstShape = dst->getShape(); + auto srcShape = src->getShape(); + TLLM_CHECK(dstShape.d[0] == srcShape.d[0]); + TLLM_CHECK(dstShape.d[1] <= srcShape.d[1] * 32); + BufferLocation dstLocation(*dst); + BufferLocation srcLocation(*src); + auto testBit = [](SizeType32 x, SizeType32 idx) { return x & (1 << idx); }; + for (auto i = 0; i < dstShape.d[0]; i++) + { + for (auto j = 0; j < dstShape.d[1]; j++) + { + dstLocation.at(i, j) = testBit(srcLocation.at(i, j / 32), j % 32); + } + } +} + +void convertBoolToInt32(TensorPtr const& dst, TensorConstPtr const& src) +{ + auto dstShape = dst->getShape(); + auto srcShape = src->getShape(); + TLLM_CHECK(dstShape.d[0] == srcShape.d[0]); + TLLM_CHECK(dstShape.d[1] * 32 >= srcShape.d[1]); + BufferLocation dstLocation(*dst); + BufferLocation srcLocation(*src); + + for (auto i = 0; i < dstLocation.size(); i++) + { + dstLocation[i] = 0; + } + + auto setBit = [](SizeType32& x, SizeType32 idx, bool value) { x |= (value << idx); }; + for (auto i = 0; i < srcShape.d[0]; i++) + { + for (auto j = 0; j < srcShape.d[1]; j++) + { + setBit(dstLocation.at(i, j / 32), j % 32, srcLocation.at(i, j)); + } + } +} + +void LookaheadDecodingLayerTest::llmForward(void) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto batchSize = ITensor::volume(mBatchSlots->getShape()); + PRINT_VALUES(mBatchSlots); + + for (SizeType32 bi = 0; bi < batchSize; bi++) + { + auto gbi = BufferRange(*mBatchSlots)[bi]; + auto len = BufferRange(*mTokensPerStep)[gbi]; + TensorPtr output = ITensor::slice(mProbs, {bi, 0}, len); + TensorPtr golden = ITensor::slice(mGoldenSampledTokens, {gbi, 0}, len); + + convertInt32ToBool(ITensor::at(mPackedMasksBool, {gbi}), ITensor::at(mPackedMasks, {gbi})); + + mLlm[gbi]->forward(output, // + ITensor::slice(mInputTokensBatch, {bi, 0}, len), // + ITensor::slice(mPositionIdsBatch, {bi, 0}, len), // + ITensor::at(mPackedMasksBool, {gbi})); + + mAscii->logitsToTensor(golden, output); + TLLM_LOG_DEBUG("batch[%d] LLM golden: '%s'", gbi, D(golden).tokens().c_str()); + } + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void LookaheadDecodingLayerTest::decodeForward(void) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto batchSize = ITensor::volume(mBatchSlots->getShape()); + PRINT_VALUES(mBatchSlots); + + auto inputParams = std::make_shared(tcc::toTllmTensor(*mEndIds)); + inputParams->localBatchSize = batchSize; + inputParams->logits = tcc::toTllmTensor(*ITensor::slice(mProbs, 0, batchSize)); + inputParams->batchSlots = tcc::toTllmTensor(*mBatchSlots); + inputParams->finished = tcc::toTllmTensor(*mFinished); // TODO(liweim) ask finished protocol + inputParams->curTokensPerStep = tcc::toTllmTensor(*mTokensPerStep); + + auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIds)); + + PRINT_VALUES(mSequenceLengths); + outputParams->sequenceLength = tcc::toTllmTensor(*mSequenceLengths); + outputParams->finished = tcc::toTllmTensor(*mFinished); + outputParams->nextDraftLengths = tcc::toTllmTensor(*mDraftLengths); + outputParams->nextDraftTokens = tcc::toTllmTensor(*mDraftTokens); + outputParams->nextDraftPosIds = tcc::toTllmTensor(*mDraftPosIds); + outputParams->packedMasks = tcc::toTllmTensor(*mPackedMasks); + outputParams->numNewTokens = tcc::toTllmTensor(*mNumNewTokens); + outputParams->numNewTokensCumSum = tcc::toTllmTensor(*mKNumNewTokensCumSum); + outputParams->pathsOffsets = tcc::toTllmTensor(*mPathsOffsets); + + PRINT_VALUES(mTokensPerStep); + + mDecoder->forwardAsync(outputParams, inputParams); + + mStream->synchronize(); + + mDecoder->forwardSync(outputParams, inputParams); + + mStream->synchronize(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void LookaheadDecodingLayerTest::verifyDecode(void) +{ + auto batchSize = ITensor::volume(mBatchSlots->getShape()); + for (SizeType32 bi = 0; bi < batchSize; bi++) + { + auto gbi = BufferRange(*mBatchSlots)[bi]; + auto len = BufferRange(*mTokensPerStep)[gbi]; + TensorPtr golden = ITensor::slice(mGoldenSampledTokens, {gbi, 0}, len); + auto sequenceLength = BufferLocation(*mSequenceLengths).at(gbi); + auto numNewTokens = BufferLocation(*mNumNewTokens).at(gbi); + TensorPtr newTokens = ITensor::slice(mOutputIds, {gbi, sequenceLength - numNewTokens}, numNewTokens); + TensorPtr pathOffsets = ITensor::slice(mPathsOffsets, {gbi, 0}, numNewTokens); + + BufferRange goldenRange(*golden); + BufferRange newTokensRange(*newTokens); + BufferRange offsetsRange(*pathOffsets); + for (SizeType32 i = 0; i < newTokensRange.size(); i++) + { + TLLM_CHECK(goldenRange[offsetsRange[i]] == newTokensRange[i]); + } + } + BufferRange cumSumRange(*mKNumNewTokensCumSum); + SizeType32 sum = 0; + TLLM_CHECK(cumSumRange[0] == sum); + for (SizeType32 gbi = 0; gbi < mTestParam.maxBatchSize; gbi++) + { + auto numNewTokens = BufferLocation(*mNumNewTokens).at(gbi); + sum += numNewTokens; + TLLM_CHECK(cumSumRange[gbi + 1] == sum); + } +} + +void LookaheadDecodingLayerTest::runTest(TestParam const& param) +{ + TLLM_LOG_DEBUG("TEST BEGIN: maxBatchSize=%d, mode=%d, WNG=(%d, %d, %d), wng=(%d, %d, %d)", param.maxBatchSize, + param.batchType, param.maxW, param.maxN, param.maxG, param.w, param.n, param.g); + srand(42); + + mTestParam = param; + allocateBuffers(); + + int step = 0; + for (; !mBatchSlotsManager->finished() && step < 3000; step++) + { + TLLM_LOG_DEBUG("!!!!!!!!!!!!!!!! < %d > !!!!!!!!!!!!!!!!", step); + manageBatch(); + if (ITensor::volume(mBatchSlots->getShape())) + { + llmForward(); + mStream->synchronize(); + decodeForward(); + verifyDecode(); + } + } + + for (auto& r : mReports) + { + TLLM_LOG_DEBUG(r); + } + if (!mBatchSlotsManager->finished()) + { + TLLM_LOG_INFO("step=%d is not enough", step); + } +} + +TEST_F(LookaheadDecodingLayerTest, singleOnce) +{ + this->runTest(TestParam{16, TestParam::SINGLE_ONCE, 5, 3, 5, 3, 5, 3}); +} + +TEST_F(LookaheadDecodingLayerTest, singleTwice) +{ + this->runTest(TestParam{16, TestParam::SINGLE_TWICE, 7, 5, 7, 5, 7, 5}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamic) +{ + this->runTest(TestParam{16, TestParam::DYNAMIC, 5, 5, 5, 5, 5, 5}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicLarge) +{ + this->runTest(TestParam{32, TestParam::DYNAMIC, 7, 6, 7, 6, 9, 8}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicSmall_110) +{ + this->runTest(TestParam{16, TestParam::SINGLE_TWICE, 1, 1, 2, 2, 0, 0}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicSmall_311) +{ + this->runTest(TestParam{32, TestParam::DYNAMIC, 3, 2, 2, 2, 1, 1}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicSmall_131) +{ + this->runTest(TestParam{32, TestParam::DYNAMIC, 1, 1, 3, 2, 1, 1}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicSmall_113) +{ + this->runTest(TestParam{32, TestParam::DYNAMIC, 1, 1, 2, 2, 3, 2}); +} + +TEST_F(LookaheadDecodingLayerTest, dynamicSmall_112110) +{ + this->runTest(TestParam{4, TestParam::SINGLE_TWICE, 1, 1, 2, 1, 1, 0}); +} + +using ParamType = std::tuple, + std::tuple, std::tuple>; + +static int g_id = 0; + +std::string generateTestName(testing::TestParamInfo const& info) +{ + auto [maxBatchSize, mode, Ww, Nn, Gg] = info.param; + auto [W, w] = Ww; + auto [N, n] = Nn; + auto [G, g] = Gg; + std::ostringstream buf; + buf << (g_id++) << "maxBatchSize_" << maxBatchSize << "__mode_" << mode << '_' << '_' << W << '_' << w << '_' << '_' + << N << '_' << n << '_' << '_' << G << '_' << g << '_'; + return buf.str(); +} + +class ParamTest : public LookaheadDecodingLayerTest, public ::testing::WithParamInterface +{ +}; + +TEST_P(ParamTest, Test) +{ + srand(42); + + auto [maxBatchSize, mode, Ww, Nn, Gg] = GetParam(); + auto [W, w] = Ww; + auto [N, n] = Nn; + auto [G, g] = Gg; + if (!executor::LookaheadDecodingConfig::isLegal(W, N, G) || !executor::LookaheadDecodingConfig::isLegal(w, n, g)) + { + TLLM_LOG_DEBUG("Just Pass for illegal parameter combination"); + GTEST_SKIP() << "Algorithm does not support these parameters WNG=(" << W << ", " << N << ", " << G << "), wng=(" + << w << ", " << n << ", " << g << ")"; + } + runTest(TestParam{maxBatchSize, mode, W, w, N, n, G, g}); +} + +INSTANTIATE_TEST_SUITE_P(LookaheadDecodingLayerParamTest, ParamTest, + testing::Combine( // + testing::Values(4, 16), testing::Values(TestParam::DYNAMIC), + testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(2, 1), + std::make_tuple(3, 2), std::make_tuple(5, 3)), + testing::Values(std::make_tuple(1, 1), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(2, 1), + std::make_tuple(3, 2), std::make_tuple(5, 3)), + testing::Values(std::make_tuple(0, 0), std::make_tuple(3, 3), std::make_tuple(5, 5), std::make_tuple(1, 0), + std::make_tuple(3, 2), std::make_tuple(5, 3))), + generateTestName); + +} // namespace tensorrt_llm::tests::layers diff --git a/cpp/tests/layers/medusaDecodeLayerTest.cpp b/cpp/tests/layers/medusaDecodeLayerTest.cpp index 10fa2888f..98a9f8c56 100644 --- a/cpp/tests/layers/medusaDecodeLayerTest.cpp +++ b/cpp/tests/layers/medusaDecodeLayerTest.cpp @@ -262,16 +262,17 @@ void MedusaDecodingLayerTest::setup(SamplingParams& params) } template -std::shared_ptr MedusaDecodingLayerTest::createInputTensors() +std::shared_ptr MedusaDecodingLayerTest::createInputTensors() { - auto forwardParams = std::make_shared( - tcc::toTllmTensor(*mTargetLogitsDevice), tcc::toTllmTensor(*mEndIdsDevice)); + auto forwardParams = std::make_shared(tcc::toTllmTensor(*mEndIdsDevice), mBatchSize); auto batchSlots = BufferRange(*mBatchSlots); + forwardParams->logits = tcc::toTllmTensor(*mTargetLogitsDevice); + forwardParams->finished = tcc::toTllmTensor(*mFinishedDevice); - forwardParams->batch_slots = tcc::toTllmTensor(*mBatchSlots); + forwardParams->batchSlots = tcc::toTllmTensor(*mBatchSlots); forwardParams->paths = tcc::toTllmTensor(*mPathsDevice); @@ -295,30 +296,29 @@ std::shared_ptr MedusaDecodingLayerTest::createInputTensor } forwardParams->medusaLogits = medusaLogits; - forwardParams->medusaCurTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); + forwardParams->curTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); - forwardParams->medusaTargetTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); + forwardParams->targetTokensPerStep = tcc::toTllmTensor(*mTokensPerStepDevice); return forwardParams; } template -std::shared_ptr MedusaDecodingLayerTest::createOutputTensors() +std::shared_ptr MedusaDecodingLayerTest::createOutputTensors() { - auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); + auto outputParams = std::make_shared(tcc::toTllmTensor(*mOutputIdsDevice)); - outputParams->sequence_length = tcc::toTllmTensor(*mSeqLengthsDevice); + outputParams->sequenceLength = tcc::toTllmTensor(*mSeqLengthsDevice); outputParams->finished = tcc::toTllmTensor(*mFinishedDevice); - outputParams->speculativeDecodingOutputs = DynamicDecodeOutputParams::SpeculativeDecodingOutputs(); - outputParams->speculativeDecodingOutputs->nextDraftTokens = tcc::toTllmTensor(*mNextDraftTokensDevice); + outputParams->nextDraftTokens = tcc::toTllmTensor(*mNextDraftTokensDevice); - outputParams->speculativeDecodingOutputs->acceptedLengths = tcc::toTllmTensor(*mAcceptedLengths); + outputParams->numNewTokens = tcc::toTllmTensor(*mAcceptedLengths); - outputParams->speculativeDecodingOutputs->acceptedLengthsCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSumDevice); + outputParams->numNewTokensCumSum = tcc::toTllmTensor(*mAcceptedLengthCumSumDevice); - outputParams->speculativeDecodingOutputs->pathsOffsets = tcc::toTllmTensor(*mPackedPathsDevice); + outputParams->pathsOffsets = tcc::toTllmTensor(*mPackedPathsDevice); return outputParams; } diff --git a/cpp/tests/layers/medusaDecodeLayerTest.h b/cpp/tests/layers/medusaDecodeLayerTest.h index 615cfbd36..92128d1a2 100644 --- a/cpp/tests/layers/medusaDecodeLayerTest.h +++ b/cpp/tests/layers/medusaDecodeLayerTest.h @@ -104,9 +104,9 @@ class MedusaDecodingLayerTest : public testing::Test void setup(SamplingParams& params); - std::shared_ptr createInputTensors(); + std::shared_ptr createInputTensors(); - std::shared_ptr createOutputTensors(); + std::shared_ptr createOutputTensors(); void checkResult(std::vector>> const& expectedOutTokens, std::vector> const& expectedDraftTokens, std::vector const& finished, diff --git a/cpp/tests/layers/randomLlm.cpp b/cpp/tests/layers/randomLlm.cpp index 45de0c207..f644ae797 100644 --- a/cpp/tests/layers/randomLlm.cpp +++ b/cpp/tests/layers/randomLlm.cpp @@ -14,7 +14,13 @@ * limitations under the License. */ #include "tests/layers/randomLlm.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iBuffer.h" +#include "tensorrt_llm/runtime/iTensor.h" namespace tensorrt_llm::tests::layers { @@ -34,20 +40,20 @@ TensorPtr initTensor(std::string str, std::optional shape) return tensor; } -TensorPtr RandomTokenLogits::tokenToLogits(TokenIdType token) const +TensorConstPtr RandomTokenLogits::tokenToLogits(TokenIdType token) const { TensorPtr logits = BufferManager::cpu(mVocabulary->getShape(), nvinfer1::DataType::kFLOAT); tokenToLogits(logits, token); return logits; } -void RandomTokenLogits::tokenToLogits(TensorPtr logits, TokenIdType token) const +void RandomTokenLogits::tokenToLogits(TensorPtr const& logits, TokenIdType token) const { TLLM_CHECK_WITH_INFO(logits->shapeEquals({getVocabSize()}), "%s != {%d}", ITensor::toString(logits->getShape()).c_str(), getVocabSize()); auto logitsRange = BufferRange(*logits); - auto vocabRange = BufferRange(*mVocabulary); + auto vocabRange = BufferRange(*mVocabulary); auto itl = logitsRange.begin(); auto itv = vocabRange.begin(); for (; itl != logitsRange.end() && itv != vocabRange.end(); itl++, itv++) @@ -57,11 +63,11 @@ void RandomTokenLogits::tokenToLogits(TensorPtr logits, TokenIdType token) const } } -TokenIdType RandomTokenLogits::logitsToToken(TensorPtr logits) const +TokenIdType RandomTokenLogits::logitsToToken(TensorConstPtr const& logits) const { TLLM_CHECK(logits->shapeEquals({getVocabSize()})); - auto logitsRange = BufferRange(*logits); - auto vocabRange = BufferRange(*mVocabulary); + auto logitsRange = BufferRange(*logits); + auto vocabRange = BufferRange(*mVocabulary); float max = -FLT_MAX; TokenIdType result; auto itl = logitsRange.begin(); @@ -78,9 +84,9 @@ TokenIdType RandomTokenLogits::logitsToToken(TensorPtr logits) const return result; } -std::list RandomTokenLogits::stringToLogits(std::string tokens) const +std::list RandomTokenLogits::stringToLogits(std::string tokens) const { - std::list result; + std::list result; for (auto& token : tokens) { result.push_back(tokenToLogits(static_cast(token))); @@ -88,7 +94,7 @@ std::list RandomTokenLogits::stringToLogits(std::string tokens) const return result; } -void RandomTokenLogits::stringToLogits(TensorPtr logits, std::string tokens) const +void RandomTokenLogits::stringToLogits(TensorPtr const& logits, std::string tokens) const { TLLM_CHECK(logits->shapeEquals({static_cast(tokens.size()), getVocabSize()})); @@ -99,11 +105,11 @@ void RandomTokenLogits::stringToLogits(TensorPtr logits, std::string tokens) con } } -void RandomTokenLogits::tensorToLogits(TensorPtr logits, TensorPtr tokens) const +void RandomTokenLogits::tensorToLogits(TensorPtr const& logits, TensorConstPtr const& tokens) const { TLLM_CHECK(ITensor::volume(logits->getShape()) == ITensor::volume(tokens->getShape()) * getVocabSize()); // TLLM_CHECK(logits->shapeEquals({static_cast(tokens.size()), getVocabSize()})); - auto tokensRange = BufferRange(*tokens); + auto tokensRange = BufferRange(*tokens); auto i = 0; for (auto it = tokensRange.begin(); it != tokensRange.end(); it++) { @@ -111,7 +117,7 @@ void RandomTokenLogits::tensorToLogits(TensorPtr logits, TensorPtr tokens) const } } -std::string RandomTokenLogits::logitsToString(std::list logits) const +std::string RandomTokenLogits::logitsToString(std::list logits) const { std::string result; for (auto& token : logits) @@ -121,7 +127,7 @@ std::string RandomTokenLogits::logitsToString(std::list logits) const return result; } -std::string RandomTokenLogits::logitsToString(TensorPtr logits) const +std::string RandomTokenLogits::logitsToString(TensorConstPtr const& logits) const { auto len = logits->getShape().d[0]; std::string result; @@ -132,15 +138,22 @@ std::string RandomTokenLogits::logitsToString(TensorPtr logits) const return result; } -TensorPtr RandomTokenLogits::logitsToTensor(TensorPtr logits) const +void RandomTokenLogits::logitsToTensor(TensorPtr const& tokens, TensorConstPtr const& logits) const { auto len = logits->getShape().d[0]; - TensorPtr result = BufferManager::cpu(ITensor::makeShape({len}), nvinfer1::DataType::kINT32); - auto resultRange = BufferRange(*result); + TLLM_CHECK(tokens->getShape().d[0] >= len); + auto tokensRange = BufferRange(*tokens); for (auto i = 0; i < len; i++) { - resultRange[i] = logitsToToken(ITensor::at(logits, {i})); + tokensRange[i] = logitsToToken(ITensor::at(logits, {i})); } +} + +TensorConstPtr RandomTokenLogits::logitsToTensor(TensorConstPtr const& logits) const +{ + auto len = logits->getShape().d[0]; + TensorPtr result = BufferManager::cpu(ITensor::makeShape({len}), nvinfer1::DataType::kINT32); + logitsToTensor(result, logits); return result; } @@ -149,22 +162,22 @@ SizeType32 RandomTokenLogits::getVocabSize() const return ITensor::volume(mVocabulary->getShape()); } -TokenIdType RandomTokenLogits::getInvalidToken() const +TokenIdType const RandomTokenLogits::getInvalidToken() const { - return *(BufferRange(*mVocabulary).end() - 1); + return *(BufferRange(*mVocabulary).end() - 1); } -TokenIdType RandomTokenLogits::getEndToken() const +TokenIdType const RandomTokenLogits::getEndToken() const { - return *(BufferRange(*mVocabulary).end() - 2); + return *(BufferRange(*mVocabulary).end() - 2); } -void RandomLlm::sampleByMask(TensorPtr inout, TensorPtr mask) const +void RandomLlm::sampleByMask(TensorPtr const& inout, TensorConstPtr const& mask) const { auto len = ITensor::volume(mask->getShape()); TLLM_CHECK(len == ITensor::volume(mask->getShape())); auto inoutRange = BufferRange(*inout); - auto maskRange = BufferRange(*mask); + auto maskRange = BufferRange(*mask); auto invalid = mTable->getInvalidToken(); for (SizeType32 i = 0; i < len; i++) @@ -176,10 +189,10 @@ void RandomLlm::sampleByMask(TensorPtr inout, TensorPtr mask) const } } -bool RandomLlm::verify(SizeType32 const offset, TensorPtr const script) const +bool RandomLlm::verify(SizeType32 const offset, TensorConstPtr const& script) const { - auto oracleRange = BufferRange(*mOracle); - auto scriptRange = BufferRange(*script); + auto oracleRange = BufferRange(*mOracle); + auto scriptRange = BufferRange(*script); auto len = ITensor::volume(script->getShape()); auto result = std::equal(oracleRange.begin() + offset, oracleRange.begin() + offset + len, scriptRange.begin()); if (!result) @@ -193,53 +206,44 @@ bool RandomLlm::verify(SizeType32 const offset, TensorPtr const script) const return result; } -void RandomLlm::forward(TensorPtr output, TensorPtr const input, TensorPtr const position) const +void RandomLlm::forward(TensorPtr const& output, TensorConstPtr const& input, TensorConstPtr const& position, + TensorConstPtr const mask) const { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(ITensor::volume(input->getShape()) == ITensor::volume(position->getShape())); TLLM_CHECK(ITensor::volume(output->getShape()) == ITensor::volume(input->getShape()) * mTable->getVocabSize()); TensorPtr tokens = BufferManager::cpu(input->getShape(), nvinfer1::DataType::kINT32); - foretell(tokens, input, position); - if (mId == 4) - { - TLLM_LOG_DEBUG("batch[%d] DEBUG", mId); - PRINT_TOKENS(tokens); - PRINT_TOKENS(input); - PRINT_TOKENS(position); - } + foretell(tokens, input, position, mask); + // foretellOld(tokens, input, position); mTable->tensorToLogits(output, tokens); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void LookaheadRandomLlm::foretell(TensorPtr output, TensorPtr const input, TensorPtr const position) const +void LookaheadRandomLlm::foretell(TensorPtr const& output, TensorConstPtr const& input, TensorConstPtr const& position, + TensorConstPtr const mask) const { - TLLM_CHECK(ITensor::volume(input->getShape()) == ITensor::volume(position->getShape())); - TLLM_CHECK(ITensor::volume(output->getShape()) >= ITensor::volume(input->getShape())); + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto len = ITensor::volume(input->getShape()); + TLLM_CHECK(ITensor::volume(position->getShape()) == len); + TLLM_CHECK(ITensor::volume(output->getShape()) >= len); + if (mask) + { + TLLM_CHECK(ITensor::volume(mask->getShape()) >= len * len); + TLLM_CHECK(mask->getShape().d[0] >= len); + TLLM_CHECK(mask->getShape().d[1] >= len); + } + + TensorPtr maskRebuilt = BufferManager::cpu(ITensor::makeShape({len, len}), nvinfer1::DataType::kBOOL); + posIdsToMask(maskRebuilt, position); auto outputRange = BufferRange(*output); - auto inputRange = BufferRange(*input); - auto positionRange = BufferRange(*position); - auto oracleRange = BufferRange(*mOracle); - auto len = ITensor::volume(input->getShape()); + auto inputRange = BufferRange(*input); + auto positionRange = BufferRange(*position); + auto maskLocation = mask ? BufferLocation(*mask) : BufferLocation(*maskRebuilt); + auto oracleRange = BufferRange(*mOracle); auto olen = ITensor::volume(mOracle->getShape()); - std::vector> mask(len, std::vector(len, false)); - std::vector> stack; - stack.push_back(std::make_pair(0, positionRange[0])); - mask[0][0] = true; - for (auto i = 1; i < len; i++) - { - auto cur = positionRange[i]; - while (stack.size() > 0 && cur <= stack.back().second) - { - stack.pop_back(); - } - TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); - stack.push_back(std::make_pair(i, cur)); - for (auto prev : stack) - { - mask[i][prev.first] = true; - } - } auto verifyStart = 2; for (; verifyStart < len - 1; verifyStart++) { @@ -257,7 +261,7 @@ void LookaheadRandomLlm::foretell(TensorPtr output, TensorPtr const input, Tenso bool right = true; for (auto j = 0; j < i; j++) { - right &= mask[i][j] ? oracleRange[positionRange[j]] == inputRange[j] : true; + right &= maskLocation.at(i, j) ? oracleRange[positionRange[j]] == inputRange[j] : true; } if (i < verifyStart) { // lookahead might be right @@ -270,4 +274,51 @@ void LookaheadRandomLlm::foretell(TensorPtr output, TensorPtr const input, Tenso } } +void LookaheadRandomLlm::posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds) const +{ + auto len = ITensor::volume(posIds->getShape()); + TLLM_CHECK(ITensor::volume(mask->getShape()) >= len * len); + auto posIdsRange = BufferRange(*posIds); + auto maskRange = BufferRange(*mask); + + for (auto i = 0; i < maskRange.size(); i++) + { + maskRange[i] = false; + } + + std::vector> stack; + stack.push_back(std::make_pair(0, posIdsRange[0])); + maskRange[0 * len + 0] = true; + for (auto i = 1; i < len; i++) + { + auto cur = posIdsRange[i]; + while (stack.size() > 0 && cur <= stack.back().second) + { + stack.pop_back(); + } + TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); + stack.push_back(std::make_pair(i, cur)); + for (auto prev : stack) + { + maskRange[i * len + prev.first] = true; + } + } +} + +void LookaheadRandomLlm::maskToPosIds(TensorPtr const& posIds, TensorConstPtr const& mask, SizeType32 start) const +{ + auto len = ITensor::volume(posIds->getShape()); + TLLM_CHECK(ITensor::volume(mask->getShape()) >= len * len); + auto posIdsRange = BufferRange(*posIds); + auto maskLocation = BufferLocation(*mask); + for (auto i = 0; i < len; i++) + { + posIdsRange[i] = start; + for (auto j = 0; j < i; j++) + { + posIdsRange[i] += maskLocation.at(i, j); + } + } +} + } // namespace tensorrt_llm::tests::layers diff --git a/cpp/tests/layers/randomLlm.h b/cpp/tests/layers/randomLlm.h index ad333df4f..191aa7d1e 100644 --- a/cpp/tests/layers/randomLlm.h +++ b/cpp/tests/layers/randomLlm.h @@ -19,12 +19,14 @@ #include #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/runtimeKernels.h" namespace tensorrt_llm::tests::layers { using namespace tensorrt_llm::runtime; using TensorPtr = runtime::ITensor::SharedPtr; +using TensorConstPtr = runtime::ITensor::SharedConstPtr; //! Initialize a tensor with data from string @param str. Shape {str.size} by default. TensorPtr initTensor(std::string str, std::optional shape = std::nullopt); @@ -33,7 +35,7 @@ TensorPtr initTensor(std::string str, std::optional shape = std: class RandomTokenLogits { public: - RandomTokenLogits(TensorPtr vocab) + RandomTokenLogits(TensorConstPtr const& vocab) : mVocabulary(vocab) { } @@ -43,27 +45,28 @@ class RandomTokenLogits { } - TensorPtr tokenToLogits(TokenIdType token) const; - void tokenToLogits(TensorPtr logits, TokenIdType token) const; + TensorConstPtr tokenToLogits(TokenIdType token) const; + void tokenToLogits(TensorPtr const& logits, TokenIdType token) const; - TokenIdType logitsToToken(TensorPtr logits) const; + TokenIdType logitsToToken(TensorConstPtr const& logits) const; - std::list stringToLogits(std::string tokens) const; - void stringToLogits(TensorPtr logits, std::string tokens) const; - void tensorToLogits(TensorPtr logits, TensorPtr tokens) const; + std::list stringToLogits(std::string tokens) const; + void stringToLogits(TensorPtr const& logits, std::string tokens) const; + void tensorToLogits(TensorPtr const& logits, TensorConstPtr const& tokens) const; - std::string logitsToString(std::list logits) const; - std::string logitsToString(TensorPtr logits) const; - TensorPtr logitsToTensor(TensorPtr logits) const; + std::string logitsToString(std::list logits) const; + std::string logitsToString(TensorConstPtr const& logits) const; + TensorConstPtr logitsToTensor(TensorConstPtr const& logits) const; + void logitsToTensor(TensorPtr const& tokens, TensorConstPtr const& logits) const; SizeType32 getVocabSize() const; //! @return the last token in mVocabulary as invalid token; - virtual TokenIdType getInvalidToken() const; + virtual TokenIdType const getInvalidToken() const; //! @return the second-to-last token in mVocabulary as end token; - virtual TokenIdType getEndToken() const; + virtual TokenIdType const getEndToken() const; private: - TensorPtr const mVocabulary; + TensorConstPtr const mVocabulary; }; //! vocabulary is ascii table from 0 to 127. tokenId == token. @@ -83,12 +86,12 @@ class AsciiRandomTokenLogits : public RandomTokenLogits { } - virtual TokenIdType getInvalidToken() const + virtual TokenIdType const getInvalidToken() const { return static_cast('#'); } - virtual TokenIdType getEndToken() const + virtual TokenIdType const getEndToken() const { return static_cast('&'); } @@ -98,7 +101,7 @@ class AsciiRandomTokenLogits : public RandomTokenLogits class RandomLlm { public: - RandomLlm(std::shared_ptr table, std::string oracle, runtime::SizeType32 id = 0) + RandomLlm(std::shared_ptr const table, std::string oracle, runtime::SizeType32 id = 0) : mTable(table) , mOracle(initTensor(oracle)) , mId(id) @@ -106,32 +109,41 @@ class RandomLlm } // simulate forward in a LLM. - void forward(TensorPtr output, TensorPtr const input, TensorPtr const position) const; + void forward(TensorPtr const& output, TensorConstPtr const& input, TensorConstPtr const& position, + TensorConstPtr const mask = nullptr) const; //! set inout[i] invalid if mask[i]==false; - void sampleByMask(TensorPtr inout, TensorPtr const mask) const; + void sampleByMask(TensorPtr const& inout, TensorConstPtr const& mask) const; //! @return true when @param script is a sub-string started from @param offset. - bool verify(SizeType32 const offset, TensorPtr const script) const; + bool verify(SizeType32 const offset, TensorConstPtr const& script) const; //! foretell @param output tokens from @param input tokens and @param position ids. //! It depends on different algorithms implementations. - virtual void foretell(TensorPtr output, TensorPtr const input, TensorPtr const position) const = 0; + virtual void foretell(TensorPtr const& output, TensorConstPtr const& input, TensorConstPtr const& position, + TensorConstPtr const mask = nullptr) const + = 0; protected: - std::shared_ptr mTable; - TensorPtr mOracle; - runtime::SizeType32 mId; + std::shared_ptr const mTable; + TensorConstPtr const mOracle; + runtime::SizeType32 const mId; }; //! a lookahead implementation for RandomLlm. class LookaheadRandomLlm : public RandomLlm { public: - LookaheadRandomLlm(std::shared_ptr table, std::string oracle, runtime::SizeType32 id = 0) + LookaheadRandomLlm( + std::shared_ptr const table, std::string oracle, runtime::SizeType32 id = 0) : RandomLlm(table, oracle, id) { } - void foretell(TensorPtr output, TensorPtr const input, TensorPtr const position) const override; + void foretell(TensorPtr const& output, TensorConstPtr const& input, TensorConstPtr const& position, + TensorConstPtr const mask = nullptr) const override; + +private: + void posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds) const; + void maskToPosIds(TensorPtr const& posIds, TensorConstPtr const& mask, runtime::SizeType32 start) const; }; } // namespace tensorrt_llm::tests::layers diff --git a/cpp/tests/resources/data/test_model_lora_config.json b/cpp/tests/resources/data/test_model_lora_config.json index 28cfc8097..a592b302a 100644 --- a/cpp/tests/resources/data/test_model_lora_config.json +++ b/cpp/tests/resources/data/test_model_lora_config.json @@ -11,7 +11,6 @@ "moe": { "num_experts": 0, "top_k": 0, - "tp_mode": 2, "normalization_mode": 1 }, "architecture": "GPTForCausalLM", @@ -53,7 +52,7 @@ }, "build_config": { "max_input_len": 512, - "max_output_len": 50, + "max_seq_len": 562, "opt_batch_size": null, "max_batch_size": 4, "max_beam_width": 2, diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py index 0ee9ec2f7..20a697bb7 100644 --- a/cpp/tests/resources/scripts/build_chatglm_engines.py +++ b/cpp/tests/resources/scripts/build_chatglm_engines.py @@ -32,7 +32,7 @@ def convert_ckpt(model_dir: str, output_dir: str, world_size: int): convert_cmd = [ sys.executable, - str(chatglm_example_dir / "convert_checkpoint.py"), "--dtype=float32", + str(chatglm_example_dir / "convert_checkpoint.py"), "--dtype=float16", f"--model_dir={model_dir}", f"--output_dir={output_dir}", f"--tp_size={world_size}" ] @@ -51,9 +51,9 @@ def build_engine(ckpt_dir: str, "--max_batch_size=8", "--max_beam_width=2", "--max_input_len=256", - "--max_output_len=128", - "--gpt_attention_plugin=float32", - "--gemm_plugin=float32", + "--max_seq_len=384", + "--gpt_attention_plugin=float16", + "--gemm_plugin=float16", "--builder_opt=0", ] if is_ifb: @@ -125,7 +125,7 @@ def build_engines(model_cache: typing.Optional[str] = None, convert_ckpt(hf_dir, ckpt_dir, world_size) - for engine_kind in ["fp32-plugin", "fp32-plugin-packed-paged"]: + for engine_kind in ["fp16-plugin", "fp16-plugin-packed-paged"]: engine_dir = Path( model_dir ) / "rt_engine" / model_name / engine_kind / "tp1-pp1-gpu" diff --git a/cpp/tests/resources/scripts/build_enc_dec_engines.py b/cpp/tests/resources/scripts/build_enc_dec_engines.py index d40b24a1d..842d0d8cc 100644 --- a/cpp/tests/resources/scripts/build_enc_dec_engines.py +++ b/cpp/tests/resources/scripts/build_enc_dec_engines.py @@ -20,9 +20,8 @@ class Arguments: model_cache: str = '/llm-models' - # override by --only_multi_gpu, enforced by test_cpp.py - tp: int = 2 - pp: int = 2 + tp: int = 1 + pp: int = 1 beams: int = 1 gpus_per_node: int = 4 @@ -76,16 +75,9 @@ def __post_init__(self): else: parser.add_argument(f'--{k}', default=v, type=type(v)) - parser.add_argument('--only_multi_gpu', action='store_true') args = parser.parse_args() for k, v in args._get_kwargs(): setattr(self, k, v) - if args.only_multi_gpu: - self.tp = 2 - self.pp = 2 - else: - self.tp = 1 - self.pp = 1 @dataclass @@ -137,7 +129,7 @@ def command(self): f"--output_dir {join(engine_dir, 'encoder')}", f'--paged_kv_cache disable', f'--moe_plugin disable', f'--enable_xqa disable', f'--max_beam_width {args.beams}', - f'--max_batch_size 8', f'--max_output_len 200', + f'--max_batch_size 8', f'--max_seq_len 1224', f'--gemm_plugin {args.dtype}', f'--bert_attention_plugin {args.dtype}', f'--gpt_attention_plugin {args.dtype}', @@ -150,7 +142,7 @@ def command(self): f"--output_dir {join(engine_dir, 'decoder')}", f'--paged_kv_cache enable', f'--moe_plugin disable', f'--enable_xqa disable', f'--max_beam_width {args.beams}', - f'--max_batch_size 8', f'--max_output_len 200', + f'--max_batch_size 8', f'--max_seq_len 201', f'--gemm_plugin {args.dtype}', f'--bert_attention_plugin {args.dtype}', f'--gpt_attention_plugin {args.dtype}', diff --git a/cpp/tests/resources/scripts/build_engines_utils.py b/cpp/tests/resources/scripts/build_engines_utils.py index bc9453058..c0301af1c 100644 --- a/cpp/tests/resources/scripts/build_engines_utils.py +++ b/cpp/tests/resources/scripts/build_engines_utils.py @@ -14,16 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging as _log +import os as _os import pathlib as _pl import subprocess as _sp import typing as _tp -def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None: - print(f"Running: cd %s && %s" % - (str(cwd or _pl.Path.cwd()), " ".join(command)), - flush=True) - _sp.check_call(command, cwd=cwd, **kwargs) +def run_command(command: _tp.Sequence[str], + *, + cwd=None, + timeout=None, + **kwargs) -> None: + _log.info("Running: cd %s && %s", str(cwd), " ".join(command)) + override_timeout = int(_os.environ.get("CPP_TEST_TIMEOUT_OVERRIDDEN", "-1")) + if override_timeout > 0 and (timeout is None or override_timeout > timeout): + _log.info("Overriding the command timeout: %s (before) and %s (after)", + timeout, override_timeout) + timeout = override_timeout + _sp.check_call(command, cwd=cwd, timeout=timeout, **kwargs) # We can't use run_command() because robocopy (Robust Copy, rsync equivalent on Windows) diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py index 1346569ec..1a78ed0c0 100755 --- a/cpp/tests/resources/scripts/build_gpt_engines.py +++ b/cpp/tests/resources/scripts/build_gpt_engines.py @@ -42,14 +42,18 @@ def build_engine( engine_dir: str, *args, max_input_len: int = 256, - max_output_len: int = 128, + max_seq_len: int = 384, ): build_cmd = [ - "trtllm-build", '--log_level=error', - f'--checkpoint_dir={checkpoint_dir}', f'--output_dir={engine_dir}', - '--max_batch_size=64', f'--max_input_len={max_input_len}', - f'--max_output_len={max_output_len}', '--max_beam_width=2', - '--builder_opt=0' + "trtllm-build", + '--log_level=error', + f'--checkpoint_dir={checkpoint_dir}', + f'--output_dir={engine_dir}', + '--max_batch_size=64', + f'--max_input_len={max_input_len}', + f'--max_seq_len={max_seq_len}', + '--max_beam_width=2', + '--builder_opt=0', ] legacy_args = [ "--gpt_attention_plugin=disable", @@ -219,8 +223,11 @@ def build_engines(model_cache: Optional[str] = None, world_size: int = 1): "--lora_target_modules=attn_qkv", '--lora_plugin=float16', *ifb_args) - llm_datasets_root = Path(model_cache) / "datasets" - calib_dataset = llm_datasets_root / "cimec/lambada/" + if model_cache: + llm_datasets_root = Path(model_cache) / "datasets" + calib_dataset = llm_datasets_root / "cimec/lambada/" + else: + calib_dataset = "lambada" print("\nConverting to fp16 SQ") fp16_sq_ckpt_dir = ckpt_dir / 'fp16-sq' / tp_dir convert_ckpt(str(hf_dir), diff --git a/cpp/tests/resources/scripts/build_gptj_engines.py b/cpp/tests/resources/scripts/build_gptj_engines.py index bcbaa0809..800982283 100755 --- a/cpp/tests/resources/scripts/build_gptj_engines.py +++ b/cpp/tests/resources/scripts/build_gptj_engines.py @@ -51,7 +51,7 @@ def build_engine(checkpoint_dir: _pl.Path, engine_dir: _pl.Path, *args): '--gemm_plugin=float16', '--max_batch_size=32', '--max_input_len=40', - '--max_output_len=20', + '--max_seq_len=60', '--max_beam_width=2', '--log_level=error', ] + list(args) diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py index 3df91f075..ef8b3e662 100644 --- a/cpp/tests/resources/scripts/build_llama_engines.py +++ b/cpp/tests/resources/scripts/build_llama_engines.py @@ -43,7 +43,7 @@ def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, *args): '--gemm_plugin=float16', '--max_batch_size=32', '--max_input_len=40', - '--max_output_len=20', + '--max_seq_len=60', '--max_beam_width=2', '--log_level=error', '--paged_kv_cache=enable', diff --git a/cpp/tests/resources/scripts/build_mamba_engines.py b/cpp/tests/resources/scripts/build_mamba_engines.py index 63ead564d..d724cdeae 100644 --- a/cpp/tests/resources/scripts/build_mamba_engines.py +++ b/cpp/tests/resources/scripts/build_mamba_engines.py @@ -42,7 +42,7 @@ def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path, '--gemm_plugin=disable', '--max_batch_size=8', '--max_input_len=924', - '--max_output_len=100', + '--max_seq_len=1024', '--max_beam_width=1', ] + list(args) run_command(build_args) diff --git a/cpp/tests/resources/scripts/build_medusa_engines.py b/cpp/tests/resources/scripts/build_medusa_engines.py index 324cd2b45..18eccf932 100755 --- a/cpp/tests/resources/scripts/build_medusa_engines.py +++ b/cpp/tests/resources/scripts/build_medusa_engines.py @@ -40,7 +40,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path, '--gemm_plugin=float16', '--max_batch_size=8', '--max_input_len=12', - '--max_output_len=128', + '--max_seq_len=140', '--log_level=error', '--paged_kv_cache=enable', '--remove_input_padding=enable', diff --git a/cpp/tests/resources/scripts/build_recurrentgemma_engines.py b/cpp/tests/resources/scripts/build_recurrentgemma_engines.py index 21c10ef3f..47d0c58c9 100644 --- a/cpp/tests/resources/scripts/build_recurrentgemma_engines.py +++ b/cpp/tests/resources/scripts/build_recurrentgemma_engines.py @@ -44,7 +44,7 @@ def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path, '--gemm_plugin=float16', '--max_batch_size=8', '--max_input_len=924', - '--max_output_len=100', + '--max_seq_len=1024', '--max_beam_width=1', ] + list(args) run_command(build_args) diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py index 2d1bdf695..eb6cc3fa4 100755 --- a/cpp/tests/resources/scripts/generate_expected_chatglm_output.py +++ b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py @@ -43,7 +43,7 @@ def generate_output( output_dir = resources_dir / "data" / model_name / f"beam_search_{num_beams}" output_dir.mkdir(exist_ok=True, parents=True) - for engine_kind in ["fp32-plugin", "fp32-plugin-packed-paged"]: + for engine_kind in ["fp16-plugin", "fp16-plugin-packed-paged"]: engine_dir = model_path / 'rt_engine' / model_name / engine_kind / tp_pp_dir output_npy_file_name = output_dir / f"output_tokens_{engine_kind.replace('-', '_')}_tp{tp_size}_pp{pp_size}.npy" output_csv_file_name = output_dir / f"output_tokens_{engine_kind.replace('-', '_')}_tp{tp_size}_pp{pp_size}.csv" diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index 0e265d2b6..82b19925f 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -328,6 +328,13 @@ def prepare_multi_gpu_model_tests(python_exe: str, model_cache_arg=model_cache_arg, only_multi_gpu_arg=only_multi_gpu_arg) + prepare_model_tests(model_name="t5", + python_exe=python_exe, + root_dir=root_dir, + resources_dir=resources_dir, + model_cache_arg=model_cache_arg, + only_multi_gpu_arg=['--tp', '4', '--pp', '1']) + def prepare_model_tests(model_name: str, python_exe: str, @@ -360,7 +367,7 @@ def prepare_model_tests(model_name: str, ] + only_fp8_arg + only_multi_gpu_arg + enc_dec_model_name_arg if "enc_dec" in model_name: generate_expected_output += model_cache_arg - if only_multi_gpu_arg: + if only_multi_gpu_arg and model_name != 'enc_dec': generate_expected_output = [ "mpirun", "-n", "4", "--allow-run-as-root", "--timeout", "600" ] + generate_expected_output @@ -438,6 +445,10 @@ def run_single_gpu_tests(build_dir: _pl.Path, included_tests.append("RecurrentGemma") if run_encoder: included_tests.append("EncoderModelTestSingleGPU") + if run_bart: + included_tests.append("BartBasicTest/EncDecParamsTest.Forward*") + if run_t5: + included_tests.append("T5BasicTest/EncDecParamsTest.Forward*") excluded_tests = [] if not run_fp8: @@ -449,23 +460,6 @@ def run_single_gpu_tests(build_dir: _pl.Path, ctest.extend(["-E", "|".join(excluded_tests)]) run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) - def run_enc_dec_test_with_env(model: str): - enc_dec_test_command = [ - "tests/executor/executorTest", - "--gtest_filter=EncDecBasicTest/EncDecParamsTest.Forward*", - f"--gtest_output=xml:{str(build_dir)}/results-single-gpu-enc-dec.xml" - ] - run_command(enc_dec_test_command, - cwd=build_dir, - env={ - **cpp_env, 'ENC_DEC_MODEL': model - }) - - if run_bart: - run_enc_dec_test_with_env('bart') - if run_t5: - run_enc_dec_test_with_env('t5') - def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500): build_tests(build_dir=build_dir) @@ -512,6 +506,16 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500): ] run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + #EncDec test in leader mode + new_env = cpp_env + xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml" + trt_model_test = [ + "mpirun", "-n", "4", "--allow-run-as-root", "executor/executorTest", + "--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*", + f"--gtest_output=xml:{xml_output_file}" + ] + run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + def run_benchmarks(python_exe: str, root_dir: _pl.Path, build_dir: _pl.Path, resources_dir: _pl.Path): diff --git a/cpp/tests/runtime/gptDecoderTest.cpp b/cpp/tests/runtime/gptDecoderTest.cpp index e7aa2001b..75928a229 100644 --- a/cpp/tests/runtime/gptDecoderTest.cpp +++ b/cpp/tests/runtime/gptDecoderTest.cpp @@ -35,7 +35,7 @@ bool forwardAndSync(std::unique_ptr const& decoder, DecodingOutput& std::shared_ptr stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const maxBatchSize = input.maxBatchSize; + auto const maxBatchSize = input.batchSize; BufferManager::ITensorPtr finishedSum; std::int32_t* finishedSumHost = nullptr; diff --git a/cpp/tests/runtime/gptSessionTest.cpp b/cpp/tests/runtime/gptSessionTest.cpp index ef412d9d7..5ba2051db 100644 --- a/cpp/tests/runtime/gptSessionTest.cpp +++ b/cpp/tests/runtime/gptSessionTest.cpp @@ -705,7 +705,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest, INSTANTIATE_TEST_SUITE_P(ChatGlmSessionTest, ParamTest, testing::Combine(testing::Values(ModelParams{CHATGLM_MODEL_DIR, {130005, 3}}), // end_id, pad_id - testing::Values(ModelSpec{FP32_GPT_ATTENTION_DIR, FP32_PLUGIN_RESULT_FILE, nvinfer1::DataType::kFLOAT} + testing::Values(ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() ), @@ -718,7 +718,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmSessionTest, ParamTest, INSTANTIATE_TEST_SUITE_P(ChatGlm2SessionTest, ParamTest, testing::Combine(testing::Values(ModelParams{CHATGLM2_MODEL_DIR, {2, 0}}), // end_id, pad_id - testing::Values(ModelSpec{FP32_GPT_ATTENTION_DIR, FP32_PLUGIN_RESULT_FILE, nvinfer1::DataType::kFLOAT} + testing::Values(ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() ), @@ -731,7 +731,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm2SessionTest, ParamTest, INSTANTIATE_TEST_SUITE_P(ChatGlm3SessionTest, ParamTest, testing::Combine(testing::Values(ModelParams{CHATGLM3_MODEL_DIR, {2, 0}}), // end_id, pad_id - testing::Values(ModelSpec{FP32_GPT_ATTENTION_DIR, FP32_PLUGIN_RESULT_FILE, nvinfer1::DataType::kFLOAT} + testing::Values(ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() ), diff --git a/cpp/tests/runtime/iTensorTest.cpp b/cpp/tests/runtime/iTensorTest.cpp index 2bad82500..86f0bbe09 100644 --- a/cpp/tests/runtime/iTensorTest.cpp +++ b/cpp/tests/runtime/iTensorTest.cpp @@ -280,6 +280,87 @@ TEST(ITensorTest, TensorDimsSliceAtManual) EXPECT_TRUE(theConstOne->shapeEquals({1})); } +TEST(ITensorTest, TensorDimsSliceAtExtrame) +{ + auto constexpr dataType = nvinfer1::DataType::kFLOAT; + { + auto shape = ITensor::makeShape({5, 5, 5, 5, 5}); + ITensor::SharedPtr tensor(BufferManager::cpu(shape, dataType)); + + EXPECT_TRUE(ITensor::slice(tensor, {}, 0)->shapeEquals(ITensor::makeShape({0, 5, 5, 5, 5, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 1)->shapeEquals(ITensor::makeShape({1, 5, 5, 5, 5, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {5}, 0)->shapeEquals(ITensor::makeShape({0, 5, 5, 5, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 5}, 0)->shapeEquals(ITensor::makeShape({0, 5, 5, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 4, 5}, 0)->shapeEquals(ITensor::makeShape({0, 5, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 4, 4, 5}, 0)->shapeEquals(ITensor::makeShape({0, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 4, 4, 4, 5}, 0)->shapeEquals(ITensor::makeShape({0}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 4, 4, 4, 4}, 0)->shapeEquals(ITensor::makeShape({0}))); + EXPECT_TRUE(ITensor::slice(tensor, {4, 4, 4, 4, 4}, 1)->shapeEquals(ITensor::makeShape({1}))); + EXPECT_THROW(ITensor::slice(tensor, {}, 2), std::runtime_error); + + EXPECT_TRUE(ITensor::at(tensor, {})->shapeEquals(ITensor::makeShape({5, 5, 5, 5, 5}))); + EXPECT_TRUE(ITensor::at(tensor, {4})->shapeEquals(ITensor::makeShape({5, 5, 5, 5}))); + EXPECT_TRUE(ITensor::at(tensor, {4, 4})->shapeEquals(ITensor::makeShape({5, 5, 5}))); + EXPECT_TRUE(ITensor::at(tensor, {4, 4, 4})->shapeEquals(ITensor::makeShape({5, 5}))); + EXPECT_TRUE(ITensor::at(tensor, {4, 4, 4, 4})->shapeEquals(ITensor::makeShape({5}))); + EXPECT_TRUE(ITensor::at(tensor, {4, 4, 4, 4, 4})->shapeEquals(ITensor::makeShape({1}))); + } + + { + ITensor::SharedPtr tensor(BufferManager::cpu(ITensor::makeShape({}), dataType)); + + EXPECT_TRUE(ITensor::slice(tensor, 0, 0)->shapeEquals(ITensor::makeShape({}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 0)->shapeEquals(ITensor::makeShape({0}))); // {0,{}} ==> {0} + EXPECT_THROW(ITensor::slice(tensor, {}, 1), std::runtime_error); // (1,{}} /=> {1} + EXPECT_THROW(ITensor::slice(tensor, {}, 2), std::runtime_error); + EXPECT_THROW(ITensor::slice(tensor, {0}, 0), std::runtime_error); + + EXPECT_THROW(ITensor::at(tensor, {}), std::runtime_error); // due illegal slice(tensor, {}, 1) + EXPECT_THROW(ITensor::at(tensor, {0}), std::runtime_error); + } + { + ITensor::SharedPtr tensor(BufferManager::cpu(ITensor::makeShape({0}), dataType)); + + EXPECT_TRUE(ITensor::slice(tensor, 0, 0)->shapeEquals(ITensor::makeShape({0}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 0)->shapeEquals(ITensor::makeShape({0, 0}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 1)->shapeEquals(ITensor::makeShape({1, 0}))); + EXPECT_TRUE(ITensor::slice(tensor, {0}, 0)->shapeEquals(ITensor::makeShape({0}))); + EXPECT_THROW(ITensor::slice(tensor, {}, 2), std::runtime_error); + + EXPECT_TRUE(ITensor::at(tensor, {})->shapeEquals(ITensor::makeShape({0}))); + EXPECT_THROW(ITensor::at(tensor, {0}), std::runtime_error); + } + { + ITensor::SharedPtr tensor(BufferManager::cpu(ITensor::makeShape({0, 0}), dataType)); + + EXPECT_TRUE(ITensor::slice(tensor, 0, 0)->shapeEquals(ITensor::makeShape({0, 0}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 0)->shapeEquals(ITensor::makeShape({0, 0, 0}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 1)->shapeEquals(ITensor::makeShape({1, 0, 0}))); + EXPECT_TRUE(ITensor::slice(tensor, {0}, 0)->shapeEquals(ITensor::makeShape({0, 0}))); + EXPECT_THROW(ITensor::slice(tensor, {}, 2), std::runtime_error); + EXPECT_THROW(ITensor::slice(tensor, {0, 0}, 0), std::runtime_error); + + EXPECT_TRUE(ITensor::at(tensor, {})->shapeEquals(ITensor::makeShape({0, 0}))); + EXPECT_THROW(ITensor::at(tensor, {0}), std::runtime_error); + EXPECT_THROW(ITensor::at(tensor, {0, 0}), std::runtime_error); + } + { + ITensor::SharedPtr tensor(BufferManager::cpu(ITensor::makeShape({5, 0, 5}), dataType)); + + EXPECT_TRUE(ITensor::slice(tensor, 0, 0)->shapeEquals(ITensor::makeShape({0, 0, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 0)->shapeEquals(ITensor::makeShape({0, 5, 0, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {}, 1)->shapeEquals(ITensor::makeShape({1, 5, 0, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {0}, 0)->shapeEquals(ITensor::makeShape({0, 0, 5}))); + EXPECT_TRUE(ITensor::slice(tensor, {0, 0}, 0)->shapeEquals(ITensor::makeShape({0, 5}))); + EXPECT_THROW(ITensor::slice(tensor, {}, 2), std::runtime_error); + EXPECT_THROW(ITensor::slice(tensor, {0, 0, 0}, 0), std::runtime_error); + + EXPECT_TRUE(ITensor::at(tensor, {})->shapeEquals(ITensor::makeShape({5, 0, 5}))); + EXPECT_TRUE(ITensor::at(tensor, {0})->shapeEquals(ITensor::makeShape({0, 5}))); + EXPECT_THROW(ITensor::at(tensor, {0, 0}), std::runtime_error); + } +} + //! \brief Range shape in [begin, end). class ShapeRange { @@ -466,6 +547,7 @@ TEST(ITensorTest, TensorDimsSliceAt) { auto blockAt = ITensor::at(tensor, index); auto blockSliceRest = ITensor::slice(tensor, index); + auto blockSliceZero = ITensor::slice(tensor, index, 0); auto blockSliceOne = ITensor::slice(tensor, index, 1); auto blockSliceTwo = (shape.d[index.nbDims - 1] - index.d[index.nbDims - 1] >= 2) ? std::make_optional(ITensor::slice(tensor, index, 2)) @@ -516,6 +598,17 @@ TEST(ITensorTest, TensorDimsSliceAt) } EXPECT_TRUE(ITensor::shapeEquals(blockShape, goldenShape)); } + { + auto blockShape = blockSliceZero->getShape(); + ITensor::Shape goldenShape; + goldenShape.nbDims = shape.nbDims - index.nbDims + 1; + goldenShape.d[0] = 0; + for (SizeType32 i = 1; i < goldenShape.nbDims; i++) + { + goldenShape.d[i] = shape.d[i + index.nbDims - 1]; + } + EXPECT_TRUE(ITensor::shapeEquals(blockShape, goldenShape)); + } { auto blockShape = blockSliceOne->getShape(); ITensor::Shape goldenShape; @@ -556,7 +649,6 @@ TEST(ITensorTest, TensorDimsSliceAt) for (it++; it != range.end(); ++it) { EXPECT_THROW(ITensor::at(tensor, *it), std::runtime_error); - EXPECT_THROW(ITensor::slice(tensor, *it), std::runtime_error); EXPECT_THROW(ITensor::slice(tensor, *it, 1), std::runtime_error); } } diff --git a/cpp/tests/runtime/samplingTest.cpp b/cpp/tests/runtime/samplingTest.cpp index b42153663..4a26ad5b5 100644 --- a/cpp/tests/runtime/samplingTest.cpp +++ b/cpp/tests/runtime/samplingTest.cpp @@ -53,7 +53,7 @@ class SamplingTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type std::shared_ptr mLogger{}; }; -std::shared_ptr dynamicDecodeTest(BufferManager& manager, +std::shared_ptr dynamicDecodeTest(BufferManager& manager, std::shared_ptr allocator, size_t vocabSize, size_t vocabSizePadded, size_t batchSize, size_t beamWidth, int step, int ite, int maxInputLength, size_t maxSeqLength, size_t sinkTokenLength, int localBatchSize, std::vector& cpuOutputIds, std::vector cpuLogits, int noRepeatNgramSizeValue = 0) @@ -98,15 +98,23 @@ std::shared_ptr dynamicDecodeTest(BufferManager& auto ddLayer = tl::DynamicDecodeLayer(decodingMode, decodingDomain, manager.getStream().get(), allocator); auto setupParams = std::make_shared(); - setupParams->penaltyParams.noRepeatNgramSize = cpuNoRepeatNgramSize; + setupParams->banWordsParams = std::make_shared(); + setupParams->banWordsParams->noRepeatNgramSize = cpuNoRepeatNgramSize; + + setupParams->penaltyParams = std::make_shared(); + setupParams->decodingParams = std::make_shared(); + ddLayer.setup(batchSize, beamWidth, nullptr, setupParams); - auto forwardParams = std::make_shared( - step, ite, maxInputLength, static_cast(maxSeqLength), sinkTokenLength, localBatchSize, endIds); + auto forwardParams = std::make_shared(endIds, step, ite, localBatchSize); forwardParams->logits = logits; - auto outputParams = std::make_shared(outputIds); - outputParams->sequence_length = sequenceLengths; + forwardParams->banWordsInputs = std::make_shared(localBatchSize); + + forwardParams->stopCriteriaInputs = std::make_shared(localBatchSize); + + auto outputParams = std::make_shared(outputIds); + outputParams->sequenceLength = sequenceLengths; outputParams->newTokens = newTokens; outputParams->finished = finished; @@ -149,7 +157,7 @@ TEST_F(SamplingTest, SamplingWithNoRepeatNGramSize) auto outputParams = dynamicDecodeTest(manager, allocator, vocabSize, vocabSizePadded, batchSize, beamWidth, step, ite, maxInputLength, maxSeqLength, sinkTokenLength, localBatchSize, cpuOutputIds, cpuLogits, noRepeatNgramSize); - cudaMemcpy(cpuOutputIds.data(), outputParams->output_ids.getPtr(), cpuOutputIds.size() * sizeof(int), + cudaMemcpy(cpuOutputIds.data(), outputParams->outputIds.getPtr(), cpuOutputIds.size() * sizeof(int), cudaMemcpyDeviceToHost); EXPECT_EQ(cpuOutputIds[maxSeqLength - 1], 43); diff --git a/cpp/tests/runtime/tllmRuntimeTest.cpp b/cpp/tests/runtime/tllmRuntimeTest.cpp index e5b30852e..b9e694ed8 100644 --- a/cpp/tests/runtime/tllmRuntimeTest.cpp +++ b/cpp/tests/runtime/tllmRuntimeTest.cpp @@ -23,6 +23,7 @@ #include #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/runtime/rawEngine.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/tllmRuntime.h" @@ -90,7 +91,7 @@ class TllmRuntimeTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-t TEST_F(TllmRuntimeTest, SinglePass) { EXPECT_TRUE(mSerializedEngine); - TllmRuntime rt{*mSerializedEngine, 1.0F, mLogger}; + TllmRuntime rt{RawEngine(mSerializedEngine.get()), &mLogger, 1.0F}; auto& engine = rt.getEngine(); EXPECT_FALSE(engine.hasImplicitBatchDimension()); EXPECT_EQ(rt.getNbProfiles(), engine.getNbOptimizationProfiles()); diff --git a/docs/source/advanced/batch-manager.md b/docs/source/advanced/batch-manager.md index d6ccccee9..64606b751 100644 --- a/docs/source/advanced/batch-manager.md +++ b/docs/source/advanced/batch-manager.md @@ -122,7 +122,7 @@ When using V1 batching, the following additional statistics are reported per V1 Users can alter the logits produced the network, with a callback attached to an `InferenceRequest`: ``` - using LogitsPostProcessor = std::function; + using LogitsPostProcessor = std::function; ``` The first argument is the request id, second is the logits tensor, third are the tokens produced by the request so far, and last one is the operation stream used by the logits tensor. @@ -171,7 +171,7 @@ The responses from `SendResponseCallback` are stored in a `std::shared_ptr Tensor Parallel evenly splits each expert’s weight and distributes them to different GPUs, which means each GPU holds partial weight of all experts, While Expert Parallel evenly distributes some of the experts’ full weight to different GPUs, which means each GPU holds part of the experts’ full weight. As a result, each GPU rank in the Tensor Parallel group receives all tokens’ hidden states for all experts, then computes using the partial weights, while for Expert Parallel, each GPU rank only receives part of tokens’ hidden states for experts on this rank, then computes using the full weights. +When both Tensor Parallel and Expert Parallel are enabled, each GPU handles a portion of the expert weights matrices (as in EP mode) and these weights are further sliced across multiple GPUs (as in TP mode). This hybrid approach aims to balance the workload more evenly across GPUs, enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone. + ## How to Enable -The default parallel pattern is Tensor Parallel. You can enable Expert Parallel by setting `--moe_tp_mode 1` when calling `convert_coneckpoint.py`, and `--tp_size` is used to set the Expert Parallel size. +The default parallel pattern is Tensor Parallel. You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used. + +Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE paralleism across all GPUs must match the total number of parallelism in other parts of the model. The other parameters related to MoE structure, such as `num_experts_per_tok` (TopK in previous context), and `num_local_experts`, can be find in the model’s configuration file, such as the one for [Mixtral 8x7B model](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json). diff --git a/docs/source/advanced/lora.md b/docs/source/advanced/lora.md index 8bb53bb20..59efd300c 100644 --- a/docs/source/advanced/lora.md +++ b/docs/source/advanced/lora.md @@ -23,7 +23,7 @@ trtllm-build --checkpoint_dir /tmp/llama_7b/trt_ckpt/fp16/1-gpu/ \ --lora_plugin float16 \ --max_batch_size 128 \ --max_input_len 512 \ - --max_output_len 50 \ + --max_seq_len 562 \ --lora_dir Japanese-Alpaca-LoRA-7b-v0 \ --max_lora_rank 8 \ --lora_target_modules "attn_q" "attn_k" "attn_v" diff --git a/docs/source/advanced/weight-streaming.md b/docs/source/advanced/weight-streaming.md index 9e8078ce3..24bdf9594 100644 --- a/docs/source/advanced/weight-streaming.md +++ b/docs/source/advanced/weight-streaming.md @@ -25,7 +25,7 @@ trtllm-build \ --gemm_plugin disable \ --max_batch_size 128 \ --max_input_len 512 \ - --max_output_len 50 + --max_seq_len 562 # Run the engine with 20% weights in GPU memory. python3 examples/summarize.py \ @@ -47,7 +47,7 @@ python3 benchmarks/python/benchmark.py \ --max_batch_size "32" \ --input_output_len "256,32" \ --max_input_len 256\ - --max_output_len 32 \ + --max_seq_len 288 \ --gpu_weights_percent "0.0;0.3;0.6;1.0" \ --dtype float16 \ --csv \ diff --git a/docs/source/architecture/checkpoint.md b/docs/source/architecture/checkpoint.md index dc269c5f3..fb5620da7 100644 --- a/docs/source/architecture/checkpoint.md +++ b/docs/source/architecture/checkpoint.md @@ -229,7 +229,7 @@ trtllm-build --checkpoint_dir ./opt/125M/trt_ckpt/fp16/2-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/125M/trt_engines/fp16/2-gpu/ ``` diff --git a/docs/source/executor.md b/docs/source/executor.md index 5e2cef922..5d556590f 100644 --- a/docs/source/executor.md +++ b/docs/source/executor.md @@ -20,7 +20,7 @@ The `Executor` class is responsible for receiving requests from the client, and Users can alter the logits produced the network, by providing a map of named callbacks of the form: ``` -std::unordered_map> +std::unordered_map> ``` to the `ExecutorConfig`. The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any. @@ -29,7 +29,16 @@ The first argument to the callback is the request id, second is the logits tenso Users *must* use the stream to access the logits tensor. For example, performing a addition with a bias tensor should be enqueued on that stream. Alternatively, users may call `stream->synchronize()`, however, that will slow down the entire execution pipeline. -Note: this feature isn't supported with the `STATIC` batching type for the moment. +We also provide a batched version that allows altering logits of multiple requests in a batch. This allows further optimizations and reduces callback overheads. + +``` +std::function const&, std::vector&, std::vector> const&, StreamPtr const&)> +``` + +A single batched callback can be specified in `ExecutorConfig`. Each request can opt to apply this callback by specifying the name of the logits +post-processor as `Request::kBatchedPostProcessorName`. + +Note: Both callback variants are not supported with the `STATIC` batching type for the moment. ### The Request Class diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 31f8feaa1..100568cce 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -166,7 +166,7 @@ The following tables are references for commands that are used as part of the be | Stage | Description | Command | | :- | - | - | -| [Build](#engine-building) | Build a TensorRT-LLM engine | `trtllm-build --model_config $model_cfg --strongly_typed --output_dir $engine_dir --max_batch_size 2048 --max_input_len 2048 --max_output_len 4096 --workers $tp_size --max_num_tokens 2048 --use_paged_context_fmha enable --multiple_profiles enable` | +| [Build](#engine-building) | Build a TensorRT-LLM engine | `trtllm-build --model_config $model_cfg --strongly_typed --output_dir $engine_dir --max_batch_size 2048 --max_input_len 2048 --max_seq_len 6144 --workers $tp_size --max_num_tokens 2048 --use_paged_context_fmha enable --multiple_profiles enable` | | [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `benchmarks/cpp/prepare_dataset.py --output=$dataset_file --tokenizer=$model_name token-norm-dist --num-requests=2000 --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0` | | [Run](#running-the-benchmark) | Run a benchmark with a dataset | `mpirun -n $tp_size --allow-run-as-root --oversubscribe cpp/build/benchmarks/gptManagerBenchmark --engine_dir $engine_dir --type IFB --dataset $dataset_file --scheduler_policy max_utilization --kv_cache_free_gpu_mem_fraction 0.9 --output_csv $results_csv --request_rate -1.0 --enable_chunked_context --streaming --warm_up 0` | @@ -197,13 +197,13 @@ for the model that you would like to build (see [below](#network-configuration-f command is as follows: ```shell -trtllm-build --model_config $model_cfg --strongly_typed --output_dir $engine_dir --max_batch_size 2048 --max_input_len 2048 --max_output_len 4096 --workers $tp_size --max_num_tokens 2048 --use_paged_context_fmha enable --multiple_profiles enable +trtllm-build --model_config $model_cfg --strongly_typed --output_dir $engine_dir --max_batch_size 2048 --max_input_len 2048 --max_seq_len 6144 --workers $tp_size --max_num_tokens 2048 --use_paged_context_fmha enable --multiple_profiles enable ``` Some notes about the command: - `--workers` affects the number of threads that build the engine file and does not necessarily need to match the TP size. Make sure to set the tensor parallelism in the `$model_cfg` JSON file. See [below](#network-configuration-files) -- You can run benchmarks for datasets that fit within the bounds of the `max_input_len` and `max_output_len` parameters. +- You can run benchmarks for datasets that fit within the bounds of the `max_input_len` and `max_seq_len` parameters. ### Engine Configuration Files diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index 03fdaa43b..d3d96a128 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM. * - - Software Compatibility * - Container - - [23.10](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html#framework-matrix-2023) + - [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) * - TensorRT - - [9.2](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) + - [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) * - Precision - - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4 @@ -96,16 +96,17 @@ The following table shows the supported software for TensorRT-LLM. - [Whisper](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper) * - Multi-Modal Models (5) - - - [BLIP2 w/ OPT-2.7B](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - - [BLIP2 w/ T5-XL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) + - [BLIP2 w/ OPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) + - [BLIP2 w/ T5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - [CogVLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)(6) - [Deplot](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - [Fuyu](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - - [Kosmos-2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - - [LLaVA-v1.5-7B](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) + - [Kosmos](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) + - [LLaVA-v1.5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - [NeVA](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) + - [Nougat](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) Nougat-small, Nougat-base + - [Phi-3-vision](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - [Video NeVA](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) - - [Nougat family](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) Nougat-small, Nougat-base - [VILA](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) ``` diff --git a/examples/arctic/README.md b/examples/arctic/README.md index 826c04ec4..b04f7ccb5 100644 --- a/examples/arctic/README.md +++ b/examples/arctic/README.md @@ -46,7 +46,7 @@ mkdir -p tmp/trt_engines ### Apply FP8 PTQ Notes: -- currently quantize.py does not support for Expert Parallelism (EP) mode yet. User should use `../llama/convert_checkpoint.py` and specify `--moe_tp_mode 1` (1 for EP, 2 for TP) instead, if needed. +- currently quantize.py does not support for Expert Parallelism (EP) mode yet. User should use `../llama/convert_checkpoint.py` and specify `--moe_ep_size 1` instead, if needed. - TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to ../quantization/README.md for explanation. - **due to the large model size and the calibration step (which has to load the HuggingFace model and run forward passes), it is likely that you will need more number of GPUs during quantization step than the number of GPUs for engine building and final deployment. For example, using 16xH100 or 8xH200 for quantization & 8xH100 for deployment.** diff --git a/examples/baichuan/README.md b/examples/baichuan/README.md index 18abcf0f8..be4e9eb40 100644 --- a/examples/baichuan/README.md +++ b/examples/baichuan/README.md @@ -72,7 +72,7 @@ trtllm-build --checkpoint_dir ./tmp/baichuan_v1_13b/trt_ckpts/fp16/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size=32 \ --max_input_len=1024 \ - --max_output_len=512 + --max_seq_len=1536 ``` diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt index abd4fddad..f51033e19 100644 --- a/examples/baichuan/requirements.txt +++ b/examples/baichuan/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.15.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/bindings/executor/README.md b/examples/bindings/executor/README.md index 8c563b104..b33e35a36 100644 --- a/examples/bindings/executor/README.md +++ b/examples/bindings/executor/README.md @@ -54,3 +54,13 @@ This can be done by running: ``` python3 example_advanced.py --model_path=../llama/tmp/7B/trt_engines/fp16/4gpu_tp4_pp1/ --use_orchestrator_mode ``` + +### Logits post processor example + +This example shows how to generate JSON structured output using LogitsPostProcessor API. + +``` +python3 example_logits_processor.py -t -e --batch_size 8 +``` + +LogitsPostProcessorBatched, which fuses logits processing for all samples in a batch into a single callback, is enabled by `--lpp_batched` diff --git a/examples/bindings/executor/example_logits_processor.py b/examples/bindings/executor/example_logits_processor.py new file mode 100644 index 000000000..6e0e77e73 --- /dev/null +++ b/examples/bindings/executor/example_logits_processor.py @@ -0,0 +1,203 @@ +import argparse +import datetime +import typing as _tp + +import torch as _tor +from lmformatenforcer import (JsonSchemaParser, TokenEnforcer, + TokenEnforcerTokenizerData) +from pydantic import BaseModel +from transformers import AutoTokenizer + +import tensorrt_llm.bindings.executor as trtllm + + +def _build_regular_tokens_list( + tokenizer) -> _tp.List[_tp.Tuple[int, str, bool]]: + token_0 = [tokenizer.encode("0")[-1]] + regular_tokens = [] + vocab_size = tokenizer.vocab_size + for token_idx in range(vocab_size): + if token_idx in tokenizer.all_special_ids: + continue + # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word. + tensor_after_0 = _tor.tensor(token_0 + [token_idx], dtype=_tor.long) + decoded_after_0 = tokenizer.decode(tensor_after_0)[1:] + decoded_regular = tokenizer.decode(token_0) + is_word_start_token = len(decoded_after_0) > len(decoded_regular) + regular_tokens.append((token_idx, decoded_after_0, is_word_start_token)) + return regular_tokens + + +def build_token_enforcer(tokenizer, character_level_parser): + """ + Build logits processor for feeding it into generate function (use_py_session should be True) + """ + regular_tokens = _build_regular_tokens_list(tokenizer) + + def _decode(tokens: _tp.List[int]) -> str: + tensor = _tor.tensor(tokens, dtype=_tor.long) + return tokenizer.decode(tensor) + + tokenizer_data = TokenEnforcerTokenizerData(regular_tokens, _decode, + tokenizer.eos_token_id) + return TokenEnforcer(tokenizer_data, character_level_parser) + + +# Prepare and enqueue the requests +def enqueue_requests(args: argparse.Namespace, + executor: trtllm.Executor) -> None: + + sampling_config = trtllm.SamplingConfig(args.beam_width) + + request_ids = [] + for _ in range(args.batch_size): + # Create the request. + request = trtllm.Request(input_token_ids=prompt, + max_new_tokens=25, + end_id=tokenizer.eos_token_id, + sampling_config=sampling_config) + request.logits_post_processor_name = request.BATCHED_POST_PROCESSOR_NAME if args.lpp_batched else "my_logits_pp" + + # Enqueue the request. + req_id = executor.enqueue_request(request) + request_ids.append(req_id) + + return request_ids + + +# Wait for responses and store output tokens +def wait_for_responses(args: argparse.Namespace, request_ids: list[int], + executor: trtllm.Executor) -> dict[dict[list[int]]]: + + output_tokens = { + req_id: {beam: [] + for beam in range(args.beam_width)} + for req_id in request_ids + } + num_finished = 0 + iter = 0 + while (num_finished < len(request_ids) and iter < args.timeout_ms): + responses = executor.await_responses( + datetime.timedelta(milliseconds=args.timeout_ms)) + for response in responses: + req_id = response.request_id + if not response.has_error(): + result = response.result + num_finished += 1 if result.is_final else 0 + for beam, outTokens in enumerate(result.output_token_ids): + output_tokens[req_id][beam].extend(outTokens) + else: + raise RuntimeError( + str(req_id) + " encountered error:" + response.error_msg) + + return output_tokens + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Executor Bindings Example") + parser.add_argument("--tokenizer_path", + "-t", + type=str, + required=True, + help="Directory containing model tokenizer") + parser.add_argument("--engine_path", + "-e", + type=str, + required=True, + help="Directory containing model engine") + parser.add_argument("--beam_width", + type=int, + required=False, + default=1, + help="The beam width") + parser.add_argument("--batch_size", + type=int, + required=False, + default=1, + help="The batch size") + parser.add_argument( + "--timeout_ms", + type=int, + required=False, + default=10000, + help="The maximum time to wait for all responses, in milliseconds") + parser.add_argument("--lpp_batched", + action="store_true", + default=False, + help="Enable batched logits post processor") + + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) + + class AnswerFormat(BaseModel): + last_name: str + year_of_birth: int + + parser = JsonSchemaParser(AnswerFormat.model_json_schema()) + token_enforcer = build_token_enforcer(tokenizer, parser) + + def get_allowed_tokens(ids): + + def _trim(ids): + return [x for x in ids if x != tokenizer.eos_token_id] + + allowed = token_enforcer.get_allowed_tokens(_trim(ids[0])) + return allowed + + def logits_post_processor(req_id: int, logits: _tor.Tensor, + ids: _tp.List[_tp.List[int]], stream_ptr: int): + del req_id + mask = _tor.full_like(logits, fill_value=float("-inf"), device="cpu") + allowed = get_allowed_tokens(ids) + mask[:, :, allowed] = 0 + + with _tor.cuda.stream(_tor.cuda.ExternalStream(stream_ptr)): + mask = mask.to(logits.device, non_blocking=True) + logits += mask + + def logits_post_processor_batched(req_ids_batch, logits_batch, ids_batch, + stream_ptr: int): + masks = [] + for req_id, logits, ids in zip(req_ids_batch, logits_batch, ids_batch): + del req_id + mask = _tor.full_like(logits, + fill_value=float("-inf"), + device="cpu") + allowed = get_allowed_tokens(ids) + mask[:, :, allowed] = 0 + masks.append(mask) + + with _tor.cuda.stream(_tor.cuda.ExternalStream(stream_ptr)): + for logits, mask in zip(logits_batch, masks): + logits += mask.to(logits.device, non_blocking=True) + + # Create the executor. + executor_config = trtllm.ExecutorConfig(args.beam_width) + if not args.lpp_batched: + executor_config.logits_post_processor_map = { + "my_logits_pp": logits_post_processor + } + else: + executor_config.logits_post_processor_batched = logits_post_processor_batched + executor = trtllm.Executor(args.engine_path, trtllm.ModelType.DECODER_ONLY, + executor_config) + + input = "Please give me information about Michael Jordan. You MUST answer using the following json schema: " + prompt = tokenizer.encode(input) + print(f"Input text: {input}\n") + + if executor.can_enqueue_requests(): + request_ids = enqueue_requests(args, executor) + output_tokens = wait_for_responses(args, request_ids, executor) + + # Print output + for req_id in request_ids: + for beam_id in range(args.beam_width): + result = tokenizer.decode( + output_tokens[req_id][beam_id][len(prompt):]) + generated_tokens = len( + output_tokens[req_id][beam_id]) - len(prompt) + print( + f"Request {req_id} Beam {beam_id} ({generated_tokens} tokens): {result}" + ) diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt index 146586af1..047fb0e86 100644 --- a/examples/bloom/requirements.txt +++ b/examples/bloom/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/chatglm/README.md b/examples/chatglm/README.md index 26e8911e6..ab9fa624f 100644 --- a/examples/chatglm/README.md +++ b/examples/chatglm/README.md @@ -150,7 +150,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo Normally, the `trtllm-build` command only requires a single GPU, but you can enable parallel building by passing the number of GPUs to the `--workers` argument. -Using ChatGLM2-6B-32K / ChatGLM3-6B-32K models, we need to guarantee `max_batch_size * max_beam_width * (max_input_len + max_output_len) <= 78398 = 2^31 / (13696 * 2)` due to constrain of TensorRT. For example, we will fail to build engine while using default max_batch_size (8) and adding arguments `--max_beam_width=4 --max_input_len=20000 --max_output_len=100`. +Using ChatGLM2-6B-32K / ChatGLM3-6B-32K models, we need to guarantee `max_batch_size * max_beam_width * max_seq_len <= 78398 = 2^31 / (13696 * 2)` due to constrain of TensorRT. For example, we will fail to build engine while using default max_batch_size (8) and adding arguments `--max_beam_width=4 --max_input_len=20000 --max_seq_len=20100`. ```bash # ChatGLM3-6B: single-gpu engine diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt index 7961260c0..58286c4c4 100644 --- a/examples/chatglm/requirements.txt +++ b/examples/chatglm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 protobuf diff --git a/examples/cogvlm/convert_checkpoint.py b/examples/cogvlm/convert_checkpoint.py index ba5ce1595..fec672fe3 100644 --- a/examples/cogvlm/convert_checkpoint.py +++ b/examples/cogvlm/convert_checkpoint.py @@ -276,14 +276,8 @@ def create_config_from_args(args: argparse.Namespace): 'vision_start': args.vision_start, 'vision_length': args.vision_length, 'quantization': { - 'quant_algo': - None, - 'kv_cache_quant_algo': - None, - 'exclude_modules': [ - 'lm_head', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ], + 'quant_algo': None, + 'kv_cache_quant_algo': None, }, 'mapping': { 'world_size': args.tp_size * args.pp_size, diff --git a/examples/dbrx/README.md b/examples/dbrx/README.md index 97539f291..c6ca8eb95 100644 --- a/examples/dbrx/README.md +++ b/examples/dbrx/README.md @@ -111,12 +111,11 @@ trtllm-build --checkpoint_dir dbrx/trt_ckpt/bf16/tp4pp2 \ ```bash # Build DBRX with expert parallelism for DbrxExperts layer and tensor parallelism for rest -# `moe_tp_mode` decides sharding for expert weights: -# 1 is for expert parallelism, 2 for tensor parallelism python convert_checkpoint.py --model_dir dbrx-base \ --dtype bfloat16 \ --tp_size 8 \ - --moe_tp_mode 1 \ + --moe_tp_size 1 \ + --moe_ep_size 8 \ --workers 8 \ --output_dir dbrx/trt_ckpt/bf16/ep8 diff --git a/examples/dbrx/convert_checkpoint.py b/examples/dbrx/convert_checkpoint.py index 58de73320..c014a44e4 100644 --- a/examples/dbrx/convert_checkpoint.py +++ b/examples/dbrx/convert_checkpoint.py @@ -124,11 +124,18 @@ def parse_arguments(): 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' ) parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, + '--moe_tp_size', type=int, + default=-1, help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' ) parser.add_argument( '--moe_renorm_mode', @@ -516,31 +523,32 @@ def convert_hf_dbrx(model_params: dict, f'{prefix}.ffn.experts.mlp.w1', dtype) mlp_gate_weight = mlp_gate_weight.reshape(-1, mlp_hidden_size, num_hidden) - if moe_config.tp_mode == MoeConfig.ParallelismMode.TENSOR_PARALLEL: - mlp_gate_w = split_matrix(mlp_gate_weight, - mapping.tp_size, - mapping.tp_rank, - dim=1) - else: - mlp_gate_w = split_matrix(mlp_gate_weight, - mapping.tp_size, - mapping.tp_rank, - dim=0) + # moe expert parallel + mlp_gate_weight = split_matrix(mlp_gate_weight, + mapping.moe_ep_size, + mapping.moe_ep_rank, + dim=0) + # moe tensor parallel + mlp_gate_w = split_matrix(mlp_gate_weight, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=1) + # experts mlp v1 -> mlp fc mlp_fc_weight = get_weight(model_params, f'{prefix}.ffn.experts.mlp.v1', dtype) mlp_fc_weight = mlp_fc_weight.reshape(-1, mlp_hidden_size, num_hidden) - if moe_config.tp_mode == MoeConfig.ParallelismMode.TENSOR_PARALLEL: - mlp_fc_w = split_matrix(mlp_fc_weight, - mapping.tp_size, - mapping.tp_rank, - dim=1) - else: - mlp_fc_w = split_matrix(mlp_fc_weight, - mapping.tp_size, - mapping.tp_rank, - dim=0) + # moe expert parallel + mlp_fc_weight = split_matrix(mlp_fc_weight, + mapping.moe_ep_size, + mapping.moe_ep_rank, + dim=0) + # moe tensor parallel + mlp_fc_w = split_matrix(mlp_fc_weight, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=1) mlp_fc_w = torch.concat([mlp_fc_w, mlp_gate_w], dim=-2) weights.update( get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc.', None, @@ -553,16 +561,16 @@ def convert_hf_dbrx(model_params: dict, mlp_proj_weight = mlp_proj_weight.reshape(-1, mlp_hidden_size, num_hidden).transpose( 1, 2) - if moe_config.tp_mode == MoeConfig.ParallelismMode.TENSOR_PARALLEL: - mlp_proj_w = split_matrix(mlp_proj_weight, - mapping.tp_size, - mapping.tp_rank, - dim=2) - else: - mlp_proj_w = split_matrix(mlp_proj_weight, - mapping.tp_size, - mapping.tp_rank, - dim=0) + # moe expert parallel + mlp_proj_weight = split_matrix(mlp_proj_weight, + mapping.moe_ep_size, + mapping.moe_ep_rank, + dim=0) + # moe tensor parallel + mlp_proj_w = split_matrix(mlp_proj_weight, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=2) weights.update( get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj.', None, use_weight_only, @@ -621,6 +629,16 @@ def execute(workers, func, hf_model): print(tensorrt_llm.__version__) args = parse_arguments() world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" tik = time.time() @@ -661,7 +679,6 @@ def execute(workers, func, hf_model): args.hidden_act = 'swiglu' args.rotary_base = hf_config.attn_config.rope_theta args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_tp_mode, args.moe_renorm_mode).validate() config = { 'architecture': 'DbrxForCausalLM', @@ -680,33 +697,25 @@ def execute(workers, func, hf_model): 'rotary_base': args.rotary_base, 'rotary_scaling': args.rotary_scaling, 'quantization': { - 'quant_algo': - quant_algo, - 'kv_cache_quant_algo': - kv_cache_quant_algo, - 'exclude_modules': [ - 'lm_head', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ], + 'quant_algo': quant_algo, + 'kv_cache_quant_algo': kv_cache_quant_algo, }, 'moe': { "num_experts": args.moe_num_experts, "top_k": args.moe_top_k, - "tp_mode": args.moe_tp_mode, "normalization_mode": args.moe_renorm_mode }, 'mapping': { 'world_size': world_size, 'tp_size': args.tp_size, 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, }, 'clip_qkv': args.clip_qkv, 'dense_context_fmha': args.dense_context_fmha, } - if args.use_weight_only and args.moe_config.has_moe(): - config['quantization']['exclude_modules'].append('router') - config.update(args_to_build_options(args)) with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: @@ -725,7 +734,9 @@ def convert_and_save(hf_model, rank): mapping = Mapping(world_size=world_size, rank=rank, tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) act_range = {} if args.int8_kv_cache: tokenizer = AutoTokenizer.from_pretrained(args.model_dir, diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt index 8528147fd..cd04908b5 100644 --- a/examples/dbrx/requirements.txt +++ b/examples/dbrx/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/enc_dec/README.md b/examples/enc_dec/README.md index 69f3c33b0..51ec74ef2 100644 --- a/examples/enc_dec/README.md +++ b/examples/enc_dec/README.md @@ -99,21 +99,23 @@ The default value of `--max_input_len` is 1024. When building DecoderModel, spec DecoderModel takes `--max_encoder_input_len` and `--max_input_len` as model inputs, `--max_encoder_input_len` is set to 1024 as default since `--max_input_len` is 1024 for EncoderModel. -To be noted: for T5, add `--context_fmha disable`, and `--bert_attention_plugin`, `--gpt_attention_plugin`, `--remove_input_padding`, `--gemm_plugin` require explicit disabling and setting. +To be noted: +1. For T5, add `--context_fmha disable`. FMHA with T5's relative attention bias is not implemented. Add `--use_implicit_relative_attention` when `--max_seq_len` is extremely large, causing decoder engine size to be too large to fit in memory. Compute relative attention on-the-fly (implicitly, without pre-computation) instead. +2. `--bert_attention_plugin`, `--gpt_attention_plugin`, `--remove_input_padding`, `--gemm_plugin` require explicit disabling and setting, or else they'll be set to default value in `trtllm-build`. ```bash # --gpt_attention_plugin is necessary in Enc-Dec. # Try --gemm_plugin to prevent accuracy issue. # It is recommended to use --remove_input_padding along with --gpt_attention_plugin for better performance -trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/encoder \ - --output_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/encoder \ +trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/encoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION}/encoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 1224 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -121,15 +123,14 @@ trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION --context_fmha disable # For decoder, refer to the above content and set --max_input_len correctly -trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/decoder \ - --output_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/decoder \ - --paged_kv_cache disable \ +trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/decoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION}/decoder \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 201 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -158,15 +159,15 @@ python convert_checkpoint.py --model_type ${MODEL_TYPE} \ --dtype ${INFERENCE_PRECISION} # Note: non-T5 models can enable FMHA for the encoder part, for FP16/BF16, the default is enabled -trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/encoder \ - --output_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/encoder \ +trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/encoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION}/encoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 1224 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -174,15 +175,14 @@ trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION # --context_fmha disable should be removed # Use the same command for decoder engine -trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/decoder \ - --output_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/decoder \ - --paged_kv_cache disable \ +trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/${INFERENCE_PRECISION}/decoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION}/decoder \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 201 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -205,11 +205,13 @@ Different types of runtime are provided for encoder-decoder models. Following an Please refer to the documentation for the details of [paged kv cache](../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../docs/source/advanced/gpt-attention.md#inflight-batching). #### Run C++ runtime +**Note: to use inflight batching and paged kv cache features in C++ runtime, please make sure you have set `--paged_kv_cache enable` (which is by default enabled) in the `trtllm-build` command of the decoder. Meanwhile, if using Python runtime, it is recommended to disable this flag by `--paged_kv_cache disable` to avoid any unnecessary overhead.** + For good usability, Python binding of the C++ runtime is provided. You can use the high-level C++ `ModelRunner` under the `examples/` root folder. ```python # Inferencing via python binding of C++ runtime with inflight batching (IFB) -python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --input_text "translate English to German: The house is wonderful." +python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --input_text "translate English to German: The house is wonderful." ``` For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend). @@ -220,10 +222,10 @@ For pure Python runtime, you can still use the encoder-decoder specific script u ```bash # Inferencing w/ single GPU greedy search, compare results with HuggingFace FP32 -python3 run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE} --engine_name ${MODEL_NAME} --model_name tmp/hf_models/${MODEL_NAME} --max_new_token=64 --num_beams=1 --compare_hf_fp32 +python3 run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --engine_name ${MODEL_NAME} --model_name tmp/hf_models/${MODEL_NAME} --max_new_token=64 --num_beams=1 --compare_hf_fp32 # Inferencing w/ 4 GPUs (4-way TP, as configured during the engine building step), greedy search, compare results with HuggingFace FP32 -mpirun --allow-run-as-root -np ${WORLD_SIZE} python3 run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE} --engine_name ${MODEL_NAME} --model_name tmp/hf_models/${MODEL_NAME} --max_new_token=64 --num_beams=1 --compare_hf_fp32 +mpirun --allow-run-as-root -np ${WORLD_SIZE} python3 run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --engine_name ${MODEL_NAME} --model_name tmp/hf_models/${MODEL_NAME} --max_new_token=64 --num_beams=1 --compare_hf_fp32 ``` ### Benchmark @@ -278,14 +280,14 @@ python convert_checkpoint.py --model_type bart \ ```bash trtllm-build --checkpoint_dir tmp/trt_models/bart-large-cnn/${INFERENCE_PRECISION}/encoder \ - --output_dir tmp/trt_engines/bart-large-cnn/1-gpu/${INFERENCE_PRECISION}/encoder \ + --output_dir tmp/trt_engines/bart-large-cnn/${INFERENCE_PRECISION}/encoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 1224 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -295,14 +297,13 @@ trtllm-build --checkpoint_dir tmp/trt_models/bart-large-cnn/${INFERENCE_PRECISIO --lora_target_modules attn_q attn_v trtllm-build --checkpoint_dir tmp/trt_models/bart-large-cnn/${INFERENCE_PRECISION}/decoder \ - --output_dir tmp/trt_engines/bart-large-cnn/1-gpu/${INFERENCE_PRECISION}/decoder \ - --paged_kv_cache disable \ + --output_dir tmp/trt_engines/bart-large-cnn/${INFERENCE_PRECISION}/decoder \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 201 \ --gemm_plugin ${INFERENCE_PRECISION} \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ @@ -317,7 +318,7 @@ trtllm-build --checkpoint_dir tmp/trt_models/bart-large-cnn/${INFERENCE_PRECISIO ```bash python run.py \ - --engine_dir tmp/trt_engines/bart-large-cnn/1-gpu/${INFERENCE_PRECISION}/ \ + --engine_dir tmp/trt_engines/bart-large-cnn/${INFERENCE_PRECISION}/ \ --engine_name bart-large-cnn \ --model_name tmp/hf_models/bart-large-cnn \ --max_new_token=64 \ @@ -330,7 +331,7 @@ python run.py \ ```bash python run.py \ - --engine_dir tmp/trt_engines/bart-large-cnn/1-gpu/${INFERENCE_PRECISION}/ \ + --engine_dir tmp/trt_engines/bart-large-cnn/${INFERENCE_PRECISION}/ \ --engine_name bart-large-cnn \ --model_name tmp/hf_models/bart-large-cnn \ --max_new_token=64 \ @@ -385,32 +386,31 @@ python convert_checkpoint.py --model_type nmt \ # Build TensorRT engine(s) # Note: non-T5 models can enable FMHA for the encoder part, although only FP16/BF16 precisions are valid -trtllm-build --checkpoint_dir tmp/trt_models/wmt14/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/encoder \ - --output_dir tmp/trt_engines/wmt14/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/encoder \ +trtllm-build --checkpoint_dir tmp/trt_models/wmt14/${INFERENCE_PRECISION}/encoder \ + --output_dir tmp/trt_engines/wmt14/${INFERENCE_PRECISION}/encoder \ --paged_kv_cache disable \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 1224 \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ --remove_input_padding disable -trtllm-build --checkpoint_dir tmp/trt_models/wmt14/${INFERENCE_PRECISION}/tp${TP_SIZE}/pp${PP_SIZE}/decoder \ - --output_dir tmp/trt_engines/wmt14/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE}/decoder \ - --paged_kv_cache disable \ +trtllm-build --checkpoint_dir tmp/trt_models/wmt14/${INFERENCE_PRECISION}/decoder \ + --output_dir tmp/trt_engines/wmt14/${INFERENCE_PRECISION}/decoder \ --moe_plugin disable \ --enable_xqa disable \ --use_custom_all_reduce disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 200 \ + --max_seq_len 201 \ --bert_attention_plugin ${INFERENCE_PRECISION} \ --gpt_attention_plugin ${INFERENCE_PRECISION} \ --remove_input_padding disable \ --max_input_len 1 # Run -mpirun --allow-run-as-root -np ${WORLD_SIZE} python3 run.py --engine_dir tmp/trt_engines/wmt14/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE} --engine_name wmt14 --model_name tmp/fairseq_models/wmt14/${WORLD_SIZE}-gpu/${INFERENCE_PRECISION}/tp${TP_SIZE} --max_new_token=24 --num_beams=1 +mpirun --allow-run-as-root -np ${WORLD_SIZE} python3 run.py --engine_dir tmp/trt_engines/wmt14/${INFERENCE_PRECISION} --engine_name wmt14 --model_name tmp/fairseq_models/wmt14/${INFERENCE_PRECISION} --max_new_token=24 --num_beams=1 ``` diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index 25e8417cf..567d1d89d 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 transformers>=4.31.0 datasets~=2.14.5 evaluate~=0.4.1 diff --git a/examples/gemma/README.md b/examples/gemma/README.md index 1f8f83f1c..193807719 100644 --- a/examples/gemma/README.md +++ b/examples/gemma/README.md @@ -66,11 +66,10 @@ After getting checkpoint, we can use `trtllm-build` command to build TensorRT-LL ```bash ENGINE_PATH=/tmp/gemma/2B/bf16/1-gpu/ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} ``` @@ -162,11 +161,10 @@ python3 ./examples/gemma/convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_PATH} python3 ../summarize.py --test_trt_llm \ @@ -212,11 +210,9 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_PATH} python3 ../summarize.py --test_trt_llm \ @@ -261,11 +257,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin float16 \ - --gpt_attention_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin float16 \ --output_dir ${ENGINE_PATH} @@ -307,11 +302,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -349,11 +343,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -393,11 +386,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -440,11 +432,10 @@ python3 ./examples/gemma/convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -485,11 +476,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -527,11 +517,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin float16 \ - --gpt_attention_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin float16 \ --output_dir ${ENGINE_PATH} @@ -578,11 +567,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -619,11 +607,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -662,11 +649,10 @@ python3 ./convert_checkpoint.py \ --output-model-dir ${UNIFIED_CKPT_PATH} trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin bfloat16 \ - --gpt_attention_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 32 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin bfloat16 \ --output_dir ${ENGINE_PATH} @@ -709,11 +695,9 @@ HF_GEMMA_PATH can either be HF model card name or the downloaded model path. QUA For fp8, build engines with: ``` trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin float16 \ - --gpt_attention_plugin float16 \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --lookup_plugin float16 \ --output_dir ${ENGINE_PATH} ``` @@ -722,11 +706,10 @@ For int4_awq and int8_sq, build engines with: ``` trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ - --gemm_plugin float16 \ - --gpt_attention_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --enable_xqa enable \ --lookup_plugin float16 \ --output_dir ${ENGINE_PATH} diff --git a/examples/gemma/convert_checkpoint.py b/examples/gemma/convert_checkpoint.py index 1d5eb71b2..38895306f 100644 --- a/examples/gemma/convert_checkpoint.py +++ b/examples/gemma/convert_checkpoint.py @@ -832,7 +832,7 @@ def convert_from_checkpoint( dim=trt_llm_config.embedding_sharding_dim, ) if trt_llm_config.quant_mode.is_int8_weight_only() and not trt_llm_config.quant_mode.has_per_group_scaling() and \ - not trt_llm_config.quant_mode.has_int8_kv_cache() and 'vocab_embedding' not in trt_llm_config.quantization.exclude_modules: + not trt_llm_config.quant_mode.has_int8_kv_cache() and trt_llm_config.quantization.exclude_modules is not None: # shape of embedding table: [V, K], V: vocab size, K: embedding dim @@ -1009,12 +1009,7 @@ def main(): if args.use_weight_only_with_precision.endswith( "awq") or args.use_weight_only_with_precision.endswith( "int4") or not args.use_int8_weight_only_embedding: - quant_kwargs.update(has_zero_point=False, - pre_quant_scale=True, - exclude_modules=[ - 'lm_head', 'router', 'vocab_embedding', - 'position_embedding', 'block_embedding' - ]) + quant_kwargs.update(has_zero_point=False, pre_quant_scale=True) else: quant_kwargs.update(exclude_modules=['router']) @@ -1022,7 +1017,7 @@ def main(): quant_config.quant_algo = quant_kwargs['quant_algo'] quant_config.kv_cache_quant_algo = quant_kwargs['kv_cache_quant_algo'] if args.use_weight_only_with_precision: - quant_config.exclude_modules = quant_kwargs['exclude_modules'] + quant_config.exclude_modules = quant_kwargs.get('exclude_modules') if args.use_weight_only_with_precision.endswith("awq"): quant_config.group_size = 128 quant_config.has_zero_point = quant_kwargs['has_zero_point'] diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index 1774609ba..9a2afb89d 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -3,7 +3,7 @@ # WAR the new posting of "nvidia-cudnn-cu12~=9.0". # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9". nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 flax~=0.8.0 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows" jax~=0.4.19; platform_system == "Windows" diff --git a/examples/gpt/README.md b/examples/gpt/README.md index 6508d3ed3..80c38314e 100644 --- a/examples/gpt/README.md +++ b/examples/gpt/README.md @@ -198,7 +198,7 @@ trtllm-build --model_config gpt_530b/trt_ckpt/fp16/16-gpu/config.json \ --gemm_plugin auto \ --max_batch_size 128 \ --max_input_len 128 \ - --max_output_len 20 \ + --max_seq_len 148 \ --output_dir gpt_530b/trt_engines/fp16/16-gpu \ --workers 8 ``` @@ -698,7 +698,7 @@ trtllm-build --checkpoint_dir gpt-next-2B/trt_ckpt/fp16/1-gpu \ --max_batch_size 4 \ --max_beam_width 2 \ --max_input_len 512 \ - --max_output_len 50 \ + --max_seq_len 562 \ --output_dir gpt-next-2B/trt_engines/fp16/1-gpu # Run inference directly from NeMo LoRA checkpoint diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index b31a07db6..954293a71 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptj/README.md b/examples/gptj/README.md index 3a826eb24..67f96af05 100644 --- a/examples/gptj/README.md +++ b/examples/gptj/README.md @@ -82,7 +82,7 @@ trtllm-build --checkpoint_dir ./trt_ckpt/gptj_fp16_tp1/ \ --gemm_plugin float16 \ --max_batch_size=32 \ --max_input_len=1919 \ - --max_output_len=128 + --max_seq_len=2047 ``` INT8 weight-only diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt index 815e4caee..58af19bd7 100644 --- a/examples/gptj/requirements.txt +++ b/examples/gptj/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptneox/README.md b/examples/gptneox/README.md index ac1c4329e..d7c48483f 100644 --- a/examples/gptneox/README.md +++ b/examples/gptneox/README.md @@ -88,14 +88,14 @@ trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/fp16/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./gptneox/20B/trt_engines/fp16/1-gpu/ # With 2-way Tensor Parallel trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/fp16/2-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --workers 2 \ --output_dir ./gptneox/20B/trt_engines/fp16/2-gpu/ # Single GPU with int8 weight only @@ -103,14 +103,14 @@ trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/int8_wo/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./gptneox/20B/trt_engines/int8_wo/1-gpu/ # With 2-way Tensor Parallel with int8 weight only trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/int8_wo/2-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --workers 2 \ --output_dir ./gptneox/20B/trt_engines/int8_wo/2-gpu/ ``` @@ -198,14 +198,14 @@ trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./gptneox/20B/trt_engines/int4_gptq/1-gpu/ # With 2-way Tensor Parallel trtllm-build --checkpoint_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --workers 2 \ --output_dir ./gptneox/20B/trt_engines/int4_gptq/2-gpu/ ``` diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt index fdf548ce7..8edbc46e6 100644 --- a/examples/gptneox/requirements.txt +++ b/examples/gptneox/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 rouge_score~=0.1.2 evaluate~=0.4.1 diff --git a/examples/grok/README.md b/examples/grok/README.md index 943747ed7..ac3a3e478 100644 --- a/examples/grok/README.md +++ b/examples/grok/README.md @@ -63,6 +63,7 @@ python convert_checkpoint.py --model_dir ./tmp/grok-1/ \ --output_dir ./tllm_checkpoint_8gpus_bf16 \ --dtype bfloat16 \ --use_weight_only \ + --tp_size 8 \ --workers 8 trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpus_bf16 \ diff --git a/examples/grok/convert_checkpoint.py b/examples/grok/convert_checkpoint.py index ea8a59946..ff09150e0 100644 --- a/examples/grok/convert_checkpoint.py +++ b/examples/grok/convert_checkpoint.py @@ -136,11 +136,18 @@ def parse_arguments(): 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' ) parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, + '--moe_tp_size', type=int, + default=-1, help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' ) parser.add_argument( '--moe_renorm_mode', @@ -168,10 +175,6 @@ def args_to_quantization(args: argparse.Namespace) -> QuantConfig: '''return config dict with quantization info based on the command line args ''' quant_config = QuantConfig() - quant_config.exclude_modules = [ - 'lm_head', 'router', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ] if args.use_weight_only: if args.weight_only_precision == 'int8': quant_config.quant_algo = QuantAlgo.W8A16 @@ -208,12 +211,13 @@ def from_cli_args(args): 'norm_epsilon': args.rms_norm_eps, 'moe_num_experts': args.moe_num_experts, 'moe_top_k': args.moe_top_k, - 'moe_tp_mode': args.moe_tp_mode, 'moe_normalization_mode': args.moe_renorm_mode, 'mapping': { 'world_size': args.tp_size * args.pp_size, 'tp_size': args.tp_size, - 'pp_size': args.pp_size + 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, }, 'quantization': args_to_quantization(args).asdict() } @@ -277,10 +281,20 @@ def convert_and_save_xai(args): model_dir = args.model_dir load_by_shard = args.load_by_shard world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, # before the refactor is done. - override_fields = {'moe_tp_mode': args.moe_tp_mode} + override_fields = {} quantization = args_to_quantization(args) override_fields.update(args_to_build_options(args)) @@ -293,7 +307,9 @@ def convert_and_save_rank(args, rank): mapping = Mapping(world_size=world_size, rank=rank, tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) grok = GrokForCausalLM.from_hugging_face( model_dir, args.dtype, diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt index 6ba4cf606..b6e249ae7 100644 --- a/examples/grok/requirements.txt +++ b/examples/grok/requirements.txt @@ -1,6 +1,6 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/high-level-api/llm_examples.py b/examples/high-level-api/llm_examples.py index 15d2e42a4..7e51982ff 100644 --- a/examples/high-level-api/llm_examples.py +++ b/examples/high-level-api/llm_examples.py @@ -169,10 +169,6 @@ def run_llm_with_quantization(prompt: str, model_dir: str, quant_type: str): else: config.quant_config.quant_algo = QuantAlgo.FP8 config.quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - config.quant_config.exclude_modules = [ - 'lm_head', 'router', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ] llm = LLM(config) prompts = parse_prompts(prompt, False) diff --git a/examples/high-level-api/requirements.txt b/examples/high-level-api/requirements.txt index bd1da8e38..182e2ccc1 100644 --- a/examples/high-level-api/requirements.txt +++ b/examples/high-level-api/requirements.txt @@ -1,2 +1,2 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt index 801d14c50..6c49a765f 100644 --- a/examples/internlm/requirements.txt +++ b/examples/internlm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets==2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/llama/README.md b/examples/llama/README.md index 94207ef45..96c9cf8e9 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -870,7 +870,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_4gpu_codellama \ --output_dir ./tmp/codellama/trt_engines/fp16/4-gpu/ \ --gemm_plugin auto \ --max_input_len 15360 \ - --max_output_len 1024 \ + --max_seq_len 16384 \ --max_batch_size 4 ``` @@ -909,7 +909,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_2gpu \ --lora_plugin auto \ --max_batch_size 1 \ --max_input_len 512 \ - --max_output_len 50 \ + --max_seq_len 562 \ --lora_dir chinese-llama-2-lora-13b ``` @@ -978,7 +978,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu \ --lora_plugin auto \ --max_batch_size 8 \ --max_input_len 512 \ - --max_output_len 50 \ + --max_seq_len 562 \ --lora_dir "luotuo-lora-7b-0.1/" "Japanese-Alpaca-LoRA-7b-v0/" \ --max_lora_rank 8 \ --lora_target_modules attn_q attn_k attn_v diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py index 951d67ac9..39d03bff7 100644 --- a/examples/llama/convert_checkpoint.py +++ b/examples/llama/convert_checkpoint.py @@ -30,6 +30,20 @@ def parse_arguments(): type=int, default=1, help='N-way pipeline parallelism size') + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' + ) parser.add_argument('--dtype', type=str, default='float16', @@ -180,13 +194,6 @@ def parse_arguments(): help= 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' ) - parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, - type=int, - help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', - ) parser.add_argument( '--moe_renorm_mode', default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, @@ -213,10 +220,6 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: '''return config dict with quantization info based on the command line args ''' quant_config = QuantConfig() - quant_config.exclude_modules = [ - 'lm_head', 'router', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ] if args.use_weight_only: if args.weight_only_precision == 'int8': quant_config.quant_algo = QuantAlgo.W8A16 @@ -251,6 +254,8 @@ def convert_and_save_meta(args, rank): mapping = Mapping(world_size=args.tp_size * args.pp_size, tp_size=args.tp_size, pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size, rank=rank) assert not args_to_quant_config(args).quant_mode.has_any_quant(), \ "quantization from meta checkpoint or empty model were never supported" @@ -293,13 +298,14 @@ def from_cli_args(args): 'moe': { 'num_experts': args.moe_num_experts, 'top_k': args.moe_top_k, - 'tp_mode': args.moe_tp_mode, 'normalization_mode': args.moe_renorm_mode, }, 'mapping': { 'world_size': args.tp_size * args.pp_size, 'tp_size': args.tp_size, - 'pp_size': args.pp_size + 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, }, 'quantization': args_to_quant_config(args).to_dict() } @@ -315,7 +321,7 @@ def convert_and_save_hf(args): # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, # before the refactor is done. - override_fields = {'moe_tp_mode': args.moe_tp_mode} + override_fields = {} override_fields.update(args_to_build_options(args)) quant_config = args_to_quant_config(args) @@ -327,7 +333,10 @@ def convert_and_save_hf(args): world_size=world_size, rank=-1, #intentinoally make -1 to avoid mistake tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + # TODO: support moe quantization for tp + ep LLaMAForCausalLM.quantize(args.model_dir, args.output_dir, dtype=args.dtype, @@ -351,7 +360,9 @@ def convert_and_save_rank(args, rank): mapping = Mapping(world_size=world_size, rank=rank, tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) llama = LLaMAForCausalLM.from_hugging_face( model_dir if hf_model is None else hf_model, args.dtype, @@ -408,6 +419,16 @@ def main(): args = parse_arguments() world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" tik = time.time() if not os.path.exists(args.output_dir): diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index fbe76cd65..f8e55c97d 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/mamba/README.md b/examples/mamba/README.md index b261d1b7f..710bb10f0 100644 --- a/examples/mamba/README.md +++ b/examples/mamba/README.md @@ -102,7 +102,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \ --mamba_conv1d_plugin bfloat16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./mamba_model/mamba-2.8b/trt_engines/bf16/1-gpu/ # mamba-1.4b @@ -112,7 +112,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \ --mamba_conv1d_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./mamba_model/mamba-1.4b/trt_engines/fp16/1-gpu/ # mamba-790m @@ -122,7 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \ --mamba_conv1d_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./mamba_model/mamba-790m/trt_engines/fp16/1-gpu/ # mamba-370m @@ -132,7 +132,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \ --mamba_conv1d_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./mamba_model/mamba-370m/trt_engines/fp16/1-gpu/ # mamba-130m @@ -142,7 +142,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \ --mamba_conv1d_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./mamba_model/mamba-130m/trt_engines/fp16/1-gpu/ ``` diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 168ca1dca..5d3b4823e 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate rouge_score diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index 4d1cb9c80..b29584c99 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md index 11f458d1f..6ec33bd74 100644 --- a/examples/mixtral/README.md +++ b/examples/mixtral/README.md @@ -1,7 +1,8 @@ # Mixtral This document shows how to build and run a Mixtral model in TensorRT-LLM on both single GPU, single node multi-GPU and -multi-node multi-GPU. +multi-node multi-GPU. Mixtral 8x22B is also supported and can be replace Mixtral 8x7B below as long as GPU memory is +sufficient. ## Overview @@ -11,6 +12,7 @@ See the LLaMA example [`examples/llama`](../llama) for details. ### Build TensorRT engine(s) +#### Download Mixtral 8x7b weights Get the weights by downloading from HF https://huggingface.co/mistralai/Mixtral-8x7B-v0.1. See also https://huggingface.co/docs/transformers/main/en/model_doc/mixtral @@ -19,6 +21,15 @@ git lfs install git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1 ``` +#### Download Mixtral 8x22b weights +Get the weights by downloading from HF https://huggingface.co/mistralai/Mixtral-8x22B-v0.1. +See also https://huggingface.co/docs/transformers/main/en/model_doc/mixtral + +```bash +git lfs install +git clone https://huggingface.co/mistralai/Mixtral-8x22B-v0.1 +``` + We use the LLaMA `convert_checkpoint.py` script to convert and build the model. TensorRT-LLM LLaMA builds TensorRT engine(s) from HF checkpoint provided by `--model_dir`. If no checkpoint directory is specified, TensorRT-LLM will build engine(s) with dummy weights. @@ -44,10 +55,23 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \ python ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x7B-v0.1 \ --output_dir ./tllm_checkpoint_mixtral_2gpu \ --dtype float16 \ - --tp_size 2 + --tp_size 2 \ + --moe_tp_size 2 trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \ --output_dir ./trt_engines/mixtral/tp2 \ --gemm_plugin float16 + + +# Build Mixtral8x22B with tensor parallelism and expert parallelism +python ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x22B-v0.1 \ + --output_dir ./tllm_checkpoint_mixtral_8gpu \ + --dtype float16 \ + --tp_size 8 \ + --moe_tp_size 2 \ + --moe_ep_size 4 +trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \ + --output_dir ./trt_engines/mixtral/tp2ep4 \ + --gemm_plugin float16 ``` Then, you can test your engine with the [run.py](../run.py) script: @@ -60,26 +84,38 @@ For more examples see [`examples/llama/README.md`](../llama/README.md) ### Parallelism Modes -Mixture of Experts supports two parallelism modes, these are Expert Parallelism (EP) and Tensor Parallelism (TP). +Mixture of Experts supports 3 parallelism modes, these are Expert Parallelism (EP), Tensor Parallelism (TP), and the hybrid of the two (TP+EP). + +In TP mode (default) expert weight matrices are sliced evenly between all GPUs, so that all GPUs work together to calculate the result for each expert. -In TP mode (default) expert weight matrices are sliced evenly between all GPUs, so that all GPUs work together to -calculate the result for each expert. +In EP mode each GPU is assigned a subset of the expert weights matrices, so each GPU works independently to calculate the result for its assigned experts. This may cause load balancing issues where some GPUs have more work than others, thus increasing latency. -In EP mode each GPU is assigned a subset of the expert weights matrices, so each GPU works independently to calculate -the result for its assigned experts. This may cause load balancing issues where some GPUs have more work than others, -thus increasing latency. +In TP+EP mode, both strategies are used simultaneously. This means each GPU handles a portion of the expert weights matrices (as in EP mode) and these weights are further sliced across multiple GPUs (as in TP mode). This hybrid approach aims to balance the workload more evenly across GPUs, enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone. -Enable expert parallelism by providing `--moe_tp_mode 1` to `convert_checkpoint.py`, see [tensorrt_llm/layers/moe.py](../../tensorrt_llm/layers/moe.py#L51) for available values +You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used. + +Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE paralleism across all GPUs must match the total number of parallelism in other parts of the model. ```bash -# Build Mixtral8x7B with Expert Parallelism Mode +# Build Mixtral8x7B with Expert Parallelism python ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x7B-v0.1 \ --output_dir ./tllm_checkpoint_mixtral_2gpu \ --dtype float16 \ --tp_size 2 \ - --moe_tp_mode 1 # 1 is expert parallel, 2 is tensor parallel (default 2) + --moe_ep_size 2 trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \ - --output_dir ./trt_engines/mixtral/tp2 \ + --output_dir ./trt_engines/mixtral/ep2 \ + --gemm_plugin float16 + +# Build Mixtral8x7B with Expert Parallelism and Tensor Parallelism +python ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x7B-v0.1 \ + --output_dir ./tllm_checkpoint_mixtral_4gpu \ + --dtype float16 \ + --tp_size 4 \ + --moe_tp_size 2 \ + --moe_ep_size 2 +trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_4gpu \ + --output_dir ./trt_engines/mixtral/tp2ep2 \ --gemm_plugin float16 ``` @@ -132,7 +168,6 @@ python ../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \ # Enable fp8 context fmha to get further acceleration by setting `--use_fp8_context_fmha enable` trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \ --output_dir ./engine_outputs \ - --gemm_plugin float16 \ --workers 2 ``` diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt index 6b0b4a750..9eb7892b0 100644 --- a/examples/mixtral/requirements.txt +++ b/examples/mixtral/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 transformers==4.38.2 accelerate==0.25.0 diff --git a/examples/model_api/llama.py b/examples/model_api/llama.py index 54870f290..3d182a085 100644 --- a/examples/model_api/llama.py +++ b/examples/model_api/llama.py @@ -47,7 +47,7 @@ def main(): args = parse_args() build_config = BuildConfig(max_input_len=256, - max_output_len=20, + max_seq_len=276, max_batch_size=1) # just for fast build, not best for production build_config.builder_opt = 0 diff --git a/examples/model_api/llama_multi_gpu.py b/examples/model_api/llama_multi_gpu.py index b42900578..eb066770d 100644 --- a/examples/model_api/llama_multi_gpu.py +++ b/examples/model_api/llama_multi_gpu.py @@ -26,7 +26,7 @@ def build_and_run_llama(hf_model_dir, engine_dir, tp_size, rank): ## Build engine build_config = BuildConfig(max_input_len=256, - max_output_len=256, + max_seq_len=512, max_batch_size=8) build_config.builder_opt = 0 # fast build for demo, pls avoid using this in production, since inference might be slower build_config.plugin_config.gemm_plugin = 'float16' # for fast build, tune inference perf based on your needs diff --git a/examples/model_api/llama_quantize.py b/examples/model_api/llama_quantize.py index b3bb81fe9..19fd48f58 100644 --- a/examples/model_api/llama_quantize.py +++ b/examples/model_api/llama_quantize.py @@ -50,7 +50,7 @@ def main(): tokenizer_dir = args.hf_model_dir max_batch_size, max_isl, max_osl = 1, 256, 20 build_config = BuildConfig(max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, max_batch_size=max_batch_size) cache_dir = Path(args.cache_dir) checkpoint_dir = cache_dir / "trtllm_checkpoint" diff --git a/examples/mpt/README.md b/examples/mpt/README.md index ea05b002a..093fd123c 100644 --- a/examples/mpt/README.md +++ b/examples/mpt/README.md @@ -136,7 +136,7 @@ All of the checkpoint generated by `convert_checkpoint.py` or `quantize.py` (Mod trtllm-build --checkpoint_dir=./ckpts/mpt-7b/fp16 \ --max_batch_size 32 \ --max_input_len 1024 \ - --max_output_len 512 \ + --max_seq_len 1536 \ --gemm_plugin float16 \ --workers 1 \ --output_dir ./trt_engines/mpt-7b/fp16 @@ -163,7 +163,7 @@ Examples of build invocations: trtllm-build --checkpoint_dir ./ckpts/mpt-30b/fp16_tp4 \ --max_batch_size 32 \ --max_input_len 1024 \ - --max_output_len 512 \ + --max_seq_len 1536 \ --gemm_plugin float16 \ --workers 4 \ --output_dir ./trt_engines/mpt-30b/fp16_tp4 @@ -199,7 +199,7 @@ Examples of build invocations: trtllm-build --checkpoint_dir ./ckpts/replit-code-v1_5-3b/bf16_tp2 \ --max_batch_size 32 \ --max_input_len 1024 \ - --max_output_len 512 \ + --max_seq_len 1536 \ --gpt_attention_plugin bfloat16 \ --gemm_plugin bfloat16 \ --workers 2 \ diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index 815e4caee..58af19bd7 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md index 05f47db88..ad31e50ed 100644 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -15,8 +15,9 @@ We first describe how to run each model on a single GPU. We then provide general - [Kosmos-2](#kosmos-2) - [LLaVA and VILA](#llava-and-vila) - [NeVA](#neva) -- [Video NeVA](#video-neva) - [Nougat](#nougat) +- [Phi-3-vision](#phi-3-vision) +- [Video NeVA](#video-neva) - [Enabling tensor parallelism for multi-GPU](#enabling-tensor-parallelism-for-multi-gpu) ## BLIP2-T5 @@ -53,7 +54,7 @@ We first describe how to run each model on a single GPU. We then provide general --context_fmha disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --max_input_len 924 \ --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_visual_features) @@ -71,7 +72,7 @@ We first describe how to run each model on a single GPU. We then provide general --context_fmha disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --max_encoder_input_len 924 \ --max_input_len 1 ``` @@ -128,7 +129,7 @@ OPT pipeline needs few minor changes from T5 pipeline --max_batch_size 8 \ --max_multimodal_len 256 \ --max_input_len 924 \ - --max_output_len 100 + --max_seq_len 1024 python build_visual_engine.py --model_type ${MODEL_NAME} --model_path tmp/hf_models/${MODEL_NAME} @@ -157,7 +158,7 @@ OPT pipeline needs few minor changes from T5 pipeline --max_batch_size 8 \ --max_multimodal_len 256 \ --max_input_len 924 \ - --max_output_len 100 + --max_seq_len 1024 ``` The built OPT engines lie in `trt_engines/${MODEL_NAME}/int4_weightonly/1-gpu`. @@ -200,7 +201,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --remove_input_padding disable \ --max_batch_size 48 \ --max_input_len 2048 \ - --max_output_len 1024 \ + --max_seq_len 3076 \ --paged_kv_cache disable \ --use_custom_all_reduce disable \ --enable_xqa disable \ @@ -260,7 +261,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --context_fmha disable \ --max_beam_width 1 \ --max_batch_size 8 \ - --max_output_len 510 \ + --max_seq_len 2558 \ --max_encoder_input_len 2048 \ --max_input_len 1 ``` @@ -313,7 +314,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --use_fused_mlp \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 2048 ``` @@ -352,7 +353,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --gemm_plugin float16 \ --max_batch_size 1 \ --max_input_len 512 \ - --max_output_len 512 \ + --max_seq_len 1024 \ --max_multimodal_len 64 # 1 (max_batch_size) * 64 (num_visual_features) ``` @@ -364,7 +365,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in python run.py \ --hf_model_dir tmp/hf_models/${MODEL_NAME} \ --visual_engine_dir visual_engines/${MODEL_NAME} \ - --llm_engine_dir trt_engines/${MODEL_NAME}/1-gpu/bfloat16 + --llm_engine_dir trt_engines/${MODEL_NAME}/fp16/1-gpu ``` ## LLaVA and VILA @@ -411,7 +412,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --use_fused_mlp \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 576 # 1 (max_batch_size) * 576 (num_visual_features) for LLaVA trtllm-build \ @@ -421,7 +422,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --use_fused_mlp \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 4096 # 1 (max_batch_size) * 4096 (num_visual_features) for VILA ``` @@ -489,7 +490,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --gemm_plugin float16 \ --max_batch_size 1 \ --max_input_len 1024 \ - --max_output_len 100 \ + --max_seq_len 1124 \ --max_multimodal_len 576 # for LLaVA trtllm-build \ @@ -499,7 +500,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --use_fused_mlp \ --max_batch_size 1 \ --max_input_len 1024 \ - --max_output_len 100 \ + --max_seq_len 1124 \ --max_multimodal_len 4096 # for VILA ``` @@ -526,7 +527,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --gemm_plugin float16 \ --max_batch_size 1 \ --max_input_len 1024 \ - --max_output_len 100 \ + --max_seq_len 1124 \ --max_multimodal_len 576 # for LLaVA trtllm-build \ @@ -535,7 +536,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --gemm_plugin float16 \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 4096 # for VILA ``` @@ -569,7 +570,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --gemm_plugin bfloat16 \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 729 # 1 (max_batch_size) * 729 (num_visual_features) ``` @@ -590,52 +591,6 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. -## Video NeVA - -[Video NeVA](https://github.com/NVIDIA/NeMo/blob/main/docs/source/multimodal/mllm/video_neva.rst) is a groundbreaking addition to the NeMo Multimodal ecosystem that could work with video modality. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. - -1. Generate TRT-LLM engine for Nemotron model following example in `examples/nemotron/README.md`. To adhere to the NVGPT conventions of the conversion script. This will be used as our base LM for inference. - - ```bash - pip install decord # used for loading video - - python3 ../quantization/quantize.py \ - --nemo_ckpt_path /path/to/nemotron/model.nemo \ - --dtype bfloat16 \ - --batch_size 64 \ - --qformat full_prec \ - --output_dir nemotron-3/trt_ckpt/bf16/1-gpu - - - trtllm-build \ - --checkpoint_dir nemotron-3/trt_ckpt/bf16/1-gpu \ - --output_dir trt_engines/nemotron-3/bf16/1-gpu \ - --gpt_attention_plugin bfloat16 \ - --gemm_plugin bfloat16 \ - --max_batch_size 1 \ - --max_input_len 4096 \ - --max_output_len 256 \ - --max_multimodal_len 3072 # 1 (max_batch_size) * (12 num_frames) * (256 image_token_len) - ``` - -2. Build TensorRT engines for visual components - - ```bash - python build_visual_engine.py --model_path /path/to/video/neva/projector.nemo --model_type video-neva - ``` - - ```bash - python run.py \ - --max_new_tokens 30 \ - --hf_model_dir nemotron-3/trt_ckpt/bf16/1-gpu \ - --visual_engine_dir visual_engines/video_neva_engine \ - --llm_engine_dir trt_engines/nemotron-3/bf16/1-gpu \ - --input_text "Question: what is in the video? Answer:" \ - --video_path /path/to/your/local/video/file - ``` - - Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. - ## Nougat 1. Download Huggingface weights @@ -672,7 +627,7 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in --remove_input_padding enable \ --max_beam_width 1 \ --max_batch_size 1 \ - --max_output_len 100 \ + --max_seq_len 101 \ --max_input_len 1 \ --max_encoder_input_len 588 # 1 (max_batch_size) * 588 (num_visual_features) ``` @@ -691,6 +646,90 @@ Currently, CogVLM only support bfloat16 precision and doesn't support `remove_in Note: Nougat models usually do not need a text prompt. +## Phi-3-vision + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="Phi-3-vision-128k-instruct" + git clone https://huggingface.co/microsoft/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/phi`. + ```bash + python ../gpt/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gpt_attention_plugin float16 \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 4096 \ + --max_seq_len 4608 \ + --max_multimodal_len 4096 + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type phi-3-vision --model_path tmp/hf_models/${MODEL_NAME} + + python run.py \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir visual_engines/${MODEL_NAME} \ + --llm_engine_dir trt_engines/${MODEL_NAME}/fp16/1-gpu/ + ``` + +## Video NeVA + +[Video NeVA](https://github.com/NVIDIA/NeMo/blob/main/docs/source/multimodal/mllm/video_neva.rst) is a groundbreaking addition to the NeMo Multimodal ecosystem that could work with video modality. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. + +1. Generate TRT-LLM engine for Nemotron model following example in `examples/nemotron/README.md`. To adhere to the NVGPT conventions of the conversion script. This will be used as our base LM for inference. + + ```bash + pip install decord # used for loading video + + python3 ../quantization/quantize.py \ + --nemo_ckpt_path /path/to/nemotron/model.nemo \ + --dtype bfloat16 \ + --batch_size 64 \ + --qformat full_prec \ + --output_dir nemotron-3/trt_ckpt/bf16/1-gpu + + + trtllm-build \ + --checkpoint_dir nemotron-3/trt_ckpt/bf16/1-gpu \ + --output_dir trt_engines/nemotron-3/bf16/1-gpu \ + --gpt_attention_plugin bfloat16 \ + --gemm_plugin bfloat16 \ + --max_batch_size 1 \ + --max_input_len 4096 \ + --max_seq_len 4352 \ + --max_multimodal_len 3072 # 1 (max_batch_size) * (12 num_frames) * (256 image_token_len) + ``` + +2. Build TensorRT engines for visual components + + ```bash + python build_visual_engine.py --model_path /path/to/video/neva/projector.nemo --model_type video-neva + ``` + + ```bash + python run.py \ + --max_new_tokens 30 \ + --hf_model_dir nemotron-3/trt_ckpt/bf16/1-gpu \ + --visual_engine_dir visual_engines/video_neva_engine \ + --llm_engine_dir trt_engines/nemotron-3/bf16/1-gpu \ + --input_text "Question: what is in the video? Answer:" \ + --video_path /path/to/your/local/video/file + ``` + + Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. + ## Enabling tensor parallelism for multi-GPU The LLM part of the pipeline can be run on multiple GPUs using tensor parallelism. @@ -716,7 +755,7 @@ The full set of commands to enable 2-way tensor parallelism for LLaVA is: --gemm_plugin float16 \ --max_batch_size 1 \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 2560 \ --max_multimodal_len 576 python build_visual_engine.py --model_type llava --model_path tmp/hf_models/${MODEL_NAME} diff --git a/examples/multimodal/build_visual_engine.py b/examples/multimodal/build_visual_engine.py index 081a7b639..e4608c0bf 100644 --- a/examples/multimodal/build_visual_engine.py +++ b/examples/multimodal/build_visual_engine.py @@ -12,8 +12,12 @@ import tensorrt as trt from tensorrt_llm.builder import Builder # isort: on +import json +import math + import torch.nn.functional as F from PIL import Image +from safetensors.torch import save_file from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, Blip2ForConditionalGeneration, Blip2Processor, @@ -31,7 +35,8 @@ def parse_arguments(): choices=[ 'opt-2.7b', 'opt-6.7b', 'flan-t5-xl', 'flan-t5-xxl', 'llava', 'vila', 'nougat', 'cogvlm', 'fuyu', - 'pix2struct', 'neva', 'kosmos-2', 'video-neva' + 'pix2struct', 'neva', 'kosmos-2', 'video-neva', + 'phi-3-vision' ], help="Model type") parser.add_argument( @@ -93,6 +98,8 @@ def build(self): build_video_neva_engine(args) elif args.model_type == 'kosmos-2': build_kosmos_engine(args) + elif args.model_type == 'phi-3-vision': + build_phi_engine(args) else: raise RuntimeError(f"Invalid model type {args.model_type}") @@ -663,6 +670,81 @@ def forward(self, images): args.max_batch_size) +def build_phi_engine(args): + processor = AutoProcessor.from_pretrained(args.model_path, + trust_remote_code=True) + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = processor(text="<|image_1|>\ndummy", + images=raw_image, + return_tensors="pt")['pixel_values'].to( + args.device, torch.float16) + try: + with open(f"{args.model_path}/preprocessor_config.json", "r") as file: + config = file.read() + config_dict = json.loads(config) + num_crops = config_dict.get("num_crops") + except: + num_crops = 16 + + class Phi3VisionWrapper(torch.nn.Module): + + def __init__(self, img_processor, img_projection, layer_idx, + image_dim_out): + super().__init__() + self.img_processor = img_processor + self.img_projection = img_projection + self.layer_idx = layer_idx + self.image_dim_out = image_dim_out + + def get_img_features( + self, img_embeds: torch.FloatTensor) -> torch.FloatTensor: + LAYER_IDX = self.layer_idx + + img_processor_output = self.img_processor(img_embeds, + output_hidden_states=True) + img_feature = img_processor_output.hidden_states[LAYER_IDX] + + patch_feature = img_feature[:, 1:] + return patch_feature + + def forward(self, image): + img_features = self.get_img_features(image) + base_feat_height = int(math.sqrt(img_features.shape[1])) + C = self.image_dim_out + H = base_feat_height + img_features = img_features.reshape(-1, H, H, C).reshape( + -1, H // 2, 2, H // 2, 2, + C).contiguous().permute(0, 1, 3, 2, 4, + 5).reshape(-1, H // 2, H // 2, + 4 * C).contiguous() + return self.apply_img_projection(img_features) + + def apply_img_projection(self, input): + return self.img_projection(input) + + model = AutoModelForCausalLM.from_pretrained(args.model_path, + torch_dtype=torch.float16, + trust_remote_code=True).to( + args.device) + + wrapper = Phi3VisionWrapper(model.model.vision_embed_tokens.img_processor, + model.model.vision_embed_tokens.img_projection, + model.model.vision_embed_tokens.layer_idx, + model.model.vision_embed_tokens.image_dim_out) + image = image.flatten(0, 1) + glb_GN = wrapper.apply_img_projection( + model.model.vision_embed_tokens.glb_GN) + sub_GN = wrapper.apply_img_projection( + model.model.vision_embed_tokens.sub_GN) + tensors = {"glb_GN": glb_GN, "sub_GN": sub_GN} + save_file(tensors, args.output_dir + "/image_newlines.safetensors") + export_visual_wrapper_onnx(wrapper, image, args.output_dir) + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], args.output_dir, + args.max_batch_size * (num_crops + 1)) #TODO: Take input from config + + if __name__ == '__main__': logger = trt.Logger(trt.Logger.INFO) args = parse_arguments() diff --git a/examples/multimodal/run.py b/examples/multimodal/run.py index 7ef1df472..14c953ae6 100644 --- a/examples/multimodal/run.py +++ b/examples/multimodal/run.py @@ -15,6 +15,7 @@ from huggingface_hub import hf_hub_download from PIL import Image +from safetensors import safe_open from torchvision import transforms from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, Blip2Processor, CLIPImageProcessor, NougatProcessor, @@ -179,8 +180,9 @@ def batch_decode(self, x, **kwargs): use_fast=False, use_legacy=False) else: + use_fast = False if self.model_type != "phi-3-vision" else True self.tokenizer = AutoTokenizer.from_pretrained( - self.args.hf_model_dir, use_fast=False, use_legacy=False) + self.args.hf_model_dir, use_fast=use_fast, use_legacy=False) self.tokenizer.padding_side = "right" @@ -193,6 +195,14 @@ def init_image_encoder(self): logger.info(f'Creating session from engine {vision_encoder_path}') self.visual_encoder_session = Session.from_serialized_engine( engine_buffer) + if self.model_type == "phi-3-vision": + self.image_newlines = {} + image_newlines_path = os.path.join(self.args.visual_engine_dir, + 'image_newlines.safetensors') + with safe_open(image_newlines_path, framework="pt", + device="cuda") as f: + for k in f.keys(): + self.image_newlines[k] = f.get_tensor(k) def init_llm(self): if self.decoder_llm: @@ -261,6 +271,11 @@ def preprocess(self, warmup, pre_prompt, post_prompt, image, input_ids = input_ids.expand(self.args.batch_size, *input_ids.shape[1:]) length = input_ids.shape[1] + elif self.model_type == 'phi-3-vision': + input = image + image = input['pixel_values'] + bs = image.shape[0] + image = image.flatten(0, 1) if not warmup: profiler.start("Vision") @@ -310,6 +325,47 @@ def preprocess(self, warmup, pre_prompt, post_prompt, image, args.batch_size, visual_features, first_batch_split_prompts, input_lengths) return input_ids, input_lengths, ptuning_args, visual_features + elif self.model_type == 'phi-3-vision': + input_ids = input["input_ids"].clone() + glb_GN = torch.squeeze(self.image_newlines["glb_GN"].clone(), dim=0) + sub_GN = self.image_newlines["sub_GN"].clone() + + H = visual_features.shape[1] + C = visual_features.shape[-1] + #bs*17*12*12*3072 + visual_features = visual_features.view(bs, -1, H, H, C) + global_img_feature = visual_features[:, 0] #bs*12*12*3072 + temp_glb_GN = sub_GN.repeat(bs, H, 1, 1) #bs*12*1*3072 + global_img_feature = torch.cat([global_img_feature, temp_glb_GN], + dim=2).reshape(bs, -1, C) + + crop_visual_features = visual_features[:, 1:] + patch_sizes = [ + image_size // image.shape[-1] + for image_size in input["image_sizes"] + ] + visual_features = [] + for global_img_feature, crop_visual_feature, patch_size in zip( + global_img_feature, crop_visual_features, patch_sizes): + crop_visual_feature = \ + crop_visual_feature[:patch_size[0]*patch_size[1]].view(patch_size[0], patch_size[1], H, H, C).permute(0, 2, 1, 3, 4).reshape(patch_size[0]*H, patch_size[1]*H, C) + temp_sub_GN = torch.squeeze(sub_GN.repeat( + 1, patch_size[0] * H, 1, 1), + dim=0) + crop_visual_feature = torch.cat( + [crop_visual_feature, temp_sub_GN], dim=1).reshape(-1, C) + visual_features.append( + torch.cat([crop_visual_feature, glb_GN, global_img_feature], + dim=0)) + + num_img_tokens = [elem.size(0) for elem in visual_features] + + visual_features = torch.cat(visual_features, dim=0) + input_ids = input_ids.expand(self.args.batch_size, + *input_ids.shape[1:]) + input_ids = self.ptuning_setup_phi3(visual_features, input_ids, + num_img_tokens) + length = input_ids.shape[1] else: pre_input_ids = self.tokenizer(pre_prompt, return_tensors="pt", @@ -331,7 +387,7 @@ def preprocess(self, warmup, pre_prompt, post_prompt, image, input_lengths = torch.IntTensor([length] * args.batch_size).to( torch.int32) - if self.model_type in ['fuyu', 'kosmos-2']: + if self.model_type in ['fuyu', 'kosmos-2', 'phi-3-vision']: return input_ids, input_lengths, [visual_features], visual_features input_ids, ptuning_args = self.setup_fake_prompts( @@ -611,6 +667,20 @@ def ptuning_setup_fuyu(self, input_ids, image_patches_indices): res_input_ids.append(cur_input_ids) return res_input_ids + def ptuning_setup_phi3(self, visual_features, input_ids, num_img_tokens): + fake_prompt_id = torch.arange( + self.model_config.vocab_size, + self.model_config.vocab_size + visual_features.shape[0]) + MAX_INPUT_ID = int(1e9) + positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), + as_tuple=False) + idx = 0 + for i, cnt in enumerate(num_img_tokens): + input_ids[positions[idx, 0], positions[idx, 1]:positions[idx, 1] + + cnt] = fake_prompt_id[idx:idx + cnt] + idx += cnt + return input_ids + def ptuning_setup(self, prompt_table, input_ids, input_lengths): hidden_size = self.model_config.hidden_size * self.runtime_mapping.tp_size if prompt_table is not None: @@ -735,6 +805,17 @@ def setup_inputs(self, input_text, raw_image): input_text = " [INST] which city is this? [/INST] " pre_prompt = input_text post_prompt = None + elif 'phi-3-vision' in self.model_type: + pre_prompt = "<|user|>\n<|image_1|>\n" + if input_text is None: + input_text = "Which city is this?" + post_prompt = input_text + "<|end|>\n<|assistant|>\n" + prompt = pre_prompt + post_prompt + processor = AutoProcessor.from_pretrained(args.hf_model_dir, + trust_remote_code=True) + image = processor(text=prompt, + images=raw_image, + return_tensors="pt") elif self.model_type == "pix2struct": image_processor = AutoProcessor.from_pretrained(args.hf_model_dir) if input_text is None: @@ -836,7 +917,9 @@ def setup_inputs(self, input_text, raw_image): # Repeat inputs to match batch size pre_prompt = [pre_prompt] * self.args.batch_size post_prompt = [post_prompt] * self.args.batch_size - if self.model_type not in ['fuyu', 'pix2struct', 'kosmos-2', 'vila']: + if self.model_type not in [ + 'fuyu', 'pix2struct', 'kosmos-2', 'vila', 'phi-3-vision' + ]: if image.dim() == 5: image = image.expand(args.batch_size, -1, -1, -1, -1).contiguous() @@ -907,7 +990,7 @@ def print_result(self, input_text, output_text): elif self.model_type == "pix2struct": assert "characteristic | cat food, day | cat food, wet | cat treats" in output_text[ 0][0].lower() - elif self.model_type == 'neva': + elif self.model_type in ['neva', 'phi-3-vision']: assert 'singapore' in output_text[0][0].lower() elif self.model_type == 'video-neva': assert 'robot' in output_text[0][0].lower() diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index 815e4caee..88383335b 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 +transformers==4.40.2 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/opt/README.md b/examples/opt/README.md index bd4ca501d..8b3a39af5 100644 --- a/examples/opt/README.md +++ b/examples/opt/README.md @@ -95,7 +95,7 @@ trtllm-build --checkpoint_dir ./opt/125M/trt_ckpt/fp16/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/125M/trt_engines/fp16/1-gpu/ # OPT-350M @@ -103,7 +103,7 @@ trtllm-build --checkpoint_dir ./opt/350M/trt_ckpt/fp16/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/350M/trt_engines/fp16/1-gpu/ # OPT-2.7B @@ -111,7 +111,7 @@ trtllm-build --checkpoint_dir ./opt/2.7B/trt_ckpt/fp16/1-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/2.7B/trt_engines/fp16/1-gpu/ # OPT-66B @@ -119,7 +119,7 @@ trtllm-build --checkpoint_dir ./opt/66B/trt_ckpt/fp16/4-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/66B/trt_engines/fp16/4-gpu/ \ --workers 2 ``` @@ -220,7 +220,7 @@ trtllm-build --checkpoint_dir ./opt/125M/trt_ckpt/fp16/2-gpu/ \ --lookup_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/125M/trt_engines/fp16/2-gpu/ \ --workers 2 @@ -241,7 +241,7 @@ trtllm-build --checkpoint_dir ./opt/125M/trt_ckpt/fp16/2-gpu/ \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 924 \ - --max_output_len 100 \ + --max_seq_len 1024 \ --output_dir ./opt/125M/trt_engines/fp16/2-gpu/ \ --workers 2 ``` diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt index 815e4caee..58af19bd7 100644 --- a/examples/opt/requirements.txt +++ b/examples/opt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/phi/README.md b/examples/phi/README.md index ad9675a04..5dd4f7491 100644 --- a/examples/phi/README.md +++ b/examples/phi/README.md @@ -78,7 +78,7 @@ trtllm-build \ --gemm_plugin float16 \ --max_batch_size 8 \ --max_input_len 1024 \ - --max_output_len 1024 \ + --max_seq_len 2048 \ --tp_size 1 \ --pp_size 1 ``` diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py index 67cea6bf1..30fde7092 100644 --- a/examples/phi/convert_checkpoint.py +++ b/examples/phi/convert_checkpoint.py @@ -59,7 +59,6 @@ def parse_arguments(): type=str, default='tllm_checkpoint', help='The path to save the TensorRT-LLM checkpoint') - parser.add_argument( '--workers', type=int, @@ -85,6 +84,7 @@ def parse_arguments(): supported_model = { 'PhiForCausalLM': PhiForCausalLM, 'Phi3ForCausalLM': Phi3ForCausalLM, + 'Phi3VForCausalLM': Phi3ForCausalLM, 'Phi3SmallForCausalLM': Phi3SmallForCausalLM } modelForCausalLM = None diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index 3de8f0b87..0bf746b21 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/quantization/install_requirements.sh b/examples/quantization/install_requirements.sh index ebc40c1b8..c38006f0e 100755 --- a/examples/quantization/install_requirements.sh +++ b/examples/quantization/install_requirements.sh @@ -1,4 +1,4 @@ #!/bin/bash pip3 install nemo_toolkit[all]@git+https://github.com/NVIDIA/NeMo.git@5b5a4445f3de30786d7c2ef4108fc89242a643f3 -pip3 install megatron_core@git+https://github.com/NVIDIA/Megatron-LM.git@a5415fcfacef2a37416259bd38b7c4b673583675 +pip3 install megatron_core@git+ https://github.com/NVIDIA/Megatron-LM.git@a5415fcfacef2a37416259bd38b7c4b673583675 diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py index ec46b68a4..f1ce69d41 100644 --- a/examples/quantization/quantize.py +++ b/examples/quantization/quantize.py @@ -21,6 +21,12 @@ default='gptnext', choices=['gptnext', 'llama'], help="Decoder type; effective for NeMo checkpoint only.") + parser.add_argument( + '--device', + help= + "The device to run calibration; effective for HuggingFace model only.", + default='cuda', + choices=['cuda', 'cpu']) parser.add_argument( '--calib_dataset', type=str, @@ -89,6 +95,7 @@ if args.model_dir is not None: quantize_and_export( model_dir=args.model_dir, + device=args.device, calib_dataset=args.calib_dataset, dtype=args.dtype, qformat=args.qformat, diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index 608073644..b904ad81d 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets>=2.14.4 nemo-toolkit[all]<=1.20.0,>=1.18.0 rouge_score~=0.1.2 diff --git a/examples/qwen/README.md b/examples/qwen/README.md index 6e5c29cd6..4ba54bada 100644 --- a/examples/qwen/README.md +++ b/examples/qwen/README.md @@ -43,6 +43,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f | Qwen1.5-32B(-Chat) | Y | Y | Y | Y | Y | Y | Y | Y | Ampere+ | | Qwen1.5-72B(-Chat) | Y | Y | Y | Y | Y | Y | Y | Y | Ampere+ | | Qwen1.5-110B(-Chat)| Y | Y | Y | Y | Y | Y | Y | Y | Ampere+ | +| Qwen1.5-MoE-A2.7B(-Chat)| Y | - | Y | - | - | - | Y | Y | Ampere+ | *Please note that these models supports AWQ only with single GPU. diff --git a/examples/qwen/convert_checkpoint.py b/examples/qwen/convert_checkpoint.py index e9478d2dc..25a430dae 100644 --- a/examples/qwen/convert_checkpoint.py +++ b/examples/qwen/convert_checkpoint.py @@ -1,7 +1,7 @@ import argparse -import json import os import time +import traceback from concurrent.futures import ThreadPoolExecutor, as_completed import tensorrt_llm @@ -15,13 +15,7 @@ def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--model_dir', type=str, default=None) - parser.add_argument( - '--qwen_type', - default='qwen', - choices=['qwen', 'qwen2'], - help="Used only if model_dir is not provided." - "In this case users should explicitly passing the version.") + parser.add_argument('--model_dir', type=str, default=None, required=True) parser.add_argument('--tp_size', type=int, default=1, @@ -34,15 +28,6 @@ def parse_arguments(): type=str, default='float16', choices=['float32', 'bfloat16', 'float16']) - parser.add_argument('--vocab_size', type=int, default=32000) - parser.add_argument('--n_positions', type=int, default=2048) - parser.add_argument('--n_layer', type=int, default=32) - parser.add_argument('--n_head', type=int, default=32) - parser.add_argument('--n_kv_head', type=int, default=None) - parser.add_argument('--n_embd', type=int, default=4096) - parser.add_argument('--inter_size', type=int, default=22016) - parser.add_argument('--rms_norm_eps', type=float, default=1e-06) - parser.add_argument( '--use_weight_only', default=False, @@ -115,11 +100,6 @@ def parse_arguments(): 'per_group chooses at run time, and for each group, a custom scaling factor. ' 'The flag is built for GPTQ/AWQ quantization.') - parser.add_argument('--hidden_act', type=str, default='silu') - - parser.add_argument('--rotary_base', type=float, default=10000.0) - parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) - parser.add_argument('--group_size', type=int, default=128, @@ -165,13 +145,19 @@ def parse_arguments(): default=1, help='The number of workers for converting checkpoint in parallel') parser.add_argument( - '--save_config_only', - action="store_true", - default=False, + '--moe_tp_size', + type=int, + default=-1, help= - 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' ) - args = parser.parse_args() return args @@ -180,10 +166,6 @@ def args_to_quantization(args: argparse.Namespace) -> QuantConfig: '''return config dict with quantization info based on the command line args ''' quant_config = QuantConfig() - quant_config.exclude_modules = [ - 'lm_head', 'router', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ] if args.use_weight_only: if args.weight_only_precision == 'int8': quant_config.quant_algo = QuantAlgo.W8A16 @@ -228,35 +210,6 @@ def args_to_build_options(args): } -def from_cli_args(args): - n_kv_head = args.n_kv_head if args.n_kv_head is not None else args.n_head # default to MHA - config = { - 'architecture': "QWenForCausalLM", - 'dtype': args.dtype, - 'logits_dtype': 'float32', - 'num_hidden_layers': args.n_layer, - 'num_attention_heads': args.n_head, - 'hidden_size': args.n_embd, - 'intermediate_size': args.inter_size, - 'num_key_value_heads': n_kv_head, - 'vocab_size': args.vocab_size, - 'position_embedding_type': 'rope_gpt_neox', - 'max_position_embeddings': args.n_positions, - 'hidden_act': args.hidden_act, - 'rotary_base': args.rotary_base, - 'norm_epsilon': args.rms_norm_eps, - 'qwen_type': args.qwen_type, - 'mapping': { - 'world_size': args.tp_size * args.pp_size, - 'tp_size': args.tp_size, - 'pp_size': args.pp_size - }, - 'quantization': args_to_quantization(args).to_dict() - } - config.update(args_to_build_options(args)) - return config - - def preload_model(args, model_dir, load_model_on_cpu): from transformers import AutoModelForCausalLM if args.use_weight_only and args.weight_only_precision == 'int4_gptq': @@ -292,7 +245,10 @@ def convert_and_save_hf(args): world_size=world_size, rank=-1, #intentinoally make -1 to avoid mistake tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size, + ) #TODO: change to QWenForCausalLM.quantize later quantize(args.dtype, args.model_dir, @@ -311,7 +267,9 @@ def convert_and_save_rank(args, rank): mapping = Mapping(world_size=world_size, rank=rank, tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) #TODO: change to QWenForCausalLM.from_hugging_face later qwen = from_hugging_face( QWenForCausalLM, @@ -325,9 +283,9 @@ def convert_and_save_rank(args, rank): override_fields=override_fields) qwen.save_checkpoint(args.output_dir, save_config=(rank == 0)) del qwen - release_gc() execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() def execute(workers, func, args): @@ -353,19 +311,24 @@ def main(): print(tensorrt_llm.__version__) args = parse_arguments() - args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + tik = time.time() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - if args.model_dir is None: - config = from_cli_args(args) - with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: - json.dump(config, f, indent=4) - else: - assert args.model_dir is not None - convert_and_save_hf(args) + assert args.model_dir is not None + convert_and_save_hf(args) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index acd668d56..f1d4accf8 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 @@ -9,7 +9,7 @@ tiktoken einops # optional dependencies -gradio==4.19.2 +gradio==4.36.0 mdtex2html sse_starlette aiohttp_sse_client diff --git a/examples/qwenvl/README.md b/examples/qwenvl/README.md index d0e3e7c66..4578ca92e 100644 --- a/examples/qwenvl/README.md +++ b/examples/qwenvl/README.md @@ -35,7 +35,7 @@ ```bash trtllm-build --checkpoint_dir=./tllm_checkpoint_1gpu \ --gemm_plugin=float16 --gpt_attention_plugin=float16 \ - --lookup_plugin=float16 --max_input_len=2048 --max_output_len=1024 \ + --lookup_plugin=float16 --max_input_len=2048 --max_seq_len=3072 \ --max_batch_size=8 --max_prompt_embedding_table_size=2048 \ --remove_input_padding=enable \ --output_dir=./trt_engines/Qwen-VL-7B-Chat diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index f0b55b412..09ee007be 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/recurrentgemma/README.md b/examples/recurrentgemma/README.md index 1b0b161b9..6dc2db98a 100644 --- a/examples/recurrentgemma/README.md +++ b/examples/recurrentgemma/README.md @@ -25,11 +25,6 @@ Please install required packages first and setup `git-lfs`: ```bash pip install -r requirements.txt -git clone https://github.com/google-deepmind/recurrentgemma.git -pip install ./recurrentgemma/[full] -pip install "transformers>=4.40.0" - -# Setup git-lfs git lfs install ``` @@ -93,45 +88,37 @@ After getting checkpoint, we can use `trtllm-build` command to build TensorRT-LL # recurrentgemma-2b ENGINE_2B_PATH=./recurrentgemma_model/recurrentgemma-2b/trt_engines/fp16/1-gpu/ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_2B_PATH} \ - --gpt_attention_plugin float16 \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_2B_PATH} # recurrentgemma-2b-it ENGINE_2B_IT_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_engines/bf16/1-gpu/ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_2B_IT_PATH} \ - --gpt_attention_plugin bfloat16 \ - --gemm_plugin bfloat16 \ - --mamba_conv1d_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_2B_IT_PATH} # recurrentgemma-2b-flax ENGINE_2B_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-flax/trt_engines/fp16/1-gpu/ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_2B_FLAX_PATH} \ - --gpt_attention_plugin float16 \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_2B_FLAX_PATH} # recurrentgemma-2b-it-flax ENGINE_2B_IT_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-it-flax/trt_engines/bf16/1-gpu/ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_2B_IT_FLAX_PATH} \ - --gpt_attention_plugin bfloat16 \ - --gemm_plugin bfloat16 \ - --mamba_conv1d_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 3000 \ - --max_output_len 100 \ + --max_seq_len 3100 \ --output_dir ${ENGINE_2B_IT_FLAX_PATH} ``` diff --git a/examples/recurrentgemma/convert_checkpoint.py b/examples/recurrentgemma/convert_checkpoint.py index a80eb1ac5..7b12bd756 100644 --- a/examples/recurrentgemma/convert_checkpoint.py +++ b/examples/recurrentgemma/convert_checkpoint.py @@ -58,7 +58,9 @@ def embedding_weights(self, ckpt_params): def get_config(self, checkpoint_path, ckpt_params): config = recurrentgemma_jax.GriffinConfig.from_flax_params_or_variables( - ckpt_params)._asdict() + ckpt_params, + preset=recurrentgemma_jax.Preset.RECURRENT_GEMMA_2B_V1, + )._asdict() if config["lru_width"] is None: config["lru_width"] = config["width"] layer_types = [] diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index 168ca1dca..90d3ff974 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -1,5 +1,10 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 +git+https://github.com/google-deepmind/recurrentgemma.git +flax>=0.8.2 +jax~=0.4.23 +orbax-checkpoint==0.5.7 +transformers>=4.40.0 datasets~=2.14.5 evaluate rouge_score diff --git a/examples/run.py b/examples/run.py index f47b1ad12..bf802cb9b 100644 --- a/examples/run.py +++ b/examples/run.py @@ -178,7 +178,6 @@ def print_output(tokenizer, f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: output_file = Path(output_csv) output_file.parent.mkdir(exist_ok=True, parents=True) diff --git a/examples/sample_weight_stripping/README.md b/examples/sample_weight_stripping/README.md index 80ed27da5..94178c7e8 100644 --- a/examples/sample_weight_stripping/README.md +++ b/examples/sample_weight_stripping/README.md @@ -86,7 +86,7 @@ trtllm-build --checkpoint_dir ./trt_ckpt/gptj_fp16_tp1/ \ --gemm_plugin float16 \ --max_batch_size=32 \ --max_input_len=1919 \ - --max_output_len=128 \ + --max_seq_len=2047 \ --strip_plan ``` diff --git a/examples/skywork/README.md b/examples/skywork/README.md index 3655ca0e6..72768173d 100644 --- a/examples/skywork/README.md +++ b/examples/skywork/README.md @@ -62,7 +62,7 @@ trtllm-build --checkpoint_dir ./skywork-13b-base/trt_ckpt/fp16 \ --context_fmha enable \ --max_batch_size 32 \ --max_input_len 512 \ - --max_output_len 512 \ + --max_seq_len 1024 \ --output_dir ./skywork-13b-base/trt_engine/fp16 # bf16 @@ -72,7 +72,7 @@ trtllm-build --checkpoint_dir ./skywork-13b-base/trt_ckpt/bf16 \ --context_fmha enable \ --max_batch_size 32 \ --max_input_len 512 \ - --max_output_len 512 \ + --max_seq_len 1024 \ --output_dir ./skywork-13b-base/trt_engine/bf16 ``` diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt index 4d364c6da..afc5cb1e1 100644 --- a/examples/skywork/requirements.txt +++ b/examples/skywork/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets~=2.16.1 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt index fbe76cd65..f8e55c97d 100644 --- a/examples/smaug/requirements.txt +++ b/examples/smaug/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/utils.py b/examples/utils.py index fb84f3852..7628cb978 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -35,6 +35,7 @@ 'PhiForCausalLM': 'microsoft/phi-2', 'OPTForCausalLM': 'facebook/opt-350m', 'QWenForCausalLM': 'Qwen/Qwen-7B', + 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', } INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). @@ -291,7 +292,7 @@ def add_common_args(parser): ) parser.add_argument( '--kv_cache_free_gpu_memory_fraction', - default=None, + default=0.9, type=float, help='Specify the free gpu memory fraction.', ) diff --git a/examples/whisper/README.md b/examples/whisper/README.md index d8771d4da..380543c36 100755 --- a/examples/whisper/README.md +++ b/examples/whisper/README.md @@ -80,7 +80,7 @@ trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ --use_custom_all_reduce disable \ --max_beam_width ${MAX_BEAM_WIDTH} \ --max_batch_size ${MAX_BATCH_SIZE} \ - --max_output_len 100 \ + --max_seq_len 114 \ --max_input_len 14 \ --max_encoder_input_len 1500 \ --gemm_plugin ${INFERENCE_PRECISION} \ diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index 08e49a471..a5f91efea 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061100 +tensorrt_llm==0.11.0.dev2024061800 tiktoken datasets kaldialign diff --git a/requirements-dev.txt b/requirements-dev.txt index c7ba3a061..3ec8cf995 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -r requirements.txt -datasets +datasets==2.19.2 einops graphviz mypy diff --git a/requirements.txt b/requirements.txt index 905fa1b44..259a73da8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ tensorrt==10.0.1 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0. torch>=2.3.0a,<=2.3.0 nvidia-modelopt~=0.11,<0.12 -transformers==4.40.2 +transformers>=4.38.2 wheel optimum evaluate diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 6ace8ff7f..90f6c26fc 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -84,9 +84,7 @@ def main(*, build_run('git submodule update --init --recursive') on_windows = platform.system() == "Windows" requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt" - build_run( - f"\"{sys.executable}\" -m pip install -r {requirements_filename} --extra-index-url https://pypi.ngc.nvidia.com" - ) + build_run(f"\"{sys.executable}\" -m pip install -r {requirements_filename}") # Ensure TRT is installed on windows to prevent surprises. reqs = check_output([sys.executable, "-m", "pip", "freeze"]) installed_packages = [r.decode().split("==")[0] for r in reqs.split()] diff --git a/tensorrt_llm/_common.py b/tensorrt_llm/_common.py index 9c630f96f..e43bc50e5 100644 --- a/tensorrt_llm/_common.py +++ b/tensorrt_llm/_common.py @@ -201,28 +201,29 @@ def decorated(*args, **kwargs): def check_max_num_tokens(max_num_tokens, opt_num_tokens, max_batch_size, - max_input_len, max_beam_width, remove_input_padding, - enable_context_fmha, tokens_per_block, - multiple_profiles): + max_input_len, max_seq_len, max_beam_width, + remove_input_padding, enable_context_fmha, + tokens_per_block, multiple_profiles): if not remove_input_padding: if max_num_tokens is not None or opt_num_tokens is not None: - max_num_tokens = max_batch_size * max_input_len + max_num_tokens = max_batch_size * max_seq_len logger.warning("remove_input_padding is not enabled, the specified " "max_num_tokens/opt_num_tokens will be ignored.") return max_num_tokens, opt_num_tokens else: if max_num_tokens is None: - max_num_tokens = max_input_len * max_batch_size + max_num_tokens = max_seq_len * max_batch_size logger.warning( "remove_input_padding is enabled, while max_num_tokens " - "is not set, setting to max_batch_size*max_input_len. \n" - "It may not be optimal to set max_num_tokens=max_batch_size*max_input_len " + "is not set, setting to max_batch_size*max_seq_len. \n" + "It may not be optimal to set max_num_tokens=max_batch_size*max_seq_len " "when remove_input_padding is enabled, because the number " "of packed input tokens are very likely to be smaller, " "we strongly recommend to set max_num_tokens according " "to your workloads.") if opt_num_tokens is None and not multiple_profiles: - opt_num_tokens = max_batch_size * max_beam_width + opt_num_tokens = min(max_batch_size * max_beam_width, + max_num_tokens) logger.warning( "remove_input_padding is enabled, while opt_num_tokens " "is not set, setting to max_batch_size*max_beam_width. \n") @@ -233,12 +234,12 @@ def check_max_num_tokens(max_num_tokens, opt_num_tokens, max_batch_size, "large `max_num_tokens` could possibly exceed the TensorRT " "tensor volume, causing runtime errors. " f"Got `max_num_tokens` = {max_num_tokens}") - if max_num_tokens > max_input_len * max_batch_size: - max_num_tokens = max_input_len * max_batch_size + if max_num_tokens > max_seq_len * max_batch_size: + max_num_tokens = max_seq_len * max_batch_size logger.warning( f"max_num_tokens ({max_num_tokens}) shouldn't be greater than " - f"max_input_len * max_batch_size ({max_input_len * max_batch_size}), " - f"specifying to max_input_len * max_batch_size ({max_input_len * max_batch_size})." + f"max_seq_len * max_batch_size ({max_seq_len * max_batch_size}), " + f"specifying to max_seq_len * max_batch_size ({max_seq_len * max_batch_size})." ) if max_num_tokens < max_input_len and not enable_context_fmha: logger.warning( diff --git a/tensorrt_llm/auto_parallel/config.py b/tensorrt_llm/auto_parallel/config.py index a3f16a258..1ef203fef 100644 --- a/tensorrt_llm/auto_parallel/config.py +++ b/tensorrt_llm/auto_parallel/config.py @@ -45,7 +45,6 @@ class AutoParallelConfig(DictConversion): same_buffer_io: Dict[str, str] = field(default_factory=dict) same_spec_io: Dict[str, str] = field(default_factory=dict) sharded_io_allowlist: List[str] = field(default_factory=list) - fast_reduce: bool = True fill_weights: bool = False # debug configuration diff --git a/tensorrt_llm/auto_parallel/parallelization.py b/tensorrt_llm/auto_parallel/parallelization.py index e5aa49196..6883616db 100644 --- a/tensorrt_llm/auto_parallel/parallelization.py +++ b/tensorrt_llm/auto_parallel/parallelization.py @@ -12,17 +12,17 @@ import torch from filelock import FileLock -from tensorrt_llm._utils import trt_dtype_to_np, trt_dtype_to_torch, trt_gte_10 +from tensorrt_llm._utils import (str_dtype_to_trt, trt_dtype_to_np, + trt_dtype_to_torch, trt_gte_10) from tensorrt_llm.functional import (AllReduceConfig, AllReduceFusionParams, - AllReduceStrategy) + AllReduceStrategy, create_allreduce_plugin) from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.network import (PluginInfo, delete_plugin_info, get_np_weight, get_plugin_info, set_plugin_info) -from tensorrt_llm.plugin import (TRT_LLM_PLUGIN_NAMESPACE, - current_all_reduce_helper, - init_all_reduce_helper) -from tensorrt_llm.plugin.plugin import CustomAllReduceHelper +from tensorrt_llm.plugin import TRT_LLM_PLUGIN_NAMESPACE, init_all_reduce_helper +from tensorrt_llm.plugin.plugin import (CustomAllReduceHelper, + current_all_reduce_helper) from tensorrt_llm.version import __version__ from .config import AutoParallelConfig @@ -36,8 +36,8 @@ GPTAttentionPlugin, IdxEntry, IdxEntryParser) from .tensor_parallel.sharding_spec import ShardingSpec, get_sharding_sequence from .tensor_parallel.sharding_strategy import ShardingStrategy -from .utils import (get_builder_flags, get_updated_plugin, to_base_class_layer, - to_subclass_layer, to_trt_weights) +from .utils import (get_updated_plugin, to_base_class_layer, to_subclass_layer, + to_trt_weights) default_int_dtype = trt.int64 if trt_gte_10() else trt.int32 @@ -508,6 +508,10 @@ def _add_comm(self, commspec, output_name=None, is_singleton=False): + input_tensors = [ + self.get_tensor(context, input_name, device_id.item()) + for device_id in np.nditer(device_ids) + ] comm_pattern = commspec.comm_pattern if comm_pattern == "split": self.add_split(context, input_name, output_name, device_ids, @@ -528,20 +532,21 @@ def _add_comm(self, commspec.logical_process_axis) else: raise NotImplementedError + output_tensors = [ + self.get_tensor(context, input_name, device_id.item()) + for device_id in np.nditer(device_ids) + ] + for input_tensor, output_tensor in zip(input_tensors, output_tensors): + if input_tensor.dtype != output_tensor.dtype: + raise ValueError( + f"Input tensor and output tensor should have the same dtype for communication layers, " + f"input dtype is {input_tensor.dtype} for {input_tensor.name}, " + f"but output dtype is {output_tensor.dtype} for {output_tensor.name}" + ) def add_all_reduce(self, context: GraphContext, input_name, output_name, device_ids): - builder_flags = get_builder_flags() - if builder_flags & (1 << int(trt.BuilderFlag.FP16)) != 0: - dtype = trt.DataType.HALF - elif builder_flags & (1 << int(trt.BuilderFlag.BF16)) != 0: - dtype = trt.DataType.BF16 - else: - dtype = trt.DataType.FLOAT - fast_reduce = self.auto_parallel_config.fast_reduce - if fast_reduce: - logger.debug(f"all_reduce with {dtype} after {input_name}") - + dtype = str_dtype_to_trt(self.full_graph._plugin_config.dtype) to_reduce_tensors = [] for device_id in np.nditer(device_ids): device_id = device_id.item() @@ -550,7 +555,7 @@ def add_all_reduce(self, context: GraphContext, input_name, output_name, input_tensor = self.get_tensor(context, input_name, device_id).as_trt() input_dtype = input_tensor.dtype - if fast_reduce: + if input_dtype != dtype: to_reduce_tensor = self.cast( network, input_tensor, @@ -562,7 +567,7 @@ def add_all_reduce(self, context: GraphContext, input_name, output_name, to_reduce_tensors.append(to_reduce_tensor) self.add_all_reduce_layer(context, input_name, output_name, device_ids, to_reduce_tensors) - if fast_reduce and input_dtype != dtype: + if input_dtype != dtype: for device_id in np.nditer(device_ids): device_id = device_id.item() layer_info = (input_name, output_name, device_id) @@ -1313,6 +1318,11 @@ def add_layer(self, wrapped_layer: Layer, device_ids, input_name = layer.get_input(i).name local_context.update_name_mapping(input_name, device_id, updated_input.name) + if layer.get_input(i).dtype != updated_input.dtype: + raise ValueError( + f"Input dtype mismatch for {layer.name}, " + f"expect {layer.get_input(i).dtype} for {input_name}, " + f"get {updated_input.dtype} for {updated_input.name}") prefix = self.get_prefix(device_id) new_wrapped_layer = self.get_graph(device_id).add_layer( @@ -1431,17 +1441,7 @@ def get_values(self, device_id) -> Dict[str, List[int]]: def add_reduce_scatter(self, context: GraphContext, input_name, output_name, device_ids, shard_dims, device_dims): - builder_flags = get_builder_flags() - if builder_flags & (1 << int(trt.BuilderFlag.FP16)) != 0: - dtype = trt.DataType.HALF - elif builder_flags & (1 << int(trt.BuilderFlag.BF16)) != 0: - dtype = trt.DataType.BF16 - else: - dtype = trt.DataType.FLOAT - fast_reduce = self.auto_parallel_config.fast_reduce - if fast_reduce: - logger.debug(f"reduce_scatter with {dtype} after {input_name}") - + dtype = str_dtype_to_trt(self.full_graph._plugin_config.dtype) it = np.nditer(device_ids, flags=['multi_index']) for device_id in it: device_id = device_id.item() @@ -1464,7 +1464,7 @@ def add_reduce_scatter(self, context: GraphContext, input_name, output_name, input_tensor = transpose_layer.get_output(0) flatten_tensor = self.flatten(network, input_tensor, layer_info) input_dtype = flatten_tensor.dtype - if fast_reduce: + if input_dtype != dtype: to_reduce_tensor = self.cast( network, flatten_tensor, @@ -1509,7 +1509,7 @@ def add_reduce_scatter(self, context: GraphContext, input_name, output_name, self.shapes_by_device[device_id][ reduce_scatter_tensor.name] = output_shape wrapped_tensor.shape = output_shape - if fast_reduce: + if input_dtype != dtype: reduce_scatter_tensor = self.cast( network, reduce_scatter_tensor, @@ -1565,8 +1565,9 @@ def add_reduce_scatter(self, context: GraphContext, input_name, output_name, def add_all_reduce_layer(self, context: GraphContext, input_name, output_name, device_ids, to_reduce_tensors): + counter = 0 if self.use_custom_all_reduce: - all_reduce_instance_id = current_all_reduce_helper().gen_id() + counter = current_all_reduce_helper().gen_id() for device_id, to_reduce_tensor in zip(np.nditer(device_ids), to_reduce_tensors): device_id = device_id.item() @@ -1575,61 +1576,23 @@ def add_all_reduce_layer(self, context: GraphContext, input_name, graph = self.get_graph(device_id) if self.use_custom_all_reduce: strategy = AllReduceStrategy.AUTO + workspace = graph.get_input("all_reduce_workspace").as_trt() else: strategy = AllReduceStrategy.NCCL - reduce_fusion_params = AllReduceFusionParams() - allreduce_plg_creator = trt.get_plugin_registry( - ).get_plugin_creator('AllReduce', '1', TRT_LLM_PLUGIN_NAMESPACE) - assert allreduce_plg_creator is not None - - group = trt.PluginField( - "group", - np.ascontiguousarray(device_ids.reshape(-1).astype(np.int32)), - trt.PluginFieldType.INT32) - pf_type = trt.PluginField( - "type_id", np.array([int(to_reduce_tensor.dtype)], np.int32), - trt.PluginFieldType.INT32) - pf_strategy = trt.PluginField("strategy", - np.array([int(strategy)], np.int8), - trt.PluginFieldType.INT8) - config = AllReduceConfig(0) - pf_config = trt.PluginField("config", - np.array([int(config)], np.int8), - trt.PluginFieldType.INT8) - pfc = [group, pf_type, pf_strategy, pf_config] - p_fusion_op = trt.PluginField( - "fusion_op", - np.array([int(reduce_fusion_params.fusion_op)], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_fusion_op) - pf_counter = trt.PluginField( - "counter", - np.array([all_reduce_instance_id], np.int32), - trt.PluginFieldType.INT32, + workspace = None + + all_reduce_layer, allreduce_plg_creator, pfc = create_allreduce_plugin( + network=network, + tensor=to_reduce_tensor, + workspace=workspace, + group=np.ascontiguousarray( + device_ids.reshape(-1).astype(np.int32)), + strategy=strategy, + dtype=to_reduce_tensor.dtype, + config=AllReduceConfig(0), + counter=counter, + reduce_fusion_params=AllReduceFusionParams(), ) - pfc.append(pf_counter) - p_eps = trt.PluginField( - "eps", np.array([float(reduce_fusion_params.eps)], np.float32), - trt.PluginFieldType.FLOAT32) - pfc.append(p_eps) - p_affine = trt.PluginField( - "affine", - np.array([int(reduce_fusion_params.has_affine())], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_affine) - p_bias = trt.PluginField( - "bias", np.array([int(reduce_fusion_params.has_bias())], - np.int8), trt.PluginFieldType.INT8) - pfc.append(p_bias) - - pfc = trt.PluginFieldCollection(pfc) - ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) - - inputs = [to_reduce_tensor] - if self.use_custom_all_reduce: - workspace = graph.get_input("all_reduce_workspace").as_trt() - inputs.append(workspace) - all_reduce_layer = network.add_plugin_v2(inputs, ar_plug) plugin_info = PluginInfo(allreduce_plg_creator, "allreduce", pfc) set_plugin_info(network, all_reduce_layer.name, plugin_info) with self.disable_infer_shape(): @@ -1761,6 +1724,12 @@ def add_output(self, tensor: Tensor, device_ids, graph.add_output_shape(trt_output) else: graph.add_output(trt_output) + trt_output.dtype = tensor.dtype + if tensor.dtype != output_tensor.dtype: + raise ValueError( + f"Output dtype mismatch, " + f"expect {tensor.dtype} for {tensor.name}, " + f"get {output_tensor.dtype} for {output_tensor.name}") shard_dims = strategy.sharding_specs["input0"].dim_partition_dict for dim, device_dim in shard_dims.items(): @@ -2187,6 +2156,7 @@ def add_output(self, tensor: Tensor, device_ids, output = self.prefixed_graph.add_output_shape(trt_output) else: output = self.prefixed_graph.add_output(trt_output) + trt_output.dtype = tensor.dtype output.attrs["strategy"] = strategy.name def assign_shapes(self, shape_info: ShapeInfo): @@ -2229,9 +2199,10 @@ def parallelize( graph_strategy, config.graph_config.graph_mapping, ) + graph._plugin_config = simplifier.llm_network.plugin_config graph_group = GraphGroup.from_graph(graph, config, auto_parallel_config) - use_custom_all_reduce = simplifier.llm_network.plugin_config.use_custom_all_reduce + use_custom_all_reduce = graph._plugin_config.use_custom_all_reduce if use_custom_all_reduce and not debug_mode: graph_group.use_custom_all_reduce = True init_all_reduce_helper() diff --git a/tensorrt_llm/auto_parallel/pipeline_graph.py b/tensorrt_llm/auto_parallel/pipeline_graph.py index e4ee763d3..b5af57631 100644 --- a/tensorrt_llm/auto_parallel/pipeline_graph.py +++ b/tensorrt_llm/auto_parallel/pipeline_graph.py @@ -8,6 +8,7 @@ from tensorrt_llm._utils import trt_dtype_to_str, trt_dtype_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.network import Network, get_plugin_info, set_plugin_info +from tensorrt_llm.plugin.plugin import PluginConfig from tensorrt_llm.runtime.session import Session from .utils import (current_flags, get_builder_flags, get_sorted_layer_ids, @@ -263,6 +264,7 @@ def __init__(self): self._io_buffer_mapping = {} self._unfilled_weights = {} self._auto_parallel_config = None + self._plugin_config: PluginConfig = None @staticmethod def create_graph(): diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py index 1733707ba..7bc90a4f6 100644 --- a/tensorrt_llm/builder.py +++ b/tensorrt_llm/builder.py @@ -374,6 +374,8 @@ def build_engine(self, network: Network, mapping = builder_config.auto_parallel_config["mapping"] builder_config.tensor_parallel = mapping.tp_size builder_config.pipeline_parallel = mapping.pp_size + builder_config.moe_tensor_parallel = mapping.moe_tp_size + builder_config.moe_expert_parallel = mapping.moe_ep_size if builder_config.trt_builder_config.num_optimization_profiles == 0: self._add_optimization_profile(network, builder_config) engine = None @@ -437,7 +439,7 @@ def save_config(builder_config: BuilderConfig, config_path: str): @dataclass class BuildConfig: max_input_len: int = 256 - max_output_len: int = 256 + max_seq_len: int = 512 opt_batch_size: int = 8 max_batch_size: int = 8 max_beam_width: int = 1 @@ -475,6 +477,7 @@ def __post_init__(self): opt_num_tokens=self.opt_num_tokens, max_batch_size=self.max_batch_size, max_input_len=self.max_input_len, + max_seq_len=self.max_seq_len, max_beam_width=self.max_beam_width, remove_input_padding=self.plugin_config.remove_input_padding, enable_context_fmha=self.plugin_config.context_fmha, @@ -486,7 +489,7 @@ def __post_init__(self): @classmethod def from_dict(cls, config, plugin_config=None): max_input_len = config.pop('max_input_len') - max_output_len = config.pop('max_output_len') + max_seq_len = config.pop('max_seq_len') max_batch_size = config.pop('max_batch_size') max_beam_width = config.pop('max_beam_width') max_num_tokens = config.pop('max_num_tokens') @@ -526,7 +529,7 @@ def from_dict(cls, config, plugin_config=None): return cls( max_input_len=max_input_len, - max_output_len=max_output_len, + max_seq_len=max_seq_len, max_batch_size=max_batch_size, max_beam_width=max_beam_width, max_num_tokens=max_num_tokens, @@ -678,6 +681,45 @@ def get_engine_version(engine_dir: str) -> Union[None, str]: return config['version'] +def optimize_model_with_config(model: PretrainedModel, + build_config: BuildConfig): + use_auto_parallel = build_config.auto_parallel_config.enabled + gemm_swiglu_plugin = build_config.plugin_config.gemm_swiglu_plugin + if gemm_swiglu_plugin: + if not build_config.use_fused_mlp: + raise RuntimeError( + "GemmSwiGLU plugin requires --use_fused_mlp flag") + if gemm_swiglu_plugin not in ["fp8"]: + raise RuntimeError( + f"GemmSwiGLU plugin currently has limited support: fp8 only, " + f"got: {gemm_swiglu_plugin}") + + if build_config.plugin_config.lora_plugin is not None: + model.use_lora(build_config.lora_config) + + is_enc_dec = model.config.architecture in ["EncoderModel", "DecoderModel"] + model = optimize_model( + model, + use_ootb_moe=build_config.plugin_config.moe_plugin is None, + use_fused_mlp=(build_config.use_fused_mlp and not is_enc_dec + and not use_auto_parallel), + gemm_swiglu_plugin_dtype=gemm_swiglu_plugin, + use_fused_rg_lru=model.config.architecture + in ["RecurrentGemmaForCausalLM"], + use_unfused_qkv_gemm=use_auto_parallel, + use_prompt_tuning=(build_config.max_prompt_embedding_table_size > 0), + use_lora=build_config.plugin_config.lora_plugin is not None, + max_lora_rank=build_config.lora_config.max_lora_rank, + use_fp8_context_fmha=( + model.config.quantization.quant_algo == QuantAlgo.FP8 + and build_config.plugin_config.use_fp8_context_fmha), + ) + + if is_enc_dec: + model.precompute_relative_attention_bias(build_config) + return model + + def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: '''Build engine from given model and optimization options specified in the build_config WARNING: this function may change the given \p model object state in some optimization passes @@ -703,13 +745,13 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: if build_config.speculative_decoding_mode != SpeculativeDecodingMode.NONE: logger.info( - f'Increasing max_output_len ({build_config.max_output_len}) ' + f'Increasing max_seq_len ({build_config.max_seq_len}) ' f'by max_draft_len ({build_config.max_draft_len}) ' 'to account for speculative decoding implementation specifics. ' 'Maximum number of generated tokens remains the same. ' - f'New max_output_len is set to {build_config.max_output_len + build_config.max_draft_len}' + f'New max_seq_len is set to {build_config.max_seq_len + build_config.max_draft_len}' ) - build_config.max_output_len += build_config.max_draft_len + build_config.max_seq_len += build_config.max_draft_len if build_config.speculative_decoding_mode != SpeculativeDecodingMode.NONE: num_tokens = build_config.max_batch_size * (build_config.max_draft_len + @@ -738,40 +780,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: raise RuntimeError( "Paged Context FMHA doesn't work with int8 kv cache currently.") - use_auto_parallel = build_config.auto_parallel_config.enabled - gemm_swiglu_plugin = build_config.plugin_config.gemm_swiglu_plugin - if gemm_swiglu_plugin: - if not build_config.use_fused_mlp: - raise RuntimeError( - "GemmSwiGLU plugin requires --use_fused_mlp flag") - if gemm_swiglu_plugin not in ["fp8"]: - raise RuntimeError( - f"GemmSwiGLU plugin currently has limited support: fp8 only, " - f"got: {gemm_swiglu_plugin}") - - if build_config.plugin_config.lora_plugin is not None: - model.use_lora(build_config.lora_config) - - is_enc_dec = model.config.architecture in ["EncoderModel", "DecoderModel"] - model = optimize_model( - model, - use_ootb_moe=build_config.plugin_config.moe_plugin is None, - use_fused_mlp=(build_config.use_fused_mlp and not is_enc_dec - and not use_auto_parallel), - gemm_swiglu_plugin_dtype=gemm_swiglu_plugin, - use_fused_rg_lru=model.config.architecture - in ["RecurrentGemmaForCausalLM"], - use_unfused_qkv_gemm=use_auto_parallel, - use_prompt_tuning=(build_config.max_prompt_embedding_table_size > 0), - use_lora=build_config.plugin_config.lora_plugin is not None, - max_lora_rank=build_config.lora_config.max_lora_rank, - use_fp8_context_fmha=( - model.config.quantization.quant_algo == QuantAlgo.FP8 - and build_config.plugin_config.use_fp8_context_fmha), - ) - - if is_enc_dec: - model.precompute_relative_attention_bias(build_config) + model = optimize_model_with_config(model, build_config) builder = Builder() builder_config = builder.create_builder_config( @@ -793,6 +802,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: network = builder.create_network() network.plugin_config = build_config.plugin_config + use_auto_parallel = build_config.auto_parallel_config.enabled use_weight_only = model.config.quant_mode.is_weight_only() per_group = model.config.quant_mode.has_per_group_scaling() use_smooth_quant = model.config.quant_mode.has_act_and_weight_quant() @@ -821,7 +831,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: "max_input_len": build_config.max_input_len, "max_seq_len": - build_config.max_input_len + build_config.max_output_len, + build_config.max_seq_len, "use_cache": True, "max_beam_width": @@ -846,7 +856,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: } if model.config.architecture == "DecoderModel": - prepare_input_args["max_seq_len"] = build_config.max_output_len + prepare_input_args["max_seq_len"] = build_config.max_seq_len prepare_input_args[ "max_decoder_input_len"] = build_config.max_input_len prepare_input_args[ diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index cc4e6a556..1c614c445 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -78,7 +78,14 @@ def parse_arguments(): help='The number of workers for building in parallel') parser.add_argument('--max_batch_size', type=int, default=256) parser.add_argument('--max_input_len', type=int, default=1024) - parser.add_argument('--max_output_len', type=int, default=1024) + parser.add_argument( + '--max_seq_len', + '--max_decoder_seq_len', + dest='max_seq_len', + type=int, + default=2048, + help="Max total length of context and generated sequence") + parser.add_argument('--max_output_len', type=int, default=None) parser.add_argument('--max_beam_width', type=int, default=1) parser.add_argument('--max_num_tokens', type=int, default=8192) parser.add_argument( @@ -292,9 +299,10 @@ def build_model(build_config: BuildConfig, rank_config = copy.deepcopy(model_config) rank_config.set_rank(rank) - assert architecture in MODEL_MAP, \ - f"Unsupported model architecture: {architecture}" - model_cls = MODEL_MAP[architecture] + if model_cls is None: + assert architecture in MODEL_MAP, \ + f"Unsupported model architecture: {architecture}" + model_cls = MODEL_MAP[architecture] if ckpt_dir is None: model = model_cls(rank_config) else: @@ -408,6 +416,7 @@ def main(): workers = min(torch.cuda.device_count(), args.workers) plugin_config = PluginConfig.from_arguments(args) + kwargs = { 'logits_dtype': args.logits_dtype, 'use_fused_mlp': args.use_fused_mlp, @@ -432,10 +441,26 @@ def main(): else: cluster_config = infer_cluster_config() + if args.max_output_len: + logger.warning( + '--max_output_len has been deprecated in favor of --max_seq_len' + ) + if args.max_input_len: + if args.max_seq_len: + logger.warning( + '--max_seq_len has been overwritten due to --max_output_len being specified' + ) + args.max_seq_len = args.max_input_len + args.max_output_len + else: + raise Exception( + f"--max_output_len is specified but not --max_input_len") + + del args.max_output_len + build_config = BuildConfig.from_dict( { 'max_input_len': args.max_input_len, - 'max_output_len': args.max_output_len, + 'max_seq_len': args.max_seq_len, 'max_batch_size': args.max_batch_size, 'max_beam_width': args.max_beam_width, 'max_num_tokens': args.max_num_tokens, diff --git a/tensorrt_llm/executor.py b/tensorrt_llm/executor.py index f78caf382..2aefb7714 100644 --- a/tensorrt_llm/executor.py +++ b/tensorrt_llm/executor.py @@ -1,11 +1,13 @@ import asyncio +import atexit import datetime import secrets +import threading +import time from abc import ABC, abstractmethod from multiprocessing.connection import Client, Listener from pathlib import Path from queue import Queue -from threading import Thread from typing import Any, Dict, Generator, List, Optional, Tuple, Union import numpy as np @@ -21,15 +23,14 @@ from tensorrt_llm.hlapi.utils import (ContextManager, GenerationOutput, print_traceback_on_error) -from . import bindings as tllm from ._utils import mpi_rank, mpi_world_size -from .bindings import executor as tllme +from .bindings import executor as tllm from .hlapi.mpi_session import (MpiPoolSession, MpiSession, external_mpi_comm_available, find_free_port, need_spawn_mpi_workers) from .hlapi.tokenizer import TokenizerBase, tokenizer_factory from .hlapi.utils import (ContextManager, GenerationOutput, SamplingParams, - print_traceback_on_error) + exception_handler, print_traceback_on_error) def has_event_loop() -> bool: @@ -78,7 +79,7 @@ def set_id(self, id): self.id = id return self - def as_executor_request(self) -> tllme.Request: + def as_executor_request(self) -> tllm.Request: # Request # TODO: Should we unify the pad_id/end_id logic? end_id = self.tokenizer.eos_token_id if self.tokenizer is not None else None @@ -88,24 +89,38 @@ def as_executor_request(self) -> tllme.Request: pad_id = end_id if pad_id is None else pad_id request_kwargs = { - "input_token_ids": self.input_ids.squeeze().tolist(), - "max_new_tokens": self.sampling_params.max_new_tokens or 32, - "streaming": self.streaming, - "sampling_config": self.sampling_params._get_sampling_config(), - "end_id": end_id, - "pad_id": pad_id, - "output_config": self.sampling_params._get_output_config(), + "input_token_ids": + self.input_ids.squeeze().tolist(), + "max_new_tokens": + self.sampling_params.max_new_tokens or 32, + "streaming": + self.streaming, + "sampling_config": + self.sampling_params._get_sampling_config(), + "end_id": + end_id, + "pad_id": + pad_id, + "output_config": + self.sampling_params._get_output_config(), # The following options in the Executor API are not yet exposed by the HLAPI: # https://jirasw.nvidia.com/browse/TRTLLM-489 - "bad_words": self.sampling_params.bad_words or [], - "stop_words": self.sampling_params.stop_words or [], - "embedding_bias": None, #TODO - "external_draft_tokens_config": None, #TODO - "prompt_tuning_config": None, #TODO - "lora_config": None, #TODO - "logits_post_processor_name": None, #TODO + "bad_words": + self.sampling_params.bad_words or [], + "stop_words": + self.sampling_params.stop_words or [], + "embedding_bias": + self.sampling_params.embedding_bias, + "external_draft_tokens_config": + self.sampling_params.external_draft_tokens_config, + "prompt_tuning_config": + self.sampling_params.prompt_tuning_config, + "lora_config": + self.sampling_params.lora_config, + "logits_post_processor_name": + self.sampling_params.logits_post_processor_name, } - request = tllme.Request(**request_kwargs) + request = tllm.Request(**request_kwargs) return request @@ -160,7 +175,7 @@ def result_step(self, timeout: Optional[float] = None): self.handle_generation_msg(tensors, error) async def aresult_step(self): - assert self.aqueue is not None + assert self.aqueue is not None, "The asyncio event loop was not present during initialization, so async operations are not available." _, tensors, self._done, error = await self.aqueue.get() self.handle_generation_msg(tensors, error) @@ -238,6 +253,11 @@ class GenerationExecutor(ABC): def __init__(self): self.id_counter = GenerationExecutor.TERMINATE_REQUEST_ID + 1 self.tokenizer = None + self._stats = None + self.stats_queue = None + + exception_handler.register(self) + atexit.register(self.shutdown) def generate_id(self) -> int: gen_id = self.id_counter @@ -309,24 +329,30 @@ def generate( def shutdown(self): pass - @abstractmethod + def create_stats_queue(self): + # Stats queue is created during first submission to ensure event loop exists if it is needed. + if not self._stats: + if has_event_loop(): + self._stats = AsyncQueue() + self.stats_queue = self._stats.sync_q + self.stats_aqueue = self._stats.async_q + else: + self._stats = Queue() + self.stats_queue = self._stats + self.stats_aqueue = None + def get_stats(self): - pass + return self.stats_queue.get() - @abstractmethod async def aget_stats(self): - pass + assert self.stats_aqueue is not None, "The asyncio event loop was not present during initialization, so async operations are not available." + return await self.stats_aqueue.get() @staticmethod def create( engine_dir: Path, tokenizer: Union[str, Path, TokenizerBase], - executor_type: tllm.TrtGptModelType = tllm.TrtGptModelType. - InflightFusedBatching, - scheduler_config: tllme.SchedulerConfig = tllme.SchedulerConfig( - tllme.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT), - executor_config: tllm.TrtGptModelOptionalParams = tllm. - TrtGptModelOptionalParams(), + executor_config: tllm.ExecutorConfig = tllm.ExecutorConfig(1), model_world_size: int = 1, world_size: int = 0, mpi_session: Optional[MpiSession] = None, @@ -345,8 +371,6 @@ def create( worker_kwargs = { "engine_dir": engine_dir, "tokenizer": tokenizer, - "executor_type": executor_type, - "scheduler_config": scheduler_config, "executor_config": executor_config, } @@ -373,60 +397,25 @@ def __init__( self, engine_dir: Path, tokenizer: Union[str, Path, TokenizerBase, None], - executor_type: tllm.TrtGptModelType = tllm.TrtGptModelType. - InflightFusedBatching, - scheduler_config: tllme.SchedulerConfig = tllme.SchedulerConfig( - tllme.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT), - executor_config: tllm.TrtGptModelOptionalParams = tllm. - TrtGptModelOptionalParams(), + executor_config: tllm.ExecutorConfig = tllm.ExecutorConfig(1), ) -> None: super().__init__() self.engine = None self.tokenizer = tokenizer_factory(tokenizer) - self._stats = None self._results: Dict[int, GenerationResult] = {} self._pending: set = set() self.result_queue = None self.rank = mpi_rank() - # Convert config to Executor config. - config = tllme.ExecutorConfig( - max_beam_width=executor_config.max_beam_width - if executor_config.max_beam_width else 1, - batching_type=self.convert_executor_type(executor_type), - scheduler_config=scheduler_config) - # Translate additional options from TrtGptModelOptionalParams - config.kv_cache_config = tllme.KvCacheConfig( - enable_block_reuse=executor_config.kv_cache_config. - enable_block_reuse, - max_tokens=executor_config.kv_cache_config.max_tokens, - max_attention_window=executor_config.kv_cache_config. - max_attention_window, - sink_token_length=executor_config.kv_cache_config.sink_token_length, - free_gpu_memory_fraction=executor_config.kv_cache_config. - free_gpu_memory_fraction) - if executor_config.device_ids: - config.parallel_config = tllme.ParallelConfig( - device_ids=executor_config.device_ids) - config.enable_chunked_context = executor_config.enable_chunked_context - config.normalize_log_probs = executor_config.normalize_log_probs - config.decoding_config = executor_config.decoding_config - assert not executor_config.enable_trt_overlap, "enable_trt_overlap is not supported." - self.engine = tllme.Executor(engine_dir, - tllme.ModelType.DECODER_ONLY, - executor_config=config) - self.awaiter_thread = Thread(target=self.awaiter_loop) - self.running = True - - def convert_executor_type(self, executor_type): - batching_type_map = { - tllm.TrtGptModelType.V1: tllme.BatchingType.STATIC, - tllm.TrtGptModelType.InflightFusedBatching: - tllme.BatchingType.INFLIGHT, - } - assert executor_type in batching_type_map, f"executor_type={executor_type} is not supported." - return batching_type_map[executor_type] + self.engine = tllm.Executor(engine_dir, + tllm.ModelType.DECODER_ONLY, + executor_config=executor_config) + self.awaiter_stop_event = threading.Event() + self.awaiter_thread = threading.Thread(target=self.awaiter_loop, + daemon=True) + self.stats_thread = threading.Thread(target=self.stats_loop, + daemon=True) def create_stats_queue(self): # Stats queue is created during first submission to ensure event loop exists if it is needed. @@ -441,8 +430,15 @@ def create_stats_queue(self): self.stats_aqueue = None def set_result_queue(self, queue): + """In multi-gpu mode, result_queue will be set here to communicate between the proxy and the worker 0 process.""" self.result_queue = queue + def set_stats_queue(self, queue): + """In multi-gpu mode, stats_queue will be set here to communicate between the proxy and the worker 0 process.""" + self._stats = queue + self.stats_queue = self._stats + self.stats_aqueue = None + def return_queue(self, req_id: int): """ If a centralized result queue is registered (used for communication with the proxy) send the message there. @@ -458,9 +454,14 @@ def start_awaiter_thread(self): ) and not self.awaiter_thread.is_alive(): self.awaiter_thread.start() + def start_stats_thread(self): + if self.engine.can_enqueue_requests( + ) and not self.stats_thread.is_alive(): + self.stats_thread.start() + def awaiter_loop(self): """ Gets responses from executor and places in the return queue.""" - while self.running: + while not self.awaiter_stop_event.is_set(): # Get responses and place in queue. for response in self.engine.await_responses( timeout=datetime.timedelta(milliseconds=100)): @@ -480,9 +481,14 @@ def awaiter_loop(self): None)) if response.result.is_final: self._pending.remove(req_id) + + def stats_loop(self): + while not self.awaiter_stop_event.is_set(): + time.sleep(0.1) # Get stats and place in queue. for stats in self.engine.get_latest_iteration_stats(): - while self.stats_queue.full(): + while hasattr(self.stats_queue, + "full") and self.stats_queue.full(): self.stats_queue.get() self.stats_queue.put(stats.to_json_str()) @@ -494,6 +500,7 @@ def submit(self, request: GenerationRequest) -> GenerationResult: raise NotImplementedError("Only rank 0 can submit requests.") self.create_stats_queue() self.start_awaiter_thread() + self.start_stats_thread() req_id = self.engine.enqueue_request(request.as_executor_request()) request.set_id(req_id) @@ -502,19 +509,14 @@ def submit(self, request: GenerationRequest) -> GenerationResult: self._pending.add(req_id) return result - def get_stats(self): - return self.stats_queue.get() - - async def aget_stats(self): - assert self.stats_aqueue is not None - return await self.stats_aqueue.get() - def shutdown(self): if self.engine is not None: - self.running = False + self.awaiter_stop_event.set() if self.engine.can_enqueue_requests(): if self.awaiter_thread.is_alive(): self.awaiter_thread.join() + if self.stats_thread.is_alive(): + self.stats_thread.join() self.engine.shutdown() self.engine = None @@ -610,7 +612,12 @@ def __init__( secrets.token_bytes(512)) self.result_queue = Fifo(result_queue_addr, is_server=True) + stats_queue_addr = ("127.0.0.1", find_free_port(), + secrets.token_bytes(512)) + self.mp_stats_queue = Fifo(stats_queue_addr, is_server=True) + self._results: Dict[int, GenerationResult] = {} + self._request_id_dispatcher_queue = Queue(maxsize=100) if mpi_session is None: self.mpi_session = MpiPoolSession(n_workers=model_world_size) @@ -623,8 +630,13 @@ def __init__( "request_queue_addr": request_queue_addr, "request_id_queue_addr": request_id_queue_addr, "result_queue_addr": result_queue_addr, + "stats_queue_addr": stats_queue_addr, }) - self.dispatcher = Thread(target=self.dispatcher_thread) + self.workers_init_ok = False + self.dispatcher = threading.Thread(target=self.dispatcher_thread, + daemon=True) + self.stats_thread = threading.Thread(target=self.stats_main, + daemon=True) @print_traceback_on_error @staticmethod @@ -634,12 +646,8 @@ def workers_main( request_queue_addr: Tuple[str, int, bytes], request_id_queue_addr: Tuple[str, int, bytes], result_queue_addr: Tuple[str, int, bytes], - executor_type: tllm.TrtGptModelType = tllm.TrtGptModelType. - InflightFusedBatching, - scheduler_config: tllme.SchedulerConfig = tllme.SchedulerConfig( - tllme.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT), - executor_config: tllm.TrtGptModelOptionalParams = tllm. - TrtGptModelOptionalParams() + stats_queue_addr: Tuple[str, int, bytes], + executor_config: tllm.ExecutorConfig = tllm.ExecutorConfig(1) ) -> None: result_queue = None @@ -647,6 +655,7 @@ def workers_main( request_queue = Fifo(request_queue_addr, is_server=False) request_id_queue = Fifo(request_id_queue_addr, is_server=False) result_queue = Fifo(result_queue_addr, is_server=False) + mp_stats_queue = Fifo(stats_queue_addr, is_server=False) # Only the failure on rank0 can be captured here. All the non-rank0 process will hang once the executor runtime # is successfully initialized, that is controlled within cpp runtime. @@ -655,12 +664,10 @@ def workers_main( init_ok = True try: executor = ExecutorBindingsWorker(engine_dir, tokenizer, - executor_type, scheduler_config, executor_config) except Exception as e: init_ok = False raise e - finally: if mpi_rank() == 0: result_queue.put(init_ok) @@ -668,11 +675,13 @@ def workers_main( with ContextManager(executor) as executor: if mpi_rank() == 0: executor.set_result_queue(result_queue) + executor.set_stats_queue(mp_stats_queue) while (req := request_queue.get()) is not None: result = executor.submit(req) request_id_queue.put(result.generation_request.id) result_queue.put(None) + mp_stats_queue.put(None) else: executor.block_subordinates() @@ -681,26 +690,44 @@ def dispatcher_thread(self): correct GenerationResult queues. """ while (res := self.result_queue.get()) is not None: - req_id = res[0] + req_id, *_ = res + # Wait for this result ready in self._results + while req_id not in self._results or self._request_id_dispatcher_queue.full( + ): + self._request_id_dispatcher_queue.get() self._results[req_id].queue.put(res) + def stats_main(self): + while (stats := self.mp_stats_queue.get()) is not None: + time.sleep(0.1) + while self.stats_queue.full(): + self.stats_queue.get() + self.stats_queue.put(stats) + def start(self): self.mpi_futures = self.mpi_session.submit( ExecutorBindingsProxy.workers_main, **self.workers_kwargs) self.workers_started = True - ack = self.result_queue.get() - if not ack: + self.workers_init_ok = self.result_queue.get() + if not self.workers_init_ok: raise RuntimeError("worker initialization failed") self.dispatcher.start() + self.create_stats_queue() + self.stats_thread.start() def shutdown(self): if not self.workers_started: return - self.request_queue.put(None) + if self.workers_init_ok: + self.request_queue.put(None) for f in self.mpi_futures: f.result() if self.dispatcher.is_alive(): + self.result_queue.put(None) self.dispatcher.join() + if self.stats_thread.is_alive(): + self.mp_stats_queue.put(None) + self.stats_thread.join() self.workers_started = False def submit(self, request: GenerationRequest) -> GenerationResult: @@ -725,17 +752,10 @@ def submit(self, request: GenerationRequest) -> GenerationResult: result = GenerationResult(request, tokenizer) self._results[req_id] = result request.tokenizer = tokenizer + self._request_id_dispatcher_queue.put(req_id) return result - def get_stats(self): - # TODO: https://jirasw.nvidia.com/browse/TRTLLM-514 - pass - - async def aget_stats(self): - # TODO: https://jirasw.nvidia.com/browse/TRTLLM-514 - pass - def __del__(self): self.shutdown() diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 132b5c6a8..037e52f10 100644 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -308,12 +308,10 @@ def mark_output(self, if name is None: name = self.name - if dtype is None: - dtype = self.dtype - elif isinstance(dtype, str): + if isinstance(dtype, str): dtype = str_dtype_to_trt(dtype) - assert isinstance(dtype, trt.DataType) + assert dtype is None or isinstance(dtype, trt.DataType) default_net()._mark_output(self, name, dtype) def __add__(self, b): @@ -1371,27 +1369,43 @@ def arange(start: Union[Tensor, int], end: Union[Tensor, int], The tensor produced by the fill layer. It is a 1D tensor containing `end-start` elements of type `dtype`. ''' + res_dtype = str_dtype_to_trt(dtype) if isinstance(start, int): assert isinstance(end, int) - start = constant(int32_array(start)) - end = constant(int32_array(end)) + array_func = int32_array if res_dtype == trt.int32 else int64_array + start = constant(array_func(start)) + end = constant(array_func(end)) elif isinstance(start, Tensor): assert isinstance(end, Tensor) + assert start.dtype == trt.int32 or start.dtype == trt.int64 + assert end.dtype == trt.int32 or end.dtype == trt.int64 + if start.dtype != end.dtype: + if start.dtype == trt.int32: # end == trt.int64 + if res_dtype == trt.int32: + end = cast(end, "int32") + else: + start = cast(start, "int64") + else: # start == trt.int64 and end == trt.int32 + if res_dtype == trt.int32: + start = cast(start, "int32") + else: + end = cast(end, "int64") else: raise TypeError("%s is not supported" % type(start)) - step = constant(int32_array([1])) + assert start.dtype == end.dtype, f"start type ({start.dtype}) != end type ({end.dtype})" + step = constant_to_tensor_(1, dtype=start.dtype, to_array=True) num = end - start num = num.view([1]) layer = default_trtnet().add_fill([0], trt.FillOperation.LINSPACE, - trt.int32) + start.dtype) layer.set_input(0, num.trt_tensor) # rank = 1 layer.set_input(1, start.trt_tensor) # rank = 0 layer.set_input(2, step.trt_tensor) # rank = 1 tensor = _create_tensor(layer.get_output(0), layer) - if tensor.dtype != str_dtype_to_trt(dtype): + if tensor.dtype != res_dtype: tensor = tensor.cast(dtype) return tensor @@ -2305,9 +2319,11 @@ def cumsum(input: Tensor, dim: int, prefer_plugin: bool = True) -> Tensor: else: # credit to Apple reduction_length = shape(input, -1) - reduction_range = arange(constant_to_tensor_(0, to_array=False), + reduction_range = arange(constant_to_tensor_(0, + dtype='int64', + to_array=False), reduction_length, - dtype='int32') + dtype='int64') lower_triangle = cast( unsqueeze(reduction_range, 0) <= unsqueeze(reduction_range, 1), dtype=input.dtype) @@ -3666,6 +3682,67 @@ def has_bias(self): return 1 if self.bias is not None else 0 +def create_allreduce_plugin( + network: trt.INetworkDefinition, + tensor: trt.ITensor, + workspace: Optional[trt.ITensor], + group: np.array, + strategy: AllReduceStrategy, + dtype: trt.DataType, + config: AllReduceConfig, + counter: int, + reduce_fusion_params: AllReduceFusionParams, +): + allreduce_plg_creator = trt.get_plugin_registry().get_plugin_creator( + 'AllReduce', '1', TRT_LLM_PLUGIN_NAMESPACE) + assert allreduce_plg_creator is not None + + pf_group = trt.PluginField("group", group, trt.PluginFieldType.INT32) + pf_dtype = trt.PluginField("type_id", np.array([int(dtype)], np.int32), + trt.PluginFieldType.INT32) + pfc = [pf_group, pf_dtype] + p_strategy = trt.PluginField("strategy", np.array([int(strategy)], np.int8), + trt.PluginFieldType.INT8) + pfc.append(p_strategy) + p_config = trt.PluginField("config", np.array([int(config)], np.int8), + trt.PluginFieldType.INT8) + pfc.append(p_config) + p_fusion_op = trt.PluginField( + "fusion_op", np.array([int(reduce_fusion_params.fusion_op)], np.int8), + trt.PluginFieldType.INT8) + pfc.append(p_fusion_op) + p_counter = trt.PluginField("counter", np.array([counter], np.int32), + trt.PluginFieldType.INT32) + pfc.append(p_counter) + p_eps = trt.PluginField( + "eps", np.array([float(reduce_fusion_params.eps)], np.float32), + trt.PluginFieldType.FLOAT32) + pfc.append(p_eps) + p_affine = trt.PluginField( + "affine", np.array([int(reduce_fusion_params.has_affine())], np.int8), + trt.PluginFieldType.INT8) + pfc.append(p_affine) + p_bias = trt.PluginField( + "bias", np.array([int(reduce_fusion_params.has_bias())], np.int8), + trt.PluginFieldType.INT8) + pfc.append(p_bias) + + pfc = trt.PluginFieldCollection(pfc) + ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) + plug_inputs = [tensor] + if strategy != AllReduceStrategy.NCCL: + plug_inputs.append(workspace) + if reduce_fusion_params.fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM: + if reduce_fusion_params.has_bias() == 1: + plug_inputs.append(reduce_fusion_params.bias.trt_tensor) + plug_inputs.append(reduce_fusion_params.residual.trt_tensor) + if reduce_fusion_params.has_affine() == 1: + plug_inputs.append(reduce_fusion_params.norm_weight.trt_tensor) + + layer = network.add_plugin_v2(plug_inputs, ar_plug) + return layer, allreduce_plg_creator, pfc + + def allreduce( tensor: Tensor, group: List[int], @@ -3706,73 +3783,33 @@ def allreduce( The tensor produced by that layer. ''' - allreduce_plg_creator = trt.get_plugin_registry().get_plugin_creator( - 'AllReduce', '1', TRT_LLM_PLUGIN_NAMESPACE) - if strategy is None: if default_net().plugin_config.use_custom_all_reduce: strategy = AllReduceStrategy.AUTO else: strategy = AllReduceStrategy.NCCL - if reduce_fusion_params is None: - reduce_fusion_params = AllReduceFusionParams() - counter = 0 workspace = None - + counter = 0 if strategy != AllReduceStrategy.NCCL: + workspace = current_all_reduce_helper().workspace.trt_tensor counter = current_all_reduce_helper().gen_id() - workspace = current_all_reduce_helper().workspace - - assert allreduce_plg_creator is not None - - group = trt.PluginField("group", np.array(group, dtype=np.int32), - trt.PluginFieldType.INT32) - - p_dtype = default_net().plugin_config.nccl_plugin - pf_dtype = trt.PluginField( - "type_id", np.array([int(str_dtype_to_trt(p_dtype))], np.int32), - trt.PluginFieldType.INT32) - pfc = [group, pf_dtype] - p_strategy = trt.PluginField("strategy", np.array([int(strategy)], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_strategy) - p_config = trt.PluginField("config", np.array([int(config)], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_config) - p_fusion_op = trt.PluginField( - "fusion_op", np.array([int(reduce_fusion_params.fusion_op)], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_fusion_op) - p_counter = trt.PluginField("counter", np.array([counter], np.int32), - trt.PluginFieldType.INT32) - pfc.append(p_counter) - p_eps = trt.PluginField( - "eps", np.array([float(reduce_fusion_params.eps)], np.float32), - trt.PluginFieldType.FLOAT32) - pfc.append(p_eps) - p_affine = trt.PluginField( - "affine", np.array([int(reduce_fusion_params.has_affine())], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_affine) - p_bias = trt.PluginField( - "bias", np.array([int(reduce_fusion_params.has_bias())], np.int8), - trt.PluginFieldType.INT8) - pfc.append(p_bias) - pfc = trt.PluginFieldCollection(pfc) - ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) - plug_inputs = [tensor.cast(p_dtype).trt_tensor] - if strategy != AllReduceStrategy.NCCL: - plug_inputs.append(workspace.trt_tensor) - if reduce_fusion_params.fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM: - if reduce_fusion_params.has_bias() == 1: - plug_inputs.append(reduce_fusion_params.bias.trt_tensor) - plug_inputs.append(reduce_fusion_params.residual.trt_tensor) - if reduce_fusion_params.has_affine() == 1: - plug_inputs.append(reduce_fusion_params.norm_weight.trt_tensor) + if reduce_fusion_params is None: + reduce_fusion_params = AllReduceFusionParams() - layer = default_trtnet().add_plugin_v2(plug_inputs, ar_plug) + dtype = default_net().plugin_config.nccl_plugin + layer, allreduce_plg_creator, pfc = create_allreduce_plugin( + network=default_trtnet(), + tensor=tensor.cast(dtype).trt_tensor, + workspace=workspace, + group=np.array(group, dtype=np.int32), + strategy=strategy, + dtype=str_dtype_to_trt(dtype), + config=config, + counter=counter, + reduce_fusion_params=reduce_fusion_params, + ) _add_plugin_info(layer, allreduce_plg_creator, "allreduce", pfc) if reduce_fusion_params.fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM: final_output = _create_tensor(layer.get_output(0), @@ -4920,7 +4957,7 @@ def gpt_attention( if spec_decoding_packed_mask is not None: # add position_ids as well only if speculative decoding mode assert spec_decoding_position_offsets is not None - assert spec_decoding_position_offsets is not None + assert spec_decoding_generation_lengths is not None plug_inputs += [ spec_decoding_generation_lengths, spec_decoding_packed_mask, spec_decoding_position_offsets diff --git a/tensorrt_llm/hlapi/llm.py b/tensorrt_llm/hlapi/llm.py index 444078b43..0b2179d0c 100644 --- a/tensorrt_llm/hlapi/llm.py +++ b/tensorrt_llm/hlapi/llm.py @@ -7,16 +7,16 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Iterable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Union import tensorrt as trt import torch -from .. import bindings as tllm from .._utils import mpi_barrier, mpi_rank, release_gc from ..auto_parallel import AutoParallelConfig, infer_cluster_config -from ..bindings import KvCacheConfig -from ..bindings.executor import CapacitySchedulerPolicy +from ..bindings import executor as tllm +from ..bindings.executor import (CapacitySchedulerPolicy, DecodingConfig, + KvCacheConfig) from ..builder import BuildConfig, Engine, EngineConfig, build from ..executor import GenerationExecutor, GenerationResult from ..logger import logger @@ -28,14 +28,13 @@ MpiSession, external_mpi_comm_available) from .tokenizer import TokenizerBase, TransformersTokenizer from .utils import (GenerationOutput, GpuArch, SamplingParams, - download_hf_model, file_with_glob_exists, + download_hf_model, exception_handler, file_with_glob_exists, file_with_suffix_exists, get_device_count, init_log_level, print_colored, print_traceback_on_error) init_log_level( ) # This should be called before importing the following cpp-runtime modules -from ..bindings.executor import CapacitySchedulerPolicy from ..builder import BuildConfig, Engine, EngineConfig, build from ..executor import GenerationExecutor, GenerationResult @@ -252,6 +251,9 @@ def __init__(self, tokenizer: Optional[TokenizerBase] = None, dtype: str = 'auto', kv_cache_config: Optional[KvCacheConfig] = None, + logits_post_processor_map: Optional[Dict[str, Callable[ + [int, torch.Tensor, List[List[int]], int], None]]] = None, + decoding_config: Optional[DecodingConfig] = None, streaming_llm: Union[bool, StreamingLLMParam] = False, async_engine_tmp_dir: Optional[str] = None, **_additional_options: Any): @@ -267,6 +269,10 @@ def __init__(self, (2) implicitly specify `auto` (default), then `dtype` will be automatically inferred from the source model. However, if the source `dtype` is `float32`, will use `float16` instead. kv_cache_config (KvCacheConfig): The config for the paged KV cache. + logits_post_processor_map (Dict[str, Callable[[int, torch.Tensor, List[List[int]], int], None]]): + Optional, a map of logits post processor functions. + decoding_config (DecodingConfig): + Optional, the config for speculative decoding. streaming_llm (bool, StreamingLLMParam): Whether to enable the streaming LLM mode. async_engine_tmp_dir (str): @@ -289,6 +295,8 @@ def __init__(self, 'SHARDING_ALONG_HIDDEN' means parallelism enabled with lookup table weight sharded along the hidden dimension. share_embedding_table (bool): Whether to share the weight between token embedding lookup table and lm_head. + peft_cache_config (PeftCacheConfig) + The configuration for the peft cache. ''' self.config = config @@ -297,6 +305,8 @@ def __init__(self, self.dtype = dtype self.async_engine_tmp_dir = async_engine_tmp_dir self.kv_cache_config = kv_cache_config + self.logits_post_processor_map = logits_post_processor_map + self.decoding_config = decoding_config # TODO[chunweiy]: add doc for enable_streaming_llm self.enable_streaming_llm = streaming_llm if self.enable_streaming_llm is True: @@ -317,6 +327,8 @@ def __init__(self, CapacitySchedulerPolicy.GUARANTEED_NO_EVICT) self.context_chunking_policy = _additional_options.pop( 'context_chunking_policy', None) + self.peft_cache_config = _additional_options.pop( + 'peft_cache_config', None) self._convert_checkpoint_options = {} # TODO: Move these options to ParallelConfig @@ -428,6 +440,7 @@ def __init__(self, True) self._build_model() + exception_handler.register(self) def generate( self, @@ -514,11 +527,10 @@ def _generate_check_arguments(self, prompts, # TODO(enweiz): move tokenizer from GenerationExecutor to LLM and validate on token ids here prompt_len = len(prompt.split()) - if prompt_len + sp.max_new_tokens > build_config.max_input_len + build_config.max_output_len: + if prompt_len + sp.max_new_tokens > build_config.max_seq_len: raise ValueError( f"The sum of prompt length ({prompt_len}) and max_new_tokens ({sp.max_new_tokens}) should not exceed " - f"the sum of max_input_len ({build_config.max_input_len}) and max_output_len ({build_config.max_output_len})" - ) + f"max_seq_len ({build_config.max_seq_len})") if sp.beam_width > build_config.max_beam_width: raise ValueError( f"sampling_params's beam_width ({sp.beam_width}) should not exceed max_beam_width ({build_config.max_beam_width})" @@ -641,9 +653,19 @@ def get_engine_dir(): if not isinstance(tokenizer, TokenizerBase): tokenizer = ModelLoader.load_hf_tokenizer(self.config.model_dir) - executor_config = tllm.TrtGptModelOptionalParams() + executor_config = tllm.ExecutorConfig( + max_beam_width=self.config.build_config.max_beam_width, + scheduler_config=tllm.SchedulerConfig( + self.capacity_scheduling_policy, self.context_chunking_policy), + batching_type=tllm.BatchingType.INFLIGHT) if self.kv_cache_config is not None: executor_config.kv_cache_config = self.kv_cache_config + if self.peft_cache_config is not None: + executor_config.peft_cache_config = self.peft_cache_config + if self.decoding_config is not None: + executor_config.decoding_config = self.decoding_config + if self.logits_post_processor_map is not None: + executor_config.logits_post_processor_map = self.logits_post_processor_map executor_config.normalize_log_probs = self.normalize_log_probs executor_config.enable_chunked_context = self.enable_chunked_context executor_config.max_beam_width = self.config.build_config.max_beam_width @@ -652,11 +674,8 @@ def get_engine_dir(): get_engine_dir(), tokenizer, executor_config=executor_config, - scheduler_config=tllm.executor.SchedulerConfig( - self.capacity_scheduling_policy, self.context_chunking_policy), model_world_size=self.config.parallel_config.world_size, mpi_session=self.mpi_session, - executor_type=tllm.TrtGptModelType.InflightFusedBatching, reuse_mpi_comm=external_mpi_comm_available( self.config.parallel_config.world_size)) diff --git a/tensorrt_llm/hlapi/utils.py b/tensorrt_llm/hlapi/utils.py index 255ee6125..751aac334 100644 --- a/tensorrt_llm/hlapi/utils.py +++ b/tensorrt_llm/hlapi/utils.py @@ -4,10 +4,11 @@ import sys import tempfile import traceback +import weakref from dataclasses import dataclass, field from functools import wraps from pathlib import Path -from typing import List, Optional, Union +from typing import Any, Callable, List, Optional, Union import filelock import huggingface_hub @@ -15,7 +16,7 @@ from huggingface_hub import snapshot_download from tensorrt_llm.bindings import executor as tllme -from tensorrt_llm.logger import set_level +from tensorrt_llm.logger import Singleton, set_level def print_traceback_on_error(func): @@ -40,8 +41,13 @@ class SamplingParams: end_id (int): The end token id. pad_id (int): The pad token id. max_new_tokens (int): The maximum number of tokens to generate. - bad_words: List[List[int]]: A list of bad words tokens. Each "word" can be composed of multiple tokens. - stop_words: List[List[int]]: A list of stop words tokens. Each "word" can be composed of multiple tokens. + bad_words (List[List[int]]): A list of bad words tokens. Each "word" can be composed of multiple tokens. + stop_words (List[List[int]]): A list of stop words tokens. Each "word" can be composed of multiple tokens. + embedding_bias (torch.Tensor): The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. + external_draft_tokens_config (ExternalDraftTokensConfig): The speculative decoding configuration. + prompt_tuning_config (PromptTuningConfig): The prompt tuning configuration. + lora_config (LoraConfig): The LoRA configuration. + logits_post_processor_name (str): The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig. beam_width (int): The beam width. Default is 1 which disables beam search. top_k (int): Controls number of logits to sample from. Default is 0 (all logits). @@ -80,6 +86,12 @@ class SamplingParams: max_new_tokens: int = 32 bad_words: Optional[List[List[int]]] = None stop_words: Optional[List[List[int]]] = None + embedding_bias: Optional[torch.Tensor] = None + external_draft_tokens_config: Optional[ + tllme.ExternalDraftTokensConfig] = None + prompt_tuning_config: Optional[tllme.PromptTuningConfig] = None + lora_config: Optional[tllme.LoraConfig] = None + logits_post_processor_name: Optional[str] = None # Keep the below fields in sync with tllme.SamplingConfig beam_width: int = 1 @@ -238,6 +250,27 @@ def init_log_level(): os.environ["TLLM_LOG_LEVEL"] = "WARNING" +class ExceptionHandler(metaclass=Singleton): + + def __init__(self): + self._sys_excepthook: Callable = sys.excepthook + self._obj_refs_to_shutdown: List[weakref.ReferenceType] = [] + + def __call__(self, exc_type, exc_value, traceback): + self._sys_excepthook(exc_type, exc_value, traceback) + + for obj_ref in self._obj_refs_to_shutdown: + if (obj := obj_ref()) is not None: + obj.shutdown() + + def register(self, obj: Any): + self._obj_refs_to_shutdown.append(weakref.ref(obj)) + + +exception_handler = ExceptionHandler() +sys.excepthook = exception_handler + + def sigint_handler(signal, frame): sys.stderr.write("\nSIGINT received, quit LLM!\n") sys.exit(1) diff --git a/tensorrt_llm/layers/linear.py b/tensorrt_llm/layers/linear.py index f14e76c4d..016d18124 100644 --- a/tensorrt_llm/layers/linear.py +++ b/tensorrt_llm/layers/linear.py @@ -89,9 +89,8 @@ def _gemm_plugin(input: Tensor, if use_fp8: assert ( isinstance(alpha, np.ndarray) and alpha.dtype == np.float32 - and alpha.size == 1, - "`alpha` must be passed as a float32 ndarray if `use_fp8` is enabled for _gemm_plugin" - ) + and alpha.size == 1 + ), "`alpha` must be passed as a float32 ndarray if `use_fp8` is enabled for _gemm_plugin" assert input.dtype == trt.fp8 assert mat2.dtype == trt.fp8 diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py index 5a0fdc475..740a680bc 100644 --- a/tensorrt_llm/layers/moe.py +++ b/tensorrt_llm/layers/moe.py @@ -23,10 +23,11 @@ from tensorrt_llm.layers.lora import LoraParams from .._common import default_net, default_trtnet -from ..functional import (_add_plugin_info, _create_tensor, allreduce, cast, - div, is_gated_activation, non_gated_version, softmax, - sum, topk) +from ..functional import (AllReduceStrategy, _add_plugin_info, _create_tensor, + allreduce, cast, div, is_gated_activation, + non_gated_version, softmax, sum, topk) from ..layers import MLP, GatedMLP +from ..mapping import Mapping from ..module import Module, ModuleList from ..parameter import Parameter from ..plugin import TRT_LLM_PLUGIN_NAMESPACE @@ -48,11 +49,6 @@ @dataclass class MoeConfig: - # [WARNING] Keep the below in sync with cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h - class ParallelismMode(IntEnum): - NONE = 0 - EXPERT_PARALLEL = 1 - TENSOR_PARALLEL = 2 class ExpertScaleNormalizationMode(IntEnum): NONE = 0 @@ -60,7 +56,6 @@ class ExpertScaleNormalizationMode(IntEnum): num_experts: int = 0 top_k: int = 0 - tp_mode: ParallelismMode = ParallelismMode.TENSOR_PARALLEL normalization_mode: ExpertScaleNormalizationMode = ExpertScaleNormalizationMode.RENORMALIZE def validate(self) -> "MoeConfig": @@ -101,7 +96,9 @@ def _moe_plugin(moe_config, output_dtype, quant_mode=QuantMode(0), tp_size=1, - tp_rank=0): + ep_size=1, + tp_rank=0, + ep_rank=0): if isinstance(dtype, str): dtype = str_dtype_to_trt(dtype) @@ -127,7 +124,7 @@ def from_parameter(x): # Create the plugin with our required state num_experts = moe_config.num_experts - # We pass the full number of experts (not divided by tp_size) even for EP mode + # We pass the full number of experts (not divided by ep_size) even for EP mode p_num_experts = trt.PluginField("number_of_experts", np.array(num_experts, dtype=np.int32), trt.PluginFieldType.INT32) @@ -167,9 +164,10 @@ def from_parameter(x): trt.PluginFieldType.INT32) p_tp_rank = trt.PluginField("tp_rank", np.array(tp_rank, dtype=np.int32), trt.PluginFieldType.INT32) - p_parallelism_mode = trt.PluginField( - "parallelism_mode", np.array(moe_config.tp_mode, dtype=np.int32), - trt.PluginFieldType.INT32) + p_ep_size = trt.PluginField("ep_size", np.array(ep_size, dtype=np.int32), + trt.PluginFieldType.INT32) + p_ep_rank = trt.PluginField("ep_rank", np.array(ep_rank, dtype=np.int32), + trt.PluginFieldType.INT32) p_normalization_mode = trt.PluginField( "normalization_mode", np.array(moe_config.normalization_mode, dtype=np.int32), @@ -179,7 +177,7 @@ def from_parameter(x): p_num_experts, p_top_k, p_expert_hidden_size, p_expert_inter_size, p_activation_type, p_type_id, p_weight_type_id, p_output_type_id, p_quant_mode, p_use_finished, p_use_bias, p_tp_size, p_tp_rank, - p_parallelism_mode, p_normalization_mode + p_ep_size, p_ep_rank, p_normalization_mode ]) # Create the plugin with our constant inputs to the constructor @@ -275,11 +273,11 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, hidden_act: str, + mapping: Mapping = Mapping(), bias: bool = True, dtype=None, tp_group: List[int] = None, tp_size: int = 1, - tp_rank: int = 0, quant_mode=QuantMode(0)): super().__init__() @@ -295,25 +293,24 @@ def __init__(self, self.weight_dtype = dtype self.tp_group = tp_group self.tp_size = tp_size - self.tp_rank = tp_rank + self.mapping = mapping self.quant_mode = quant_mode self.bias = bias self.experts_per_node = self.num_experts - self.tp_mode = moe_config.tp_mode - if moe_config.tp_mode == MoeConfig.ParallelismMode.EXPERT_PARALLEL: - if self.num_experts % self.tp_size != 0: + if self.mapping.has_moe_ep(): + if self.num_experts % self.mapping.moe_ep_size != 0: raise ValueError( - f"MixtureOfExperts - Number of experts {self.num_experts} is not a multiple of EP size {self.tp_size}" + f"MixtureOfExperts - Number of experts {self.num_experts} is not a multiple of EP size {self.mapping.moe_ep_size}" ) - self.experts_per_node = self.experts_per_node // tp_size + self.experts_per_node = self.experts_per_node // self.mapping.moe_ep_size - elif moe_config.tp_mode == MoeConfig.ParallelismMode.TENSOR_PARALLEL: - if self.ffn_hidden_size % self.tp_size != 0: + if self.mapping.has_moe_tp(): + if self.ffn_hidden_size % self.mapping.moe_tp_size != 0: raise ValueError( - f"MixtureOfExperts - FFN Hidden Size {self.ffn_hidden_size} is not a multiple of TP size {self.tp_size}" + f"MixtureOfExperts - FFN Hidden Size {self.ffn_hidden_size} is not a multiple of TP size {self.mapping.moe_tp_size}" ) - self.expert_inter_size = self.ffn_hidden_size // tp_size + self.expert_inter_size = self.ffn_hidden_size // self.mapping.moe_tp_size if quant_mode.has_fp8_qdq() and self.bias: # TODO (dastokes) We will need to revisit this if we have a use case for it @@ -436,10 +433,12 @@ def forward_experts(self, hidden_states, routing, finished, weight_dtype=weight_dtype_quant, output_dtype=output_dtype_quant, quant_mode=self.quant_mode, - tp_size=self.tp_size, - tp_rank=self.tp_rank) + tp_size=self.mapping.moe_tp_size, + tp_rank=self.mapping.moe_tp_rank, + ep_size=self.mapping.moe_ep_size, + ep_rank=self.mapping.moe_ep_rank) - if self.tp_size > 1 and self.tp_group is not None and self.moe_config.tp_mode != MoeConfig.ParallelismMode.NONE: + if self.tp_size > 1 and self.tp_group is not None: output = allreduce(output, self.tp_group) return output @@ -475,11 +474,11 @@ def init_experts(self): ) ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP - # In OOTB mode, when ParallelismMode mode is TENSOR_PARALLEL, using MLP class to do TP settings + # In OOTB mode, when TP is enabled, using MLP class to do TP settings # pass self.ffn_hidden_size to original size, - if self.moe_config.tp_mode == MoeConfig.ParallelismMode.TENSOR_PARALLEL: - tp_size = self.tp_size - tp_group = self.tp_group + if self.mapping.has_moe_tp(): + tp_size = self.mapping.moe_tp_size + tp_group = self.mapping.moe_tp_group else: tp_size = 1 tp_group = None @@ -543,8 +542,8 @@ def forward_experts(self, hidden_states, routing, finished, output = hidden_states * 0.0 # Create output space # Experts inference for i, expert in enumerate(self.experts): - if self.tp_mode == MoeConfig.ParallelismMode.EXPERT_PARALLEL: - index = i + self.experts_per_node * self.tp_rank + if self.mapping.has_moe_ep(): + index = i + self.experts_per_node * self.mapping.moe_ep_rank else: index = i # inference expert @@ -559,8 +558,10 @@ def forward_experts(self, hidden_states, routing, finished, keepdim=True), self.dtype) output += out * expert_weights - if self.tp_size > 1 and self.tp_group is not None and self.moe_config.tp_mode == MoeConfig.ParallelismMode.EXPERT_PARALLEL: - output = allreduce(output, self.tp_group) + if self.mapping.has_moe_ep() and self.mapping.moe_ep_group is not None: + output = allreduce(output, + self.mapping.moe_ep_group, + strategy=AllReduceStrategy.NCCL) return output diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 328a9e6bd..5896668df 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -13,7 +13,6 @@ from ._utils import (DictConversion, pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy) from .layers.linear import ColumnLinear -from .layers.moe import MoeConfig from .logger import logger from .mapping import Mapping from .models.convert_utils import (get_model_path, load_state_dict, @@ -546,7 +545,6 @@ def load_from_hf(self, lora_target_modules = model_config.lora_target_modules dtype = model_config.dtype - moe_tp_mode = model_config.moe_tp_mode hf_modules_to_trtllm_modules = invert_module_mapping( model_config.trtllm_modules_to_hf_modules) hf_modules = set(hf_modules_to_trtllm_modules.keys()) @@ -606,7 +604,7 @@ def load_from_model_dir(uid, model_dir, hf_config): t_out = module_weights["out"] if lora_module in ["moe_router"]: pass - elif "moe" in lora_module and moe_tp_mode == MoeConfig.ParallelismMode.EXPERT_PARALLEL: + elif "moe" in lora_module and runtime_mapping.has_moe_ep(): pass elif lora_module in [ "attn_dense", diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py index 200b0d58f..98e79e0fe 100644 --- a/tensorrt_llm/mapping.py +++ b/tensorrt_llm/mapping.py @@ -30,26 +30,90 @@ class Mapping(object): - [1, 5] - [2, 6] - [3, 7] + + A node with 8 GPUs, moe_tp_size = 2, moe_ep_size = 4 + + 4 moe_tp groups: + + - [0, 4] + - [1, 5] + - [2, 6] + - [3, 7] + + 2 moe_ep groups: + + - [0, 1, 2, 3] + - [4, 5, 6, 7] + + 2 nodes with 16 GPUs, moe_tp_size = 2, moe_ep_size = 4, pp_size = 2 + + 8 moe_tp groups: + + - [0 4] + - [1 5] + - [2 6] + - [3 7] + - [8 12] + - [9 13] + - [10 14] + - [11 15] + + 4 moe_ep groups: + + - [0, 1, 2, 3] + - [4, 5, 6, 7] + - [8, 9, 10, 11] + - [12, 13, 14, 15] + + 8 pp groups: + + - [0 8] + - [1 9] + - [2 10] + - [3 11] + - [4 12] + - [5 13] + - [6 14] + - [7 15] ''' - def __init__(self, - world_size=1, - rank=0, - gpus_per_node=8, - tp_size=1, - pp_size=1): + def __init__( + self, + world_size=1, + rank=0, + gpus_per_node=8, + tp_size=1, + pp_size=1, + moe_tp_size=-1, # -1 means no moe + moe_ep_size=-1): # -1 means no moe + # set default values for non-moe cases + if moe_tp_size == -1: + moe_tp_size = tp_size + moe_ep_size = 1 + + if pp_size * tp_size != world_size: + raise ValueError( + f"world_size must equal to pp_size * tp_size, but got {world_size} != {pp_size} * {tp_size}" + ) + + moe_tp_ep_size = moe_tp_size * moe_ep_size + if moe_tp_ep_size != tp_size: + raise ValueError( + f"tp_size must equal to moe_tp_size * moe_ep_size, but got {tp_size} != {moe_tp_size} * {moe_ep_size}" + ) + self.tp_size = tp_size self.pp_size = pp_size + self.moe_tp_size = moe_tp_size + self.moe_ep_size = moe_ep_size self.world_size = world_size self.rank = rank self.gpus_per_node = gpus_per_node - if pp_size * tp_size != world_size: - raise ValueError( - f"world_size must equal to pp_size * tp_size, but got {world_size} != {pp_size} * {tp_size}" - ) self.pp_groups = [] self.tp_groups = [] + self.moe_tp_groups = [] + self.moe_ep_groups = [] # init pp group for i in range(tp_size): @@ -61,11 +125,31 @@ def __init__(self, ranks = range(i * tp_size, (i + 1) * tp_size) self.tp_groups.append(list(ranks)) + # init moe tp group + for i in range(pp_size): + for j in range(moe_ep_size): + ranks = range(i * moe_tp_ep_size + j, (i + 1) * moe_tp_ep_size, + moe_ep_size) + self.moe_tp_groups.append(list(ranks)) + + # init moe ep group + for i in range(pp_size): + for j in range(moe_tp_size): + ranks = range(i * moe_tp_ep_size + j * moe_ep_size, + i * moe_tp_ep_size + (j + 1) * moe_ep_size) + self.moe_ep_groups.append(list(ranks)) + self.pp_rank = self.rank // self.tp_size self.tp_rank = self.rank % self.tp_size + self.moe_tp_rank = self.tp_rank // self.moe_ep_size + self.moe_ep_rank = self.tp_rank % self.moe_ep_size self.tp_group = self.tp_groups[self.pp_rank] self.pp_group = self.pp_groups[self.tp_rank] + self.moe_tp_group = self.moe_tp_groups[self.pp_rank * moe_ep_size + + self.moe_ep_rank] + self.moe_ep_group = self.moe_ep_groups[self.pp_rank * moe_tp_size + + self.moe_tp_rank] self.node_rank = self.rank // self.gpus_per_node self.local_rank = self.rank % self.gpus_per_node @@ -100,6 +184,12 @@ def next_pp_rank(self): p = p - self.world_size return p + def has_moe_tp(self): + return self.moe_tp_size > 1 + + def has_moe_ep(self): + return self.moe_ep_size > 1 + def pp_layers(self, num_layers: int) -> List[int]: layers_per_pipeline_stage = num_layers // self.pp_size layers_range = range(self.pp_rank * layers_per_pipeline_stage, @@ -107,9 +197,9 @@ def pp_layers(self, num_layers: int) -> List[int]: return list(layers_range) def ep_experts(self, num_experts: int) -> List[int]: - experts_per_rank = num_experts // self.tp_size - experts_range = range(self.tp_rank * experts_per_rank, - (self.tp_rank + 1) * experts_per_rank) + experts_per_rank = num_experts // self.moe_ep_size + experts_range = range(self.moe_ep_rank * experts_per_rank, + (self.moe_ep_rank + 1) * experts_per_rank) return list(experts_range) @classmethod @@ -122,5 +212,7 @@ def to_dict(self): 'rank': self.rank, 'gpus_per_node': self.gpus_per_node, 'tp_size': self.tp_size, - 'pp_size': self.pp_size + 'pp_size': self.pp_size, + 'moe_tp_size': self.moe_tp_size, + 'moe_ep_size': self.moe_ep_size } diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py index 45cca2fc3..bd0ee680e 100644 --- a/tensorrt_llm/models/convert_utils.py +++ b/tensorrt_llm/models/convert_utils.py @@ -68,10 +68,13 @@ def weight_only_quantize_dict(weights: Dict[str, torch.Tensor], 'qkv.weight', 'dense.weight', 'fc.weight', 'proj.weight', 'gate.weight' ], + exclude_weights=['shared_expert_gate.weight'], plugin: bool = True): if quant_algo not in [QuantAlgo.W4A16, QuantAlgo.W8A16]: return weights for name in list(weights): + if any([_name in name for _name in exclude_weights]): + continue if any([_name in name for _name in quant_weights ]) and weights[name].dtype != torch.int8: quant_weight, quant_scale = weight_only_quantize( diff --git a/tensorrt_llm/models/dbrx/config.py b/tensorrt_llm/models/dbrx/config.py index 643d6c3ff..d97311ff8 100644 --- a/tensorrt_llm/models/dbrx/config.py +++ b/tensorrt_llm/models/dbrx/config.py @@ -38,8 +38,6 @@ def __init__(self, moe = MoeConfig( num_experts=kwargs.pop('moe_num_experts', 0), top_k=kwargs.pop('moe_top_k', 0), - tp_mode=kwargs.pop('moe_tp_mode', - MoeConfig.ParallelismMode.TENSOR_PARALLEL), normalization_mode=kwargs.pop( 'moe_normalization_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)) diff --git a/tensorrt_llm/models/dbrx/model.py b/tensorrt_llm/models/dbrx/model.py index cbad72b45..14617e018 100644 --- a/tensorrt_llm/models/dbrx/model.py +++ b/tensorrt_llm/models/dbrx/model.py @@ -59,7 +59,7 @@ def __init__(self, config: DbrxConfig, layer_idx: int): ClsMLP = MOE mlp_kwargs = { "moe_config": config.moe, - "tp_rank": config.mapping.tp_rank, + "mapping": config.mapping, } self.mlp = ClsMLP(hidden_size=config.hidden_size, diff --git a/tensorrt_llm/models/dit/model.py b/tensorrt_llm/models/dit/model.py index 5f7346950..ad4f68040 100644 --- a/tensorrt_llm/models/dit/model.py +++ b/tensorrt_llm/models/dit/model.py @@ -28,7 +28,6 @@ from ...module import Module, ModuleList from ...parameter import Parameter from ...plugin import current_all_reduce_helper -from ..generation_mixin import GenerationMixin from ..modeling_utils import PretrainedConfig, PretrainedModel @@ -333,7 +332,10 @@ def prepare_inputs(self, max_batch_size, **kwargs): if use_custom_all_reduce and mapping.tp_size > 1: current_all_reduce_helper().set_workspace_tensor(mapping, 1) - default_range = GenerationMixin.default_range + def dit_default_range(max_batch_size): + return [2, (max_batch_size + 1) // 2, max_batch_size] + + default_range = dit_default_range if self.cfg_scale is not None: max_batch_size *= 2 diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py index 98a2fc772..c2044ea84 100644 --- a/tensorrt_llm/models/enc_dec/model.py +++ b/tensorrt_llm/models/enc_dec/model.py @@ -1812,8 +1812,7 @@ def precompute_relative_attention_bias(self, build_config): relative_attention_bias_builder = torch.ops.tensorrt_llm.relative_attention_bias rel_attn_precomputed = torch.zeros( (self.config.num_attention_heads // self.mapping.tp_size, - build_config.max_output_len + 1, - build_config.max_output_len + 1), + build_config.max_seq_len + 1, build_config.max_seq_len + 1), dtype=str_dtype_to_torch(self.config.dtype), device='cuda') rel_attn_table = numpy_to_torch( @@ -1822,7 +1821,7 @@ def precompute_relative_attention_bias(self, build_config): rel_attn_precomputed, rel_attn_table, self.config.num_attention_heads // self.mapping.tp_size, - build_config.max_output_len, + build_config.max_seq_len, self.config.num_buckets, False, self.config.max_distance, @@ -1830,7 +1829,7 @@ def precompute_relative_attention_bias(self, build_config): for layer_idx in range(self.num_layers): self.decoder_layers[ layer_idx].self_attention.set_rel_attn_table( - build_config.max_output_len, rel_attn_precomputed) + build_config.max_seq_len, rel_attn_precomputed) class WhisperEncoder(PretrainedModel): diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index c97e9446a..ab5a81b62 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -38,8 +38,8 @@ def has_ctx_gen_opt_profiles(use_gpt_attention_plugin: bool, return res @staticmethod - def default_range(max_range, offset=0): - result = [1, (max_range + 1) // 2, max_range] + def default_range(max_range, offset=0, min_range=1): + result = [min_range, (max_range + min_range) // 2, max_range] return [elem + offset for elem in result] @staticmethod @@ -603,15 +603,18 @@ def prepare_basic_inputs( # total number of spec decoding tokens for all sequences (sequence length can be variable). num_gen_tokens_range = [ default_range( - max_batch_size * max_beam_width * tokens_per_engine_step) + max_batch_size * max_beam_width * tokens_per_engine_step, + min_range=0) ] * num_profiles + bb_range_0 = [[0] + bbr[1:] for bbr in bb_range] # support variable sequence lengths for medusa. spec_decoding_generation_lengths = Tensor( name='spec_decoding_generation_lengths', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size_beam_width', bb_range)]), + dim_range=OrderedDict([('batch_size_beam_width_0', bb_range_0) + ]), ) # position offsets that are fixed during the whole session. @@ -621,7 +624,7 @@ def prepare_basic_inputs( dtype=trt.int32, shape=[-1, -1], dim_range=OrderedDict([ - ('batch_size_beam_width', bb_range), + ('batch_size_beam_width_0', bb_range_0), ('spec_decoding_position_ids_dim0', tokens_per_engine_step_range), ]), diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py index 5dc25b592..076e107cd 100644 --- a/tensorrt_llm/models/gpt/config.py +++ b/tensorrt_llm/models/gpt/config.py @@ -44,8 +44,6 @@ def __init__(self, moe = MoeConfig( num_experts=kwargs.pop('moe_num_experts', 0), top_k=kwargs.pop('moe_top_k', 0), - tp_mode=kwargs.pop('moe_tp_mode', - MoeConfig.ParallelismMode.TENSOR_PARALLEL), normalization_mode=kwargs.pop( 'moe_normalization_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)) diff --git a/tensorrt_llm/models/grok/convert.py b/tensorrt_llm/models/grok/convert.py index 1fc3e2455..87f773eab 100644 --- a/tensorrt_llm/models/grok/convert.py +++ b/tensorrt_llm/models/grok/convert.py @@ -52,7 +52,7 @@ def get_jax_weight(config, prefix, dtype, postfix='.weight', key_name='scale'): dtype=dtype).T -def get_jax_weight_scale(params, key, rank): +def get_jax_weight_scale(params, key): jax_obj = params[key]['w'] jax_scales = jax.device_put(jax_obj.scales, device=jax.devices('cpu')[0]) # jax_scales = jax.device_put(jax_obj.scales, device=jax.devices('gpu')[rank]) @@ -111,13 +111,11 @@ def convert_layer(l): tllm_prex = f'transformer.layers.{l - layers_range[0]}.' q_weight, q_scale = get_jax_weight_scale( - model_params, prefix + 'multi_head_attention/query', - mapping.tp_rank) + model_params, prefix + 'multi_head_attention/query') k_weight, k_scale = get_jax_weight_scale( - model_params, prefix + 'multi_head_attention/key', mapping.tp_rank) + model_params, prefix + 'multi_head_attention/key') v_weight, v_scale = get_jax_weight_scale( - model_params, prefix + 'multi_head_attention/value', - mapping.tp_rank) + model_params, prefix + 'multi_head_attention/value') wq = split(q_weight, mapping.tp_size, mapping.tp_rank, dim=1) wk = split(k_weight, mapping.tp_size, mapping.tp_rank, dim=1) @@ -134,8 +132,7 @@ def convert_layer(l): plugin_weight_only_quant_type)) attn_dense_weight, attn_dense_scales = get_jax_weight_scale( - model_params, prefix + 'multi_head_attention/linear', - mapping.tp_rank) + model_params, prefix + 'multi_head_attention/linear') split_v = split_matrix_tp(attn_dense_weight, tensor_parallel, @@ -151,30 +148,49 @@ def convert_layer(l): tllm_prex + 'attention.dense.', plugin_weight_only_quant_type)) - if moe_config.tp_mode == moe_config.ParallelismMode.EXPERT_PARALLEL: - mapping.ep_experts(moe_config.num_experts) - w3, s3 = get_jax_weight_scale( - model_params, f'transformer/decoder_layer_{l}/moe/linear_v', - mapping.tp_rank) + model_params, f'transformer/decoder_layer_{l}/moe/linear_v') w2, s2 = get_jax_weight_scale( - model_params, f'transformer/decoder_layer_{l}/moe/linear_1', - mapping.tp_rank) + model_params, f'transformer/decoder_layer_{l}/moe/linear_1') w1, s1 = get_jax_weight_scale( - model_params, f'transformer/decoder_layer_{l}/moe/linear', - mapping.tp_rank) - - if moe_config.tp_mode == moe_config.ParallelismMode.TENSOR_PARALLEL: - - w3_split = split(w3, mapping.tp_size, mapping.tp_rank, dim=2) - w2_split = split(w2, mapping.tp_size, mapping.tp_rank, dim=1) - w1_split = split(w1, mapping.tp_size, mapping.tp_rank, dim=2) - - s3_split = split(s3, mapping.tp_size, mapping.tp_rank, dim=2) - s2_split = split(s2, mapping.tp_size, mapping.tp_rank, dim=1) - s1_split = split(s1, mapping.tp_size, mapping.tp_rank, dim=2) + model_params, f'transformer/decoder_layer_{l}/moe/linear') + + # moe expert parallel + w3_split = split(w3, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + w2_split = split(w2, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + w1_split = split(w1, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + + s3_split = split(s3, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + s2_split = split(s2, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + s1_split = split(s1, mapping.moe_ep_size, mapping.moe_ep_rank, dim=0) + # moe tensor parallel + w3_split = split(w3_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=2) + w2_split = split(w2_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=1) + w1_split = split(w1_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=2) + + s3_split = split(s3_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=2) + s2_split = split(s2_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=1) + s1_split = split(s1_split, + mapping.moe_tp_size, + mapping.moe_tp_rank, + dim=2) weights.update( get_tllm_linear_weight(w2_split, @@ -319,7 +335,6 @@ def create_config_from_xai(dtype, moe_num_experts = hf_config['num_experts'] moe_top_k = hf_config['num_experts_per_tok'] - moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL attn_output_multiplier = hf_config['attn_output_multiplier'] embedding_multiplier_scale = hf_config['embedding_multiplier_scale'] @@ -364,15 +379,15 @@ def create_config_from_xai(dtype, moe_num_experts, 'moe_top_k': moe_top_k, - 'moe_tp_mode': - moe_tp_mode, 'moe_normalization_mode': MoeConfig.ExpertScaleNormalizationMode.NONE, #TODO: should have directly map from the Mapping object to the TRT-LLM checkpoint fields 'mapping': { 'world_size': mapping.tp_size * mapping.pp_size, 'tp_size': mapping.tp_size, - 'pp_size': mapping.pp_size + 'pp_size': mapping.pp_size, + 'moe_tp_size': mapping.moe_tp_size, + 'moe_ep_size': mapping.moe_ep_size, }, 'attn_bias': attn_bias, @@ -391,15 +406,6 @@ def create_config_from_xai(dtype, config['quantization'] = quantization.to_dict() config.update(override_fields) - moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], - config['moe_tp_mode'], - config['moe_normalization_mode']).validate() - use_weight_only = config['quantization']['quant_algo'] in [ - QuantAlgo.W8A16, QuantAlgo.W4A16, QuantAlgo.FP8 - ] - if use_weight_only and moe_config.has_moe(): - config['quantization']['exclude_modules'].append('router') - return config @@ -472,7 +478,6 @@ def load_weights_from_xai(*, config, mapping, model): plugin_weight_only_quant_type = torch.int8 moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], - config['moe_tp_mode'], config['moe_normalization_mode']).validate() use_weight_only = quant_algo in [QuantAlgo.W8A16] diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py index d33bbca62..9034400e7 100644 --- a/tensorrt_llm/models/grok/model.py +++ b/tensorrt_llm/models/grok/model.py @@ -70,15 +70,8 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): assert config.moe_num_experts > 1, "Grok model is a MoE model." ClsMLP = MOE mlp_kwargs = { - "moe_config": - MoeConfig( - config.moe_num_experts, - config.moe_top_k, - config.moe_tp_mode, - config.moe_normalization_mode, - ), - "tp_rank": - config.mapping.tp_rank, + "moe_config": config.moe, + "mapping": config.mapping, } self.mlp = ClsMLP(hidden_size=config.hidden_size, ffn_hidden_size=mlp_hidden_size, @@ -228,8 +221,6 @@ def check_config(self, config): config.set_if_not_exist('rotary_scaling', None) config.set_if_not_exist('moe_num_experts', 0) config.set_if_not_exist('moe_top_k', 0) - config.set_if_not_exist('moe_tp_mode', - MoeConfig.ParallelismMode.TENSOR_PARALLEL) config.set_if_not_exist('moe_normalization_mode', MoeConfig.ExpertScaleNormalizationMode.NONE) diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py index 9233dc19e..1677727ce 100644 --- a/tensorrt_llm/models/llama/config.py +++ b/tensorrt_llm/models/llama/config.py @@ -49,8 +49,6 @@ def __init__(self, moe = MoeConfig( num_experts=kwargs.pop('moe_num_experts', 0), top_k=kwargs.pop('moe_top_k', 0), - tp_mode=kwargs.pop('moe_tp_mode', - MoeConfig.ParallelismMode.TENSOR_PARALLEL), normalization_mode=kwargs.pop( 'moe_normalization_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)) @@ -128,11 +126,8 @@ def from_hugging_face( moe_normalization_mode = None moe_num_experts = getattr(hf_config, "num_local_experts", 0) moe_top_k = getattr(hf_config, "num_experts_per_tok", 0) - moe_tp_mode = kwargs.pop('moe_tp_mode', - MoeConfig.ParallelismMode.TENSOR_PARALLEL) moe_config = MoeConfig(num_experts=moe_num_experts, top_k=moe_top_k, - tp_mode=moe_tp_mode, normalization_mode=moe_normalization_mode) moe_config.validate() diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py index 940f395b9..7d6209040 100644 --- a/tensorrt_llm/models/llama/convert.py +++ b/tensorrt_llm/models/llama/convert.py @@ -840,7 +840,7 @@ def convert_layer(l): if moe_config.has_moe(): rank_experts = list(range(moe_config.num_experts)) - if moe_config.tp_mode == moe_config.ParallelismMode.EXPERT_PARALLEL: + if mapping.has_moe_ep(): rank_experts = mapping.ep_experts(moe_config.num_experts) for suffix in ["w1", "w2", "w3"]: model_params[f'model.layers.{l}.block_sparse_moe.experts.{suffix}.weight'] = \ @@ -852,10 +852,10 @@ def convert_layer(l): f'model.layers.{l}.block_sparse_moe.experts.w2.weight'] w1 = model_params[ f'model.layers.{l}.block_sparse_moe.experts.w1.weight'] - if moe_config.tp_mode == moe_config.ParallelismMode.TENSOR_PARALLEL: - w3 = split(w3, mapping.tp_size, mapping.tp_rank, dim=1) - w2 = split(w2, mapping.tp_size, mapping.tp_rank, dim=2) - w1 = split(w1, mapping.tp_size, mapping.tp_rank, dim=1) + if mapping.has_moe_tp(): + w3 = split(w3, mapping.moe_tp_size, mapping.moe_tp_rank, dim=1) + w2 = split(w2, mapping.moe_tp_size, mapping.moe_tp_rank, dim=2) + w1 = split(w1, mapping.moe_tp_size, mapping.moe_tp_rank, dim=1) model_params[ f'model.layers.{l}.block_sparse_moe.experts.w3w1.weight'] = torch.concat( @@ -1503,16 +1503,21 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig): model_dir = model_dir if model_dir.endswith("/") else model_dir + "/" safetensors_map = {} + has_safetensor_index_json = True try: with open(model_dir + "model.safetensors.index.json", 'r') as fr: sharding_map = json.load(fr) for k, v in sharding_map['weight_map'].items(): safetensors_map[k] = int(v[6:11]) - 1 except FileNotFoundError: - pass + has_safetensor_index_json = False + shard_files = [] for name in os.listdir(model_dir): if name.endswith(".safetensors"): + if has_safetensor_index_json and name not in sharding_map[ + 'weight_map'].values(): + continue shard_files.append(name) shard_files.sort() safetensors_ptrs = [ @@ -1547,7 +1552,7 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig): torch_dtype = str_dtype_to_torch(dtype) - def load(key, tp_dim=-1, no_prefix=0): + def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False): if not no_prefix: key = model_prefix + key ptr_idx = safetensors_map[key] if key in safetensors_map else 0 @@ -1558,40 +1563,48 @@ def load(key, tp_dim=-1, no_prefix=0): if tp_dim == -1: res = safetensors_ptrs[ptr_idx].get_tensor(key) else: + if is_expert_weights: + tp_size = mapping.moe_tp_size + tp_rank = mapping.moe_tp_rank + else: + tp_size = mapping.tp_size + tp_rank = mapping.tp_rank tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key) tensor_shape = tensor_slice.get_shape() if len(tensor_shape) == 1: if tp_dim == 0: - slice_width = tensor_shape[0] // mapping.tp_size - res = tensor_slice[slice_width * - mapping.tp_rank:slice_width * - (mapping.tp_rank + 1)] + slice_width = tensor_shape[0] // tp_size + res = tensor_slice[slice_width * tp_rank:slice_width * + (tp_rank + 1)] else: res = tensor_slice[:] else: - if tensor_shape[tp_dim] % mapping.tp_size != 0: + if tensor_shape[tp_dim] % tp_size != 0: logger.error( - "Current weight shape is invalid for mapping.tp_size=" + - str(mapping.tp_size)) - slice_width = tensor_shape[tp_dim] // mapping.tp_size + "Current weight shape is invalid for tp_size=" + + str(tp_size)) + slice_width = tensor_shape[tp_dim] // tp_size if tp_dim == 0: - res = tensor_slice[slice_width * - mapping.tp_rank:slice_width * - (mapping.tp_rank + 1), :] + res = tensor_slice[slice_width * tp_rank:slice_width * + (tp_rank + 1), :] elif tp_dim == 1: - res = tensor_slice[:, slice_width * - mapping.tp_rank:slice_width * - (mapping.tp_rank + 1)] + res = tensor_slice[:, slice_width * tp_rank:slice_width * + (tp_rank + 1)] else: assert False, "Invalid TP dim" return res.to(torch_dtype).contiguous( ) if "block_sparse_moe.gate" not in key else res.to(torch.float32) - def load_and_set(target, key, tp_dim=-1, no_prefix=0): - res = load(key, tp_dim, no_prefix) + def load_and_set(target, + key, + tp_dim=-1, + no_prefix=0, + is_expert_weights=False): + res = load(key, tp_dim, no_prefix, is_expert_weights) weights[target] = res if "weight" in key: - bias = load(key.replace("weight", "bias"), tp_dim, no_prefix) + bias = load(key.replace("weight", "bias"), tp_dim, no_prefix, + is_expert_weights) if bias is not None: weights[target.replace("weight", "bias")] = bias @@ -1651,13 +1664,13 @@ def load_and_set(target, key, tp_dim=-1, no_prefix=0): weights[f'{tllm_prex}.mlp.router.weight'] = load( prefix + 'block_sparse_moe.gate.weight') rank_experts = list(range(moe_config.num_experts)) - if moe_config.tp_mode == moe_config.ParallelismMode.EXPERT_PARALLEL: + if mapping.has_moe_ep(): rank_experts = mapping.ep_experts(moe_config.num_experts) expert_weight_list = [] for suffix in range(3): tp_dim = -1 - if moe_config.tp_mode == moe_config.ParallelismMode.TENSOR_PARALLEL: + if mapping.has_moe_tp(): tp_dim = 1 if suffix == 1 else 0 expert_weight_list.append( torch.stack( @@ -1665,7 +1678,9 @@ def load_and_set(target, key, tp_dim=-1, no_prefix=0): load( prefix + f'block_sparse_moe.experts.{expert}.w{suffix + 1}.weight', - tp_dim=tp_dim) for expert in rank_experts))) + tp_dim=tp_dim, + is_expert_weights=True) + for expert in rank_experts))) w1 = expert_weight_list[0] w2 = expert_weight_list[1] diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py index b9cf6efdf..d8fc09b0c 100644 --- a/tensorrt_llm/models/llama/model.py +++ b/tensorrt_llm/models/llama/model.py @@ -74,9 +74,8 @@ def __init__(self, config: LLaMAConfig, layer_idx: int): ClsMLP = MOE mlp_kwargs = { "moe_config": config.moe, - "tp_rank": config.mapping.tp_rank, + "mapping": config.mapping, } - self.mlp = ClsMLP(hidden_size=config.hidden_size, ffn_hidden_size=mlp_hidden_size, hidden_act=config.hidden_act, @@ -86,6 +85,7 @@ def __init__(self, config: LLaMAConfig, layer_idx: int): tp_size=config.mapping.tp_size, quant_mode=config.quant_mode, **mlp_kwargs) + self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=config.dtype) diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py index d67f5664b..c042e7440 100644 --- a/tensorrt_llm/models/modeling_utils.py +++ b/tensorrt_llm/models/modeling_utils.py @@ -272,6 +272,8 @@ def set_rank(self, rank): rank=rank, tp_size=self.mapping.tp_size, pp_size=self.mapping.pp_size, + moe_tp_size=self.mapping.moe_tp_size, + moe_ep_size=self.mapping.moe_ep_size, gpus_per_node=self.mapping.gpus_per_node) @@ -635,6 +637,9 @@ def quantize( ): if mapping is None: # single gpu mapping = Mapping() + if mapping.moe_ep_size > 1: + raise NotImplementedError( + "Quantization for expert parallelism is not supported") modelopt_qformat = quant_config.quant_algo_to_modelopt_qformat() kv_cache_dtype = quant_config.kv_cache_quant_algo assert modelopt_qformat is not None @@ -643,6 +648,7 @@ def quantize( hf_model_dir) # quantize_and_export has some code can not take Path quantize_and_export( model_dir=hf_model_dir, + device='cuda', calib_dataset=calib_dataset, dtype=dtype, qformat=modelopt_qformat, @@ -665,8 +671,8 @@ def __init__(self, config: PretrainedConfig, transformer, lm_head): super().__init__(config) self.transformer = transformer self.lm_head = lm_head - config.set_if_not_exist('mup_width_multiplier', 1.0) - self.mup_width_multiplier = config.mup_width_multiplier + self.mup_width_multiplier = getattr(config, 'mup_width_multiplier', + None) def forward(self, input_ids: Tensor, diff --git a/tensorrt_llm/models/qwen/convert.py b/tensorrt_llm/models/qwen/convert.py index 1b6a6a445..88d60df9a 100644 --- a/tensorrt_llm/models/qwen/convert.py +++ b/tensorrt_llm/models/qwen/convert.py @@ -16,8 +16,10 @@ from tensorrt_llm._utils import pad_vocab_size, release_gc +from ...layers import MoeConfig from ...logger import logger from ...mapping import Mapping +from ...quantization import QuantAlgo from ..convert_utils import load_calib_dataset from ..modeling_utils import PretrainedConfig from .utils import get_qwen_key_list, make_context @@ -665,7 +667,7 @@ def multi_query_split(data, local_dim, head_size, tp_size, cur_rank): def convert_hf_qwen(hf_model, qwen_type, - mapping, + mapping: Mapping, vocab_size=32000, dtype='float32', use_parallel_embedding=False, @@ -680,7 +682,8 @@ def convert_hf_qwen(hf_model, int8_kv_cache=False, act_range=[], qkv_para=[], - smoother=[]): + smoother=[], + moe_config=None): weights = {} tik = time.time() tensor_parallel = mapping.tp_size @@ -852,98 +855,218 @@ def convert_hf_qwen(hf_model, plugin_weight_only_quant_type, dtype, use_gemm_woq_plugin)) - mlp_gate_weight = get_weight(model_params, prefix + key_list[2], dtype) - split_v = split_matrix_tp(mlp_gate_weight, - tensor_parallel, - mapping.tp_rank, - dim=0) - if use_smooth_quant: - mlp_gate_weight = mlp_gate_weight.t() - int8_weights = generate_int8(mlp_gate_weight, - act_range.get(prefix + key_list[2])) - + if qwen_type == "qwen2_moe" and moe_config and moe_config.has_moe(): + + # shared_expert for qwen2_moe + shared_expert_up_proj = model_params[ + f'model.layers.{l}.mlp.shared_expert.up_proj.weight'] + shared_expert_down_proj = model_params[ + f'model.layers.{l}.mlp.shared_expert.down_proj.weight'] + shared_expert_gate = model_params[ + f'model.layers.{l}.mlp.shared_expert.gate_proj.weight'] + shared_expert_up_proj = split(shared_expert_up_proj, + mapping.tp_size, + mapping.tp_rank, + dim=0) + shared_expert_down_proj = split(shared_expert_down_proj, + mapping.tp_size, + mapping.tp_rank, + dim=1) + shared_expert_gate = split(shared_expert_gate, + mapping.tp_size, + mapping.tp_rank, + dim=0) + model_params[ + f'model.layers.{l}.mlp.shared_expert.gate_up_proj.weight'] = torch.concat( + [shared_expert_up_proj, shared_expert_gate], dim=-2) + + model_params[ + f'model.layers.{l}.mlp.shared_expert.down_proj.weight'] = shared_expert_down_proj + + shared_expert_gate_up_proj = get_weight( + model_params, prefix + 'mlp.shared_expert.gate_up_proj', dtype) + ## mlp.shared_expert.gate_up_proj.weight weights.update( - get_tllm_linear_sq_weight( - int8_weights, - tllm_prex + 'mlp.gate.', - [1, intermediate_size // tensor_parallel], - tensor_parallel, - is_qkv=False, - per_token=per_token, - per_channel=per_channel, - last_prefix=tllm_prex + 'post_layernorm.scale_to_int', - smoother_value=None, - smoother_shape=None, - rank=mapping.tp_rank, - cat_dim=-1)) - else: - weights.update( - get_tllm_linear_weight(split_v, tllm_prex + 'mlp.gate.', None, + get_tllm_linear_weight(shared_expert_gate_up_proj, + tllm_prex + 'shared_expert.fc.', None, use_weight_only, plugin_weight_only_quant_type, dtype, use_gemm_woq_plugin)) - mlp_fc_weight = get_weight(model_params, prefix + key_list[3], dtype) - split_v = split_matrix_tp(mlp_fc_weight, - tensor_parallel, - mapping.tp_rank, - dim=0) - - if use_smooth_quant: - mlp_fc_weight = mlp_fc_weight.t() #verified - int8_weights = generate_int8(mlp_fc_weight, - act_range.get(prefix + key_list[3])) - weights.update( - get_tllm_linear_sq_weight( - int8_weights, - tllm_prex + 'mlp.fc.', - [1, intermediate_size // tensor_parallel], - tensor_parallel, - is_qkv=False, - per_token=per_token, - per_channel=per_channel, - last_prefix=tllm_prex + 'post_layernorm.scale_to_int', - smoother_value=None, - smoother_shape=None, - rank=mapping.tp_rank, - cat_dim=-1)) - else: + shared_expert_down_proj = get_weight( + model_params, prefix + 'mlp.shared_expert.down_proj', dtype) + ## mlp.shared_expert.down_proj.weight weights.update( - get_tllm_linear_weight(split_v, tllm_prex + 'mlp.fc.', None, + get_tllm_linear_weight(shared_expert_down_proj, + tllm_prex + 'shared_expert.proj.', None, use_weight_only, plugin_weight_only_quant_type, dtype, use_gemm_woq_plugin)) - mlp_proj_weight = get_weight(model_params, prefix + key_list[4], dtype) - split_v = split_matrix_tp(mlp_proj_weight, - tensor_parallel, - mapping.tp_rank, - dim=1) - - if use_smooth_quant: - mlp_proj_weight = mlp_proj_weight.t() - int8_weights = generate_int8(mlp_proj_weight, - act_range.get(prefix + key_list[4])) + moe_shared_expert_gate_weights = get_weight( + model_params, prefix + 'mlp.shared_expert_gate', dtype) weights.update( - get_tllm_linear_sq_weight( - int8_weights, - tllm_prex + 'mlp.proj.', [1, hidden_size], - tensor_parallel, - is_qkv=False, - per_token=per_token, - per_channel=per_channel, - last_prefix=tllm_prex + 'mlp.quantization_scaling_factor', - smoother_value=smoother[prefix + key_list[4]], - smoother_shape=[1, intermediate_size // tensor_parallel], - rank=mapping.tp_rank, - cat_dim=0)) - else: + get_tllm_linear_weight( + moe_shared_expert_gate_weights, + tllm_prex + 'shared_expert_gate.', + None, + False, # Router should never be quantized + plugin_weight_only_quant_type, + dtype, + use_gemm_woq_plugin)) + + ## fine-grained experts + rank_experts = list(range(moe_config.num_experts)) + if mapping.has_moe_ep(): + rank_experts = mapping.ep_experts(moe_config.num_experts) + for suffix in ["gate_proj", "down_proj", "up_proj"]: + model_params[f'model.layers.{l}.mlp.experts.{suffix}.weight'] = \ + torch.stack([model_params[f'model.layers.{l}.mlp.experts.{expert}.{suffix}.weight'].detach() + for expert in rank_experts]) + w3 = model_params[f'model.layers.{l}.mlp.experts.up_proj.weight'] + w2 = model_params[f'model.layers.{l}.mlp.experts.down_proj.weight'] + w1 = model_params[f'model.layers.{l}.mlp.experts.gate_proj.weight'] + if mapping.has_moe_tp(): + w3 = split(w3, mapping.moe_tp_size, mapping.moe_tp_rank, dim=1) + w2 = split(w2, mapping.moe_tp_size, mapping.moe_tp_rank, dim=2) + w1 = split(w1, mapping.moe_tp_size, mapping.moe_tp_rank, dim=1) + + model_params[ + f'model.layers.{l}.mlp.experts.gate_up_proj.weight'] = torch.concat( + [w3, w1], dim=-2) + + model_params[f'model.layers.{l}.mlp.experts.down_proj.weight'] = w2 + + ## mlp.experts.w2.weight + moe_experts_w2_weights = get_weight( + model_params, prefix + 'mlp.experts.down_proj', dtype) + weights.update( + get_tllm_linear_weight(moe_experts_w2_weights, + tllm_prex + 'mlp.proj.', None, + use_weight_only, + plugin_weight_only_quant_type, dtype, + use_gemm_woq_plugin)) + ## mlp.experts.w3w1.weight + moe_experts_w3w1_weights = get_weight( + model_params, prefix + 'mlp.experts.gate_up_proj', dtype) weights.update( - get_tllm_linear_weight(split_v, tllm_prex + 'mlp.proj.', None, + get_tllm_linear_weight(moe_experts_w3w1_weights, + tllm_prex + 'mlp.fc.', None, use_weight_only, plugin_weight_only_quant_type, dtype, use_gemm_woq_plugin)) + moe_experts_gate_weights = get_weight(model_params, + prefix + 'mlp.gate', + torch.float32) + weights.update( + get_tllm_linear_weight( + moe_experts_gate_weights, + tllm_prex + 'mlp.router.', + None, + False, # Router should never be quantized + plugin_weight_only_quant_type, + dtype, + use_gemm_woq_plugin)) + else: + mlp_gate_weight = get_weight(model_params, prefix + key_list[2], + dtype) + split_v = split_matrix_tp(mlp_gate_weight, + tensor_parallel, + mapping.tp_rank, + dim=0) + if use_smooth_quant: + mlp_gate_weight = mlp_gate_weight.t() + int8_weights = generate_int8( + mlp_gate_weight, act_range.get(prefix + key_list[2])) + + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.gate.', + [1, intermediate_size // tensor_parallel], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'post_layernorm.scale_to_int', + smoother_value=None, + smoother_shape=None, + rank=mapping.tp_rank, + cat_dim=-1)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.gate.', + None, use_weight_only, + plugin_weight_only_quant_type, dtype, + use_gemm_woq_plugin)) + + mlp_fc_weight = get_weight(model_params, prefix + key_list[3], + dtype) + split_v = split_matrix_tp(mlp_fc_weight, + tensor_parallel, + mapping.tp_rank, + dim=0) + + if use_smooth_quant: + mlp_fc_weight = mlp_fc_weight.t() #verified + int8_weights = generate_int8( + mlp_fc_weight, act_range.get(prefix + key_list[3])) + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.fc.', + [1, intermediate_size // tensor_parallel], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'post_layernorm.scale_to_int', + smoother_value=None, + smoother_shape=None, + rank=mapping.tp_rank, + cat_dim=-1)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.fc.', None, + use_weight_only, + plugin_weight_only_quant_type, dtype, + use_gemm_woq_plugin)) + + mlp_proj_weight = get_weight(model_params, prefix + key_list[4], + dtype) + split_v = split_matrix_tp(mlp_proj_weight, + tensor_parallel, + mapping.tp_rank, + dim=1) + + if use_smooth_quant: + mlp_proj_weight = mlp_proj_weight.t() + int8_weights = generate_int8( + mlp_proj_weight, act_range.get(prefix + key_list[4])) + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.proj.', [1, hidden_size], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + + 'mlp.quantization_scaling_factor', + smoother_value=smoother[prefix + key_list[4]], + smoother_shape=[ + 1, intermediate_size // tensor_parallel + ], + rank=mapping.tp_rank, + cat_dim=0)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.proj.', + None, use_weight_only, + plugin_weight_only_quant_type, dtype, + use_gemm_woq_plugin)) + # Layer norms do not use tensor parallelism input_ln_weight = get_weight(model_params, prefix + key_list[5], dtype) weights[tllm_prex + 'input_layernorm.weight'] = input_ln_weight @@ -1047,18 +1170,30 @@ def create_config_from_hugging_face(hf_model, n_kv_head = getattr(hf_config, "num_key_value_heads", n_head) vocab_size = hf_config.vocab_size n_positions = hf_config.max_position_embeddings + hidden_act = getattr(hf_config, "hidden_act", "silu") config['rotary_scaling'] = getattr(hf_config, "rope_scaling", None) qwen_type = hf_config.model_type if qwen_type == "qwen": rms_norm_eps = hf_config.layer_norm_epsilon rotary_base = getattr(hf_config, "rotary_emb_base", 10000.0) - elif qwen_type == "qwen2": + elif qwen_type == "qwen2" or qwen_type == "qwen2_moe": rms_norm_eps = hf_config.rms_norm_eps rotary_base = getattr(hf_config, "rope_theta", 100000.0) else: logger.error("Unknown Qwen Architecture: " + qwen_type) assert False + moe_num_experts = getattr(hf_config, "num_experts", 0) + moe_top_k = getattr(hf_config, "num_experts_per_tok", 0) + moe_intermediate_size = getattr(hf_config, "moe_intermediate_size", 0) + moe_shared_expert_intermediate_size = getattr( + hf_config, "shared_expert_intermediate_size", 0) + config[ + 'moe_normalization_mode'] = MoeConfig.ExpertScaleNormalizationMode.NONE + + if qwen_type == "qwen2_moe": + hidden_act = "swiglu" + config.update({ 'architecture': "QWenForCausalLM", 'dtype': dtype, @@ -1071,19 +1206,36 @@ def create_config_from_hugging_face(hf_model, 'vocab_size': vocab_size, 'position_embedding_type': 'rope_gpt_neox', 'max_position_embeddings': n_positions, - 'hidden_act': 'silu', + 'hidden_act': hidden_act, 'rotary_base': rotary_base, 'norm_epsilon': rms_norm_eps, 'qwen_type': qwen_type, + 'moe_num_experts': moe_num_experts, + 'moe_top_k': moe_top_k, + 'moe_intermediate_size': moe_intermediate_size, + 'moe_shared_expert_intermediate_size': + moe_shared_expert_intermediate_size, #TODO: should have directly map from the Mapping object to the TRT-LLM checkpoint fields 'mapping': { 'world_size': mapping.tp_size * mapping.pp_size, 'tp_size': mapping.tp_size, - 'pp_size': mapping.pp_size + 'pp_size': mapping.pp_size, + 'moe_tp_size': mapping.moe_tp_size, + 'moe_ep_size': mapping.moe_ep_size, } }) config['quantization'] = quantization.to_dict() config.update(override_fields) + + moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], + config['moe_normalization_mode']).validate() + use_weight_only = config['quantization']['quant_algo'] in [ + QuantAlgo.W8A16, QuantAlgo.W4A16, QuantAlgo.FP8 + ] + if use_weight_only and moe_config.has_moe(): + config['quantization']['exclude_modules'].append('router') + config['quantization']['exclude_modules'].append('shared_expert_gate') + return config @@ -1109,9 +1261,7 @@ def from_hugging_face(cls, pretrained_config = PretrainedConfig.from_dict(config) pretrained_config.set_rank(mapping.rank) #TODO: remove this hack qwen_type = pretrained_config.qwen_type - assert qwen_type in [ - 'qwen', 'qwen2' - ], "Unsupported Qwen type. Must be either 'qwen' or 'qwen2'" + assert qwen_type in ['qwen', 'qwen2', 'qwen2_moe'], "Unsupported Qwen type." qwen = cls.from_config(pretrained_config) if from_hf_gptq: @@ -1151,9 +1301,7 @@ def quantize(dtype, override_fields=override_fields) qwen_type = config['qwen_type'] - assert qwen_type in [ - 'qwen', 'qwen2' - ], "Unsupported Qwen type. Must be either 'qwen' or 'qwen2'" + assert qwen_type in ['qwen', 'qwen2', 'qwen2_moe'], "Unsupported Qwen type." with open(os.path.join(output_dir, 'config.json'), 'w') as f: json.dump(config, f, indent=4) @@ -1190,7 +1338,9 @@ def quantize(dtype, ranked_mapping = Mapping(world_size=mapping.world_size, rank=rank, tp_size=mapping.tp_size, - pp_size=mapping.pp_size) + pp_size=mapping.pp_size, + moe_tp_size=mapping.moe_tp_size, + moe_ep_size=mapping.moe_ep_size) weights = load_weights_from_hf( config=config, mapping=ranked_mapping, @@ -1222,6 +1372,9 @@ def load_weights_from_hf(*, elif quant_algo == 'W4A16': plugin_weight_only_quant_type = torch.quint4x2 + moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], + config['moe_normalization_mode']).validate() + use_weight_only = quant_algo in ['W8A16', 'W4A16'] use_smooth_quant = quant_algo is not None and quant_algo.startswith( 'W8A8_SQ') @@ -1247,5 +1400,6 @@ def load_weights_from_hf(*, int8_kv_cache=use_int8_kv_cache, act_range=act_range, qkv_para=qwen_qkv_para, - smoother=qwen_smoother) + smoother=qwen_smoother, + moe_config=moe_config) return weights diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py index 5e4e1bceb..c61f46426 100644 --- a/tensorrt_llm/models/qwen/model.py +++ b/tensorrt_llm/models/qwen/model.py @@ -18,9 +18,9 @@ from tensorrt_llm.lora_manager import LoraConfig, use_lora from ..._utils import pad_vocab_size -from ...functional import Tensor, recv, send -from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding, - GatedMLP, RmsNorm) +from ...functional import Tensor, recv, send, sigmoid +from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear, + Embedding, GatedMLP, RmsNorm, RowLinear) from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, PretrainedConfig) @@ -60,17 +60,49 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): quant_mode=config.quant_mode, dense_bias=False) - # Qwen's real inter_size is one half of what's in the config while Qwen2 is aligned with the config - intermediate_size = config.intermediate_size // 2 if self.config.qwen_type == 'qwen' else config.intermediate_size - - self.mlp = GatedMLP(hidden_size=config.hidden_size, - ffn_hidden_size=intermediate_size, - hidden_act=config.hidden_act, - dtype=dtype, - bias=False, - tp_group=tp_group, - tp_size=tp_size, - quant_mode=config.quant_mode) + ClsMLP = GatedMLP + mlp_kwargs = {} + if config.qwen_type == 'qwen2_moe': + ClsMLP = MOE + mlp_kwargs = { + "moe_config": config.moe, + "mapping": config.mapping, + } + + if config.qwen_type == 'qwen2_moe': + self.shared_expert = MLP( + hidden_size=config.hidden_size, + ffn_hidden_size=config.moe_shared_expert_intermediate_size, + hidden_act=config.hidden_act, + dtype=dtype, + bias=False, + tp_group=tp_group, + tp_size=tp_size, + quant_mode=config.quant_mode) + self.shared_expert_gate = RowLinear(config.hidden_size, + 1, + bias=False, + dtype=dtype, + tp_group=None, + tp_size=1) + + # Qwen's real inter_size depends on qwen_type + if self.config.qwen_type == 'qwen': + intermediate_size = config.intermediate_size // 2 + elif self.config.qwen_type == 'qwen2_moe': + intermediate_size = config.moe_intermediate_size + else: + intermediate_size = config.intermediate_size + + self.mlp = ClsMLP(hidden_size=config.hidden_size, + ffn_hidden_size=intermediate_size, + hidden_act=config.hidden_act, + dtype=dtype, + bias=False, + tp_group=tp_group, + tp_size=tp_size, + quant_mode=config.quant_mode, + **mlp_kwargs) self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=dtype) @@ -103,9 +135,19 @@ def forward( hidden_states = self.post_layernorm(hidden_states) + shared_output = None + if self.config.qwen_type == 'qwen2_moe': + shared_output = self.shared_expert(hidden_states) + if self.shared_expert_gate is not None: + shared_output = sigmoid( + self.shared_expert_gate(hidden_states)) * shared_output + hidden_states = self.mlp(hidden_states, lora_layer_params=lora_layer_params) + if shared_output is not None: + hidden_states = hidden_states + shared_output + hidden_states = residual + hidden_states if use_cache: return (hidden_states, presents) diff --git a/tensorrt_llm/models/qwen/weight.py b/tensorrt_llm/models/qwen/weight.py index 3df6e5afa..c5f1e9f77 100644 --- a/tensorrt_llm/models/qwen/weight.py +++ b/tensorrt_llm/models/qwen/weight.py @@ -47,7 +47,9 @@ def load_from_gptq_qwen( model_params = {k: v for k, v in model.state_dict().items()} torch.cuda.empty_cache() - + assert qwen_type in [ + 'qwen', 'qwen2' + ], "Currently, only qwen and qwen2 support gptq. qwen2_moe is not supported yet." layer_prefix = "transformer.h." if qwen_type == 'qwen' else "model.layers." key_list = get_qwen_key_list(qwen_type) diff --git a/tensorrt_llm/network.py b/tensorrt_llm/network.py index c6be2f557..e7318622c 100644 --- a/tensorrt_llm/network.py +++ b/tensorrt_llm/network.py @@ -223,12 +223,12 @@ def _mark_output(self, tensor, name, dtype): from .functional import cast # In strongly_typed, if tensor output is not the same, add a cast - if self.strongly_typed: + if dtype is not None and self.strongly_typed: tensor = cast(tensor, dtype) self.trt_network.mark_output(tensor.trt_tensor) tensor.trt_tensor.name = name if not self.strongly_typed: - tensor.trt_tensor.dtype = dtype + tensor.trt_tensor.dtype = dtype or tensor.trt_tensor.dtype logger.debug(f'Mark output: {name}, dtype: {dtype}') def set_named_parameters(self, named_parameters): diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py index 1ce0e46ee..591dba9d5 100644 --- a/tensorrt_llm/quantization/layers.py +++ b/tensorrt_llm/quantization/layers.py @@ -20,12 +20,12 @@ from .._common import default_net, precision from .._utils import fp32_array, is_same_dtype -from ..functional import (ACT2FN, AllReduceFusionOp, AttentionMaskType, - PositionEmbeddingType, RopeEmbeddingUtils, - RotaryScalingType, Tensor, allgather, allreduce, cast, - concat, constant, embedding, generate_alibi_slopes, - gpt_attention, matmul, mul, shape, slice, softmax, - split, where) +from ..functional import (ACT2FN, AllReduceFusionOp, AllReduceFusionParams, + AttentionMaskType, PositionEmbeddingType, + RopeEmbeddingUtils, RotaryScalingType, Tensor, + allgather, allreduce, cast, concat, constant, + embedding, generate_alibi_slopes, gpt_attention, + matmul, mul, shape, slice, softmax, split, where) from ..layers import SpecDecodingParams from ..layers.embedding import Embedding from ..layers.linear import Linear, RowLinear @@ -1211,7 +1211,7 @@ def forward( position_embedding=None, norm_before_bmm1=False, lora_layer_params=None, - reduce_fusion_params=None, + reduce_fusion_params: Optional[AllReduceFusionParams] = None, ): assert lora_layer_params is None, "lora is not supported on SmoothQuantAttention now" if default_net().plugin_config.smooth_quant_gemm_plugin: @@ -1400,7 +1400,10 @@ def merge_caches(): # quantized tensor and scaling factors per token context = quantize_per_token(context) - context = self.dense(context, reduce_fusion_params=reduce_fusion_params) + context = self.dense( + context, + reduce_fusion_params=reduce_fusion_params, + ) if use_cache: return (context, past_key_value) diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py index 3999cbea8..5a33799dd 100644 --- a/tensorrt_llm/quantization/quantize.py +++ b/tensorrt_llm/quantization/quantize.py @@ -1,3 +1,5 @@ +import fnmatch + from .._utils import get_init_params from ..layers import (MLP, Attention, ColumnLinear, Embedding, GatedMLP, LayerNorm, RmsNorm, RowLinear) @@ -21,15 +23,20 @@ def quantize_layers( ): exclude_modules = quant_config.exclude_modules or [ 'lm_head', - 'router', - 'vocab_embedding', - 'position_embedding', - 'block_embedding', + '*router', + '*vocab_embedding', + '*position_embedding', + '*block_embedding', ] for name, module, parent in model.named_modules_with_parent(): module_name = name.rsplit('.', 1)[-1] - if module_name not in exclude_modules: + is_excluded = False + for exclude_module in exclude_modules: + if fnmatch.fnmatchcase(name, exclude_module): + is_excluded = True + break + if not is_excluded: quant_cls = None for cls in quant_map: if isinstance(module, cls): diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index cfd19482b..86c2527f4 100644 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -176,7 +176,6 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"): if "vila" in ckpt_path: model = _get_vila_model(ckpt_path) else: - model_kwargs = {"torch_dtype": "auto"} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto" if device != "cpu" else "cpu", @@ -215,8 +214,17 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", elif "cnn_dailymail" in dataset_name_or_dir: dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train") dataset = dataset["article"][:calib_size] + elif os.path.isdir(dataset_name_or_dir): + print( + f"Recognized local dataset repo {dataset_name_or_dir} for calibration; " + "assuming the calibration data are in the train split and text column." + ) + dataset = load_dataset(dataset_name_or_dir, split="train") + dataset = dataset["text"][:calib_size] else: - raise NotImplementedError + raise NotImplementedError( + f"Unsupported dataset name or local repo directory: {dataset_name_or_dir}." + ) batch_encoded = tokenizer.batch_encode_plus(dataset, return_tensors="pt", @@ -255,7 +263,7 @@ def calibrate_loop(): return model -def quantize_and_export(*, model_dir, calib_dataset, dtype, qformat, +def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat, kv_cache_dtype, calib_size, batch_size, calib_max_seq_length, awq_block_size, output_dir, tp_size, pp_size, seed, tokenizer_max_seq_length): @@ -279,7 +287,7 @@ def quantize_and_export(*, model_dir, calib_dataset, dtype, qformat, random.seed(seed) np.random.seed(seed) - model = get_model(model_dir, dtype) + model = get_model(model_dir, dtype, device=device) model_type = get_model_type(model) if "vila" in model_dir: tokenizer = get_tokenizer(model_dir + "/llm", @@ -366,18 +374,12 @@ def quantize_and_export(*, model_dir, calib_dataset, dtype, qformat, else: tensorrt_llm_config["quantization"]["quant_algo"] = None - # Workaround for MOE router quantization - if "moe_num_experts" in tensorrt_llm_config and qformat != "full_prec": - if "exclude_modules" not in tensorrt_llm_config["quantization"]: - # Append router and lm_head because we need both excluded - tensorrt_llm_config["quantization"]["exclude_modules"] = [ - 'lm_head', 'router', 'vocab_embedding', - 'position_embedding', 'block_embedding' - ] - else: - tensorrt_llm_config["quantization"]["exclude_modules"].append( - "router") - + # HF uses rope_scaling while tensorrt_llm uses rotary_scaling + if hasattr( + model.config, + "rope_scaling") and "rotary_scaling" not in tensorrt_llm_config: + tensorrt_llm_config["rotary_scaling"] = getattr( + model.config, "rope_scaling") with open(f"{export_path}/config.json", "w") as f: json.dump(tensorrt_llm_config, f, indent=4) @@ -554,6 +556,17 @@ def get_nemo_calib_dataloader(dataset_name_or_dir="cnn_dailymail", elif "cnn_dailymail" in dataset_name_or_dir: dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train") text_column = "article" + elif os.path.isdir(dataset_name_or_dir): + print( + f"Recognized local dataset repo {dataset_name_or_dir} for calibration; " + "assuming the calibration data are in the train split and text column." + ) + dataset = load_dataset(dataset_name_or_dir, split="train") + text_column = "text" + else: + raise NotImplementedError( + f"Unsupported dataset name or local repo directory: {dataset_name_or_dir}." + ) calib_size = max(min(len(dataset), calib_size), batch_size) for i in range(calib_size // batch_size): batch = dataset[i * batch_size:(i + 1) * batch_size][text_column] diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py index 0e63ccfdb..9e6605601 100755 --- a/tensorrt_llm/runtime/generation.py +++ b/tensorrt_llm/runtime/generation.py @@ -33,7 +33,6 @@ from .._ipc_utils import set_peer_access from .._utils import (pad_vocab_size, str_dtype_to_torch, torch_to_numpy, trt_dtype_to_torch, trt_gte_10) -from ..layers.moe import MoeConfig from ..logger import logger from ..lora_manager import LoraManager from ..mapping import Mapping @@ -448,7 +447,6 @@ class ModelConfig: max_medusa_tokens: int = 0 paged_state: bool = True mamba_conv1d_plugin: bool = True - moe_tp_mode: MoeConfig.ParallelismMode = MoeConfig.ParallelismMode.TENSOR_PARALLEL conv_kernel: int = 0 layer_types: List[str] = field(default_factory=list) rnn_hidden_size: int = 0 @@ -1593,9 +1591,8 @@ def setup(self, # Because we don't support inplace update, so we need separate buffer for inputs and outputs. # We can do reuse between different layers' inputs and outputs, i.e. current layer's output can # reuse previous layer's input memory. But this need one extra buffer as the guard. - i = self.first_layer - if self.layer_types[ - i] == 'attention': # Not applicable to cross KV buffers as it's constant + if self.has_attn_layers: # Not applicable to cross KV buffers as it's constant + i = self.attn_to_general_idx[0] trt_dtype = self.runtime.engine.get_tensor_dtype( f'present_key_value_{i}') diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py index 23ebd8790..2b3eb1b8d 100644 --- a/tensorrt_llm/runtime/model_runner.py +++ b/tensorrt_llm/runtime/model_runner.py @@ -480,14 +480,12 @@ def from_engine(cls, pretrained_config, 'num_medusa_heads') else 0, use_custom_all_reduce=build_config.plugin_config. use_custom_all_reduce, - moe_tp_mode=pretrained_config.moe_tp_mode if hasattr( - pretrained_config, 'moe_tp_mode') else 0, **rnn_configs_kwargs, gpu_weights_percent=gpu_weights_percent, ) max_batch_size = build_config.max_batch_size max_input_len = build_config.max_input_len - max_output_len = build_config.max_output_len + max_seq_len = build_config.max_seq_len max_beam_width = build_config.max_beam_width if pretrained_config.architecture == 'ChatGLMForCausalLM' and pretrained_config.chatglm_version in [ 'glm', 'chatglm' @@ -528,7 +526,7 @@ def from_engine(cls, return cls(session=session, max_batch_size=max_batch_size, max_input_len=max_input_len, - max_seq_len=max_input_len + max_output_len, + max_seq_len=max_seq_len, max_beam_width=max_beam_width, lora_manager=lora_manager) diff --git a/tensorrt_llm/runtime/model_runner_cpp.py b/tensorrt_llm/runtime/model_runner_cpp.py index 4e7c1ab64..c3b1b0acb 100644 --- a/tensorrt_llm/runtime/model_runner_cpp.py +++ b/tensorrt_llm/runtime/model_runner_cpp.py @@ -154,7 +154,8 @@ def from_dir( profiler.start('load tensorrt_llm engine') kv_cache_config = trtllm.KvCacheConfig( - free_gpu_memory_fraction=0.45, # hardcode for now + free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction / + 2, # hardcoded as half self kv & half cross kv for now max_attention_window=max_attention_window_size, sink_token_length=sink_token_length) @@ -162,7 +163,8 @@ def from_dir( Path(engine_dir) / "encoder", Path(engine_dir) / "decoder", trtllm.ModelType.ENCODER_DECODER, trtllm.ExecutorConfig(max_beam_width=max_beam_width, - kv_cache_config=kv_cache_config)) + kv_cache_config=kv_cache_config, + gpu_weights_percent=gpu_weights_percent)) profiler.stop('load tensorrt_llm engine') diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 2bca98ef1..12206c450 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.11.0.dev2024061100" +__version__ = "0.11.0.dev2024061800" diff --git a/tests/bindings/test_bindings.py b/tests/bindings/test_bindings.py index 1b662e276..32b306dc7 100644 --- a/tests/bindings/test_bindings.py +++ b/tests/bindings/test_bindings.py @@ -1,4 +1,3 @@ -import inspect import json import pickle import tempfile @@ -11,123 +10,6 @@ import tensorrt_llm.bindings as _tb -def test_generation_output(): - ids = torch.ones(1) - lengths = torch.ones(2) - gen_output = _tb.GenerationOutput(ids, lengths) - assert torch.equal(gen_output.ids, ids) - assert torch.equal(gen_output.lengths, lengths) - - assert gen_output.log_probs is None - log_probs = torch.ones(1) - gen_output.log_probs = log_probs - assert gen_output.log_probs == log_probs - - assert gen_output.context_logits is None - torch.ones(1) - gen_output.context_logits = log_probs - assert gen_output.context_logits == log_probs - - -def test_generation_input(): - end_id = 42 - pad_id = 13 - ids = torch.ones(1) - lengths = torch.ones(2) - packed = True - gen_input = _tb.GenerationInput(end_id, pad_id, ids, lengths, packed) - assert gen_input.end_id == end_id - assert gen_input.pad_id == pad_id - assert torch.equal(gen_input.ids, ids) - assert torch.equal(gen_input.lengths, lengths) - assert gen_input.packed == packed - - assert gen_input.max_new_tokens is None - max_new_tokens = 100 - gen_input.max_new_tokens = max_new_tokens - assert gen_input.max_new_tokens == max_new_tokens - - assert gen_input.embedding_bias is None - embedding_bias = torch.ones(3) - gen_input.embedding_bias = embedding_bias - assert torch.equal(gen_input.embedding_bias, embedding_bias) - - assert gen_input.prompt_tuning_params.embedding_table is None - assert gen_input.prompt_tuning_params.tasks is None - assert gen_input.prompt_tuning_params.vocab_size is None - - embedding_table = torch.ones(3) - tasks = torch.ones(2) - vocab_size = torch.ones(1) - prompt_tuning_params = _tb.PromptTuningParams( - embedding_table=embedding_table, tasks=tasks, vocab_size=vocab_size) - assert len(prompt_tuning_params.prompt_tuning_enabled) == 0 - prompt_tuning_enabled = [True, False] - prompt_tuning_params.prompt_tuning_enabled = prompt_tuning_enabled - assert len(prompt_tuning_params.prompt_tuning_enabled) == 2 - assert prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled - gen_input.prompt_tuning_params = prompt_tuning_params - assert gen_input.prompt_tuning_params is not None - assert torch.equal(gen_input.prompt_tuning_params.embedding_table, - embedding_table) - assert torch.equal(gen_input.prompt_tuning_params.tasks, tasks) - assert torch.equal(gen_input.prompt_tuning_params.vocab_size, vocab_size) - assert gen_input.prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled - - -def test_gpt_session_config(): - kv_cache_config = _tb.KvCacheConfig() - assert kv_cache_config.max_tokens is None - max_tokens = 13 - kv_cache_config.max_tokens = max_tokens - assert kv_cache_config.max_tokens == max_tokens - assert kv_cache_config.free_gpu_memory_fraction is None - free_gpu_memory_fraction = 0.5 - kv_cache_config.free_gpu_memory_fraction = free_gpu_memory_fraction - assert kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction - - max_batch_size = 1000 - max_beam_width = 64 - max_sequence_length = 1 << 20 - gpu_weights_percent = 0.5 - gpt_session_config = _tb.GptSessionConfig(max_batch_size, max_beam_width, - max_sequence_length, - gpu_weights_percent) - assert gpt_session_config.max_batch_size == max_batch_size - assert gpt_session_config.max_beam_width == max_beam_width - assert gpt_session_config.max_sequence_length == max_sequence_length - assert gpt_session_config.gpu_weights_percent == gpu_weights_percent - - assert gpt_session_config.kv_cache_config is not None - assert gpt_session_config.kv_cache_config.max_tokens is None - assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None - gpt_session_config.kv_cache_config = kv_cache_config - assert gpt_session_config.kv_cache_config.max_tokens == max_tokens - assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction - gpt_session_config.kv_cache_config.max_tokens = None - assert gpt_session_config.kv_cache_config.max_tokens is None - gpt_session_config.kv_cache_config.free_gpu_memory_fraction = None - assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None - - assert not gpt_session_config.decoder_per_request - gpt_session_config.decoder_per_request = True - assert gpt_session_config.decoder_per_request - - assert not gpt_session_config.cuda_graph_mode - gpt_session_config.cuda_graph_mode = True - assert gpt_session_config.cuda_graph_mode - - assert gpt_session_config.ctx_micro_batch_size is None - ctx_micro_batch_size = 10 - gpt_session_config.ctx_micro_batch_size = ctx_micro_batch_size - assert gpt_session_config.ctx_micro_batch_size == ctx_micro_batch_size - - assert gpt_session_config.gen_micro_batch_size is None - gen_micro_batch_size = 20 - gpt_session_config.gen_micro_batch_size = gen_micro_batch_size - assert gpt_session_config.gen_micro_batch_size == gen_micro_batch_size - - def test_quant_mode(): assert _tb.QuantMode.none().value == 0 assert _tb.QuantMode.int4_weights().has_int4_weights @@ -377,14 +259,6 @@ def check_properties(the_object, properties, model_config): world_config, "llama") == "llama_float32_tp1_rank3.engine" -def test_gpt_session(): - members = {name: tpe for (name, tpe) in inspect.getmembers(_tb.GptSession)} - assert isinstance(members["model_config"], property) - assert isinstance(members["world_config"], property) - assert isinstance(members["device"], property) - assert "generate" in members - - def test_llm_request(): beam_width = 2 sampling_config = _tb.SamplingConfig(beam_width) diff --git a/tests/bindings/test_executor_bindings.py b/tests/bindings/test_executor_bindings.py index 73e49a455..0e49f1f18 100644 --- a/tests/bindings/test_executor_bindings.py +++ b/tests/bindings/test_executor_bindings.py @@ -805,6 +805,21 @@ def test_kv_cache_config(): assert config.host_cache_size is None assert config.onboard_blocks == True + config.enable_block_reuse = True + config.max_tokens = 1 + config.max_attention_window = 2 + config.sink_token_length = 3 + config.free_gpu_memory_fraction = 0.5 + config.host_cache_size = 4 + config.onboard_blocks = False + assert config.enable_block_reuse == True + assert config.max_tokens == 1 + assert config.max_attention_window == 2 + assert config.sink_token_length == 3 + assert config.free_gpu_memory_fraction == 0.5 + assert config.host_cache_size == 4 + assert config.onboard_blocks == False + kwargs = { "enable_block_reuse": True, "max_tokens": 3, @@ -821,21 +836,18 @@ def test_kv_cache_config(): def test_lookahead_decoding_config(): config = trtllm.LookaheadDecodingConfig(3, 5, 7) - assert config.max_ngram_size == 3 - assert config.max_window_size == 5 + assert config.max_window_size == 3 + assert config.max_ngram_size == 5 assert config.max_verification_set_size == 7 - config.max_ngram_size = 5 - config.max_window_size = 10 - config.max_verification_set_size = 3 - - assert config.max_ngram_size == 5 - assert config.max_window_size == 10 + config = trtllm.LookaheadDecodingConfig(5, 10, 3) + assert config.max_window_size == 5 + assert config.max_ngram_size == 10 assert config.max_verification_set_size == 3 kwargs = { - "max_ngram_size": 3, "max_window_size": 5, + "max_ngram_size": 3, "max_verification_set_size": 7, } @@ -909,6 +921,7 @@ def test_executor_config(): assert config.parallel_config is None assert isinstance(config.peft_cache_config, trtllm.PeftCacheConfig) assert config.logits_post_processor_map is None + assert config.logits_post_processor_batched is None assert config.decoding_config is None kwargs = { @@ -1056,10 +1069,67 @@ def logits_post_processor(req_id: int, logits: torch.Tensor, assert tokens[-max_new_tokens:] == [42] * max_new_tokens +@skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture +def test_logits_post_processor_batched(model_files, model_path): + + # Define the logits post-processor callback + def logits_post_processor_batched(req_id_batch: tp.List[int], + logits_batch: tp.List[torch.Tensor], + ids_batch: tp.List[tp.List[tp.List[int]]], + stream_ptr: int): + with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): + for logits in logits_batch: + logits[:] = float("-inf") + logits[..., 42] = 0 + + # Create executor + beam_width = 1 + executor_config = trtllm.ExecutorConfig(beam_width) + executor_config.logits_post_processor_batched = logits_post_processor_batched + executor = trtllm.Executor(model_path, trtllm.ModelType.DECODER_ONLY, + executor_config) + + # Create the request + max_new_tokens = 5 + input_tokens = [1, 2, 3, 4] + request = trtllm.Request(input_tokens, max_new_tokens, False) + request.logits_post_processor_name = request.BATCHED_POST_PROCESSOR_NAME + + batch_size = 4 + # Enqueue the requests + request_ids = [] + for _ in range(batch_size): + request_id = executor.enqueue_request(request) + request_ids.append(request_id) + + # Get the new tokens + tokens = {req_id: [] for req_id in request_ids} + num_finished = 0 + i = 0 + max_wait_ms = 10000 + while num_finished < len(request_ids) and i < max_wait_ms: + responses = executor.await_responses(datetime.timedelta(milliseconds=1)) + for response in responses: + req_id = response.request_id + assert not response.has_error( + ), f"Request id {req_id} failed with err {response.error_msg}" + result = response.result + num_finished += 1 if result.is_final else 0 + new_tokens = result.output_token_ids[beam_width - 1] + tokens[req_id].extend(new_tokens) + assert i < max_wait_ms + + expected_num_tokens = get_expected_num_tokens(len(input_tokens), + max_new_tokens, False, False) + for req_id in request_ids: + assert len(tokens[req_id]) == expected_num_tokens, f"{req_id}" + + def test_iteration_stats(): stats = trtllm.IterationStats() stats.timestamp = "01:23:56" stats.iter = 1 + stats.iter_latency_ms = 100 stats.num_active_requests = 2 stats.max_num_active_requests = 3 stats.gpu_mem_usage = 1024 @@ -1068,6 +1138,7 @@ def test_iteration_stats(): stats_json = json.loads(stats.to_json_str()) assert stats_json["timestamp"] == stats.timestamp assert stats_json["iter"] == stats.iter + assert stats_json["iterLatencyMS"] == stats.iter_latency_ms assert stats_json["numActiveRequests"] == stats.num_active_requests assert stats_json["maxNumActiveRequests"] == stats.max_num_active_requests assert stats_json["gpuMemUsage"] == stats.gpu_mem_usage @@ -1116,3 +1187,47 @@ def test_scheduler_config_pickle(): config_str = pickle.dumps(config) config_copy = pickle.loads(config_str) assert config.capacity_scheduler_policy == config_copy.capacity_scheduler_policy + + +def test_kv_cache_config_pickle(): + config = trtllm.KvCacheConfig() + config.enable_block_reuse = True + config.free_gpu_memory_fraction = 0.3 + config_copy = pickle.loads(pickle.dumps(config)) + assert config.enable_block_reuse == config_copy.enable_block_reuse + assert config.max_tokens == config_copy.max_tokens + assert config.max_attention_window == config_copy.max_attention_window + assert config.sink_token_length == config_copy.sink_token_length + assert config.free_gpu_memory_fraction == config_copy.free_gpu_memory_fraction + assert config.host_cache_size == config_copy.host_cache_size + assert config.onboard_blocks == config_copy.onboard_blocks + + +def test_peft_cache_config_pickle(): + config = trtllm.PeftCacheConfig(1, 2, 3, 4, 5, 6, 7, 8, 9, 0.9, 1024) + config_copy = pickle.loads(pickle.dumps(config)) + assert config.num_host_module_layer == config_copy.num_host_module_layer + assert config.num_device_module_layer == config_copy.num_device_module_layer + assert config.optimal_adapter_size == config_copy.optimal_adapter_size + assert config.max_adapter_size == config_copy.max_adapter_size + assert config.num_put_workers == config_copy.num_put_workers + assert config.num_ensure_workers == config_copy.num_ensure_workers + assert config.num_copy_streams == config_copy.num_copy_streams + assert config.max_pages_per_block_host == config_copy.max_pages_per_block_host + assert config.max_pages_per_block_device == config_copy.max_pages_per_block_device + assert config.device_cache_percent == config_copy.device_cache_percent + assert config.host_cache_size == config_copy.host_cache_size + + +def test_executor_config_pickle(): + beam_width = 2 + config = trtllm.ExecutorConfig(beam_width) + config.scheduler_config = trtllm.SchedulerConfig() + config.kv_cache_config = trtllm.KvCacheConfig() + config.parallel_config = trtllm.ParallelConfig() + config.peft_cache_config = trtllm.PeftCacheConfig(1) + pickle.dumps(config) + config_copy = pickle.loads(pickle.dumps(config)) + assert config.max_beam_width == config_copy.max_beam_width + assert config.scheduler_config.capacity_scheduler_policy == config_copy.scheduler_config.capacity_scheduler_policy + assert config.kv_cache_config.enable_block_reuse == config_copy.kv_cache_config.enable_block_reuse diff --git a/tests/bindings/test_gpt_manager.py b/tests/bindings/test_gpt_manager.py index 9f53d031b..e50b8eefc 100644 --- a/tests/bindings/test_gpt_manager.py +++ b/tests/bindings/test_gpt_manager.py @@ -320,12 +320,9 @@ def stats_cb(stats_json: str): remaining_requests = len(inference_request_list) opt_params = _tb.TrtGptModelOptionalParams() opt_params.max_beam_width = 1 - with _tb.GptManager( - model_path, _tb.TrtGptModelType.InflightBatching, - _tb.executor.SchedulerConfig( - _tb.executor.CapacitySchedulerPolicy.MAX_UTILIZATION), - fetch_requests, response_cb, should_stop, stats_cb, opt_params, - 10000): + with _tb.GptManager(model_path, _tb.TrtGptModelType.InflightBatching, + fetch_requests, response_cb, should_stop, stats_cb, + opt_params, 10000): while remaining_requests > 0: _time.sleep(0.1) diff --git a/tests/bindings/test_gpt_session.py b/tests/bindings/test_gpt_session.py deleted file mode 100644 index eeb099d09..000000000 --- a/tests/bindings/test_gpt_session.py +++ /dev/null @@ -1,161 +0,0 @@ -import os as _os -import pathlib as _pl -import sys as _sys - -import numpy as _np -import pytest -import torch as _tor -from binding_test_utils import * - -import tensorrt_llm.bindings as _tb - -_sys.path.append(_os.path.join(_os.path.dirname(__file__), '..')) -from utils.cpp_paths import * -from utils.llm_data import llm_models_root -from utils.util import skip_pre_ampere - - -@pytest.mark.parametrize( - "variant, results_file, load_bytearray", - [ - ("fp32-default", "output_tokens_fp32_tp1_pp1.npy", True), - ("fp32-plugin", "output_tokens_fp32_plugin_tp1_pp1.npy", False), - ("fp16-default", "output_tokens_fp16_tp1_pp1.npy", True), - ("fp16-plugin", "output_tokens_fp16_plugin_tp1_pp1.npy", False), - # ("fp16-plugin-packed", "output_tokens_fp16_plugin_packed_tp1_pp1.npy"), - # ("fp16-plugin-packed-paged", "output_tokens_fp16_plugin_packed_paged_tp1_pp1.npy"), - ]) -@skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture -def test_gpt_session(variant, results_file, load_bytearray, llm_root: _pl.Path, - resource_path: _pl.Path, engine_path: _pl.Path, - data_path: _pl.Path): - model_dir = "gpt2" - tp_size = 1 - pp_size = 1 - beam_width = 1 - max_batch_size = 8 - end_id = 50256 - pad_id = 50256 - repetitions = 2 - - # load input data - input_path = data_path / "input_tokens.npy" - assert input_path.is_file() - given_input = _np.load(input_path).astype("int32") - input_shape = given_input.shape - assert len(input_shape) == 2 - num_given_inputs = input_shape[0] - assert max_batch_size <= num_given_inputs - max_input_length = input_shape[1] - given_input_lengths = sequence_lengths(given_input, pad_id) - assert _np.all(given_input_lengths <= max_input_length) - - # load expected output data - results_path = data_path / model_dir / ( - "sampling" - if beam_width == 1 else f"beam_search_{beam_width}") / results_file - - if not results_path.exists(): - model_cache = llm_models_root() - model_cache_arg = ["--model_cache", str(model_cache) - ] if model_cache is not None else [] - prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg) - - assert results_path.is_file() - expected_output = _np.load(results_path) - output_shape = expected_output.shape - assert len(output_shape) == 2 - assert num_given_inputs * beam_width == output_shape[0] - max_seq_length = output_shape[1] - assert max_input_length <= max_seq_length - expected_output_lengths = sequence_lengths(expected_output, end_id) - assert _np.all(expected_output_lengths <= max_seq_length) - - gpu_size_path = f"tp{tp_size}-pp{pp_size}-gpu" - model_path = engine_path / model_dir / variant / gpu_size_path - assert model_path.is_dir() - config_path = model_path / "config.json" - config_json = _tb.GptJsonConfig.parse_file(config_path) - assert config_json.tensor_parallelism == tp_size - assert config_json.pipeline_parallelism == pp_size - world_config = _tb.WorldConfig.mpi(tensor_parallelism=tp_size, - pipeline_parallelism=pp_size) - engine_filename = config_json.engine_filename(world_config) - assert (model_path / engine_filename).is_file() - session_config = _tb.GptSessionConfig(max_batch_size, beam_width, - max_seq_length) - - model_config = config_json.model_config - full_engine_path = str(model_path / engine_filename) - if load_bytearray: - with open(full_engine_path, "rb") as f: - engine_data = bytearray(f.read()) - session = _tb.GptSession(session_config, model_config, world_config, - engine_data) - else: - session = _tb.GptSession(session_config, model_config, world_config, - full_engine_path) - - assert isinstance(session, _tb.GptSession) - assert isinstance(session.model_config, _tb.ModelConfig) - assert isinstance(session.world_config, _tb.WorldConfig) - assert session.device == world_config.device - cuda_device = _tor.device("cuda", world_config.device) - - max_new_tokens = max_seq_length - max_input_length - sampling_config = _tb.SamplingConfig(beam_width) - sampling_config.temperature = [1.0] - sampling_config.min_length = [1] - sampling_config.random_seed = [42] - sampling_config.top_k = [0] - sampling_config.top_p = [0.0] - - packed_input = model_config.use_packed_input - assert not packed_input - input_ids = _tor.from_numpy( - given_input[:max_batch_size, :max_input_length]).to(cuda_device) - assert input_ids.dtype == _tor.int32 - input_lengths = _tor.from_numpy( - given_input_lengths[:max_batch_size]).to(cuda_device) - assert input_lengths.dtype == _tor.int32 - generation_input = _tb.GenerationInput(end_id, pad_id, input_ids, - input_lengths, packed_input) - generation_input.max_new_tokens = max_new_tokens - - for r in range(repetitions): - output_ids = _tor.empty(0, dtype=_tor.int32, device=cuda_device) - output_lengths = _tor.empty(0, dtype=_tor.int32, device=cuda_device) - generation_output = _tb.GenerationOutput(output_ids, output_lengths) - num_steps = 0 - - def on_token_generated(ids, step, finished): - assert ids.shape == (max_batch_size, 1, max_seq_length) - nonlocal num_steps - assert step == num_steps - num_steps += 1 - # check that we only finish after producing `maxNewTokens` tokens - assert not finished or num_steps == max_new_tokens - # check that `finished` is set to true after producing `maxNewTokens` tokens - assert num_steps != max_new_tokens or finished - - generation_output.on_token_generated = on_token_generated - - session.generate(generation_output, generation_input, sampling_config) - observed_output = output_ids.squeeze().cpu().numpy() - assert observed_output.shape == (max_batch_size, max_seq_length) - observed_output_lengths = output_lengths.squeeze().cpu().numpy() - assert _np.all(observed_output_lengths <= max_seq_length) - - for batch_idx in range(max_batch_size): - expected_length = expected_output_lengths[batch_idx] - observed_length = observed_output_lengths[batch_idx] - assert expected_length == observed_length, (batch_idx, - expected_length, - observed_length) - expected = expected_output[batch_idx, :expected_length] - observed = observed_output[batch_idx, :expected_length] - unmatched = expected != observed - if _np.any(unmatched): - assert False, (batch_idx, _np.where(unmatched), - _np.column_stack( - (expected, observed))[unmatched]) diff --git a/tests/functional/test_arange.py b/tests/functional/test_arange.py index 019c9132a..cac5766f2 100644 --- a/tests/functional/test_arange.py +++ b/tests/functional/test_arange.py @@ -15,9 +15,11 @@ import os import sys import unittest +from itertools import product import numpy as np import torch +from parameterized import parameterized import tensorrt_llm @@ -53,36 +55,47 @@ def test_arange_int(self): ref = torch.arange(start, end).int().cuda() torch.testing.assert_close(outputs['output'], ref) - def test_arange_tensor(self): + @parameterized.expand( + list( + product(['int32', 'int64'], ['int32', 'int64'], + ['int32', 'int64', 'float32', 'float16']))) + def test_arange_tensor(self, + s_dtype='int32', + e_dtype='int32', + r_dtype='int32'): # test data s = 0 e = 128 - dtype = 'int32' + s_np_dtype = tensorrt_llm._utils.str_dtype_to_np(s_dtype) + e_np_dtype = tensorrt_llm._utils.str_dtype_to_np(e_dtype) # construct trt network builder = tensorrt_llm.Builder() network = builder.create_network() with tensorrt_llm.net_guard(network): - start = tensorrt_llm.functional.constant(np.array(s, - dtype=np.int32)) - end_tensor = tensorrt_llm.functional.constant( - np.array([0] * e, dtype=np.int32)) + start = tensorrt_llm.functional.constant( + np.array(s, dtype=s_np_dtype)) + end = tensorrt_llm.functional.constant( + np.array([e], dtype=e_np_dtype)) - output = tensorrt_llm.functional.arange( - start=start, - end=tensorrt_llm.functional.shape(end_tensor, 0), - dtype=dtype) + output = tensorrt_llm.functional.arange(start=start, + end=end, + dtype=r_dtype) - output.mark_output('output', dtype) + output.mark_output('output', r_dtype) # trt run inputs = {} - session = create_session(builder, network, precision="float32") + session = create_session( + builder, + network, + precision="float32" if r_dtype != 'float16' else 'float16') outputs = run_session(session, inputs) # pytorch run - ref = torch.arange(s, e).int().cuda() + ref = torch.arange( + s, e, dtype=tensorrt_llm.str_dtype_to_torch(r_dtype)).cuda() # compare diff torch.testing.assert_close(outputs['output'], ref) diff --git a/tests/functional/test_gather_nd.py b/tests/functional/test_gather_nd.py index 1cf479650..0b20030fd 100644 --- a/tests/functional/test_gather_nd.py +++ b/tests/functional/test_gather_nd.py @@ -168,7 +168,7 @@ def test_gatherND_b0(self, data, indices, ref): def test_gatherND_selectH(self): dtype = "float32" - # This usecase is used to gather in ReDrafter for validated end-tokens (diff stopping point for diff seqs) + # This usecase is used to gather for validated end-tokens (diff stopping point for diff seqs) data = torch.rand((2, 9, 4), dtype=torch.float32, device="cuda") indices = torch.randint(9, size=(2, ), dtype=torch.int32, device="cuda") indices = torch.stack( diff --git a/tests/hlapi/hlapi_evaluator.py b/tests/hlapi/hlapi_evaluator.py index f93f54d47..ca3dda8d9 100644 --- a/tests/hlapi/hlapi_evaluator.py +++ b/tests/hlapi/hlapi_evaluator.py @@ -49,7 +49,7 @@ def benchmark_main(model_path: str, warmup: int = 100, max_num_tokens=2048, max_input_length: int = 200, - max_output_length: int = 200, + max_seq_length: int = 400, max_batch_size: int = 128, engine_output_dir: str = "", cpp_executable: str = None, @@ -83,7 +83,7 @@ def run_hlapi(): build_config = config.build_config build_config.max_num_tokens = max_num_tokens build_config.max_input_len = max_input_length - build_config.max_output_len = max_output_length + build_config.max_seq_len = max_seq_length build_config.max_batch_size = max_batch_size config.parallel_config.tp_size = tp_size @@ -149,7 +149,7 @@ def run_gpt_manager_benchmark(): default=1e8, help="Specify the first N cases to test") @click.option("--max-input-len", type=int, default=1024) -@click.option("--max-output-len", type=int, default=1024) +@click.option("--max-seq-len", type=int, default=2048) @click.option("--max-num-tokens", type=int, default=4096) @click.option("--tp-size", type=int, default=1) @click.option("--num-samples", type=int, default=200) @@ -158,7 +158,7 @@ def grid_searcher_main(model_path, reports_root, prune_space_for_debug: int, max_input_len: int, - max_output_len: int, + max_seq_len: int, max_num_tokens: int, tp_size: int = 1, num_samples: int = 200): @@ -169,8 +169,8 @@ def grid_searcher_main(model_path, model_config = ModelConfig(model_path) model_config.parallel_config.tp_size = tp_size - model_config._set_additional_options(max_output_len=max_input_len, - max_input_len=max_output_len, + model_config._set_additional_options(max_seq_len=max_seq_len, + max_input_len=max_input_len, max_num_tokens=max_num_tokens) grid_searcher.evaluate( diff --git a/tests/hlapi/run_llm.py b/tests/hlapi/run_llm.py index 9abadd6a3..1cb20ecbc 100644 --- a/tests/hlapi/run_llm.py +++ b/tests/hlapi/run_llm.py @@ -10,7 +10,8 @@ @click.option("--model_dir", type=str, required=True) @click.option("--tp_size", type=int, required=True) @click.option("--engine_dir", type=str, default=None) -def main(model_dir: str, tp_size: int, engine_dir: str): +@click.option("--prompt", type=str, default=None) +def main(model_dir: str, tp_size: int, engine_dir: str, prompt: str): config = ModelConfig(model_dir) config.parallel_config.tp_size = tp_size @@ -20,11 +21,18 @@ def main(model_dir: str, tp_size: int, engine_dir: str): engine_dir) != os.path.abspath(model_dir): llm.save(engine_dir) - prompt = [45, 12, 13] sampling_params = SamplingParams(max_new_tokens=10, end_id=-1) - for output in llm.generate([prompt], sampling_params=sampling_params): + + # For intentional failure test, need a simple prompt here to start LLM + prompt_token_ids = [45, 12, 13] + for output in llm.generate([prompt_token_ids], + sampling_params=sampling_params): print(output) + if prompt is not None: + for output in llm.generate([prompt], sampling_params=sampling_params): + print(output) + if __name__ == '__main__': main() diff --git a/tests/hlapi/run_llm_exit.py b/tests/hlapi/run_llm_exit.py new file mode 100644 index 000000000..a097ecb6a --- /dev/null +++ b/tests/hlapi/run_llm_exit.py @@ -0,0 +1,29 @@ +import os +import subprocess +import sys + +import click + + +@click.command() +@click.option("--model_dir", type=str, required=True) +@click.option("--tp_size", type=int, required=True) +def main(model_dir: str, tp_size: int): + run_cmd = [ + sys.executable, + os.path.join(os.path.dirname(__file__), "run_llm.py"), + "--model_dir", + model_dir, + "--tp_size", + str(tp_size), + "--prompt", + "This is an over-long prompt that intentionlly trigger failure. " * + 1000, + ] + # Will raise TimeoutExpired exception if timeout + res = subprocess.run(run_cmd, check=False, timeout=600) + assert res.returncode != 0 + + +if __name__ == '__main__': + main() diff --git a/tests/hlapi/test_executor.py b/tests/hlapi/test_executor.py index 9a201c79b..b5cb75f95 100644 --- a/tests/hlapi/test_executor.py +++ b/tests/hlapi/test_executor.py @@ -1,3 +1,5 @@ +import asyncio +import json import os as _os import sys as _sys import unittest @@ -8,7 +10,7 @@ from transformers import AutoTokenizer from tensorrt_llm._utils import mpi_world_size -from tensorrt_llm.bindings import TrtGptModelOptionalParams +from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (GenerationExecutor, GenerationRequest, SamplingParams) from tensorrt_llm.hlapi.llm import LLM, ModelConfig @@ -68,12 +70,11 @@ def test_generation_bs2(llama_7b_bs2_path: Path): tokenizer = llama_7b_bs2_path prompt = "A B C D" max_new_tokens = 8 - executor_config = TrtGptModelOptionalParams() - executor_config.max_beam_width = 2 - with GenerationExecutor.create(llama_7b_bs2_path, - tokenizer, - executor_config=executor_config) as executor: + with GenerationExecutor.create( + llama_7b_bs2_path, + tokenizer, + executor_config=tllm.ExecutorConfig(max_beam_width=2)) as executor: result = executor.generate(prompt, sampling_params=SamplingParams( max_new_tokens=max_new_tokens, @@ -166,8 +167,26 @@ def test_sync_generation_tp_inner(llama_7b_tp2_path: Path): executor = GenerationExecutor.create(llama_7b_tp2_path, llama_7b_tp2_path, model_world_size=tp_size) - result = executor.generate(prompt, sampling_params=sampling_params) - assert result.text == ", neural network," + + async def async_stats_task(): + # asyncio event loop must be created before first generation in order to + # use async APIs. + result = executor.generate(prompt, sampling_params=sampling_params) + assert result.text == ", neural network," + + stats = await executor.aget_stats() + stats = json.loads(stats) + assert stats["iter"] == 0 + assert stats["cpuMemUsage"] > 0 + assert stats["gpuMemUsage"] > 0 + assert stats["inflightBatchingStats"]["numCtxTokens"] == 3 + assert stats["inflightBatchingStats"]["numGenRequests"] == 0 + assert stats["kvCacheStats"]["usedNumBlocks"] == 1 + + asyncio.run(async_stats_task()) + + stats = executor.get_stats() + assert json.loads(stats)["iter"] == 1 executor.shutdown() diff --git a/tests/hlapi/test_llm.py b/tests/hlapi/test_llm.py index 88cd254e1..02001f7ae 100644 --- a/tests/hlapi/test_llm.py +++ b/tests/hlapi/test_llm.py @@ -52,7 +52,7 @@ def test_ModelConfig_build_config(): config.build_config.builder_opt = 3 config.build_config.max_num_tokens = 888 config.build_config.strongly_typed = True - config.build_config.max_output_len = 333 + config.build_config.max_seq_len = 1024 llm = LLM(config, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4)) @@ -74,7 +74,7 @@ def test_ModelConfig_build_config(): assert build_config.builder_opt == config.build_config.builder_opt assert build_config.max_num_tokens == config.build_config.max_num_tokens assert build_config.strongly_typed == config.build_config.strongly_typed - assert build_config.max_output_len == config.build_config.max_output_len + assert build_config.max_seq_len == config.build_config.max_seq_len def test_llm_loading_from_hf(): @@ -476,6 +476,55 @@ def test_generate_with_bad_words(): assert output.text == "D E F G H J" +@force_ampere +def test_generate_with_embedding_bias(): + config = ModelConfig(llama_model_path) + llm = LLM( + config, + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), + ) + + sampling_params = SamplingParams(max_new_tokens=6) + + tokenizer = AutoTokenizer.from_pretrained(llama_model_path, + add_prefix_space=False) + biased_word_id = tokenizer(["Z"]).input_ids[0][1] + + vocab_size_padded = 32000 + embedding_bias = torch.zeros(vocab_size_padded) + embedding_bias[biased_word_id] = torch.finfo(torch.float32).max + sampling_params.embedding_bias = embedding_bias + + for output in llm.generate(prompts, sampling_params=sampling_params): + print(output) + assert output.text == "Z Z Z Z Z Z" + + +@force_ampere +def test_generate_with_logits_post_processor(): + tokenizer = AutoTokenizer.from_pretrained(llama_model_path, + add_prefix_space=False) + biased_word_id = tokenizer(["Z"]).input_ids[0][1] + + def logits_post_processor(req_id: int, logits: torch.Tensor, + ids: List[List[int]], stream_ptr: int): + with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): + logits[:] = float("-inf") + logits[..., biased_word_id] = 0 + + config = ModelConfig(llama_model_path) + llm = LLM(config, + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), + logits_post_processor_map={"my_logits_pp": logits_post_processor}) + + sampling_params = SamplingParams(max_new_tokens=6) + sampling_params.logits_post_processor_name = "my_logits_pp" + + for output in llm.generate(prompts, sampling_params=sampling_params): + print(output) + assert output.text == "Z Z Z Z Z Z" + + @force_ampere def test_generate_block_reuse(): config = ModelConfig(llama_model_path) diff --git a/tests/hlapi/test_llm_multi_gpu.py b/tests/hlapi/test_llm_multi_gpu.py index 13ae6c2f9..ad289a453 100644 --- a/tests/hlapi/test_llm_multi_gpu.py +++ b/tests/hlapi/test_llm_multi_gpu.py @@ -92,13 +92,17 @@ def test_llm_generate_tp2(engine_from_checkpoint): print(output) -# TODO[yuxianq]: Enable auto_parallel after fixing the issue -#@pytest.mark.parametrize("use_auto_parallel", [True, False], ids=[ "enable_auto_parallel", "disable_auto_parallel"]) +@pytest.mark.parametrize("use_auto_parallel", [True, False], + ids=["enable_auto_parallel", "disable_auto_parallel"]) +@pytest.mark.parametrize("from_ckpt", [True, False], + ids=["from_ckpt", "from_hf"]) @skip_single_gpu def test_llm_generate_async_tp2( engine_from_checkpoint: tempfile.TemporaryDirectory, - use_auto_parallel=False): - model_dir = engine_from_checkpoint.name if not use_auto_parallel else get_model_path( + use_auto_parallel: bool, from_ckpt: bool): + if use_auto_parallel and from_ckpt: + pytest.skip("Skip auto parallel for TP2 checkpoint") + model_dir = engine_from_checkpoint.name if from_ckpt else get_model_path( llama_model_path) tokenizer_dir = get_model_path(llama_model_path) tokenizer = TransformersTokenizer.from_pretrained(tokenizer_dir) diff --git a/tests/hlapi/test_llm_quant.py b/tests/hlapi/test_llm_quant.py index b4deddda3..6e7014dcb 100644 --- a/tests/hlapi/test_llm_quant.py +++ b/tests/hlapi/test_llm_quant.py @@ -32,10 +32,6 @@ def test_llm_fp8_quantization(): config = ModelConfig(llama_model_path) config.quant_config.quant_algo = QuantAlgo.FP8 config.quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - config.quant_config.exclude_modules = [ - 'lm_head', 'router', 'vocab_embedding', 'position_embedding', - 'block_embedding' - ] assert config.quant_config.quant_mode.has_any_quant() diff --git a/tests/model/test_arctic.py b/tests/model/test_arctic.py index c78f665b5..5dfd0f026 100644 --- a/tests/model/test_arctic.py +++ b/tests/model/test_arctic.py @@ -71,6 +71,8 @@ def _gen_tensorrt_llm_network(self, network, hf_mistral, 'mapping': { 'world_size': tensor_parallel, 'tp_size': tensor_parallel, + 'moe_tp_size': 1, + 'moe_ep_size': tensor_parallel, 'rank': rank, }, 'use_parallel_embedding': False, @@ -78,7 +80,6 @@ def _gen_tensorrt_llm_network(self, network, hf_mistral, 'moe': { 'num_experts': 0, 'top_k': 0, - 'tp_mode': 1, 'normalization_mode': 1, }, 'use_fused_mlp': False, diff --git a/tests/model/test_falcon.py b/tests/model/test_falcon.py index efb585cf7..edcc1778f 100644 --- a/tests/model/test_falcon.py +++ b/tests/model/test_falcon.py @@ -211,10 +211,10 @@ def load_test_cases(): ContextFMHAType.disabled, 'float16'), ('MQA', False, True, False, False, True, True, False, ContextFMHAType.disabled, 'float32'), - # TC for Falcon-40B arch: GQA + RoPE + new_decoder_architecture - ('GQA', False, False, True, False, True, True, False, + # TC for Falcon-40B arch: GQA + RoPE + parallel_attention + new_decoder_architecture + ('GQA', False, True, True, False, True, True, False, ContextFMHAType.disabled, 'float16'), - ('GQA', False, False, True, False, True, True, False, + ('GQA', False, True, True, False, True, True, False, ContextFMHAType.disabled, 'float32'), ] return test_cases diff --git a/tests/model/test_llama.py b/tests/model/test_llama.py index f22304c9f..ec0e79ce2 100644 --- a/tests/model/test_llama.py +++ b/tests/model/test_llama.py @@ -78,8 +78,6 @@ def _gen_tensorrt_llm_network(self, network, hf_llama, }, "moe": { "num_experts": 0, - "top_k": 0, - "tp_mode": 2, "normalization_mode": 1 }, 'use_parallel_embedding': False, @@ -537,7 +535,6 @@ def print_layers(m: tensorrt_llm.models.LLaMAForCausalLM): "moe": { "num_experts": 0, "top_k": 0, - "tp_mode": 1, "normalization_mode": 1, }, 'use_parallel_embedding': use_parallel_embedding, diff --git a/tests/model/test_mistral.py b/tests/model/test_mistral.py index cc20c1251..134827ed9 100644 --- a/tests/model/test_mistral.py +++ b/tests/model/test_mistral.py @@ -82,7 +82,6 @@ def _gen_tensorrt_llm_network(self, network, hf_mistral, "moe": { "num_experts": 0, "top_k": 0, - "tp_mode": 1, "normalization_mode": 1, }, 'use_fused_mlp': False, @@ -493,7 +492,6 @@ def print_layers(m: tensorrt_llm.models.LLaMAForCausalLM): "moe": { "num_experts": 0, "top_k": 0, - "tp_mode": 1, "normalization_mode": 1 }, 'use_parallel_embedding': use_parallel_embedding, diff --git a/tests/model_api/test_model_api_multi_gpu.py b/tests/model_api/test_model_api_multi_gpu.py index d3e7f5cd8..b14e7c780 100644 --- a/tests/model_api/test_model_api_multi_gpu.py +++ b/tests/model_api/test_model_api_multi_gpu.py @@ -93,7 +93,7 @@ def build_and_run_tp2(rank, model_name, engine_dir, use_auto_parallel): llama, BuildConfig(max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, strongly_typed=True, auto_parallel_config=auto_parallel_config)) engine.save(engine_dir) diff --git a/tests/model_api/test_model_level_api.py b/tests/model_api/test_model_level_api.py index 5149de0e8..9fbf06772 100644 --- a/tests/model_api/test_model_level_api.py +++ b/tests/model_api/test_model_level_api.py @@ -58,7 +58,7 @@ def test_save_load(): llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir, 'float16') build_config = BuildConfig(max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, plugin_config=llama.default_plugin_config()) build_config.plugin_config.gemm_plugin = 'float16' # faster build engine = build(llama, build_config) @@ -95,7 +95,7 @@ def test_high_level_fake_weights(): llama = LLaMAForCausalLM(config) build_config = BuildConfig(max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, plugin_config=llama.default_plugin_config()) build_config.plugin_config.gemm_plugin = 'float16' # faster build build(llama, build_config) @@ -110,7 +110,7 @@ def test_inflight_batching(): llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir, 'float16') build_config = BuildConfig(max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl) + max_seq_len=max_osl + max_isl) build_config.plugin_config.gemm_plugin = 'float16' # faster build engine = build(llama, build_config) diff --git a/tests/model_api/test_model_quantization.py b/tests/model_api/test_model_quantization.py index 93bf3e225..bc823707e 100644 --- a/tests/model_api/test_model_quantization.py +++ b/tests/model_api/test_model_quantization.py @@ -40,7 +40,7 @@ def test_int4_awq_quantization(): BuildConfig( max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, max_num_tokens=max_batch_size * max_isl, )) @@ -70,11 +70,7 @@ def test_fp8_quantization(): tokenizer_dir = hf_model_dir checkpoint_dir = tempfile.TemporaryDirectory("llama-checkpoint").name - quant_config = QuantConfig(QuantAlgo.FP8, - exclude_modules=[ - 'lm_head', 'vocab_embedding', - 'position_embedding', 'block_embedding' - ]) + quant_config = QuantConfig(QuantAlgo.FP8) LLaMAForCausalLM.quantize(hf_model_dir, checkpoint_dir, quant_config=quant_config, @@ -85,7 +81,7 @@ def test_fp8_quantization(): llama, BuildConfig(max_batch_size=max_batch_size, max_input_len=max_isl, - max_output_len=max_osl, + max_seq_len=max_osl + max_isl, max_num_tokens=max_batch_size * max_isl, strongly_typed=True)) engine_dir = "llama-fp8-quantized"