From 0a39f0a8cd6d11c3dee7ad55b39095f09d3ddaa7 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Thu, 31 Oct 2024 07:43:19 -0700 Subject: [PATCH] th::optional -> std::optional --- cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp | 144 +++++++++--------- cpp/tensorrt_llm/thop/dynamicDecodeOp.h | 137 +++++++++-------- cpp/tensorrt_llm/thop/gatherTreeOp.cpp | 46 +++--- .../thop/parallelDecodeKVCacheUpdateOp.cpp | 11 +- cpp/tensorrt_llm/thop/redrafterCurandOp.cpp | 14 +- 5 files changed, 178 insertions(+), 174 deletions(-) diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp index ca52d27ae..f58181a31 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp @@ -63,7 +63,7 @@ namespace { template -void safeInsert(th::optional& tensor, std::optional>& arg) +void safeInsert(std::optional& tensor, std::optional>& arg) { using valueType = T; if (tensor.has_value()) @@ -76,7 +76,7 @@ void safeInsert(th::optional& tensor, std::optional>& } template -void safeUpdate(th::optional& tensor, std::optional& arg) +void safeUpdate(std::optional& tensor, std::optional& arg) { if (tensor.has_value()) { @@ -85,7 +85,7 @@ void safeUpdate(th::optional& tensor, std::optional -void safeUpdate(th::optional& tensor, std::optional& arg) +void safeUpdate(std::optional& tensor, std::optional& arg) { if (tensor.has_value()) { @@ -94,7 +94,7 @@ void safeUpdate(th::optional& tensor, std::optional -void safeUpdateScalar(th::optional& tensor, std::optional& arg, std::string const& name) +void safeUpdateScalar(std::optional& tensor, std::optional& arg, std::string const& name) { if (tensor.has_value()) { @@ -105,7 +105,7 @@ void safeUpdateScalar(th::optional& tensor, std::optional& arg, s } template -void safeUpdatePtr(th::optional& tensor, T*& ptr) +void safeUpdatePtr(std::optional& tensor, T*& ptr) { if (tensor.has_value()) { @@ -117,14 +117,14 @@ void safeUpdatePtr(th::optional& tensor, T*& ptr) template void FtDynamicDecode::setup(size_t const batch_size, size_t const beam_width, - th::optional runtime_top_k_opt, th::optional runtime_top_p_opt, - th::optional temperature_opt, th::optional repetition_penalty_opt, - th::optional presence_penalty_opt, th::optional frequency_penalty_opt, - th::optional min_length_opt, th::optional length_penalty_opt, - th::optional early_stopping_opt, th::optional beam_search_diversity_rate_opt, - th::optional random_seed_opt, th::optional top_p_decay_opt, - th::optional top_p_min_opt, th::optional top_p_reset_ids_opt, - th::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) + std::optional runtime_top_k_opt, std::optional runtime_top_p_opt, + std::optional temperature_opt, std::optional repetition_penalty_opt, + std::optional presence_penalty_opt, std::optional frequency_penalty_opt, + std::optional min_length_opt, std::optional length_penalty_opt, + std::optional early_stopping_opt, std::optional beam_search_diversity_rate_opt, + std::optional random_seed_opt, std::optional top_p_decay_opt, + std::optional top_p_min_opt, std::optional top_p_reset_ids_opt, + std::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); mBeamWidth = beam_width; @@ -179,20 +179,20 @@ void FtDynamicDecode::setup(size_t const batch_size, size_t const beam_width, template void FtDynamicDecode::forward(th::Tensor const& logits, int const step, int const maxInputLength, int const maxAttentionWindow, int const sinkTokenLength, uint64_t const ite, int const localBatchSize, - th::Tensor endId, th::optional embeddingBiasOpt, th::optional inputLengthsOpt, - th::optional sequenceLimitLengthOpt, th::optional stopWordsListPtrsOpt, - th::optional stopWordsLensOpt, int32_t const maxStopWordsLen, - th::optional badWordsListPtrsOpt, th::optional badWordsLensOpt, - int32_t const maxBadWordsLen, th::optional srcCacheIndirectionOpt, th::Tensor& outputTokenIds, - th::Tensor& newTokens, th::Tensor& shouldStop, th::optional finishedInput, - th::optional finishedOutput, th::optional sequenceLengthsOpt, - th::optional cumLogProbsOpt, th::optional outputLogProbsOpt, - th::optional outputLogProbsTiledOpt, th::optional parentIdsOpt, - th::optional tgtCacheIndirectionOpt, th::optional beamHypsOutputIdsCbaOpt, - th::optional beamHypsSeqLenCbaOpt, th::optional beamHypsCumLogProbsCbaOpt, - th::optional beamHypsNormedScoresCbaOpt, th::optional beamHypsLogProbsCbaOpt, - th::optional beamHypsMinNormedScoresOpt, th::optional beamHypsNumBeamsOpt, - th::optional beamHypsIsDoneOpt, bool const useBeamHyps) + th::Tensor endId, std::optional embeddingBiasOpt, std::optional inputLengthsOpt, + std::optional sequenceLimitLengthOpt, std::optional stopWordsListPtrsOpt, + std::optional stopWordsLensOpt, int32_t const maxStopWordsLen, + std::optional badWordsListPtrsOpt, std::optional badWordsLensOpt, + int32_t const maxBadWordsLen, std::optional srcCacheIndirectionOpt, th::Tensor& outputTokenIds, + th::Tensor& newTokens, th::Tensor& shouldStop, std::optional finishedInput, + std::optional finishedOutput, std::optional sequenceLengthsOpt, + std::optional cumLogProbsOpt, std::optional outputLogProbsOpt, + std::optional outputLogProbsTiledOpt, std::optional parentIdsOpt, + std::optional tgtCacheIndirectionOpt, std::optional beamHypsOutputIdsCbaOpt, + std::optional beamHypsSeqLenCbaOpt, std::optional beamHypsCumLogProbsCbaOpt, + std::optional beamHypsNormedScoresCbaOpt, std::optional beamHypsLogProbsCbaOpt, + std::optional beamHypsMinNormedScoresOpt, std::optional beamHypsNumBeamsOpt, + std::optional beamHypsIsDoneOpt, bool const useBeamHyps) { TLLM_CHECK_WITH_INFO(mBeamWidth.has_value(), "Beam width is not set. setup() must be called before forward()"); auto const isBeamSearch = mBeamWidth.value() > 1; @@ -325,14 +325,14 @@ void DynamicDecodeOp::createInstance() } } -void DynamicDecodeOp::setup(int64_t const batchSize, int64_t const beamWidth, th::optional runtimeTopKOpt, - th::optional runtimeTopPOpt, th::optional temperatureOpt, - th::optional repetitionPenaltyOpt, th::optional presencePenaltyOpt, - th::optional frequencyPenaltyOpt, th::optional minLengthOpt, - th::optional lengthPenaltyOpt, th::optional earlyStoppingOpt, - th::optional beamSearchDiversityRateOpt, th::optional randomSeedOpt, - th::optional topPDecayOpt, th::optional topPMinOpt, - th::optional topPResetIdsOpt, th::optional noRepeatNgramSizeOpt, bool outputLogProbs, +void DynamicDecodeOp::setup(int64_t const batchSize, int64_t const beamWidth, std::optional runtimeTopKOpt, + std::optional runtimeTopPOpt, std::optional temperatureOpt, + std::optional repetitionPenaltyOpt, std::optional presencePenaltyOpt, + std::optional frequencyPenaltyOpt, std::optional minLengthOpt, + std::optional lengthPenaltyOpt, std::optional earlyStoppingOpt, + std::optional beamSearchDiversityRateOpt, std::optional randomSeedOpt, + std::optional topPDecayOpt, std::optional topPMinOpt, + std::optional topPResetIdsOpt, std::optional noRepeatNgramSizeOpt, bool outputLogProbs, bool cumLogProbs) { // TODO: Revise DynamicDecodeLayer and make the decode arguments consistent. @@ -361,44 +361,44 @@ void DynamicDecodeOp::setup(int64_t const batchSize, int64_t const beamWidth, th th::Tensor DynamicDecodeOp::forward( // Inputs BS: batchSize, BM: beamWidth, MSL: maxSeqLength, V: vocabSize, VP: vocabSizePadded - th::Tensor const& logits, // [BS, BM, VP], T, variables for input - int64_t const step, // - int64_t const maxInputLength, // - int64_t const maxAttentionWindow, // - int64_t const sinkTokenLength, // - int64_t const ite, // - int64_t const localBatchSize, // - th::Tensor const endId, // [BS*BM], int - th::optional embeddingBiasOpt, // [VP], T - th::optional inputLengthsOpt, // [BS*BM], int, length of input contexts - th::optional sequenceLimitLengthOpt, // [BS, 1], int - th::optional stopWordsListPtrsOpt, // [BS][2, stopWordsLength], int64 - th::optional stopWordsLensOpt, // [BS], int - int64_t const maxStopWordsLen, // - th::optional badWordsListPtrsOpt, // [BS][2, badWordsLength], int64 - th::optional badWordsLensOpt, // [BS], int - int64_t const maxBadWordsLen, // - th::optional srcCacheIndirectionOpt, // [localBS, BM, MSL], int + th::Tensor const& logits, // [BS, BM, VP], T, variables for input + int64_t const step, // + int64_t const maxInputLength, // + int64_t const maxAttentionWindow, // + int64_t const sinkTokenLength, // + int64_t const ite, // + int64_t const localBatchSize, // + th::Tensor const endId, // [BS*BM], int + std::optional embeddingBiasOpt, // [VP], T + std::optional inputLengthsOpt, // [BS*BM], int, length of input contexts + std::optional sequenceLimitLengthOpt, // [BS, 1], int + std::optional stopWordsListPtrsOpt, // [BS][2, stopWordsLength], int64 + std::optional stopWordsLensOpt, // [BS], int + int64_t const maxStopWordsLen, // + std::optional badWordsListPtrsOpt, // [BS][2, badWordsLength], int64 + std::optional badWordsLensOpt, // [BS], int + int64_t const maxBadWordsLen, // + std::optional srcCacheIndirectionOpt, // [localBS, BM, MSL], int // Outputs - th::Tensor outputTokenIds, // [BS, BM, MSL], variables for output - th::Tensor newTokens, // [BS, BM, 1], int - th::optional finishedInput, // [BS, BM], uint8 - th::optional finishedOutput, // [BS, BM], uint8 - th::optional sequenceLengthsOpt, // [BS*BM], int, length of the current sequences - th::optional cumLogProbsOpt, // [BS, BM], float - th::optional outputLogProbsOpt, // [BS, BM, MSL], float - th::optional outputLogProbsTiledOpt, // [MSL, BS, BM], float, transpose of outputLogProbsOpt - th::optional parentIdsOpt, // [BS, BM, MSL], int - th::optional tgtCacheIndirectionOpt, // [localBS, BM, MSL], int - th::optional beamHypsOutputIdsCbaOpt, // [BS, BM*2, MSL], int - th::optional beamHypsSeqLenCbaOpt, // [BS, BM*2], int - th::optional beamHypsCumLogProbsCbaOpt, // [BS, BM*2], float - th::optional beamHypsNormedScoresCbaOpt, // [BS, BM*2], float - th::optional beamHypsLogProbsCbaOpt, // [BS, BM*2, MSL], float - th::optional beamHypsMinNormedScoresOpt, // [BS], float - th::optional beamHypsNumBeamsOpt, // [BS], int - th::optional beamHypsIsDoneOpt, // [BS], bool - bool const useBeamHyps // + th::Tensor outputTokenIds, // [BS, BM, MSL], variables for output + th::Tensor newTokens, // [BS, BM, 1], int + std::optional finishedInput, // [BS, BM], uint8 + std::optional finishedOutput, // [BS, BM], uint8 + std::optional sequenceLengthsOpt, // [BS*BM], int, length of the current sequences + std::optional cumLogProbsOpt, // [BS, BM], float + std::optional outputLogProbsOpt, // [BS, BM, MSL], float + std::optional outputLogProbsTiledOpt, // [MSL, BS, BM], float, transpose of outputLogProbsOpt + std::optional parentIdsOpt, // [BS, BM, MSL], int + std::optional tgtCacheIndirectionOpt, // [localBS, BM, MSL], int + std::optional beamHypsOutputIdsCbaOpt, // [BS, BM*2, MSL], int + std::optional beamHypsSeqLenCbaOpt, // [BS, BM*2], int + std::optional beamHypsCumLogProbsCbaOpt, // [BS, BM*2], float + std::optional beamHypsNormedScoresCbaOpt, // [BS, BM*2], float + std::optional beamHypsLogProbsCbaOpt, // [BS, BM*2, MSL], float + std::optional beamHypsMinNormedScoresOpt, // [BS], float + std::optional beamHypsNumBeamsOpt, // [BS], int + std::optional beamHypsIsDoneOpt, // [BS], bool + bool const useBeamHyps // ) { CHECK_INPUT(logits, scalarType_); diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h index b6fa511d0..350dbfb98 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h @@ -27,34 +27,35 @@ namespace torch_ext class IFtDynamicDecode { public: - virtual void setup(size_t const batch_size, size_t const beam_width, th::optional runtime_top_k_opt, - th::optional runtime_top_p_opt, th::optional temperature_opt, - th::optional repetition_penalty_opt, th::optional presence_penalty_opt, - th::optional frequency_penalty_opt, th::optional min_length_opt, - th::optional length_penalty_opt, th::optional early_stopping_opt, - th::optional beam_search_diversity_rate_opt, th::optional random_seed_opt, - th::optional top_p_decay_opt, th::optional top_p_min_opt, - th::optional top_p_reset_ids_opt, th::optional no_repeat_ngram_size_opt, + virtual void setup(size_t const batch_size, size_t const beam_width, std::optional runtime_top_k_opt, + std::optional runtime_top_p_opt, std::optional temperature_opt, + std::optional repetition_penalty_opt, std::optional presence_penalty_opt, + std::optional frequency_penalty_opt, std::optional min_length_opt, + std::optional length_penalty_opt, std::optional early_stopping_opt, + std::optional beam_search_diversity_rate_opt, std::optional random_seed_opt, + std::optional top_p_decay_opt, std::optional top_p_min_opt, + std::optional top_p_reset_ids_opt, std::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) = 0; virtual void forward(th::Tensor const& logits, int const step, int const max_input_length, int const max_attention_window, int const sink_token_length, uint64_t const ite, int const local_batch_size, - th::Tensor end_id, th::optional embedding_bias_opt, th::optional input_lengths_opt, - th::optional sequence_limit_length_opt, th::optional stop_words_list_ptrs_opt, - th::optional stop_words_lens_opt, int32_t const max_stop_words_len, - th::optional bad_words_list_ptrs_opt, th::optional bad_words_lens_opt, - int32_t const max_bad_words_len, th::optional src_cache_indirection_opt, + th::Tensor end_id, std::optional embedding_bias_opt, std::optional input_lengths_opt, + std::optional sequence_limit_length_opt, std::optional stop_words_list_ptrs_opt, + std::optional stop_words_lens_opt, int32_t const max_stop_words_len, + std::optional bad_words_list_ptrs_opt, std::optional bad_words_lens_opt, + int32_t const max_bad_words_len, std::optional src_cache_indirection_opt, th::Tensor& output_token_ids, th::Tensor& newTokens, th::Tensor& should_stop, - th::optional finished_input, th::optional finished_output, - th::optional sequence_lengths_opt, th::optional cum_log_probs_opt, - th::optional output_log_probs_opt, th::optional output_log_probs_tiled_opt, - th::optional parent_ids_opt, th::optional tgt_cache_indirection_opt, - th::optional beam_hyps_output_ids_cba_opt, th::optional beam_hyps_seq_len_cba_opt, - th::optional beam_hyps_cum_log_probs_cba_opt, - th::optional beam_hyps_normed_scores_cba_opt, th::optional beam_hyps_log_probs_cba_opt, - th::optional beam_hyps_min_normed_scores_opt, th::optional beam_hyps_num_beams_opt, - th::optional beam_hyps_is_done_opt, bool const use_beam_hyps) + std::optional finished_input, std::optional finished_output, + std::optional sequence_lengths_opt, std::optional cum_log_probs_opt, + std::optional output_log_probs_opt, std::optional output_log_probs_tiled_opt, + std::optional parent_ids_opt, std::optional tgt_cache_indirection_opt, + std::optional beam_hyps_output_ids_cba_opt, std::optional beam_hyps_seq_len_cba_opt, + std::optional beam_hyps_cum_log_probs_cba_opt, + std::optional beam_hyps_normed_scores_cba_opt, + std::optional beam_hyps_log_probs_cba_opt, + std::optional beam_hyps_min_normed_scores_opt, std::optional beam_hyps_num_beams_opt, + std::optional beam_hyps_is_done_opt, bool const use_beam_hyps) = 0; }; @@ -65,33 +66,34 @@ class FtDynamicDecode : public IFtDynamicDecode FtDynamicDecode(size_t const max_batch_size, size_t const max_beam_width, size_t const vocab_size, size_t const vocab_size_padded, int const tensor_para_size, int const pipeline_para_size); - void setup(size_t const batch_size, size_t const beam_width, th::optional runtime_top_k_opt, - th::optional runtime_top_p_opt, th::optional temperature_opt, - th::optional repetition_penalty_opt, th::optional presence_penalty_opt, - th::optional frequency_penalty_opt, th::optional min_length_opt, - th::optional length_penalty_opt, th::optional early_stopping_opt, - th::optional beam_search_diversity_rate_opt, th::optional random_seed_opt, - th::optional top_p_decay_opt, th::optional top_p_min_opt, - th::optional top_p_reset_ids_opt, th::optional no_repeat_ngram_size_opt, + void setup(size_t const batch_size, size_t const beam_width, std::optional runtime_top_k_opt, + std::optional runtime_top_p_opt, std::optional temperature_opt, + std::optional repetition_penalty_opt, std::optional presence_penalty_opt, + std::optional frequency_penalty_opt, std::optional min_length_opt, + std::optional length_penalty_opt, std::optional early_stopping_opt, + std::optional beam_search_diversity_rate_opt, std::optional random_seed_opt, + std::optional top_p_decay_opt, std::optional top_p_min_opt, + std::optional top_p_reset_ids_opt, std::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs) override; void forward(th::Tensor const& logits, int const step, int const max_input_length, int const max_attention_window, int const sink_token_length, uint64_t const ite, int const local_batch_size, th::Tensor end_id, - th::optional embedding_bias_opt, th::optional input_lengths_opt, - th::optional sequence_limit_length_opt, th::optional stop_words_list_ptrs_opt, - th::optional stop_words_lens_opt, int32_t const max_stop_words_len, - th::optional bad_words_list_ptrs_opt, th::optional bad_words_lens_opt, - int32_t const max_bad_words_len, th::optional src_cache_indirection_opt, + std::optional embedding_bias_opt, std::optional input_lengths_opt, + std::optional sequence_limit_length_opt, std::optional stop_words_list_ptrs_opt, + std::optional stop_words_lens_opt, int32_t const max_stop_words_len, + std::optional bad_words_list_ptrs_opt, std::optional bad_words_lens_opt, + int32_t const max_bad_words_len, std::optional src_cache_indirection_opt, th::Tensor& output_token_ids, th::Tensor& newTokens, th::Tensor& should_stop, - th::optional finished_input, th::optional finished_output, - th::optional sequence_lengths_opt, th::optional cum_log_probs_opt, - th::optional output_log_probs_opt, th::optional output_log_probs_tiled_opt, - th::optional parent_ids_opt, th::optional tgt_cache_indirection_opt, - th::optional beam_hyps_output_ids_cba_opt, th::optional beam_hyps_seq_len_cba_opt, - th::optional beam_hyps_cum_log_probs_cba_opt, - th::optional beam_hyps_normed_scores_cba_opt, th::optional beam_hyps_log_probs_cba_opt, - th::optional beam_hyps_min_normed_scores_opt, th::optional beam_hyps_num_beams_opt, - th::optional beam_hyps_is_done_opt, bool const use_beam_hyps) override; + std::optional finished_input, std::optional finished_output, + std::optional sequence_lengths_opt, std::optional cum_log_probs_opt, + std::optional output_log_probs_opt, std::optional output_log_probs_tiled_opt, + std::optional parent_ids_opt, std::optional tgt_cache_indirection_opt, + std::optional beam_hyps_output_ids_cba_opt, std::optional beam_hyps_seq_len_cba_opt, + std::optional beam_hyps_cum_log_probs_cba_opt, + std::optional beam_hyps_normed_scores_cba_opt, + std::optional beam_hyps_log_probs_cba_opt, + std::optional beam_hyps_min_normed_scores_opt, std::optional beam_hyps_num_beams_opt, + std::optional beam_hyps_is_done_opt, bool const use_beam_hyps) override; private: tensorrt_llm::runtime::ITensor::SharedPtr mFinishedSum; // [batch_size] pinned @@ -108,33 +110,34 @@ class DynamicDecodeOp : public th::jit::CustomClassHolder int64_t const vocab_size_padded, int64_t const tensor_para_size, int64_t const pipeline_para_size, at::ScalarType const scalar_type); - void setup(int64_t const batch_size, int64_t const beam_width, th::optional runtime_top_k_opt, - th::optional runtime_top_p_opt, th::optional temperature_opt, - th::optional repetition_penalty_opt, th::optional presence_penalty_opt, - th::optional frequency_penalty_opt, th::optional min_length_opt, - th::optional length_penalty_opt, th::optional early_stopping_opt, - th::optional beam_search_diversity_rate_opt, th::optional random_seed_opt, - th::optional top_p_decay_opt, th::optional top_p_min_opt, - th::optional top_p_reset_ids_opt, th::optional no_repeat_ngram_size_opt, + void setup(int64_t const batch_size, int64_t const beam_width, std::optional runtime_top_k_opt, + std::optional runtime_top_p_opt, std::optional temperature_opt, + std::optional repetition_penalty_opt, std::optional presence_penalty_opt, + std::optional frequency_penalty_opt, std::optional min_length_opt, + std::optional length_penalty_opt, std::optional early_stopping_opt, + std::optional beam_search_diversity_rate_opt, std::optional random_seed_opt, + std::optional top_p_decay_opt, std::optional top_p_min_opt, + std::optional top_p_reset_ids_opt, std::optional no_repeat_ngram_size_opt, bool output_log_probs, bool cum_log_probs); th::Tensor forward(th::Tensor const& logits, int64_t const step, int64_t const max_input_length, int64_t const max_attention_window, int64_t const sink_token_length, int64_t const ite, - int64_t const local_batch_size, th::Tensor end_id, th::optional embedding_bias_opt, - th::optional input_lengths_opt, th::optional sequence_limit_length_opt, - th::optional stop_words_list_ptrs_opt, th::optional stop_words_lens_opt, - int64_t const max_stop_words_len, th::optional bad_words_list_ptrs_opt, - th::optional bad_words_lens_opt, int64_t const max_bad_words_len, - th::optional src_cache_indirection_opt, th::Tensor output_token_ids, th::Tensor newTokens, - th::optional finished_input, th::optional finished_output, - th::optional sequence_lengths_opt, th::optional cum_log_probs_opt, - th::optional output_log_probs_opt, th::optional output_log_probs_tiled_opt, - th::optional parent_ids_opt, th::optional tgt_cache_indirection_opt, - th::optional beam_hyps_output_ids_cba_opt, th::optional beam_hyps_seq_len_cba_opt, - th::optional beam_hyps_cum_log_probs_cba_opt, - th::optional beam_hyps_normed_scores_cba_opt, th::optional beam_hyps_log_probs_cba_opt, - th::optional beam_hyps_min_normed_scores_opt, th::optional beam_hyps_num_beams_opt, - th::optional beam_hyps_is_done_opt, bool const use_beam_hyps); + int64_t const local_batch_size, th::Tensor end_id, std::optional embedding_bias_opt, + std::optional input_lengths_opt, std::optional sequence_limit_length_opt, + std::optional stop_words_list_ptrs_opt, std::optional stop_words_lens_opt, + int64_t const max_stop_words_len, std::optional bad_words_list_ptrs_opt, + std::optional bad_words_lens_opt, int64_t const max_bad_words_len, + std::optional src_cache_indirection_opt, th::Tensor output_token_ids, th::Tensor newTokens, + std::optional finished_input, std::optional finished_output, + std::optional sequence_lengths_opt, std::optional cum_log_probs_opt, + std::optional output_log_probs_opt, std::optional output_log_probs_tiled_opt, + std::optional parent_ids_opt, std::optional tgt_cache_indirection_opt, + std::optional beam_hyps_output_ids_cba_opt, std::optional beam_hyps_seq_len_cba_opt, + std::optional beam_hyps_cum_log_probs_cba_opt, + std::optional beam_hyps_normed_scores_cba_opt, + std::optional beam_hyps_log_probs_cba_opt, + std::optional beam_hyps_min_normed_scores_opt, std::optional beam_hyps_num_beams_opt, + std::optional beam_hyps_is_done_opt, bool const use_beam_hyps); private: // Members initialized in constructor and used in call of createInstance() diff --git a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp index 19115f5b8..46cd6c18a 100644 --- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp +++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp @@ -28,29 +28,29 @@ namespace torch_ext { // Must be similar to [cpp/tensorrt_llm/runtime/gptSession.cpp] GptDecoder::gatherTree -th::Tensor gatherTree( // BS: batch_size, BM: beam_width, MSL: max_seq_length - th::Tensor& sequence_lengths, // [BS*BM], int - th::Tensor& output_ids, // [BS, BM, MSL],int - th::Tensor& parent_ids, // [BS, BM, MSL], int - th::Tensor& end_ids, // [BS*BM], int - th::Tensor& tiled_input_lengths, // [BS*BM], int - th::optional cum_log_probs_opt, // [BS, BM], float - th::optional log_probs_opt, // [BS, BM, MSL], float - th::optional log_probs_tiled_opt, // [MSL, BS, BM], float, transpose of output_log_probs_opt - th::optional beam_hyps_output_ids_cba, // [BS, BM*2, MSL], int - th::optional beam_hyps_seq_len_cba, // [BS, BM*2], int - th::optional beam_hyps_cum_log_probs_cba, // [BS, BM*2], float - th::optional beam_hyps_normed_scores_cba, // [BS, BM*2], float - th::optional beam_hyps_log_probs_cba, // [BS, BM*2, MSL], float - th::optional beam_hyps_min_normed_scores, // [BS], float - th::optional beam_hyps_num_beams, // [BS], int - th::optional beam_hyps_is_done, // [BS], bool - th::optional finished, // [BS, BM], uint8 - th::Tensor& length_penalty, // [BS], float - int64_t const batch_size, // - int64_t const beam_width, // - int64_t const max_seq_len, // - bool const use_beam_hyps // +th::Tensor gatherTree( // BS: batch_size, BM: beam_width, MSL: max_seq_length + th::Tensor& sequence_lengths, // [BS*BM], int + th::Tensor& output_ids, // [BS, BM, MSL],int + th::Tensor& parent_ids, // [BS, BM, MSL], int + th::Tensor& end_ids, // [BS*BM], int + th::Tensor& tiled_input_lengths, // [BS*BM], int + std::optional cum_log_probs_opt, // [BS, BM], float + std::optional log_probs_opt, // [BS, BM, MSL], float + std::optional log_probs_tiled_opt, // [MSL, BS, BM], float, transpose of output_log_probs_opt + std::optional beam_hyps_output_ids_cba, // [BS, BM*2, MSL], int + std::optional beam_hyps_seq_len_cba, // [BS, BM*2], int + std::optional beam_hyps_cum_log_probs_cba, // [BS, BM*2], float + std::optional beam_hyps_normed_scores_cba, // [BS, BM*2], float + std::optional beam_hyps_log_probs_cba, // [BS, BM*2, MSL], float + std::optional beam_hyps_min_normed_scores, // [BS], float + std::optional beam_hyps_num_beams, // [BS], int + std::optional beam_hyps_is_done, // [BS], bool + std::optional finished, // [BS, BM], uint8 + th::Tensor& length_penalty, // [BS], float + int64_t const batch_size, // + int64_t const beam_width, // + int64_t const max_seq_len, // + bool const use_beam_hyps // ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); diff --git a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp index aaa87e8cf..18545e1cf 100644 --- a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp +++ b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp @@ -29,11 +29,12 @@ namespace torch_ext void updateKVCacheDraftTokenLocation(torch::Tensor seqAcceptedDraftTokenOffsetsTensor, torch::Tensor packedAcceptedDraftTokensIndicesTensor, torch::Tensor pastKeyValueLengthsTensor, bool usePagedKVCache, int64_t layerCount, int64_t numKVHeads, int64_t headSizeInBytes, int64_t rewindDraftTokenCount, - int64_t maxKVCacheLen, th::optional rewindDraftTokenTensor, - th::optional> pastKeyValueListOpt = th::nullopt, - th::optional pointerArrayOpt = th::nullopt, th::optional offsetArrayOpt = th::nullopt, - th::optional maxBlocksPerSeqOpt = th::nullopt, th::optional tokensPerBlockOpt = th::nullopt, - th::optional stream_ptr = th::nullopt) + int64_t maxKVCacheLen, std::optional rewindDraftTokenTensor, + std::optional> pastKeyValueListOpt = std::nullopt, + std::optional pointerArrayOpt = std::nullopt, + std::optional offsetArrayOpt = std::nullopt, + std::optional maxBlocksPerSeqOpt = std::nullopt, std::optional tokensPerBlockOpt = std::nullopt, + std::optional stream_ptr = std::nullopt) { TLLM_CHECK_WITH_INFO( at::cuda::is_available(), "update_kv_cache_draft_token_location should be called with cuda enabled."); diff --git a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp index 74ed91aea..153f6bc97 100644 --- a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp +++ b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp @@ -42,7 +42,7 @@ namespace { // Must be similar to [cpp/tensorrt_llm/runtime/gptSession.cpp] ExplicitDraftTokensLayer::setup void initializeDeviceCurandStates( - uint64_t batchSize, th::Tensor& curandState, th::optional& randomSeeds, cudaStream_t stream) + uint64_t batchSize, th::Tensor& curandState, std::optional& randomSeeds, cudaStream_t stream) { auto* curandStatePtr = get_ptr(curandState); tr::SizeType32* batchSlotsPtr = nullptr; @@ -77,12 +77,12 @@ void initializeDeviceCurandStates( void prepareRandomTensors(th::Tensor& curandState, // [maxBatchSize, 48], uint8_t th::Tensor& randDataSample, // [maxBatchSize], dtype (float or half) - th::Tensor& randDataValidation, // [maxBatchSize, maxNumPaths, maxPathDraftLength], dtype (float or half) - th::optional randomSeeds, // [1] or [maxBatchSize], uint64_t - int64_t const batchSize, // - int64_t const numPaths, // - int64_t const draftLength, // - bool const initialize // + th::Tensor& randDataValidation, // [maxBatchSize, maxNumPaths, maxPathDraftLength], dtype (float or half) + std::optional randomSeeds, // [1] or [maxBatchSize], uint64_t + int64_t const batchSize, // + int64_t const numPaths, // + int64_t const draftLength, // + bool const initialize // ) { auto stream = at::cuda::getCurrentCUDAStream().stream();