Update TensorRT-LLM (#1954)

* Update TensorRT-LLM --------- Co-authored-by: Altair-Alpha <[email protected]>
NVIDIA · Jul 16, 2024 · 2d23435 · 2d23435
1 parent a96ccca
commit 2d23435
Show file tree

Hide file tree

Showing 207 changed files with 7,409 additions and 1,333 deletions.
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,11 @@ tensorrt_llm/bindings/*.pyi
 # Testing
 .coverage.*
 results_trt/
+
+# build/debug
+*.safetensors
+*/tllm_debug/**
+*.patch
+
+# Generated files
+cpp/include/tensorrt_llm/executor/version.h
diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -177,13 +177,6 @@ def parse_arguments():
          'If this option is specified, it will override the max decoder input len of TRT engines to the specified value instead of using pre-defined one'
          'By default when this option is not used, it will use pre-defined max decoder input len'
          ))
-    parser.add_argument(
-        '--max_output_len',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max output len of '
-         'TRT engines to the specified value instead of using pre-defined one'))
     parser.add_argument(
         '--max_seq_len',
         '--max_decoder_seq_len',
@@ -360,21 +353,6 @@ def main(args):
         rank = tensorrt_llm.mpi_rank()
         world_size = tensorrt_llm.mpi_world_size()
 
-    if args.max_output_len:
-        logger.warning(
-            '--max_output_len has been deprecated in favor of --max_seq_len')
-        if args.max_input_len:
-            if args.max_seq_len:
-                logger.warning(
-                    '--max_seq_len has been overwritten due to --max_output_len being specified'
-                )
-            args.max_seq_len = args.max_input_len + args.max_output_len
-        else:
-            raise Exception(
-                f"--max_output_len is specified but not --max_input_len")
-
-        del args.max_output_len
-
     # TODO: Re-enable memory monitor for multi-gpu benchmarks.
     # Current Mem Monitor will cause benchmark script hang
     # because MPI does not work well with multiprocessing.

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -129,13 +129,6 @@ def parse_arguments():
         help=
         ('If this option is specified, it will override the max input len of '
          'TRT engines to the specified value instead of using pre-defined one'))
-    parser.add_argument(
-        '--max_output_len',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max output len of '
-         'TRT engines to the specified value instead of using pre-defined one'))
     parser.add_argument(
         '--max_seq_len',
         '--max_decoder_seq_len',

diff --git a/benchmarks/suite/README.md b/benchmarks/suite/README.md
@@ -185,7 +185,7 @@ When the benchmark runs successfully, you will see a report out of the run simil
 [RANK 0] Completed request submission.
 [RANK 0] Calculating results.
 [RANK 0] Reporting...
-[RANK 0] JSON: {'benchmark_cmd': '', 'binary': '', 'build_cmd': 'trtllm-build --output_dir /tmp/meta-llama/llama-2-7b-hf --model_config /tmp/generated_config.json --workers 1 --max_batch_size 1024 --max_input_len 128 --max_output_len 128 --max_num_tokens 8000 --context_fmha enable --gpt_attention_plugin float16 --paged_kv_cache enable --multiple_profiles enable --gemm_plugin float16', 'first_token_latency': 0.0, 'inflight_batching': True, 'kv_mem_fraction': 0.98, 'latency_units': 'ms', 'max_batch_size': 1024, 'max_tokens': 8000, 'model': 'meta-llama/Llama-2-7b-hf', 'peak_gpu_mem_units': 'GB', 'peak_gpu_mem': 0.0, 'scheduler': 'Max Utilization', 'throughput_units': 'tokens/second', 'throughput': 17634.422523488243, 'time_per_output_token': 0.0, 'total_input_tokens': 128000, 'total_latency': 7.258530855178833, 'total_output_tokens': 128000}
+[RANK 0] JSON: {'benchmark_cmd': '', 'binary': '', 'build_cmd': 'trtllm-build --output_dir /tmp/meta-llama/llama-2-7b-hf --model_config /tmp/generated_config.json --workers 1 --max_batch_size 1024 --max_input_len 128 --max_seq_len 256 --max_num_tokens 8000 --context_fmha enable --gpt_attention_plugin float16 --paged_kv_cache enable --multiple_profiles enable --gemm_plugin float16', 'first_token_latency': 0.0, 'inflight_batching': True, 'kv_mem_fraction': 0.98, 'latency_units': 'ms', 'max_batch_size': 1024, 'max_tokens': 8000, 'model': 'meta-llama/Llama-2-7b-hf', 'peak_gpu_mem_units': 'GB', 'peak_gpu_mem': 0.0, 'scheduler': 'Max Utilization', 'throughput_units': 'tokens/second', 'throughput': 17634.422523488243, 'time_per_output_token': 0.0, 'total_input_tokens': 128000, 'total_latency': 7.258530855178833, 'total_output_tokens': 128000}
 ===========================================================
 = METADATA
 ===========================================================

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -128,6 +128,27 @@ if(INDEX_RANGE_CHECK)
   message(WARNING "Check index range to detect OOB accesses")
 endif()
 
+# Read the project version
+set(TRTLLM_VERSION_DIR ${PROJECT_SOURCE_DIR}/../tensorrt_llm)
+set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS
+                                    ${TRTLLM_VERSION_DIR}/version.py)
+execute_process(
+  COMMAND python3 -c "import version; print(version.__version__)"
+  WORKING_DIRECTORY ${TRTLLM_VERSION_DIR}
+  OUTPUT_VARIABLE TRTLLM_VERSION
+  RESULT_VARIABLE TRTLLM_VERSION_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(TRTLLM_VERSION_RESULT EQUAL 0)
+  message(STATUS "TensorRT-LLM version: ${TRTLLM_VERSION}")
+else()
+  message(FATAL_ERROR "Failed to determine Tensorrt-LLM version")
+endif()
+
+configure_file(
+  cmake/templates/version.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/tensorrt_llm/executor/version.h)
+
 # Determine CUDA version before enabling the language extension
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
@@ -139,7 +160,7 @@ if(CMAKE_CUDA_COMPILER)
         "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
       RESULT_VARIABLE _BASH_SUCCESS
       OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
-      OUTPUT_STRIP_TRAILING_WHITESPACE)
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
 
     if(NOT _BASH_SUCCESS EQUAL 0)
       message(FATAL_ERROR "Failed to determine CUDA version")

diff --git a/cpp/cmake/templates/version.h b/cpp/cmake/templates/version.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// THIS FILE IS AUTO GENERATED FROM cmake/templates/version.h. DO NOT EDIT.
+
+namespace tensorrt_llm::executor
+{
+static auto constexpr kTensorRtLlmVersion = "@TRTLLM_VERSION@";
+}
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -122,6 +122,9 @@ class GptManager
     void decoupled_execution_loop();
     std::shared_ptr<std::thread> worker_thread_;
     std::shared_ptr<nvinfer1::ILogger> mLogger{};
+
+    inline static std::string const kPROFILE_START_STOP_ENV_VAR_NAME = "TLLM_PROFILE_START_STOP";
+    inline static std::string const kLEGACY_PROFILE_START_STOP_ENV_VAR_NAME = "TLLM_GPTM_PROFILE_START_STOP";
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -375,6 +375,11 @@ class BlockManager
         return mSecondaryPool;
     }
 
+    [[nodiscard]] SizeType32 getNumLayers() const
+    {
+        return mNumLayers;
+    }
+
     //! \brief Get index in pool to K or V block.
     //! \param blockId the blockId as returned by getBlockId()
     //! \param fieldIdx either 0 (K) or 1 (V),
@@ -592,6 +597,8 @@ class KVCacheManager
     void removeToken(SizeType32 seqSlotIdx);
     void rewindKVCache(SizeType32 seqSlotIdx, SizeType32 rewindLengths);
 
+    [[nodiscard]] GenerationRequest const& getSequence(SizeType32 seqSlotIdx) const;
+
     [[nodiscard]] bool isCrossKv() const
     {
         return mCacheType == CacheType::kCROSS;
@@ -634,4 +641,5 @@ class KVCacheManager
     // KV cache type (self or cross)
     CacheType mCacheType;
 };
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+
+class BlockIterator
+{
+public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = runtime::ITensor;
+    using pointer = runtime::ITensor::SharedPtr;
+    using reference = value_type&;
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
+
+    BlockIterator(runtime::ITensor::SharedPtr blockPoolPtr, std::vector<SizeType32> blockIds, size_t idx)
+        : mPool{std::move(blockPoolPtr)}
+        , mBlockIds{std::move(blockIds)}
+        , mIdx{idx}
+    {
+        TLLM_CHECK(mPool);
+        TLLM_CHECK(mIdx <= mBlockIds.size());
+        update();
+    }
+
+    [[nodiscard]] pointer operator->()
+    {
+        return mCurrent;
+    }
+
+    [[nodiscard]] reference operator*()
+    {
+        return *mCurrent;
+    }
+
+    BlockIterator& operator++()
+    {
+        mIdx++;
+        update();
+        return *this;
+    }
+
+    BlockIterator operator++(int)
+    {
+        auto ret = *this;
+        ret.update();
+        mIdx++;
+        return ret;
+    }
+
+    [[nodiscard]] bool operator==(BlockIterator const& other) const
+    {
+        return mIdx == other.mIdx && mPool.get() == other.mPool.get();
+    }
+
+    [[nodiscard]] bool operator!=(BlockIterator const& other) const
+    {
+        return !(*this == other);
+    }
+
+private:
+    void update()
+    {
+        if (mIdx < mBlockIds.size())
+        {
+            mCurrent = runtime::ITensor::slice(mPool, mBlockIds.at(mIdx), 1);
+        }
+    }
+
+    runtime::ITensor::SharedPtr mPool;
+    runtime::ITensor::SharedPtr mCurrent;
+    const std::vector<SizeType32> mBlockIds;
+    size_t mIdx;
+};
+
+[[nodiscard]] BlockIterator getBlockBeginIt(
+    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);
+
+[[nodiscard]] BlockIterator getBlockEndIt(
+    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);
+
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager