Skip to content

Commit

Permalink
Update TensorRT-LLM (#1954)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Altair-Alpha <[email protected]>
  • Loading branch information
kaiyux and Altair-Alpha authored Jul 16, 2024
1 parent a96ccca commit 2d23435
Show file tree
Hide file tree
Showing 207 changed files with 7,409 additions and 1,333 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,11 @@ tensorrt_llm/bindings/*.pyi
# Testing
.coverage.*
results_trt/

# build/debug
*.safetensors
*/tllm_debug/**
*.patch

# Generated files
cpp/include/tensorrt_llm/executor/version.h
22 changes: 0 additions & 22 deletions benchmarks/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,13 +177,6 @@ def parse_arguments():
'If this option is specified, it will override the max decoder input len of TRT engines to the specified value instead of using pre-defined one'
'By default when this option is not used, it will use pre-defined max decoder input len'
))
parser.add_argument(
'--max_output_len',
type=int,
default=None,
help=
('If this option is specified, it will override the max output len of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_seq_len',
'--max_decoder_seq_len',
Expand Down Expand Up @@ -360,21 +353,6 @@ def main(args):
rank = tensorrt_llm.mpi_rank()
world_size = tensorrt_llm.mpi_world_size()

if args.max_output_len:
logger.warning(
'--max_output_len has been deprecated in favor of --max_seq_len')
if args.max_input_len:
if args.max_seq_len:
logger.warning(
'--max_seq_len has been overwritten due to --max_output_len being specified'
)
args.max_seq_len = args.max_input_len + args.max_output_len
else:
raise Exception(
f"--max_output_len is specified but not --max_input_len")

del args.max_output_len

# TODO: Re-enable memory monitor for multi-gpu benchmarks.
# Current Mem Monitor will cause benchmark script hang
# because MPI does not work well with multiprocessing.
Expand Down
7 changes: 0 additions & 7 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,6 @@ def parse_arguments():
help=
('If this option is specified, it will override the max input len of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_output_len',
type=int,
default=None,
help=
('If this option is specified, it will override the max output len of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_seq_len',
'--max_decoder_seq_len',
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/suite/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ When the benchmark runs successfully, you will see a report out of the run simil
[RANK 0] Completed request submission.
[RANK 0] Calculating results.
[RANK 0] Reporting...
[RANK 0] JSON: {'benchmark_cmd': '', 'binary': '', 'build_cmd': 'trtllm-build --output_dir /tmp/meta-llama/llama-2-7b-hf --model_config /tmp/generated_config.json --workers 1 --max_batch_size 1024 --max_input_len 128 --max_output_len 128 --max_num_tokens 8000 --context_fmha enable --gpt_attention_plugin float16 --paged_kv_cache enable --multiple_profiles enable --gemm_plugin float16', 'first_token_latency': 0.0, 'inflight_batching': True, 'kv_mem_fraction': 0.98, 'latency_units': 'ms', 'max_batch_size': 1024, 'max_tokens': 8000, 'model': 'meta-llama/Llama-2-7b-hf', 'peak_gpu_mem_units': 'GB', 'peak_gpu_mem': 0.0, 'scheduler': 'Max Utilization', 'throughput_units': 'tokens/second', 'throughput': 17634.422523488243, 'time_per_output_token': 0.0, 'total_input_tokens': 128000, 'total_latency': 7.258530855178833, 'total_output_tokens': 128000}
[RANK 0] JSON: {'benchmark_cmd': '', 'binary': '', 'build_cmd': 'trtllm-build --output_dir /tmp/meta-llama/llama-2-7b-hf --model_config /tmp/generated_config.json --workers 1 --max_batch_size 1024 --max_input_len 128 --max_seq_len 256 --max_num_tokens 8000 --context_fmha enable --gpt_attention_plugin float16 --paged_kv_cache enable --multiple_profiles enable --gemm_plugin float16', 'first_token_latency': 0.0, 'inflight_batching': True, 'kv_mem_fraction': 0.98, 'latency_units': 'ms', 'max_batch_size': 1024, 'max_tokens': 8000, 'model': 'meta-llama/Llama-2-7b-hf', 'peak_gpu_mem_units': 'GB', 'peak_gpu_mem': 0.0, 'scheduler': 'Max Utilization', 'throughput_units': 'tokens/second', 'throughput': 17634.422523488243, 'time_per_output_token': 0.0, 'total_input_tokens': 128000, 'total_latency': 7.258530855178833, 'total_output_tokens': 128000}
===========================================================
= METADATA
===========================================================
Expand Down
23 changes: 22 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,27 @@ if(INDEX_RANGE_CHECK)
message(WARNING "Check index range to detect OOB accesses")
endif()

# Read the project version
set(TRTLLM_VERSION_DIR ${PROJECT_SOURCE_DIR}/../tensorrt_llm)
set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS
${TRTLLM_VERSION_DIR}/version.py)
execute_process(
COMMAND python3 -c "import version; print(version.__version__)"
WORKING_DIRECTORY ${TRTLLM_VERSION_DIR}
OUTPUT_VARIABLE TRTLLM_VERSION
RESULT_VARIABLE TRTLLM_VERSION_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(TRTLLM_VERSION_RESULT EQUAL 0)
message(STATUS "TensorRT-LLM version: ${TRTLLM_VERSION}")
else()
message(FATAL_ERROR "Failed to determine Tensorrt-LLM version")
endif()

configure_file(
cmake/templates/version.h
${CMAKE_CURRENT_SOURCE_DIR}/include/tensorrt_llm/executor/version.h)

# Determine CUDA version before enabling the language extension
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
Expand All @@ -139,7 +160,7 @@ if(CMAKE_CUDA_COMPILER)
"${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
RESULT_VARIABLE _BASH_SUCCESS
OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(NOT _BASH_SUCCESS EQUAL 0)
message(FATAL_ERROR "Failed to determine CUDA version")
Expand Down
24 changes: 24 additions & 0 deletions cpp/cmake/templates/version.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

// THIS FILE IS AUTO GENERATED FROM cmake/templates/version.h. DO NOT EDIT.

namespace tensorrt_llm::executor
{
static auto constexpr kTensorRtLlmVersion = "@TRTLLM_VERSION@";
}
3 changes: 3 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ class GptManager
void decoupled_execution_loop();
std::shared_ptr<std::thread> worker_thread_;
std::shared_ptr<nvinfer1::ILogger> mLogger{};

inline static std::string const kPROFILE_START_STOP_ENV_VAR_NAME = "TLLM_PROFILE_START_STOP";
inline static std::string const kLEGACY_PROFILE_START_STOP_ENV_VAR_NAME = "TLLM_GPTM_PROFILE_START_STOP";
};

} // namespace tensorrt_llm::batch_manager
8 changes: 8 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,11 @@ class BlockManager
return mSecondaryPool;
}

[[nodiscard]] SizeType32 getNumLayers() const
{
return mNumLayers;
}

//! \brief Get index in pool to K or V block.
//! \param blockId the blockId as returned by getBlockId()
//! \param fieldIdx either 0 (K) or 1 (V),
Expand Down Expand Up @@ -592,6 +597,8 @@ class KVCacheManager
void removeToken(SizeType32 seqSlotIdx);
void rewindKVCache(SizeType32 seqSlotIdx, SizeType32 rewindLengths);

[[nodiscard]] GenerationRequest const& getSequence(SizeType32 seqSlotIdx) const;

[[nodiscard]] bool isCrossKv() const
{
return mCacheType == CacheType::kCROSS;
Expand Down Expand Up @@ -634,4 +641,5 @@ class KVCacheManager
// KV cache type (self or cross)
CacheType mCacheType;
};

} // namespace tensorrt_llm::batch_manager::kv_cache_manager
99 changes: 99 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "tensorrt_llm/batch_manager/kvCacheManager.h"

namespace tensorrt_llm::batch_manager::kv_cache_manager
{

class BlockIterator
{
public:
using iterator_category = std::forward_iterator_tag;
using value_type = runtime::ITensor;
using pointer = runtime::ITensor::SharedPtr;
using reference = value_type&;
using SizeType32 = tensorrt_llm::runtime::SizeType32;

BlockIterator(runtime::ITensor::SharedPtr blockPoolPtr, std::vector<SizeType32> blockIds, size_t idx)
: mPool{std::move(blockPoolPtr)}
, mBlockIds{std::move(blockIds)}
, mIdx{idx}
{
TLLM_CHECK(mPool);
TLLM_CHECK(mIdx <= mBlockIds.size());
update();
}

[[nodiscard]] pointer operator->()
{
return mCurrent;
}

[[nodiscard]] reference operator*()
{
return *mCurrent;
}

BlockIterator& operator++()
{
mIdx++;
update();
return *this;
}

BlockIterator operator++(int)
{
auto ret = *this;
ret.update();
mIdx++;
return ret;
}

[[nodiscard]] bool operator==(BlockIterator const& other) const
{
return mIdx == other.mIdx && mPool.get() == other.mPool.get();
}

[[nodiscard]] bool operator!=(BlockIterator const& other) const
{
return !(*this == other);
}

private:
void update()
{
if (mIdx < mBlockIds.size())
{
mCurrent = runtime::ITensor::slice(mPool, mBlockIds.at(mIdx), 1);
}
}

runtime::ITensor::SharedPtr mPool;
runtime::ITensor::SharedPtr mCurrent;
const std::vector<SizeType32> mBlockIds;
size_t mIdx;
};

[[nodiscard]] BlockIterator getBlockBeginIt(
KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);

[[nodiscard]] BlockIterator getBlockEndIt(
KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);

} // namespace tensorrt_llm::batch_manager::kv_cache_manager
Loading

0 comments on commit 2d23435

Please sign in to comment.