Skip to content

Commit

Permalink
Update TensorRT-LLM (#1918)
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiyux authored Jul 9, 2024
1 parent 9dbc5b3 commit a96ccca
Show file tree
Hide file tree
Showing 133 changed files with 6,948 additions and 861 deletions.
45 changes: 35 additions & 10 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,16 @@ struct BenchmarkParams
std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
};

class InferenceRequestsSyncSend
class InferenceRequestsAsyncSend
{
public:
InferenceRequestsSyncSend(std::shared_ptr<tensorrt_llm::mpi::MpiComm> comm,
InferenceRequestsAsyncSend(std::shared_ptr<tensorrt_llm::mpi::MpiComm> comm,
std::list<std::shared_ptr<InferenceRequest>> const& inferenceRequests, int const peer)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
TLLM_LOG_DEBUG("start send requests to rank %d", peer);
mNumNewWorkItems = static_cast<int64_t>(inferenceRequests.size());
comm->send(&mNumNewWorkItems, 1, mpi::MpiType::kINT64, peer, 0);
mRequest1 = comm->sendAsync(&mNumNewWorkItems, 1, mpi::MpiType::kINT64, peer, 0);
if (mNumNewWorkItems > 0)
{
for (auto const& infReq : inferenceRequests)
Expand All @@ -191,16 +191,31 @@ class InferenceRequestsSyncSend
mPacked.insert(mPacked.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));
}
mVecSize = static_cast<int64_t>(mPacked.size());
comm->send(&mVecSize, 1, mpi::MpiType::kINT64, peer, 1);
comm->send(mPacked.data(), mPacked.size(), mpi::MpiType::kINT64, peer, 2);
mRequest2 = comm->sendAsync(&mVecSize, 1, mpi::MpiType::kINT64, peer, 1);
mRequest3 = comm->sendAsync(mPacked.data(), mPacked.size(), mpi::MpiType::kINT64, peer, 2);
}
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}

~InferenceRequestsAsyncSend()
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
mRequest1->wait();
if (mRequest2)
mRequest2->wait();
if (mRequest3)
mRequest3->wait();
TLLM_LOG_DEBUG("end send requests");
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}

private:
int64_t mNumNewWorkItems;
int64_t mVecSize;
std::vector<int64_t> mPacked;
std::shared_ptr<tensorrt_llm::mpi::MpiRequest> mRequest1;
std::shared_ptr<tensorrt_llm::mpi::MpiRequest> mRequest2;
std::shared_ptr<tensorrt_llm::mpi::MpiRequest> mRequest3;
};
} // namespace

Expand Down Expand Up @@ -930,7 +945,6 @@ class GptServer
, mStaticEmulatedBatchSize(staticEmulatedBatchSize)
, mBatchTimeout(batchTimeout.value_or(std::chrono::milliseconds{0}))
, mActiveCount(0)
, mInferReqSyncSndHdl(nullptr)
{
auto const jsonConfig = GptJsonConfig::parse(trtEnginePath / "config.json");
mWorldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
Expand Down Expand Up @@ -966,6 +980,12 @@ class GptServer

~GptServer()
{
if (mInferReqWaitThread)
{
mInferReqWaitThread->join();
mInferReqWaitThread.reset(nullptr);
}

mWorkItemsQueue.clear();
}

Expand Down Expand Up @@ -1031,7 +1051,11 @@ class GptServer
// Return up to max_num_requests inference requests.
std::list<std::shared_ptr<InferenceRequest>> getInferenceRequests(int const max_num_requests)
{
mInferReqSyncSndHdl = nullptr;
if (mInferReqWaitThread)
{
mInferReqWaitThread->join();
mInferReqWaitThread.reset(nullptr);
}
std::list<std::shared_ptr<InferenceRequest>> inferenceRequests;
auto& comm = COMM_SESSION;
if (max_num_requests > 0)
Expand Down Expand Up @@ -1134,8 +1158,9 @@ class GptServer
if (!mWorldConfig.isLastPipelineParallelRank())
{
auto const peer = mWorldConfig.getPipelineParallelRank() + 1;
mInferReqSyncSndHdl
= std::make_shared<InferenceRequestsSyncSend>(mCommPipelineParallel, inferenceRequests, peer);
auto inferReqAsyncSndHdl
= std::make_unique<InferenceRequestsAsyncSend>(mCommPipelineParallel, inferenceRequests, peer);
mInferReqWaitThread = std::make_unique<std::thread>([handle = std::move(inferReqAsyncSndHdl)]() {});
}
}
return inferenceRequests;
Expand Down Expand Up @@ -1184,7 +1209,7 @@ class GptServer
WorldConfig mWorldConfig;
std::shared_ptr<tensorrt_llm::mpi::MpiComm> mCommTensorParallel;
std::shared_ptr<tensorrt_llm::mpi::MpiComm> mCommPipelineParallel;
std::shared_ptr<InferenceRequestsSyncSend> mInferReqSyncSndHdl;
std::unique_ptr<std::thread> mInferReqWaitThread;

}; // class GptServer

Expand Down
66 changes: 65 additions & 1 deletion benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,20 @@ class BuildConfig:
parallel_attention: bool = None
new_decoder_architecture: bool = None
state_size: int = 0
state_dtype: Optional[str] = None
state_dtype: Optional[str] = ""
conv_kernel: int = 0
layer_types: List[str] = field(default_factory=list)
rnn_hidden_size: int = 0
rnn_head_size: int = 0
rnn_conv_dim_size: int = 0
logits_soft_cap: float = 0.0
opt_batch_size: Optional[int] = None
opt_num_tokens: Optional[int] = None
use_bias: bool = None
mamba_version: str = 'Mamba1'
ssm_rmsnorm: bool = True
ngroups: int = 1
chunk_size: int = 256


@dataclass
Expand Down Expand Up @@ -1218,6 +1224,7 @@ class ModelConfig:
state_size=16,
conv_kernel=4,
rnn_hidden_size=5120,
rnn_conv_dim_size=5120,
layer_types=["recurrent"],
use_bias=False,
)),
Expand All @@ -1238,6 +1245,7 @@ class ModelConfig:
state_size=16,
conv_kernel=4,
rnn_hidden_size=4096,
rnn_conv_dim_size=4096,
layer_types=["recurrent"],
use_bias=False,
)),
Expand All @@ -1258,6 +1266,7 @@ class ModelConfig:
state_size=16,
conv_kernel=4,
rnn_hidden_size=3072,
rnn_conv_dim_size=3072,
layer_types=["recurrent"],
use_bias=False,
)),
Expand All @@ -1278,6 +1287,7 @@ class ModelConfig:
state_size=16,
conv_kernel=4,
rnn_hidden_size=2048,
rnn_conv_dim_size=2048,
layer_types=["recurrent"],
use_bias=False,
)),
Expand All @@ -1298,9 +1308,62 @@ class ModelConfig:
state_size=16,
conv_kernel=4,
rnn_hidden_size=1536,
rnn_conv_dim_size=1536,
layer_types=["recurrent"],
use_bias=False,
)),
"mamba2_2.7b":
ModelConfig(name="mamba2_2.7b",
family="mamba",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=64,
num_heads=1,
hidden_size=2560,
vocab_size=50288,
hidden_act="silu",
n_positions=8192,
max_batch_size=64,
max_input_len=1024,
max_seq_len=2048,
state_size=128,
conv_kernel=4,
rnn_hidden_size=5120,
rnn_conv_dim_size=5376,
rnn_head_size=64,
layer_types=["recurrent"],
use_bias=False,
mamba_version='Mamba2',
ssm_rmsnorm=True,
ngroups=1,
chunk_size=256,
)),
"mamba2_130m":
ModelConfig(name="mamba2_130m",
family="mamba",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=1,
hidden_size=768,
vocab_size=50288,
hidden_act="silu",
n_positions=8192,
max_batch_size=64,
max_input_len=1024,
max_seq_len=2048,
state_size=128,
conv_kernel=4,
rnn_hidden_size=1536,
rnn_conv_dim_size=1792,
rnn_head_size=64,
layer_types=["recurrent"],
use_bias=False,
mamba_version='Mamba2',
ssm_rmsnorm=True,
ngroups=1,
chunk_size=256,
)),
"whisper_large_v3":
ModelConfig(name="whisper_large_v3",
family="whisper",
Expand Down Expand Up @@ -1344,6 +1407,7 @@ class ModelConfig:
state_size=1,
layer_types=["recurrent", "recurrent", "attention"],
rnn_hidden_size=2560,
rnn_conv_dim_size=2560,
logits_soft_cap=30.0,
state_dtype="float32",
)),
Expand Down
11 changes: 10 additions & 1 deletion benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ def build_gpt(args):
builder_config_extra_kwargs = {}
extra_items = [
'layer_types', 'conv_kernel', 'rnn_hidden_size', 'logits_soft_cap',
'state_size', 'use_bias'
'state_size', 'use_bias', 'rnn_head_size', 'rnn_conv_dim_size',
'mamba_version', 'ssm_rmsnorm', 'ngroups', 'chunk_size'
]
for item in extra_items:
if item in build_config:
Expand Down Expand Up @@ -876,10 +877,16 @@ def build_gpt(args):
'state_size': build_config['state_size'],
'conv_kernel': build_config['conv_kernel'],
'rnn_hidden_size': build_config['rnn_hidden_size'],
'rnn_head_size': build_config['rnn_head_size'],
'rnn_conv_dim_size': build_config['rnn_conv_dim_size'],
'rms_norm': True,
'residual_in_fp32': True,
'pad_vocab_size_multiple': 8,
'use_bias': build_config['use_bias'],
'mamba_version': build_config['mamba_version'],
'ssm_rmsnorm': build_config['ssm_rmsnorm'],
'ngroups': build_config['ngroups'],
'chunk_size': build_config['chunk_size'],
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.MambaForCausalLM(config)
Expand Down Expand Up @@ -912,6 +919,8 @@ def build_gpt(args):
'state_size': build_config['state_size'],
'layer_types': build_config['layer_types'],
'rnn_hidden_size': build_config['rnn_hidden_size'],
'rnn_head_size': build_config['rnn_head_size'],
'rnn_conv_dim_size': build_config['rnn_conv_dim_size'],
'logits_soft_cap': build_config['logits_soft_cap'],
'rotary_pct': build_config['rotary_pct'],
}
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,

rnn_config_items = [
'conv_kernel', 'layer_types', 'rnn_hidden_size', 'state_size',
'state_dtype'
'state_dtype', 'rnn_head_size', 'rnn_conv_dim_size'
]
rnn_configs_kwargs = {}
for item in rnn_config_items:
Expand Down
10 changes: 5 additions & 5 deletions cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class GenericInferenceRequest
uint64_t requestId, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt)
: mRequestId{requestId}
, mIsStreaming{false}
, mlogitsPostProcessor(logitsPostProcessor)
, mLogitsPostProcessor(logitsPostProcessor)
{
}

Expand All @@ -125,7 +125,7 @@ class GenericInferenceRequest
: mRequestId{requestId}
, mIsStreaming{false}
, mInputTensors{std::move(tensorMap)}
, mlogitsPostProcessor(logitsPostProcessor)
, mLogitsPostProcessor(logitsPostProcessor)
{
for (auto const& [name, tensor] : mInputTensors)
{
Expand Down Expand Up @@ -161,12 +161,12 @@ class GenericInferenceRequest

void setLogitsPostProcessor(std::optional<LogitsPostProcessor> cb)
{
mlogitsPostProcessor = cb;
mLogitsPostProcessor = cb;
}

std::optional<LogitsPostProcessor> getLogitsPostProcessor()
{
return mlogitsPostProcessor;
return mLogitsPostProcessor;
}

static std::array constexpr kTensorNames = {
Expand Down Expand Up @@ -280,7 +280,7 @@ class GenericInferenceRequest
uint64_t mRequestId;
bool mIsStreaming;
TensorMap mInputTensors;
std::optional<LogitsPostProcessor> mlogitsPostProcessor;
std::optional<LogitsPostProcessor> mLogitsPostProcessor;
};

class InferenceRequest : public GenericInferenceRequest<tensorrt_llm::runtime::ITensor::SharedPtr, NamedTensor>
Expand Down
30 changes: 5 additions & 25 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,6 @@ class GenerationRequest
}
}

void setNumPrepopulatedTokens(std::vector<int> numPrepopulatedTokens)
{
mNumPrepopulatedTokens = std::move(numPrepopulatedTokens);
}

[[nodiscard]] std::vector<int> const& getNumPrepopulatedTokens() const
{
return mNumPrepopulatedTokens;
}

private:
// Slot id of the sequence
SizeType32 mSeqSlotIdx;
Expand All @@ -267,10 +257,6 @@ class GenerationRequest
SizeType32 mBeamWidth;
// List of blocks allocated for each beam of the sequence
std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
// Number of tokens already in kv cache before context phase.
// A value > 0 indicates cached kv cache blocks were reused.
// One value per beam.
std::vector<int> mNumPrepopulatedTokens;
};

// BlockManager manages overall metadata of KVCacheBlocks in a layer of the
Expand Down Expand Up @@ -400,7 +386,10 @@ class BlockManager

private:
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx, SizeType32 seqSlotIdx);
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);

//! \brief Add single block to all beams of sequence.
void addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence);

//! \brief Store blocks in cached blocks.
//! \param blockedTokens Tokens of each block.
Expand All @@ -410,11 +399,8 @@ class BlockManager
//! \brief Try to load blocks from cache. Allocate new blocks if necessary.
//! \param blockedTokens Tokens of each block.
//! \param sequence Sequence to which blocks are assigned.
//! \param beamIdx Beam of sequence to which blocks are assigned.
//! \param seqSlotIdx Batch slot of sequence to which blocks are assigned.
//! \return Number of matched tokens from loaded blocks.
SizeType32 loadOrAllocateBlocks(std::list<VecTokens> const& blockedTokens, GenerationRequest& sequence,
SizeType32 beamIdx, SizeType32 seqSlotIdx);
SizeType32 loadOrAllocateBlocks(std::list<VecTokens> const& blockedTokens, GenerationRequest& sequence);

//! \brief Find best primary block to free.
//! \details The best primary block to free is the primary block that appears first in the queue and have no primary
Expand Down Expand Up @@ -598,12 +584,6 @@ class KVCacheManager
nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);

[[nodiscard]] SizeType32 getNumPrepopulatedTokens(SizeType32 batchSlotIdx, SizeType32 beamIdx) const
{
auto const& prepopulatedTokens = mSequences.at(batchSlotIdx)->getNumPrepopulatedTokens();
return prepopulatedTokens.size() > 0 ? prepopulatedTokens.at(beamIdx) : 0;
}

[[nodiscard]] bool isEnableBlockReuse() const
{
return mEnableBlockReuse;
Expand Down
Loading

0 comments on commit a96ccca

Please sign in to comment.