Skip to content

Commit

Permalink
Update TensorRT-LLM (#2562)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Starrick Liu <[email protected]>
  • Loading branch information
kaiyux and StarrickLiu authored Dec 11, 2024
1 parent 340a1b6 commit aaacc9b
Show file tree
Hide file tree
Showing 459 changed files with 858,777 additions and 364,419 deletions.
43 changes: 0 additions & 43 deletions .github/workflows/auto-assign.yml

This file was deleted.

24 changes: 10 additions & 14 deletions .github/workflows/auto_close_inactive_issues.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
name: Close inactive issues
on:
workflow_dispatch:
schedule:
- cron: "0 * * * *"
- cron: "30 1 * * *"

jobs:
stale:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write

steps:
- uses: actions/stale@v9
with:
days-before-issue-stale: 30
days-before-issue-close: 15
stale-issue-label: "stale"
exempt-issue-labels: ""
stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."
close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity."
days-before-pr-stale: -1
days-before-pr-close: -1
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'Issue has not received an update in over 14 days. Adding stale label.'
stale-pr-message: 'PR has not received an update in over 14 days. Adding stale label.'
close-issue-message: 'This issue was closed because it has been 14 days without activity since it has been marked as stale.'
close-pr-message: 'This PR was closed because it has been 14 days without activity since it has been marked as stale.'
days-before-issue-stale: 14
days-before-close: 14
only-labels: 'waiting for feedback'
labels-to-add-when-unstale: 'investigating'
labels-to-remove-when-unstale: 'stale,waiting for feedback'
stale-issue-label: 'stale'
stale-pr-label: 'stale'
debug-only: false
13 changes: 0 additions & 13 deletions .github/workflows/module-owners.json

This file was deleted.

9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ TensorRT-LLM
<h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>

[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)

---
<div align="left">
Expand Down Expand Up @@ -151,6 +151,7 @@ To get started with TensorRT-LLM, visit our documentation:
- [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
- [Installation Guide for Grace Hopper](https://nvidia.github.io/TensorRT-LLM/installation/grace-hopper.html)
- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)

## Community
Expand Down
51 changes: 37 additions & 14 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ class Recorder
const std::lock_guard<std::mutex> lock(mRequestBenchInfosMutex);
mRequestBenchInfos[requestId].outputLength = outSeqLen;
mRequestBenchInfos[requestId].decodingIter = response.getResult().decodingIter;

// We record the first beam for the response file
mResponseTensors[requestId] = outputTokenIds[0];
}
else
{
Expand Down Expand Up @@ -492,14 +495,19 @@ class Recorder
nlohmann::json jsonResponses = nlohmann::json::array();
for (auto const& [respId, respTokensTensor] : mResponseTensors)
{
int inputLength = mRequestBenchInfos[respId].inputLength;
int outputLength = mRequestBenchInfos[respId].outputLength;
std::vector<int32_t> outputTokens(outputLength);
auto respTokens = mResponseTensors[respId];
int respLength = respTokens.size();
int* respBufferPtr = respTokens.data();

int32_t* outputToksBufferPtr = bufferCast<int32_t>(*respTokensTensor);
if (mOutputHasInput)
outputToksBufferPtr += inputLength;
std::copy(outputToksBufferPtr, outputToksBufferPtr + outputLength, outputTokens.begin());
{
int inputSeqLen = mRequestBenchInfos[respId].inputLength;
respBufferPtr += inputSeqLen;
respLength -= inputSeqLen;
}

std::vector<int32_t> outputTokens(respLength);
std::copy(respBufferPtr, respBufferPtr + respLength, outputTokens.begin());

nlohmann::json currResp;
currResp["response_id"] = respId;
Expand Down Expand Up @@ -552,7 +560,7 @@ class Recorder
bool mStreaming;
int mBeamWidth;
std::string mRespJsonFile;
std::unordered_map<uint64_t, TensorPtr> mResponseTensors;
std::unordered_map<uint64_t, texec::VecTokens> mResponseTensors;
bool mOutputHasInput;
std::mutex mRequestBenchInfosMutex;

Expand Down Expand Up @@ -792,7 +800,8 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
std::optional<int32_t> const& eosId, std::optional<int32_t> const& padId, BenchmarkParams const& benchmarkParams,
texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep,
bool returnContextLogits, bool returnGenerationLogits, std::optional<int> const staticEmulatedBatchSize,
bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType)
bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType,
std::string const& responsesJsonFile)
{
auto const& world = tensorrt_llm::mpi::MpiComm::world();
auto worldRank = world.getRank();
Expand All @@ -801,7 +810,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
auto const numSamples = samples.size();

auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth);
auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth, responsesJsonFile);
int32_t decoderStartTokenId = 0;
std::shared_ptr<ExecutorServer> executorServer;

Expand Down Expand Up @@ -989,6 +998,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
recorder->calculateMetrics();
recorder->report();
recorder->writeOpMetricsToCsv();
recorder->dumpResponseSeqs();
// Send terminateReqId to terminate servers on all ranks
// Sever on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
// gptServer->enqueue(std::make_shared<InferenceRequest>(terminateReqId));
Expand Down Expand Up @@ -1047,11 +1057,13 @@ int main(int argc, char* argv[])
cxxopts::value<bool>()->default_value("false"));
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
cxxopts::value<bool>()->default_value("false"));
options.add_options()("streaming", "Operate in streaming mode", cxxopts::value<bool>()->default_value("false"));
options.add_options()("streaming",
"Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency",
cxxopts::value<bool>()->default_value("false"));
options.add_options()(
"enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("false"));
options.add_options()("enable_chunked_context", "Whether to enable context chunking.",
cxxopts::value<bool>()->default_value("false"));
options.add_options()(
"enable_chunked_context", "Whether to enable context chunking.", cxxopts::value<bool>()->default_value("true"));
options.add_options()(
"return_context_logits", "Whether to return context logits.", cxxopts::value<bool>()->default_value("false"));
options.add_options()("return_generation_logits", "Whether to return generation logits.",
Expand All @@ -1064,7 +1076,7 @@ int main(int argc, char* argv[])
options.add_options()("static_emulated_batch_size",
"Emulate static batching performance with the provided batch size.", cxxopts::value<SizeType32>());
options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.",
cxxopts::value<std::string>()->default_value("error"));
cxxopts::value<std::string>()->default_value("warning"));
options.add_options()("log_iteration_data", "On each decoder iteration, print batch state metadata.",
cxxopts::value<bool>()->default_value("false"));
options.add_options()("wait_sleep", "Specify how many milliseconds to sleep each iteration of waitForEmpty loop.",
Expand Down Expand Up @@ -1111,6 +1123,8 @@ int main(int argc, char* argv[])
"lookahead config in the format of [max_window_size, max_ngram_size, max_verification_set_size], and each <= "
"executor lookahead config",
cxxopts::value<std::string>());
options.add_options()("responses_json", "Write output response sequences to a json file",
cxxopts::value<std::string>()->default_value(""));

auto result = options.parse(argc, argv);

Expand All @@ -1137,6 +1151,12 @@ int main(int argc, char* argv[])
{
TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\".");
}
bool streaming = result["streaming"].as<bool>();
if (streaming)
{
TLLM_LOG_ERROR("Streaming is not supported in static batching.\n");
return 1;
}
batchingType = texec::BatchingType::kSTATIC;
}
else if (type == "IFB" || type == "inflight")
Expand Down Expand Up @@ -1419,6 +1439,9 @@ int main(int argc, char* argv[])

initTrtLlmPlugins(logger.get());

// Argument: output sequences JSON
auto const responsesJsonFile = result["responses_json"].as<std::string>();

// Argument: API
auto const api = result["api"].as<std::string>();
if (api == "executor")
Expand Down Expand Up @@ -1449,7 +1472,7 @@ int main(int argc, char* argv[])
benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples,
beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, capacitySchedulerPolicy,
waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData,
maxPromptLen, executorModelType);
maxPromptLen, executorModelType, responsesJsonFile);
}
catch (std::exception const& e)
{
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/cpp/utils/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ Samples parseWorkloadJson(
}
samples.emplace_back(Sample{std::move(input_ids), sample["output_len"], taskId});
}

if (samples.size() < maxNumSamples)
{
TLLM_LOG_WARNING(
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
samples.size(), maxNumSamples);
}
return samples;
}

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cpp/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ struct BenchmarkParams
bool enableBatchSizeTuning{false};
bool enableMaxNumTokensTuning{false};
bool enableBlockReuse{false};
bool enableChunkedContext{false};
bool enableChunkedContext{true};
bool streaming{false};
bool enableExpDelays{false};
std::optional<float> requestRate{std::nullopt};
Expand Down
11 changes: 4 additions & 7 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <list>
#include <memory>
#include <optional>
#include <set>
#include <unordered_map>
#include <vector>

Expand All @@ -52,6 +53,7 @@ static constexpr SizeType32 kSecondaryLevel = 1;

class KVCacheBlock;
class KVCacheManager;
class KVCacheTransferManager;

using SizeType32 = tensorrt_llm::runtime::SizeType32;
using TokenIdType = tensorrt_llm::runtime::TokenIdType;
Expand Down Expand Up @@ -622,13 +624,6 @@ class BlockManager
void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt,
std::optional<std::chrono::milliseconds> durationMs = std::nullopt);

//! \brief Compute pointer to raw KV block (K & V, all layers).
[[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
std::shared_ptr<KVCacheBlock> block, SizeType32 poolIdx) const;

//! \brief Copy content of src block to dst.
void copyBlock(BlockPtr src, BlockPtr dst);

private:
// Number of blocks in pools
SizeType32 mNumPrimaryBlocks;
Expand Down Expand Up @@ -667,6 +662,8 @@ class BlockManager
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
// Event manager
std::shared_ptr<KVCacheEventManager> mEventManager;
// Transfer manager
std::shared_ptr<KVCacheTransferManager> mTransferManager;

// Statistics for block allocations/reuse
// Total number of blocks allocated by all requests
Expand Down
62 changes: 62 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "tensorrt_llm/batch_manager/kvCacheManager.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/cudaEvent.h"

namespace tr = tensorrt_llm::runtime;

#pragma once

namespace tensorrt_llm::batch_manager::kv_cache_manager
{

// The TransferManager accelerates transfers to/from the GPU by overlapping HtoD and DtoH transfers, and tracks ongoing
// transfers in order to avoid race conditions. It is functionally equivalent to the prior approach of putting all
// transfers into the forward pass stream. This is only ever used as a component of a KVCacheManager.
class KVCacheTransferManager
{
public:
explicit KVCacheTransferManager(tr::BufferManager const& bufferManager);

//! \brief Onboard a block to gpu memory.
void onboard(BlockPtr const& offloadBlock, BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools);

//! \brief Offload a block to cpu memory.
void offload(BlockPtr const& block, BlockPtr const& offloadBlock, std::vector<KVCacheBlockPool> const& pools);

//! \brief Synchronize the offload/onboard streams with the bufferManager stream.
void syncTransfers();

private:
//! \brief Get pointer to pool specified by cache block.
static tr::ITensor::SharedPtr computeBlockPointer(
BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx);

//! \brief Copy content of src block to dst.
void copyBlock(
BlockPtr const& src, BlockPtr const& dst, std::vector<KVCacheBlockPool> const& pools, bool isOffload);

runtime::BufferManager mBufferManager;
runtime::BufferManager mOnboardManager;
runtime::BufferManager mOffloadManager;

// Track the block ids offloaded in this iteration.
std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
};

} // namespace tensorrt_llm::batch_manager::kv_cache_manager
Loading

0 comments on commit aaacc9b

Please sign in to comment.