diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html index a71651375..e31e38417 100644 --- a/_cpp_gen/executor.html +++ b/_cpp_gen/executor.html @@ -1,3 +1,5 @@ + +
@@ -6,21 +8,17 @@tensorrt_llm
@@ -144,6 +144,9 @@
tensorrt_llm::executor::Serialization::deserializeKvCacheConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeDynamicBatchConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeSchedulerConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeLookaheadDecodingConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeEagleConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeKvCacheRetentionConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::Serialization::deserializeDecodingConfig()
tensorrt_llm::executor::Serialization::serialize()
tensorrt_llm::executor::Serialization::serializedSize()
tensorrt_llm::executor::LogitsPostProcessorMap
tensorrt_llm::executor::LogitsPostProcessorBatched
tensorrt_llm::executor::MedusaChoices
tensorrt_llm::executor::EagleChoices
tensorrt_llm::executor::PriorityType
tensorrt_llm::executor::BufferView
tensorrt_llm::executor::DataType
tensorrt_llm::executor::operator<<()
tensorrt_llm::executor::operator<<()
tensorrt_llm::executor::TypeTraits
tensorrt_llm::executor::TypeTraits
-tensorrt_llm::executor::TypeTraits
-tensorrt_llm::executor::PhonyNameDueToError::value
tensorrt_llm::executor::TypeTraits
-tensorrt_llm::executor::TypeTraits
-tensorrt_llm::executor::KvCacheStats
tensorrt_llm::executor::KvCacheStats::maxNumBlocks
tensorrt_llm::executor::KvCacheStats::freeNumBlocks
tensorrt_llm::executor::KvCacheStats::usedNumBlocks
tensorrt_llm::executor::KvCacheStats::tokensPerBlock
tensorrt_llm::executor::KvCacheStats::allocTotalBlocks
tensorrt_llm::executor::KvCacheStats::allocNewBlocks
tensorrt_llm::executor::KvCacheStats::reusedBlocks
tensorrt_llm::executor::StaticBatchingStats
tensorrt_llm::executor::StaticBatchingStats::numScheduledRequests
tensorrt_llm::executor::StaticBatchingStats::numContextRequests
tensorrt_llm::executor::StaticBatchingStats::numCtxTokens
tensorrt_llm::executor::StaticBatchingStats::numGenTokens
tensorrt_llm::executor::StaticBatchingStats::emptyGenSlots
tensorrt_llm::executor::InflightBatchingStats
tensorrt_llm::executor::InflightBatchingStats::numScheduledRequests
tensorrt_llm::executor::InflightBatchingStats::numContextRequests
tensorrt_llm::executor::InflightBatchingStats::numGenRequests
tensorrt_llm::executor::InflightBatchingStats::numPausedRequests
tensorrt_llm::executor::InflightBatchingStats::numCtxTokens
tensorrt_llm::executor::InflightBatchingStats::microBatchId
tensorrt_llm::executor::InflightBatchingStats::avgNumDecodedTokensPerIter
tensorrt_llm::executor::IterationStats
tensorrt_llm::executor::IterationStats::timestamp
tensorrt_llm::executor::IterationStats::iter
tensorrt_llm::executor::IterationStats::iterLatencyMS
tensorrt_llm::executor::IterationStats::newActiveRequestsQueueLatencyMS
tensorrt_llm::executor::IterationStats::numActiveRequests
tensorrt_llm::executor::IterationStats::numQueuedRequests
tensorrt_llm::executor::IterationStats::numCompletedRequests
tensorrt_llm::executor::IterationStats::maxNumActiveRequests
tensorrt_llm::executor::IterationStats::gpuMemUsage
tensorrt_llm::executor::IterationStats::cpuMemUsage
tensorrt_llm::executor::IterationStats::pinnedMemUsage
tensorrt_llm::executor::IterationStats::kvCacheStats
tensorrt_llm::executor::IterationStats::crossKvCacheStats
tensorrt_llm::executor::IterationStats::staticBatchingStats
tensorrt_llm::executor::IterationStats::inflightBatchingStats
tensorrt_llm::executor::DisServingRequestStats
-tensorrt_llm::executor::RequestStats
tensorrt_llm::executor::RequestStats::id
tensorrt_llm::executor::RequestStats::stage
tensorrt_llm::executor::RequestStats::contextPrefillPosition
tensorrt_llm::executor::RequestStats::numGeneratedTokens
tensorrt_llm::executor::RequestStats::avgNumDecodedTokensPerIter
tensorrt_llm::executor::RequestStats::scheduled
tensorrt_llm::executor::RequestStats::paused
tensorrt_llm::executor::RequestStats::disServingStats
tensorrt_llm::executor::RequestStatsPerIteration
-tensorrt_llm::executor::DebugTensorsPerIteration
tensorrt_llm::executor::DebugTensorsPerIteration::iter
tensorrt_llm::executor::DebugTensorsPerIteration::debugTensors
tensorrt_llm::executor::DecodingMode::isLookahead()
tensorrt_llm::executor::DecodingMode::isExplicitDraftTokens()
tensorrt_llm::executor::DecodingMode::isExternalDraftTokens()
tensorrt_llm::executor::DecodingMode::isEagle()
tensorrt_llm::executor::DecodingMode::isUseTemperature()
tensorrt_llm::executor::DecodingMode::isUsePresencePenalty()
tensorrt_llm::executor::DecodingMode::isUseFrequencyPenalty()
tensorrt_llm::executor::DecodingMode::Lookahead()
tensorrt_llm::executor::DecodingMode::ExplicitDraftTokens()
tensorrt_llm::executor::DecodingMode::ExternalDraftTokens()
tensorrt_llm::executor::DecodingMode::Eagle()
tensorrt_llm::executor::DecodingMode::anyBitSet()
tensorrt_llm::executor::DecodingMode::allBitSet()
tensorrt_llm::executor::DecodingMode::setBitTo()
tensorrt_llm::executor::DecodingMode::kLookahead
tensorrt_llm::executor::DecodingMode::kExplicitDraftTokens
tensorrt_llm::executor::DecodingMode::kExternalDraftTokens
tensorrt_llm::executor::DecodingMode::kEagle
tensorrt_llm::executor::DecodingMode::kTopKTopP
tensorrt_llm::executor::DisServingRequestStats
+tensorrt_llm::executor::InflightBatchingStats
tensorrt_llm::executor::InflightBatchingStats::numScheduledRequests
tensorrt_llm::executor::InflightBatchingStats::numContextRequests
tensorrt_llm::executor::InflightBatchingStats::numGenRequests
tensorrt_llm::executor::InflightBatchingStats::numPausedRequests
tensorrt_llm::executor::InflightBatchingStats::numCtxTokens
tensorrt_llm::executor::InflightBatchingStats::microBatchId
tensorrt_llm::executor::InflightBatchingStats::avgNumDecodedTokensPerIter
tensorrt_llm::executor::IterationStats
tensorrt_llm::executor::IterationStats::timestamp
tensorrt_llm::executor::IterationStats::iter
tensorrt_llm::executor::IterationStats::iterLatencyMS
tensorrt_llm::executor::IterationStats::newActiveRequestsQueueLatencyMS
tensorrt_llm::executor::IterationStats::numNewActiveRequests
tensorrt_llm::executor::IterationStats::numActiveRequests
tensorrt_llm::executor::IterationStats::numQueuedRequests
tensorrt_llm::executor::IterationStats::numCompletedRequests
tensorrt_llm::executor::IterationStats::maxNumActiveRequests
tensorrt_llm::executor::IterationStats::maxBatchSizeStatic
tensorrt_llm::executor::IterationStats::maxBatchSizeTunerRecommended
tensorrt_llm::executor::IterationStats::maxBatchSizeRuntime
tensorrt_llm::executor::IterationStats::gpuMemUsage
tensorrt_llm::executor::IterationStats::cpuMemUsage
tensorrt_llm::executor::IterationStats::pinnedMemUsage
tensorrt_llm::executor::IterationStats::kvCacheStats
tensorrt_llm::executor::IterationStats::crossKvCacheStats
tensorrt_llm::executor::IterationStats::staticBatchingStats
tensorrt_llm::executor::IterationStats::inflightBatchingStats
tensorrt_llm::executor::KvCacheStats
tensorrt_llm::executor::KvCacheStats::maxNumBlocks
tensorrt_llm::executor::KvCacheStats::freeNumBlocks
tensorrt_llm::executor::KvCacheStats::usedNumBlocks
tensorrt_llm::executor::KvCacheStats::tokensPerBlock
tensorrt_llm::executor::KvCacheStats::allocTotalBlocks
tensorrt_llm::executor::KvCacheStats::allocNewBlocks
tensorrt_llm::executor::KvCacheStats::reusedBlocks
tensorrt_llm::executor::KvCacheStats::missedBlocks
tensorrt_llm::executor::KvCacheStats::cacheHitRate
tensorrt_llm::executor::RequestStats
tensorrt_llm::executor::RequestStats::id
tensorrt_llm::executor::RequestStats::stage
tensorrt_llm::executor::RequestStats::contextPrefillPosition
tensorrt_llm::executor::RequestStats::numGeneratedTokens
tensorrt_llm::executor::RequestStats::avgNumDecodedTokensPerIter
tensorrt_llm::executor::RequestStats::scheduled
tensorrt_llm::executor::RequestStats::paused
tensorrt_llm::executor::RequestStats::disServingStats
tensorrt_llm::executor::RequestStats::allocTotalBlocksPerRequest
tensorrt_llm::executor::RequestStats::allocNewBlocksPerRequest
tensorrt_llm::executor::RequestStats::reusedBlocksPerRequest
tensorrt_llm::executor::RequestStats::missedBlocksPerRequest
tensorrt_llm::executor::RequestStats::kvCacheHitRatePerRequest
tensorrt_llm::executor::RequestStatsPerIteration
+tensorrt_llm::executor::StaticBatchingStats
tensorrt_llm::executor::StaticBatchingStats::numScheduledRequests
tensorrt_llm::executor::StaticBatchingStats::numContextRequests
tensorrt_llm::executor::StaticBatchingStats::numCtxTokens
tensorrt_llm::executor::StaticBatchingStats::numGenTokens
tensorrt_llm::executor::StaticBatchingStats::emptyGenSlots
tensorrt_llm::executor::TypeTraits
tensorrt_llm::executor::TypeTraits
+tensorrt_llm::executor::TypeTraits
+tensorrt_llm::executor::TypeTraits
+tensorrt_llm::executor::PhonyNameDueToError::value
tensorrt_llm::executor::TypeTraits
+Typedefs
+Functions
@@ -657,361 +706,395 @@Sampling configuration.
-Public Functions
-Constructor for SamplingConfig See description of parameters below.
-Public Types
+Public Functions
Private Members
+This request corresponds to the request ID in the context phase.
+Private Static Functions
+Friends
+Configuration class for debugging output.
+Public Functions
Private Types
+Private Members
+If true, debug all input tensors.
+If true, debug all output tensors.
+If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, else dump them to files.
+Friends
+Configuration class for the decoding.
+Public Functions
Sets decoding mode. Some modes require the use of their own setters.
+Sets lookahead decoding mode and config.
+Sets medusa mode and config.
+Sets eagle mode and config.
+Private Members
The beam width. Default is 1 which disables beam search.
-Controls number of logits to sample from. Default is 0 (all logits).
-Controls the top-P probability to sample from. Default is 0.f.
-Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
-Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
+Friends
+Configuration class for dynamic tuning of batch size and max num tokens. During runtime the statistics of input and output lengths are recoreded. Based on these statistics, the batch size and max num tokens are tuned dynamically to better serve the requests.
+Public Functions
+Public Static Attributes
Controls decay in the top-P algorithm. The decay value. Default is 1.f.
+The default window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.
Controls the random seed used by the random number generator in sampling.
+The default value of batch size table.
Private Members
Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
+Controls if the batch size should be tuned dynamically.
Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
+The window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.
Controls the diversity in beam search.
+A vector of (batchSizeLimit, batchSize). When max capacity batch size is less than.
Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
-Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
-Controls how to penalize longer sequences in beam search. Default is 0.f.
-Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
-Controls how many repeat ngram size are acceptable. Default is 1 << 30.
-Private Static Functions
-Friends
@@ -1023,113 +1106,33 @@Configuration that controls the outputs of a Result.
-Public Functions
-Public Members
- - -Controls if Result should contain the context logits. Default is false.
-Controls if Result should contain the generation logits. Default is false.
-Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.
-Public Functions
Private Members
- -The draft logits. Expected shape: [num_draft_tokens, vocab_size].
-The acceptance threshold. Must be > 0.f and <= 1.f.
-Use direct transfer for draft logits.
+choices forming tree for EAGLE-1.
Configuration for prompt tuning.
+The executor is responsible for receiving new requests and sending responses, and running the inference.
Public Functions
modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
comm – An optional inter-process communicator configuration
Private Members
-The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.
-The input token extra ids for KV Cache reuse when p-tuning is enabled.
-Friends
-Configuration for LoRA.
-Public Functions
Private Members
- - -The Lora weights. See TRT-LLM documentation for expected shapes and types.
+Enqueue a new request.
+request – The LLM request which contains input tokens and request parameters
+A unique id that identifies the request
+The Lora configuration. See TRT-LLM documentation for detailed description of the config tensor.
+Enqueue a batch of request.
Await for ready responses.
+ This overload awaits for any ready responses. In particular, if several requests
+ have been enqueued, this method will provide any ready responses without order guarantees.
+
Friends
-timeout – The maximum time to wait for new responses
+A vector of responses
+Public Functions
Await for ready responses.
+id – A request id
timeout – The maximum time to wait for new responses
A vector of responses
+Await for multiple ready responses.
+ A multiple ID request behaves as if awaitResponses(IdType, timeout)
+ were invoked on all IDs. The returned vector contains
+ a vector of responses per ID in the same order specified by the requestIds.
+ The same behaviour as awaitResponses(IdType, timeout) applies:
+ * Responses may be empty.
+ * If all responses have already been given for one of the requestIds,
+ then this method will hang unless a timeout is specified.
+
requestIds – Ids requested
timeout – The maximum time to wait for new responses
A vector of vector of responses
+Get the number of ready responses.
+requestId – An optional request id
+The number of ready responses
+Cancel the request with provided request id.
+id – The request id for which to cancel the response
+Signals the server to shutdown.
+This call is blocking. Only returns when all requests have terminated or timeout has been reached
+Returns the per-iterations statistics computed since last call to getLatestIterationStats. Contains at most iterStatsMaxIterations iterations.
+Iteration stats
+Returns the request stats of each iteration computed since last call to getLatestRequestStats. Contains at most requestStatsMaxIterations iterations.
+Request stats grouped by iterations
+return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>
+Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. Contains at most debugTensorsMaxIterations iterations.
+Request debug tensors grouped by iterations
+return true when this
can be executed on resources defined by that
Indicates if the current process is allowed to enqueueRequests.
Public Static Functions
return true when the parameter combination is valid.
+Indicates if the current process participates in this executor instance.
Private Members
Friends
-Public Types
-Configuration class for the model executor.
Public Functions
Private Types
-Private Members
-This request corresponds to the request ID in the context phase.
-Private Static Functions
Friends
-Configuration for speculative decoding (both draft and target models)
-Public Functions
Public Members
-Send logits tensor directly from draft to target model.
-A class that holds information about the request.
-Public Functions
-The Request constructor.
-inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
positionIds – The input position ids
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]
externalDraftTokensConfig – The speculative decoding configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig.
encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models
returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.
priority – Sets the execution priority of this request.
encoderInputFeatures – Encoder input features for multimodal models.
encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)
type – Indicate the request type for disaggregated serving mode.
contextPhaseParams – Generated token ID from context only executor.
numReturnSequences – The number of returning sequences.
Private Members
+The beam width value of requests that will be sent to the executor.
+The scheduler configuration.
+The KV cache configuration.
+The KV cache configuration.
+Controls if log probabilities should be normalized or not.
+Controls the maximum number of iterations for which to keep statistics.
+Controls the maximum number of iterations for which to keep per-request statistics.
+The type of batching strategy to use. See BatchingType.
+The max batch size of requests.
+The max number of tokens per batch.
+The parallel execution configuration.
+Logits post processor configuration.
+Decoding configuration.
+GPU weights percent for weight streaming.
+The maximum number of requests allowed in queue before rejecting new requests.
+Config for perf knobs that can be set in runtime.
+Debugging configuration.
+The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.
+The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default is 3 minutes.
+The speculative decoding configuration.
+Friends
+Configuration class for the runtime perf knobs.
+Public Functions
Public Static Attributes
+Private Members
Control if multi block mode should be enabled or not.
+Private Members
Control if enable cuda graph.
+Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.
+Struct that holds the logits information when using direct transfer.
+Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.
Public Members
+Public Functions
+Struct that holds the generation result.
-Public Members
-Indicates if this is the final result for the request.
-The output tokens for each beam.
-The cumulative log probabilities. Size beamSize.
-The log probabilities for each generated token. Size [beamSize, outputLen].
-The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
-Logits information for direct transfer when using fast logits.
-The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.
-The params of the context phase.
-The decoding iterations it takes.
+The draft logits. Expected shape: [num_draft_tokens, vocab_size].
The index of the output sequence where 0 <= sequenceIndex < numReturnSequences.
+The acceptance threshold. Must be > 0.f and <= 1.f.
Indicates if this is the final result for a given sequence in the request.
-Class that holds either an error or a result.
-Public Functions
-Get the client id of the request for which this response was generated.
-Indicates if this response has an error or not.
-Get the error msg for this response Will throw an exception if hasError is false.
-Private Members
-Friends
@@ -2049,56 +1905,31 @@Configuration class for the scheduler.
+Class with utility functions to serialize statistics to json string.
Public Functions
-Public Static Functions
Utility function to convert an iterationStats struct to a json serialized string.
+Private Members
-The capacity scheduler policy. See CapacitySchedulerPolicy.
+Utility function to convert a requestStatsPerIteration struct to a json serialized string.
The context chunking policy. See ContextChunkingPolicy.
+Utility function to convert a requestStats struct to a json serialized string.
Friends
-Public Functions
Private Members
@@ -2246,6 +2102,18 @@Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
+Max size of the KV cache event buffer.
+Friends
@@ -2257,388 +2125,499 @@Configuration class for the runtime perf knobs.
-Public Functions
-Public Members
+The amount of blocks at each cache level.
+Public Functions
Public Members
+ -The data corresponding to this event.
+Exposes a limited set of KV cache manager functionalities.
+Public Functions
Get the latest KV Cache events.
+timeout – The maximum time to wait for new events. If nullopt, will only return when new events are available, or when the executor instance has shutdown.
+Private Members
Control if multi block mode should be enabled or not.
-If enable FMHA runner FP32 accumulation.
-Control if enable cuda graph.
+Public Members
Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.
+The hashes of blocks being removed.
Friends
-Configuration class for debugging output.
+Configuration for the request’s retention in the KV Cache.
Public Functions
Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the priorities and durations for each block.
+Public Static Attributes
+Private Members
+The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64), (100, 128), (70, 80)] is valid, whereas [(0, 64), (60, 128)] is not.
+The priority level to assign to blocks allocated in the decode phase.
+The duration in ms that decode blocks should remain at their assigned priority level.
+A single entry to set block priorities over a token range. Earlier ranges always take priority over later ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
+Public Functions
Private Types
-Public Members
+The first token of this range.
+The final token of this range. The end is not included in the range. This can be set to std::nullopt to extend the range to the end of the sequence.
+The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
+The duration in ms that the block should remain at the given priority level. Set to std::nullopt to have no expiration time, and keep the block at the given priority level until it gets reclaimed. After the duration has passed, the block will be moved back to the kDefaultRetentionPriority
level.
An entry for a single block stored into the tree.
+Public Functions
+Private Members
+Public Members
If true, debug all input tensors.
+The hash of the block.
If true, debug all output tensors.
+The unique tokens of the block.
If not empty, only debug tensors in this list.
+The Lora task id of the block.
If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, else dump them to files.
+The cache level of the block.
+The priority of the block.
Friends
-Public Members
+ + +A sequence of blocks. The parent of block i
is block i-1
Public Functions
+Public Members
+ + +The updated value of the cacheLevel field.
+The updated value of the priority field.
+Public Functions
Private Members
mapping from post processor names to non-batched post processors
+single batched post processor
+If set to true, logits post processor will run on all TP ranks in last PP rank.
+A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.
-Public Functions
Constructor.
-commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>
+return true when this
can be executed on resources defined by that
Public Static Functions
return true when the parameter combination is valid.
+Private Members
The type of communication protocol used. Default is MPI.
-The mode of communication. See CommunicationMode.
-The GPU device ids to use for executing this model.
-The participant ids (MPI ranks for example) used for executing this model.
-Optional orchestrator configuration.
-config for PeftCacheManager
+Configuration for LoRA.
Public Functions
Private Members
+ + + + + + +Friends
+Public Functions
Private Members
Configuration that controls the outputs of a Result.
+Public Functions
+Public Members
Controls if Result should contain log probabilities. Default is false.
+Controls if Result should contain the context logits. Default is false.
+Controls if Result should contain the generation logits. Default is false.
+Controls if output tokens in Result should include the input tokens. Default is false.
+Configuration class for the decoding.
+A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.
Public Functions
Sets decoding mode. Some modes require the use of their own setters.
+Constructor.
+commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
Sets lookahead decoding mode and config.
-Sets medusa mode and config.
-Private Members
-Friends
-Public Functions
Private Members
mapping from post processor names to non-batched post processors
+The type of communication protocol used. Default is MPI.
single batched post processor
+The mode of communication. See CommunicationMode.
If set to true, logits post processor will run on all TP ranks in last PP rank.
+The GPU device ids to use for executing this model.
+The participant ids (MPI ranks for example) used for executing this model.
+Optional orchestrator configuration.
Friends
+Configuration class for the model executor.
+config for PeftCacheManager
Public Functions
Private Members
+Friends
+Configuration for prompt tuning.
+Public Functions
Private Members
+The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.
+The input token extra ids for KV Cache reuse when p-tuning is enabled.
+Friends
+A class that holds information about the request.
+Public Functions
The Request constructor.
+inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
positionIds – The input position ids
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]
externalDraftTokensConfig – The speculative decoding with external draft tokens configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
lookaheadConfig – The lookahead speculative decoding configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor
kvCacheRetentionConfig – The configuration used for KV cache block eviction. name provided to the ExecutorConfig.
encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models
returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.
priority – Sets the execution priority of this request.
encoderInputFeatures – Encoder input features for multimodal models.
encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)
crossAttentionMask – Cross attention mask.
type – Indicate the request type for disaggregated serving mode.
contextPhaseParams – Generated token ID from context only executor.
numReturnSequences – The number of returning sequences.
eagleConfig – The EAGLE speculative decoding configuration
skipCrossAttnBlocks – Skip the cross attention transformer blocks or not.
Private Members
-The beam width value of requests that will be sent to the executor.
-The scheduler configuration.
-The KV cache configuration.
-The KV cache configuration.
-Controls if log probabilities should be normalized or not.
-Controls the maximum number of iterations for which to keep statistics.
-Controls the maximum number of iterations for which to keep per-request statistics.
-The type of batching strategy to use. See BatchingType.
-The max batch size of requests.
-The max number of tokens per batch.
-The parallel execution configuration.
-Logits post processor configuration.
-Decoding configuration.
-GPU weights percent for weight streaming.
-The maximum number of requests allowed in queue before rejecting new requests.
-Config for perf knobs that can be set in runtime.
-Debugging configuration.
-The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.
-The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default is 3 minutes.
-The speculative decoding configuration.
-Friends
-The executor is responsible for receiving new requests and sending responses, and running the inference.
-Public Functions
modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
comm – An optional inter-process communicator configuration
Enqueue a new request.
-request – The LLM request which contains input tokens and request parameters
-A unique id that identifies the request
-Enqueue a batch of request.
-Await for ready responses.
- This overload awaits for any ready responses. In particular, if several requests
- have been enqueued, this method will provide any ready responses without order guarantees.
-
timeout – The maximum time to wait for new responses
-A vector of responses
-Await for ready responses.
-id – A request id
timeout – The maximum time to wait for new responses
A vector of responses
-Await for multiple ready responses.
- A multiple ID request behaves as if awaitResponses(IdType, timeout)
- were invoked on all IDs. The returned vector contains
- a vector of responses per ID in the same order specified by the requestIds.
- The same behaviour as awaitResponses(IdType, timeout) applies:
- * Responses may be empty.
- * If all responses have already been given for one of the requestIds,
- then this method will hang unless a timeout is specified.
-
requestIds – Ids requested
timeout – The maximum time to wait for new responses
A vector of vector of responses
-Get the number of ready responses.
-requestId – An optional request id
-The number of ready responses
-Cancel the request with provided request id.
-id – The request id for which to cancel the response
-Signals the server to shutdown.
-This call is blocking. Only returns when all requests have terminated or timeout has been reached
-Returns the per-iterations statistics computed since last call to getLatestIterationStats. Contains at most iterStatsMaxIterations iterations.
-Iteration stats
-Returns the request stats of each iteration computed since last call to getLatestRequestStats. Contains at most requestStatsMaxIterations iterations.
-Request stats grouped by iterations
-Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. Contains at most debugTensorsMaxIterations iterations.
-Request debug tensors grouped by iterations
-Indicates if the current process is allowed to enqueueRequests.
-Public Static Attributes
+Friends
+Class with utility functions to serialize statistics to json string.
+Class that holds either an error or a result.
Public Static Functions
-Utility function to convert an iterationStats struct to a json serialized string.
-Utility function to convert a requestStatsPerIteration struct to a json serialized string.
-Utility function to convert a requestStats struct to a json serialized string.
-Public Static Functions
-Public Functions
Get the id of the request for which this response was generated.
+Get the client id of the request for which this response was generated.
+Indicates if this response has an error or not.
+Get the error msg for this response Will throw an exception if hasError is false.
+Get the result for this response Will throw an exception if hasResult is true.
+Private Members
+Friends
+Struct that holds the generation result.
+Public Members
+Indicates if this is the final result for the request.
+The output tokens for each beam.
+The cumulative log probabilities. Size beamSize.
+The log probabilities for each generated token. Size [beamSize, outputLen].
+The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
+Logits information for direct transfer when using fast logits.
+The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.
+The params of the context phase.
+The decoding iterations it takes.
+The index of the output sequence of this result where 0 <= sequenceIndex < numReturnSequences. In beam search (beamWidth > 1), this index will be always zero because all beams to be returned are included in this result.
+Indicates if this is the final result for a given sequence in the request In beam search (beamWidth > 1), the value will always equal to the value of isFinal.
+Public Functions
Public Members
+Sampling configuration.
+Public Functions
+Constructor for SamplingConfig See description of parameters below.
+Private Functions
Private Members
+The beam width. Default is 1 which disables beam search.
+Controls number of logits to sample from. Default is 0 (all logits).
+Controls the top-P probability to sample from. Default is 0.f.
+Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.
+Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.
+Controls decay in the top-P algorithm. The decay value. Default is 1.f.
+Controls the random seed used by the random number generator in sampling.
+Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.
+Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.
+Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.
+Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
+Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.
+Controls how to penalize longer sequences in beam search. Default is 0.f.
+Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)
+Controls how many repeat ngram size are acceptable. Default is 1 << 30.
+The number of return sequences or beams. In beam search, the value should be less than or equal to mBeamWidth. In sampling, it specifies the total number of independently generated sequences.
+The number of beams to return. It is equal to beamWidth unless numReturnSequences is set. If beamWidth > 1 and numReturnSequences is set, then numReturnBeams is equal to numReturnSequences.
+Private Static Functions
Friends
+Configuration class for the scheduler.
+Public Functions
Private Members
+The capacity scheduler policy. See CapacitySchedulerPolicy.
The context chunking policy. See ContextChunkingPolicy.
The config for tuning batch size dynamically. See DynamicBatchSizeConfig.
Friends
+Configuration for speculative decoding (both draft and target models)
Public Functions
Public Members
+Send logits tensor directly from draft to target model.
+Struct that holds the logits information when using direct transfer.
+ + +Public Types
Public Functions
+ + + +Public Static Functions
Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns the memory type of the buffer.
-Returns the number of elements in the tensor.
-Returns the size of the tensor in bytes.
-Set the entire memory to zero.
-stream – Must be a valid CUDA stream if the memory type is GPU.
-Copy the data and shape from another tensor.
-other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
Allocate a cpu tensor with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
-shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Types
-Private Functions
Private Members
-Private Static Functions
-Typedefs
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Typedefs
-Enums
-Values:
-Values:
-Values:
-Values:
-The batching type.
-Values:
-STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.
-INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.
-The policy used to select the subset of available requests in each iteration of the executor generation loop.
-Values:
-MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
-GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
-kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.
-Values:
-Sequential chunking, complete the unfinished context phase first.
-Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.
Public Types
+Public Functions
+Active request in encoder phase.
-Public Types
+Active request in context phase.
-Public Functions
+Active request in generation phase.
-Active request for which generation has completed.
-The reason why the model stopped generating tokens for a request.
-Values:
-The request is not finished.
-The request finished because the end id was generated.
-The request finished because a stop word was generated.
-The request finished because the maximum number of tokens was reached.
-Functions
For converting a C++ data type to a TrtLmmDataType
.
Returns a pointer to underlying array.
Public Static Attributes
- +Returns a pointer to underlying array.
+Public Static Attributes
- +Returns the memory type of the buffer.
+Public Static Attributes
- +Returns the number of elements in the tensor.
+Returns the size of the tensor in bytes.
Public Static Attributes
- +Set the entire memory to zero.
+stream – Must be a valid CUDA stream if the memory type is GPU.
+Copy the data and shape from another tensor.
+other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Attributes
-Public Static Functions
+Allocate a cpu tensor with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Public Static Attributes
-Allocate a cpu tensor in pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Public Static Attributes
-Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Struct that holds the stats of a KV cache manager.
-Public Members
-Max number of blocks.
-Number of free blocks.
+Allocate a tensor in managed memory (UVM) with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Number of used blocks.
-Number of tokens per block.
+Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
+shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Number of total allocated block.
+Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Number of newly allocated block.
+Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Number of reused block.
+Private Types
+ -Struct that holds the stats of static batching models for a single iteration.
+Public Members
-Number of scheduled requests.
-Private Functions
+Number of requests in context stage.
-Private Members
Total number of context tokens in the iteration.
-Total number of tokens to generate in the iteration.
-Private Static Functions
+ -Total number of unused generation token slots.
-Struct that holds the stats of inflight batching models for a single iteration.
+Typedefs
+Public Members
-Number of scheduled requests.
-Functions
+ -Number of requests in context stage.
-Number of requests in generation stage.
+Number of paused requests.
Total number of context tokens in the iteration.
-Index of mirco batch.
Struct that holds the stats of a single iteration.
-Public Members
-Ending time of this iteration.
+Public Static Attributes
Iteration id.
+The total time spent in queue by the requests that became active in this iteration (ms)
+Typedefs
+ + +Enums
+Values:
+Number of active requests.
+Values:
+Values:
+Values:
+The batching type.
+Values:
+STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.
+INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.
+The policy used to select the subset of available requests in each iteration of the executor generation loop.
+Values:
+MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
+GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
+kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.
+Values:
+Sequential chunking, complete the unfinished context phase first.
+Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.
+Values:
+Enum class that represents the state of a request.
+Values:
+Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).
+Active request in encoder phase.
+Active request in context phase.
+Active request in generation phase.
+Active request for which generation has completed.
+The reason why the model stopped generating tokens for a request.
+Values:
+The request is not finished.
+The request finished because the end id was generated.
+The request finished because a stop word was generated.
+The request finished because the maximum number of tokens was reached.
+Functions
+Struct that holds the debug tensors in an iteration.
+Public Members
+The iteration id for these tensors.
+mode of the decoder
+Public Types
+Public Functions
+Public Static Functions
+No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.
+Private Functions
+Private Members
+Private Static Attributes
+Number of queued requests.
-Number of requests that were completed in this iteration.
-Number of max active requests.
-GPU memory usage in bytes.
-CPU memory usage in bytes.
-Pinned memory usage in bytes.
-Stats specific to KV caches.
-Stats specific to cross KV caches.
-Stats specific to static batching.
-Stats specific to inflight batching.
-Struct that holds the request stats in the case of disaggregated serving.
-Public Members
The total time spent on transferring KV cache from context phase to generation phase (ms)
-Struct that holds the stats of a single request.
-Public Members
+The current stage the request is in.
-If using chunked context, the current context prefill position.
-The number of generated tokens so far.
-The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.
-Whether the request is scheduled for the current iteration.
-Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)
-Stats specific to disaggregated serving.
-Struct that holds the stats of all requests in an iteration.
+Struct that holds the request stats in the case of disaggregated serving.
Public Members
The iteration id for these stats.
-The stats of all active requests for this iteration.
+The total time spent on transferring KV cache from context phase to generation phase (ms)
Struct that holds the debug tensors in an iteration.
+Struct that holds the stats of inflight batching models for a single iteration.
Public Members
The iteration id for these tensors.
+Number of scheduled requests.
Number of requests in context stage.
mode of the decoder
-Public Types
-Public Functions
-Number of requests in generation stage.
+Number of paused requests.
+Total number of context tokens in the iteration.
+Index of mirco batch.
+Average number of tokens decoded per request per iteration.
+Struct that holds the stats of a single iteration.
+Public Members
+Ending time of this iteration.
+Iteration id.
+Iteration latency (ms)
+The total time spent in queue by the requests that became active in this iteration (ms)
+Number of new fetched active requests.
+Number of active requests.
+Number of queued requests.
+Number of requests that were completed in this iteration.
+Number of max active requests.
+Static max batch size passed to the executor.
+Batch size produced by dynamic tuner based on input stats.
+@brife The min of maxBatchSizeStatic and maxBatchSizeRuntimeUpperbound
+GPU memory usage in bytes.
+CPU memory usage in bytes.
+Pinned memory usage in bytes.
+Stats specific to KV caches.
+Stats specific to cross KV caches.
+Stats specific to static batching.
+Stats specific to inflight batching.
+Struct that holds the stats of a KV cache manager.
Public Static Functions
-No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.
+Public Members
+Max number of blocks.
Number of free blocks.
+Number of used blocks.
+Number of tokens per block.
+Number of total allocated block.
+Number of newly allocated block.
+Number of reused block.
+Number of not reused block.
+Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
+Struct that holds the stats of a single request.
Private Functions
-Public Members
+ -The current stage the request is in.
+If using chunked context, the current context prefill position.
+Private Members
The number of generated tokens so far.
+Private Static Attributes
The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.
+Whether the request is scheduled for the current iteration.
+Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)
+Stats specific to disaggregated serving.
+Number of total allocated blocks per request.
+Number of newly allocated blocks per request.
+Number of reused blocks per request.
+Number of missed blocks per request.
+KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
+Struct that holds the stats of all requests in an iteration.
+Public Members
The iteration id for these stats.
+The stats of all active requests for this iteration.
+Struct that holds the stats of static batching models for a single iteration.
+Public Members
Number of scheduled requests.
+Number of requests in context stage.
+Total number of context tokens in the iteration.
+Total number of tokens to generate in the iteration.
+Total number of unused generation token slots.
+For converting a C++ data type to a TrtLmmDataType
.
Public Static Attributes
Public Static Attributes
Public Static Attributes
Public Static Attributes
Public Functions
+Public Members
+[batchSize, maxDecodingDraftTokens]
+[batchSize]
+[batchSize, maxDecodingTokens, maxPathLen]
+[batchSize, maxNumPaths, maxPathLen]
+[batchSize]
+[batchSize, maxDecodingTokens, maxPathLen]
+[batchSize, maxPathLen]
+[batchSize]
+[batchSize]
+[batchSize]
+Public Static Attributes
Public Types
+Public Functions
+Public Types
Public Functions
Public Types
-Subclassed by tensorrt_llm::runtime::GptDecoder< T >
-Public Types
Public Functions
Public Static Functions
-Private Members
+Subclassed by tensorrt_llm::runtime::GptDecoder< T >
+Public Types
Public Functions
Private Members
-Public Static Functions
+Setup buffers for Eagle decoding.
+Initialize the decoder with new batch of inputs.
Initialize batched decoder at seqSlots with a new requests
.
[batchSize], indicators of finished requests
@@ -3895,7 +4040,7 @@[batchSize], the number of generation steps executed on each request
@@ -3996,8 +4141,8 @@Initialize the decoder at batchSlot
with a new request
.
Setups decoder internal tensors for new speculative decoding request.
Setups decoder internal tensors for new Eagle request.
+Sets inputs for eagle decoding.
+Public Functions
Gets a typed pointer to the constant underlying data of the buffer.
Gets a typed pointer to the underlying data of the buffer.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
Utility function to print a buffer.
A wrapper around nvinfer1::DataType
that provides a support for pointer types.
Public Functions
+Private Members
+ -Public Static Attributes
-Public Types
+Public Functions
+ -Public Static Attributes
-Public Types
Public Static Attributes
Public Types
Public Static Attributes
Public Types
Public Types
Public Types
Public Types
Public Types
Public Types
Public Types
A wrapper around nvinfer1::DataType
that provides a support for pointer types.
Subclassed by tensorrt_llm::runtime::ITensor
Public Functions
-Public Static Attributes
- +Public Functions
+Returns a pointer to underlying array.
+Private Members
- +Returns a pointer to underlying array.
+Returns a pointer to the underlying array at a given element index.
+Returns a pointer to the underlying array at a given element index.
+Returns the size (in number of elements) of the buffer.
For converting a C++ data type to a TensorRT data type.
+Returns the size (in bytes) of the buffer.
Public Static Attributes
- +Returns the capacity of the buffer.
+Public Static Attributes
-Returns the memory type of the buffer.
Public Static Attributes
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
Public Static Attributes
-Releases the buffer. It will be reset to nullptr.
Public Static Attributes
-Public Static Attributes
-Public Static Functions
+Creates a sliced view on the underlying buffer
. The view will have the same data type as buffer
.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer
.
Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Private Static Attributes
-Subclassed by tensorrt_llm::runtime::ITensor
-Public Types
- - -Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns a pointer to the underlying array at a given element index.
-Returns a pointer to the underlying array at a given element index.
-Returns the size (in number of elements) of the buffer.
-Returns the size (in bytes) of the buffer.
-Returns the capacity of the buffer.
-Returns the memory type of the buffer.
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Releases the buffer. It will be reset to nullptr.
-Public Static Functions
-Creates a sliced view on the underlying buffer
. The view will have the same data type as buffer
.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer
.
Public Functions
- - -Public Members
- - -Initialize the decoder with new batch of inputs.
Utility function to print a shape.
Utility function to print a tensor with its shape.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
Returns the tensor dimensions.
Not allowed to copy.
Functions
+Public Functions
+Public Functions
-Public Types
-Public Functions
+Public Functions
Functions
+Caches LoRA weights with LRU eviction policy.
+Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.
+A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.
+Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.
+The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)
+Cache pages are allocated on one or more blocks
+Public Types
Public Functions
param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks
+put a task in the cache, and claim pages for it, and optionally load task weights.
+taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not
Public Members
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -Public Functions
-Private Members
-Functions
-Subclassed by tensorrt_llm::runtime::LoraCacheFullException
- -Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device
-Note that this class is not thread safe
- -Public Functions
-config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks
claim pages
-numPages – [in] number of pages to claim
-a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds
-get number of available (free) pages in manager
-number of free pages in manager
-release given pages
-pages – [in] list of pages to release (free)
-return pointer to given page block
-blockIdx; – [in]
-— pointer to page block
-return pointer to given page
-pageIdx – [in]
-— const pointer to page
-Private Functions
-Caches LoRA weights with LRU eviction policy.
-Tasks put in the cache are marked in progress and can not be evicted, until they are marked done.
-A cache page holds a optimally sized LoRA. A page is of size [numSlots x pageWidth] An optimally size LoRA is on that has the configured optimalAdapterSize.
-Conceptually a slot corresponds to a r=1, 1-layer, 1-module set of in/out weights. Page width is set to the number of weights in smallest module.
-The number of slots per page is then ceilDiv(num weights in optimally sized LoRA, num weights in smallest module)
-Cache pages are allocated on one or more blocks
-Public Types
- - -Public Functions
-param[in] pageManagerConfig: a LoraCachePageManagerConfig param[in] modelConfig: a ModelConfig param[in] worldConfig: a WorldConfig param[in] bufferManager: a BufferManager only used to allocate page blocks
-put a task in the cache, and claim pages for it, and optionally load task weights.
-taskId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load – [in] if true load weights before returning, otherwise do not
load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false
-taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
load task weights. This method must be called after put. It is designed to be called asynchronously after put returns with load = false
+taslId – [in] the task id
weights – [in] lora weights tensor
config – [in] lora config tensor
taskId – [in] the task id
@@ -8404,7 +8225,7 @@Copy task weights to cache pages.
claim numPages, evicting tasks if needed
Internal helper method used inside copyTask. Not thread safe on its own
Holds memory of lora cache pages, and manages allocation and freeing of whole pages. Memory is pre-allocated either on the host or device
+Note that this class is not thread safe
+ +Public Functions
+config – [in] a LoraCachePageManagerConfig
bufferManager – [in] a Buffermanager used to allocate page blocks
claim pages
+numPages – [in] number of pages to claim
+a tuple, where the first values is a boolean indicating whether pages were claimed. If the first value is true the second value will have a list of pageIds
+get number of available (free) pages in manager
+number of free pages in manager
+release given pages
+pages – [in] list of pages to release (free)
+return pointer to given page block
+blockIdx; – [in]
+— pointer to page block
+return pointer to given page
+pageIdx – [in]
+— const pointer to page
+Private Functions
+Subclassed by tensorrt_llm::runtime::LoraCacheFullException
+ +Private Types
-Private Functions
-Private Members
-Private Static Attributes
-Public Members
-Private Members
Public Functions
+Values:
-Values:
-Public Functions
Public Static Functions
Public Static Attributes
+Private Members
+Public Members
+Public Types
+ -Public Static Functions
-Public Functions
Public Static Attributes
+Public Members
-Private Members
Public Types
+ -Public Functions
+Public Types
+ + +Public Functions
+Public Members
+Private Members
Public Types
+Public Functions
+Public Members
Public Members
Defines
+Public Types
-Public Functions
+Public Functions
Public Members
-Public Members
Public Types
-Public Functions
-Public Types
- - -Public Functions
-Public Members
Private Members
-Private Functions
+Private Static Functions
+Public Types
Public Functions
Public Members
-Public Static Functions
+Defines
-Public Functions
Private Functions
Private Members
+Public Members
+Private Static Attributes
Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
+Public Functions
+max number of draft tokens that can be accepted by one step of the decoder
++one more than draft path len for prediction from primary head
+max number of tokens that a request can grow in one step of the decoder
+max number of draft tokens processed by one step of the decoder
++one more than decoding draft tokens for prediction from primary head
+max number of tokens processed by one step of the decoder
+Private Functions
+Private Members
Private Types
-Public Functions
+ -Private Functions
-Private Static Functions
Public Types
-Public Functions
+Public Functions
Is my rank the last rank in its pipeline?
+Public Static Functions
Private Functions
-Public Static Functions
Private Members
+Public Static Attributes
Private Static Attributes
+Private Members
Subclassed by tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
+Public Types
+Public Functions
Public Members
+ -max number of draft tokens that can be accepted by one step of the decoder
--one more than draft path len for prediction from primary head
-max number of tokens that a request can grow in one step of the decoder
-Private Functions
max number of draft tokens processed by one step of the decoder
--one more than decoding draft tokens for prediction from primary head
-max number of tokens processed by one step of the decoder
-Private Members
+Public Members
+ + + + + + + + + + + + +Public Functions
Private Functions
-Public Members
+ + + + +[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numSequences, maxNumPaths, maxPathDraftLen]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+Private Members
[maxBatchSize] or [numSequences]
+[maxBatchSize] or [numSequences]
+[maxBatchSize] or [numSequences]
+[maxBatchSize] or [numSequences]
+Public Functions
Public Members
+ -Public Types
+Public Functions
Public Members
+ -Is my rank the last rank in its pipeline?
-Public Static Functions
-Public Static Attributes
Private Members
Public Functions
+Public Members