From 89ba1b1a67d570e41b03da87e5518eaff0d31fbf Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Tue, 7 May 2024 23:34:28 +0800
Subject: [PATCH] Update TensorRT-LLM (#1554)

---
 .gitattributes                                |    2 +
 .github/ISSUE_TEMPLATE/bug_report.yml         |    2 +-
 .gitignore                                    |    2 +-
 README.md                                     |    6 +-
 benchmarks/cpp/README.md                      |    7 +-
 benchmarks/cpp/gptManagerBenchmark.cpp        |   52 +-
 benchmarks/cpp/gptSessionBenchmark.cpp        |   46 +-
 benchmarks/python/README.md                   |    8 +
 benchmarks/python/allowed_configs.py          |    2 +
 benchmarks/python/benchmark.py                |   55 +-
 benchmarks/python/build.py                    |   39 +-
 benchmarks/python/check_accuracy_mlperf.py    |  163 +++
 benchmarks/python/gpt_benchmark.py            |    6 +-
 benchmarks/python/mem_monitor.py              |   14 +-
 cpp/CMakeLists.txt                            |   63 +-
 .../tensorrt_llm/batch_manager/GptManager.h   |    5 +
 .../tensorrt_llm/batch_manager/llmRequest.h   |   61 +-
 .../batch_manager/peftCacheManager.h          |    4 +-
 cpp/include/tensorrt_llm/runtime/gptSession.h |   12 +-
 cpp/include/tensorrt_llm/runtime/ipcUtils.h   |   35 +-
 cpp/include/tensorrt_llm/runtime/loraCache.h  |    4 +-
 .../runtime/loraCachePageManagerConfig.h      |    4 +-
 .../tensorrt_llm/runtime/modelConfig.h        |   65 +-
 .../runtime/speculativeDecodingMode.h         |  141 ++
 cpp/tensorrt_llm/CMakeLists.txt               |   67 +-
 .../libtensorrt_llm_batch_manager_static.a    |    4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |    4 +-
 .../aarch64-linux-gnu/version.txt             |    6 +-
 .../libtensorrt_llm_batch_manager_static.a    |    4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |    4 +-
 .../tensorrt_llm_batch_manager_static.lib     |    4 +-
 cpp/tensorrt_llm/common/cudaDriverWrapper.cpp |   28 +
 cpp/tensorrt_llm/common/cudaDriverWrapper.h   |   16 +-
 .../common/customAllReduceUtils.h             |    6 +-
 cpp/tensorrt_llm/common/envUtils.cpp          |   19 +
 cpp/tensorrt_llm/common/envUtils.h            |    3 +
 .../libtensorrt_llm_executor_static.a         |    4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |    4 +-
 .../executor/aarch64-linux-gnu/version.txt    |    6 +-
 .../libtensorrt_llm_executor_static.a         |    4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |    4 +-
 .../tensorrt_llm_executor_static.lib          |    4 +-
 .../executor_worker/executorWorker.cpp        |    2 +-
 .../fused_multihead_attention_v2.h            |   24 +-
 cpp/tensorrt_llm/kernels/cumsumLastDim.cu     |   74 ++
 cpp/tensorrt_llm/kernels/cumsumLastDim.h      |   35 +
 .../moe_gemm/moe_gemm_kernels_template.h      |    3 +-
 .../kernels/decoderMaskedMultiheadAttention.h |    4 +
 .../CMakeLists.txt                            |    3 +
 .../cubin/xqa_kernel_cubin.h                  | 1165 ++++++++---------
 .../decoderMaskedMultiheadAttentionLaunch.h   |    6 +-
 .../decoderMaskedMultiheadAttentionTemplate.h |   15 +-
 .../decoderXQAImpl.cpp                        |    7 +-
 .../decoderXQAImpl.h                          |    8 +-
 .../decoderXQAImplCommon.h                    |  311 +++++
 .../decoderXQAImplJIT/compileEngine.cpp       |   85 ++
 .../decoderXQAImplJIT/compileEngine.h         |   47 +
 .../decoderXQAImplJIT/cubinObj.cpp            |  107 ++
 .../decoderXQAImplJIT/cubinObj.h              |   55 +
 .../decoderXQAImplJIT/cubinObjRegistry.h      |  144 ++
 .../decoderXQAImplJIT/decoderXQAImplJIT.cpp   |  305 +++++
 .../decoderXQAImplJIT/decoderXQAImplJIT.h     |   69 +
 .../libtensorrt_llm_nvrtc_wrapper.so          |    3 +
 .../aarch64-linux-gnu/version.txt             |    2 +
 .../nvrtcWrapper/include/nvrtcWrapper.h       |   82 ++
 .../libtensorrt_llm_nvrtc_wrapper.so          |    3 +
 .../tensorrt_llm_nvrtc_wrapper.dll            |    3 +
 .../tensorrt_llm_nvrtc_wrapper.lib            |    3 +
 .../decoderXQAImplJIT/serializationUtils.h    |   52 +
 .../decoderXQAImplPrecompiled.cpp             |  264 +---
 .../decoderXQAImplPrecompiled.h               |    2 +-
 .../decoderXQARunner.cpp                      |   79 +-
 .../decoderXQARunner.h                        |   60 +-
 .../xqaParams.h                               |   10 +-
 .../decoderMaskedMultiheadAttentionUtils.h    |  280 ++--
 cpp/tensorrt_llm/kernels/decodingKernels.cu   |    6 +-
 cpp/tensorrt_llm/kernels/gptKernels.h         |    7 +-
 .../kernels/mixtureOfExperts/moe_kernels.cu   |  153 ++-
 .../kernels/mixtureOfExperts/moe_kernels.h    |   58 +-
 cpp/tensorrt_llm/kernels/penaltyTypes.h       |    3 +-
 cpp/tensorrt_llm/kernels/selectiveScan.cu     |   47 +-
 .../kernels/unfusedAttentionKernels.cu        |   25 +-
 .../kernels/unfusedAttentionKernels.h         |    4 +-
 .../unfusedAttentionKernels_2_template.h      |   56 +-
 .../layers/lookaheadDecodingUtils.cpp         |   75 ++
 .../layers/lookaheadDecodingUtils.h           |   17 +
 .../layers/lookaheadPoolManager.cpp           |   83 ++
 .../layers/lookaheadPoolManager.h             |   57 +
 cpp/tensorrt_llm/plugins/CMakeLists.txt       |    3 +-
 cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp   |    3 +
 .../plugins/common/gemmPluginProfiler.h       |   26 +-
 cpp/tensorrt_llm/plugins/common/plugin.h      |   11 +-
 cpp/tensorrt_llm/plugins/common/pluginUtils.h |   66 +
 .../cumsumLastDimPlugin/CMakeLists.txt        |   22 +
 .../cumsumLastDimPlugin.cpp                   |  276 ++++
 .../cumsumLastDimPlugin/cumsumLastDimPlugin.h |   98 ++
 .../plugins/gemmPlugin/gemmPlugin.cpp         |   77 +-
 .../plugins/gemmPlugin/gemmPlugin.h           |    4 +-
 .../gptAttentionCommon/gptAttentionCommon.cpp |  112 +-
 .../gptAttentionCommon/gptAttentionCommon.h   |   44 +-
 .../gptAttentionPlugin/gptAttentionPlugin.cpp |  105 +-
 .../gptAttentionPlugin/gptAttentionPlugin.h   |   19 +-
 .../plugins/loraPlugin/loraPlugin.cpp         |   61 +-
 .../mixtureOfExpertsPlugin.cpp                |   15 +-
 .../mixtureOfExperts/mixtureOfExpertsPlugin.h |   12 +-
 cpp/tensorrt_llm/runtime/CMakeLists.txt       |    1 +
 cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp  |   14 +-
 cpp/tensorrt_llm/runtime/gptJsonConfig.cpp    |   67 +-
 cpp/tensorrt_llm/runtime/gptSession.cpp       |   71 +-
 cpp/tensorrt_llm/runtime/ipcUtils.cpp         |   98 +-
 cpp/tensorrt_llm/runtime/layerProfiler.cpp    |   97 ++
 cpp/tensorrt_llm/runtime/layerProfiler.h      |   47 +
 cpp/tensorrt_llm/runtime/loraManager.cpp      |    4 +-
 cpp/tensorrt_llm/runtime/runtimeBuffers.cpp   |    7 +-
 cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp  |  101 +-
 cpp/tensorrt_llm/runtime/ssmStateBuffers.h    |    9 +-
 cpp/tensorrt_llm/runtime/tllmRuntime.cpp      |   26 +
 cpp/tensorrt_llm/runtime/tllmRuntime.h        |    7 +
 .../runtime/transformerBuffers.cpp            |   29 +-
 cpp/tensorrt_llm/runtime/transformerBuffers.h |    3 +-
 .../runtime/utils/sessionUtils.cpp            |   20 +-
 cpp/tensorrt_llm/runtime/utils/sessionUtils.h |    3 +-
 cpp/tensorrt_llm/runtime/workerPool.h         |   16 +-
 cpp/tests/CMakeLists.txt                      |   11 +-
 cpp/tests/README.md                           |    2 +-
 cpp/tests/kernels/mixtureOfExpertsTest.cu     |   91 +-
 cpp/tests/kernels/shiftKCacheKernelTest.cu    |    4 +-
 cpp/tests/layers/lookaheadPoolManagerTest.cpp |  171 +++
 .../scripts/build_chatglm_engines.py          |    4 +-
 .../resources/scripts/build_gpt_engines.py    |   20 +-
 .../resources/scripts/build_gptj_engines.py   |    6 +-
 .../resources/scripts/build_medusa_engines.py |   13 +-
 .../scripts/build_recurrentgemma_engines.py   |  131 ++
 .../generate_expected_chatglm_output.py       |    2 +-
 .../scripts/generate_expected_gpt_output.py   |    4 +-
 ...generate_expected_recurrentgemma_output.py |   73 ++
 cpp/tests/resources/scripts/test_cpp.py       |   35 +
 cpp/tests/runtime/loraCacheTest.cpp           |    7 +-
 cpp/tests/runtime/loraUtilsTest.cpp           |    3 +-
 cpp/tests/runtime/medusaModuleTest.cpp        |   11 +-
 docker/Dockerfile.multi                       |    6 +-
 docker/Makefile                               |   13 +-
 docker/common/install_polygraphy.sh           |    2 +-
 docker/common/install_pytorch.sh              |    6 +-
 docker/common/install_tensorrt.sh             |   24 +-
 docker/common/pytorch_pr_116072.patch         |   31 +
 docs/source/architecture/checkpoint.md        |    4 +-
 docs/source/architecture/workflow.md          |   16 +-
 .../installation/build-from-source-windows.md |   14 +-
 docs/source/installation/linux.md             |   11 +-
 docs/source/installation/windows.md           |   47 +-
 docs/source/reference/precision.md            |    3 +-
 docs/source/reference/support-matrix.md       |   10 +-
 docs/source/release-notes.md                  |   33 +
 docs/source/speculative_decoding.md           |    2 +-
 examples/arctic/README.md                     |   89 ++
 examples/baichuan/README.md                   |    8 +-
 examples/baichuan/requirements.txt            |    2 +-
 examples/bloom/README.md                      |   31 +-
 examples/bloom/requirements.txt               |    2 +-
 examples/chatglm/requirements.txt             |    2 +-
 examples/cogvlm/convert_checkpoint.py         |  641 +++++++++
 examples/cpp/executor/CMakeLists.txt          |    8 +-
 examples/dbrx/requirements.txt                |    2 +-
 examples/enc_dec/README.md                    |    4 +-
 examples/enc_dec/convert_checkpoint.py        |  715 ++++++----
 examples/enc_dec/helper.py                    |    4 +
 examples/enc_dec/run.py                       |   11 +-
 examples/falcon/README.md                     |   10 +-
 examples/falcon/requirements.txt              |    2 +-
 examples/gemma/README.md                      |   12 +-
 examples/gemma/convert_checkpoint.py          |    4 +-
 examples/gemma/requirements.txt               |    2 +-
 examples/gpt/convert_checkpoint.py            |  104 +-
 examples/gpt/requirements.txt                 |    2 +-
 examples/gptj/README.md                       |    6 +-
 examples/gptneox/README.md                    |    6 +-
 examples/gptneox/convert_checkpoint.py        |    7 +-
 examples/gptneox/requirements.txt             |    2 +-
 examples/high-level-api/README.md             |    9 +-
 examples/high-level-api/llm_examples.py       |  288 ++--
 examples/high-level-api/requirements.txt      |    2 +-
 .../run_auto_parallel_examples.sh             |   18 -
 examples/high-level-api/run_examples.py       |  158 ++-
 examples/high-level-api/run_quant_examples.py |   23 -
 examples/internlm/requirements.txt            |    2 +-
 examples/llama/README.md                      |   76 +-
 examples/llama/convert_checkpoint.py          |    8 +-
 examples/llama/requirements.txt               |    2 +-
 examples/mamba/requirements.txt               |    2 +-
 examples/medusa/README.md                     |    2 +
 examples/medusa/convert_checkpoint.py         |    2 +-
 examples/medusa/requirements.txt              |    2 +-
 examples/mixtral/README.md                    |    2 +-
 examples/mixtral/requirements.txt             |    2 +-
 examples/mpt/README.md                        |   30 +-
 examples/mpt/requirements.txt                 |    2 +-
 examples/multimodal/README.md                 |  177 ++-
 examples/multimodal/build_visual_engine.py    |  234 +++-
 examples/multimodal/run.py                    |  253 +++-
 examples/opt/requirements.txt                 |    2 +-
 examples/phi/README.md                        |   32 +-
 examples/phi/convert_checkpoint.py            |   15 +-
 examples/phi/requirements.txt                 |    2 +-
 examples/quantization/README.md               |   10 +-
 examples/quantization/requirements.txt        |    2 +-
 examples/qwen/README.md                       |    2 +-
 examples/qwen/requirements.txt                |    2 +-
 examples/qwenvl/requirements.txt              |    2 +-
 examples/recurrentgemma/requirements.txt      |    2 +-
 examples/skywork/requirements.txt             |    2 +-
 examples/smaug/requirements.txt               |    2 +-
 examples/whisper/requirements.txt             |    2 +-
 requirements-windows.txt                      |    7 +-
 requirements.txt                              |   20 +-
 scripts/build_wheel.py                        |    8 +
 setup.py                                      |    4 +-
 .../plugin_nodes/gpt_attention_node.py        |   16 +-
 tensorrt_llm/builder.py                       |   57 +-
 tensorrt_llm/commands/build.py                |   18 +-
 tensorrt_llm/functional.py                    |  227 +++-
 tensorrt_llm/hlapi/llm.py                     |  124 +-
 tensorrt_llm/layers/__init__.py               |    3 +-
 tensorrt_llm/layers/attention.py              |  370 +++++-
 tensorrt_llm/layers/moe.py                    |    3 +-
 tensorrt_llm/models/__init__.py               |   12 +-
 tensorrt_llm/models/cogvlm/__init__.py        |   14 +
 tensorrt_llm/models/cogvlm/convert.py         |  250 ++++
 tensorrt_llm/models/cogvlm/model.py           |  304 +++++
 tensorrt_llm/models/enc_dec/model.py          |    3 +-
 tensorrt_llm/models/gemma/model.py            |   43 +-
 tensorrt_llm/models/generation_mixin.py       |  149 ++-
 tensorrt_llm/models/gpt/model.py              |   41 +-
 tensorrt_llm/models/llama/convert.py          |  150 ++-
 tensorrt_llm/models/llama/model.py            |   90 +-
 tensorrt_llm/models/llama/weight.py           |   18 +-
 tensorrt_llm/models/mamba/model.py            |   29 +-
 tensorrt_llm/models/medusa/model.py           |   55 +-
 tensorrt_llm/models/modeling_utils.py         |   79 +-
 tensorrt_llm/models/phi3/__init__.py          |   14 +
 tensorrt_llm/models/phi3/convert.py           |   88 ++
 tensorrt_llm/models/phi3/model.py             |  190 +++
 tensorrt_llm/models/qwen/convert.py           |    2 +-
 tensorrt_llm/models/recurrentgemma/model.py   |   30 +-
 tensorrt_llm/plugin/plugin.py                 |    4 +
 tensorrt_llm/quantization/__init__.py         |    2 +-
 tensorrt_llm/quantization/layers.py           |    8 +-
 ...ize_by_ammo.py => quantize_by_modelopt.py} |   27 +-
 tensorrt_llm/runtime/generation.py            |   21 +-
 tensorrt_llm/runtime/model_runner.py          |   16 +-
 tensorrt_llm/version.py                       |    2 +-
 tests/hlapi/grid_searcher.py                  |    2 +-
 tests/hlapi/hlapi_evaluator.py                |    9 +-
 tests/hlapi/run_llm.py                        |   35 +
 tests/hlapi/test_llm.py                       |   60 +-
 tests/hlapi/test_llm_multi_gpu.py             |   13 +
 tests/hlapi/test_llm_perf_evaluator.py        |    2 +-
 tests/model/test_arctic.py                    |  416 ++++++
 tests/model/test_gpt_e2e.py                   |    6 +-
 tests/model/test_phi.py                       |    3 +-
 tests/model_api/test_model_quantization.py    |    6 +-
 tests/test_layer.py                           |   15 +-
 tests/test_llama_conversion.sh                |    2 +-
 tests/utils/util.py                           |   10 +-
 windows/README.md                             |    5 +-
 windows/destruct_env.ps1                      |    7 +-
 windows/docker/Dockerfile                     |  200 +--
 windows/docker/README.md                      |   28 +-
 windows/setup_build_env.ps1                   |   14 +-
 windows/setup_env.ps1                         |   62 +-
 270 files changed, 10605 insertions(+), 3111 deletions(-)
 create mode 100644 benchmarks/python/check_accuracy_mlperf.py
 create mode 100644 cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
 create mode 100644 cpp/tensorrt_llm/kernels/cumsumLastDim.cu
 create mode 100644 cpp/tensorrt_llm/kernels/cumsumLastDim.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h
 create mode 100755 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
 create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h
 create mode 100644 cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp
 create mode 100644 cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
 create mode 100644 cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp
 create mode 100644 cpp/tensorrt_llm/layers/lookaheadPoolManager.h
 create mode 100644 cpp/tensorrt_llm/plugins/common/pluginUtils.h
 create mode 100644 cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/CMakeLists.txt
 create mode 100644 cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.cpp
 create mode 100644 cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h
 create mode 100644 cpp/tensorrt_llm/runtime/layerProfiler.cpp
 create mode 100644 cpp/tensorrt_llm/runtime/layerProfiler.h
 create mode 100644 cpp/tests/layers/lookaheadPoolManagerTest.cpp
 create mode 100644 cpp/tests/resources/scripts/build_recurrentgemma_engines.py
 create mode 100644 cpp/tests/resources/scripts/generate_expected_recurrentgemma_output.py
 create mode 100644 docker/common/pytorch_pr_116072.patch
 create mode 100644 examples/arctic/README.md
 create mode 100644 examples/cogvlm/convert_checkpoint.py
 delete mode 100644 examples/high-level-api/run_auto_parallel_examples.sh
 delete mode 100644 examples/high-level-api/run_quant_examples.py
 create mode 100644 tensorrt_llm/models/cogvlm/__init__.py
 create mode 100644 tensorrt_llm/models/cogvlm/convert.py
 create mode 100644 tensorrt_llm/models/cogvlm/model.py
 create mode 100644 tensorrt_llm/models/phi3/__init__.py
 create mode 100644 tensorrt_llm/models/phi3/convert.py
 create mode 100644 tensorrt_llm/models/phi3/model.py
 rename tensorrt_llm/quantization/{quantize_by_ammo.py => quantize_by_modelopt.py} (94%)
 create mode 100644 tests/hlapi/run_llm.py
 create mode 100644 tests/model/test_arctic.py

diff --git a/.gitattributes b/.gitattributes
index c919a0391..b3f53c5ae 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
 *.a filter=lfs diff=lfs merge=lfs -text
 *.lib filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.dll filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 3b4d5a535..f907ce84f 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -17,7 +17,7 @@ body:
         - Libraries
           - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
           - TensorRT-LLM commit (if known)
-          - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used
+          - Versions of TensorRT, Modelopt, CUDA, cuBLAS, etc. used
           - Container used (if running TensorRT-LLM in a container)
         - NVIDIA driver version
         - OS (Ubuntu 22.04, CentOS 7, Windows 10)
diff --git a/.gitignore b/.gitignore
index 15e677c07..f7e541759 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 *.nsys-rep
 .VSCodeCounter
 build*/
-*.so
 *.egg-info/
 .coverage
 *.csv
@@ -34,6 +33,7 @@ tensorrt_llm/bindings.pyi
 tensorrt_llm/bindings/*.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
+*.swp
 
 # Testing
 .coverage.*
diff --git a/README.md b/README.md
index 774dc59d8..769631749 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,9 @@ TensorRT-LLM
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.3-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-9.3-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.9.0-green)](./setup.py)
+[![cuda](https://img.shields.io/badge/cuda-12.4.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.0.1-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.10.0.dev-green)](./setup.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
index f6964ffd5..a7cc4520d 100644
--- a/benchmarks/cpp/README.md
+++ b/benchmarks/cpp/README.md
@@ -170,8 +170,6 @@ Given a `static_emulated_batch_size` of `n` the server will wait for `n` request
 ```
  python prepare_dataset.py \
   --output tokens-fixed-lengths.json \
-  --request-rate -1 \
-  --time-delay-dist constant \
   --tokenizer <path/to/tokenizer> \
    token-norm-dist \
    --num-requests 128 \
@@ -184,6 +182,7 @@ Take GPT-350M as an example for single GPU with static batching
 ./benchmarks/gptManagerBenchmark \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type IFB \
+    --request-rate -1 \
     --static_emulated_batch_size 32 \
     --static_emulated_timeout 100 \
     --dataset ../../benchmarks/cpp/tokens-fixed-lengths.json
@@ -212,6 +211,7 @@ PP=1
 MAX_LEN=1024
 MAX_BATCH=32
 MAX_LORA_RANK=32
+NUM_LORA_MODS=7
 
 SOURCE_LORA=chinese-llama-2-lora-13b
 CPP_LORA=chinese-llama-2-lora-13b-cpp
@@ -241,10 +241,9 @@ NUM_LORAS=(8 16 24 32 64 128 256)
 NUM_REQUESTS=1024
 
 # Convert LoRA to cpp format
-python examples/gpt/nemo_lora_convert.py \
+python examples/hf_lora_convert.py \
     -i $SOURCE_LORA \
     --storage-type $DTYPE \
-    --write-cpp-runtime-tensors \
     -o $CPP_LORA
 
 # Prepare datasets
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
index 4df632e12..c3ccd6efc 100644
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -151,6 +151,7 @@ struct BenchmarkParams
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
     int randomSeed = 430;
+    std::optional<int> maxAttentionWindow{std::nullopt};
 
     // lora / peft params
     std::optional<std::string> loraDir{std::nullopt};
@@ -746,8 +747,8 @@ class ExecutorServer
 
         texec::SchedulerConfig schedulerConfig(batch_scheduler::batchManagerToExecSchedPolicy(schedulerPolicy));
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
-            std::nullopt, std::nullopt, benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize,
-            benchmarkParams.kvOnboardBlocks);
+            benchmarkParams.maxAttentionWindow, std::nullopt, benchmarkParams.freeGpuMemoryFraction,
+            benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
         texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
             std::nullopt, benchmarkParams.loraHostCacheSize);
         texec::ExecutorConfig executorConfig(
@@ -909,6 +910,16 @@ class GptServer
         mWorkItemsQueue.clear();
     }
 
+    std::string getLayerProfileInfo()
+    {
+        return mBatchManager->getLayerProfileInfo();
+    }
+
+    void setLayerProfiler()
+    {
+        return mBatchManager->setLayerProfiler();
+    }
+
     void enqueue(std::shared_ptr<InferenceRequest> const& request)
     {
         TLLM_CHECK(request != nullptr);
@@ -1267,7 +1278,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     std::chrono::milliseconds waitSleep, bool returnContextLogits, bool returnGenerationLogits,
     std::optional<SizeType> const staticEmulatedBatchSize, std::optional<std::chrono::milliseconds> const batchTimeout,
     bool logIterationData, bool excludeInputInOutput, std::string const& responsesJsonFile,
-    std::optional<SizeType> const maxPromptLen)
+    std::optional<SizeType> const maxPromptLen, bool dumpProfile)
 {
     TrtGptModelOptionalParams optionalParams;
 
@@ -1279,6 +1290,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     {
         optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
     }
+    if (benchmarkParams.maxAttentionWindow)
+    {
+        optionalParams.kvCacheConfig.maxAttentionWindow = benchmarkParams.maxAttentionWindow;
+    }
     optionalParams.kvCacheConfig.enableBlockReuse = benchmarkParams.enableBlockReuse;
     optionalParams.enableChunkedContext = benchmarkParams.enableChunkedContext;
     optionalParams.enableTrtOverlap = benchmarkParams.enableTrtOverlap;
@@ -1391,6 +1406,23 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
         recorder->report();
         recorder->writeOpMetricsToCsv();
         recorder->dumpResponseSeqs();
+        if (dumpProfile)
+        {
+            // Do per-layer profiling after normal benchmarking to avoid introducing perf overhead.
+            gptServer->resetBatchDeadline();
+            gptServer->setLayerProfiler();
+            for (std::size_t i = 0; i < numSamples; ++i)
+            {
+                auto request = makeRequest(i + 1, samples[i], benchmarkParams.streaming, beamWidthTensor, eosIdTensor,
+                    padIdTensor, bufferManager, returnContextLogitsFlagTensor, returnGenerationLogitsFlagTensor);
+                gptServer->enqueue(request);
+            }
+            gptServer->waitForEmpty();
+            if (worldConfig.getRank() == 0)
+            {
+                printf("[BENCHMARK] Per layer performance profile\n%s\n", gptServer->getLayerProfileInfo().c_str());
+            }
+        }
         // Send terminateReqId to terminate servers on all ranks
         // Server on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
         gptServer->enqueue(std::make_shared<InferenceRequest>(terminateReqId));
@@ -1554,6 +1586,7 @@ int main(int argc, char* argv[])
         "eos_id", "Specify the end-of-sequence token id.", cxxopts::value<TokenIdType>()->default_value("-1"));
     options.add_options()("pad_id", "Specify the padding token id.", cxxopts::value<TokenIdType>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
+    options.add_options()("max_attention_window", "Max KV cache length per sequence", cxxopts::value<int>());
     options.add_options()(
         "random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
     options.add_options()(
@@ -1614,6 +1647,8 @@ int main(int argc, char* argv[])
     options.add_options()(
         "max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType>());
 
+    options.add_options()("dump_profile", "Print profile information per layer.", cxxopts::value<bool>());
+
     auto result = options.parse(argc, argv);
 
     if (result.count("help"))
@@ -1674,6 +1709,12 @@ int main(int argc, char* argv[])
         benchmarkParams.maxTokensInPagedKvCache = result["max_tokens_in_paged_kvcache"].as<int>();
     }
 
+    // Argument: Max KV cache length
+    if (result.count("max_attention_window"))
+    {
+        benchmarkParams.maxAttentionWindow = result["max_attention_window"].as<int>();
+    }
+
     if (result.count("random_seed"))
     {
         benchmarkParams.randomSeed = result["random_seed"].as<int>();
@@ -1811,6 +1852,9 @@ int main(int argc, char* argv[])
         return 1;
     }
 
+    // Argument: dump profile
+    bool dumpProfile = result["dump_profile"].as<bool>();
+
     initTrtLlmPlugins(logger.get());
 
     if (api == "gptManager")
@@ -1821,7 +1865,7 @@ int main(int argc, char* argv[])
                 maxNumSamples, beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, schedulerPolicy,
                 waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, batchTimeout,
                 logIterationData, result["exclude_input_in_output_seq"].as<bool>(),
-                result["responses_json_file"].as<std::string>(), maxPromptLen);
+                result["responses_json_file"].as<std::string>(), maxPromptLen, dumpProfile);
         }
         catch (std::exception const& e)
         {
diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
index bae5d2bcd..600fcc0bd 100644
--- a/benchmarks/cpp/gptSessionBenchmark.cpp
+++ b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -68,7 +68,7 @@ size_t monitorMemory(std::atomic_bool& done)
 void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int> const& batchSizes, int beamWidth,
     std::vector<std::vector<int>> const& inOutLen, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
     int numRuns, int duration, GptSession::Config& sessionConfig, bool cudaGraphMode, bool printAllLogits,
-    bool disableForceMaxTokens, bool dumpLayerInfo)
+    bool disableForceMaxTokens, bool dumpLayerInfo, bool dumpProfile)
 {
     std::filesystem::path jsonFileName = dataPath / "config.json";
     auto const json = GptJsonConfig::parse(jsonFileName);
@@ -298,6 +298,46 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                                   << std::endl;
                     }
                 }
+                // Do per-layer profiling after normal benchmarking to avoid introducing perf overhead.
+                if (dumpProfile)
+                {
+                    session.setLayerProfiler();
+                    iterIdx = 0;
+
+                    while (iterIdx < numRuns)
+                    {
+                        auto const start = std::chrono::steady_clock::now();
+                        SizeType numSteps = 0;
+                        generationOutput.onTokenGenerated
+                            = [&numSteps, maxNewTokens](GenerationOutput::TensorPtr const& outputIds, SizeType step,
+                                  bool finished) { ++numSteps; };
+                        session.generate(generationOutput, generationInput, samplingConfig, generationProfiler);
+                        bufferManager.getStream().synchronize();
+                        auto const end = std::chrono::steady_clock::now();
+
+                        iterIdx += 1;
+                        float latency = std::chrono::duration<float, std::milli>(end - start).count();
+                        curDuration += latency;
+                        latencies.emplace_back(latency);
+                        generationTimes.emplace_back(generationProfiler->getElapsedTimeMs());
+
+                        bool durationLimitReached{curDuration / 1000 >= duration};
+                        if (worldConfig.getSize() > 1)
+                        {
+                            bool result{false};
+                            comm.allreduce(&durationLimitReached, &result, 1, tmpi::MpiType::kBOOL, tmpi::MpiOp::LOR);
+                            durationLimitReached = result;
+                        }
+                        if (durationLimitReached)
+                        {
+                            break;
+                        }
+                    }
+                    if (worldConfig.getRank() == 0)
+                    {
+                        printf("%s\n", session.getLayerProfileInfo().c_str());
+                    }
+                }
             }
             catch (std::runtime_error& e)
             {
@@ -377,6 +417,7 @@ int main(int argc, char* argv[])
     options.add_options()("print_all_logits", "Print all context and generation logits.");
     options.add_options()("disable_force_max_tokens", "Disable force the engine generating new max_tokens.");
     options.add_options()("dump_layer_info", "Print layer information of the engine to console.");
+    options.add_options()("dump_profile", "Print profile information per layer.");
 
     auto result = options.parse(argc, argv);
 
@@ -487,6 +528,7 @@ int main(int argc, char* argv[])
     auto printAllLogits = result.count("print_all_logits") > 0;
     auto disableForceMaxTokens = result.count("disable_force_max_tokens") > 0;
     auto dumpLayerInfo = result.count("dump_layer_info") > 0;
+    auto dumpProfile = result.count("dump_profile") > 0;
 
     initTrtLlmPlugins(logger.get());
 
@@ -494,7 +536,7 @@ int main(int argc, char* argv[])
     {
         benchmarkGptSession(result["engine_dir"].as<std::string>(), batchSizes, beamWidth, inOutLen, logger,
             result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(), sessionConfig,
-            enableCudaGraph, printAllLogits, disableForceMaxTokens, dumpLayerInfo);
+            enableCudaGraph, printAllLogits, disableForceMaxTokens, dumpLayerInfo, dumpProfile);
     }
     catch (std::exception const& e)
     {
diff --git a/benchmarks/python/README.md b/benchmarks/python/README.md
index 1f94f522f..3d346b37c 100644
--- a/benchmarks/python/README.md
+++ b/benchmarks/python/README.md
@@ -48,3 +48,11 @@ mpirun -n 8 python benchmark.py \
     --batch_size "1;8;64" \
     --input_output_len "60,20;128,20"
 ```
+
+Note: Building multi-GPU engines in parallel could be a heavy workload for the CPU system. Tuning `mpirun --map-by <XXX>` option on your system may achieve significant boost in build time, for example:
+```
+mpirun --map-by socket -n 8 python build.py \
+    --model gpt_175b \
+    --mode ootb \
+    --quantization fp8
+```
diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
index 97a80df3d..2523b2200 100644
--- a/benchmarks/python/allowed_configs.py
+++ b/benchmarks/python/allowed_configs.py
@@ -67,6 +67,8 @@ class BuildConfig:
     layer_types: List[str] = field(default_factory=list)
     rnn_hidden_size: int = 0
     logits_soft_cap: float = 0.0
+    opt_batch_size: Optional[int] = None
+    opt_num_tokens: Optional[int] = None
 
 
 @dataclass
diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
index 00b4ba11d..d9abe7c4f 100644
--- a/benchmarks/python/benchmark.py
+++ b/benchmarks/python/benchmark.py
@@ -268,6 +268,25 @@ def parse_arguments():
         help=
         "Print layer information of the engine to console (default = disabled)")
 
+    parser.add_argument(
+        '--opt_batch_size',
+        type=int,
+        default=None,
+        help=
+        "If opt_batch_size option is specified, it will override the opt batch size."
+        "This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
+    )
+
+    parser.add_argument(
+        '--opt_num_tokens',
+        type=int,
+        default=None,
+        help="It equals to max_batch_size*max_beam_width by default, set this "
+        "value as close as possible to the actual number of tokens on your workload. "
+        "Note that this argument might be removed in the future."
+        "This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
+    )
+
     return parser.parse_args()
 
 
@@ -334,9 +353,6 @@ def main(args):
     if args.build_only:
         return
 
-    if args.dump_profile and benchmark_profiler is not None:
-        benchmark_profiler.set_recording_perf_profile(True)
-
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
     benchmarker.print_report_header(args.csv,
@@ -432,6 +448,39 @@ def main(args):
                            csv=args.csv,
                            benchmark_profiler=benchmark_profiler)
 
+        # Rerun for dumping profile per layer.
+        if args.dump_profile and benchmark_profiler is not None:
+            benchmark_profiler.set_recording_perf_profile(True)
+            logger.info(f'Dump profile information per layer')
+            iter_idx = 0
+            try:
+                # Warm up
+                for _ in range(args.warm_up):
+                    benchmarker.run(inputs, config)
+                if benchmark_profiler is not None:
+                    benchmark_profiler.clean()
+                    benchmark_profiler.start()
+                cur_duration = 0
+                start_time = time()
+                while iter_idx < args.num_runs or cur_duration < args.duration:
+                    start.record()
+                    benchmarker.run(inputs,
+                                    config,
+                                    benchmark_profiler=benchmark_profiler)
+                    end.record()
+                    torch.cuda.synchronize()
+                    latencies.append(start.elapsed_time(end))
+                    iter_idx += 1
+                    cur_duration = round(time() - start_time, 3)
+                benchmarker.report_profiler(
+                    benchmark_profiler=benchmark_profiler)
+            except Exception as e:
+                logger.error("Found exception during benchmarking",
+                             e.with_traceback())
+                if not disable_mem_monitor:
+                    memory_monitor.kill()
+                raise e
+
 
 if __name__ == '__main__':
     mp.set_start_method('spawn')
diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
index d080b04b4..e81137d0c 100644
--- a/benchmarks/python/build.py
+++ b/benchmarks/python/build.py
@@ -168,6 +168,24 @@ def parse_arguments():
         help=
         "The number of gpus to be used for inference, only used when --serial_build is specified"
     )
+    parser.add_argument(
+        '--opt_batch_size',
+        type=int,
+        default=None,
+        help=
+        "If opt_batch_size option is specified, it will override the opt batch size."
+        "This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
+    )
+
+    parser.add_argument(
+        '--opt_num_tokens',
+        type=int,
+        default=None,
+        help="It equals to max_batch_size*max_beam_width by default, set this "
+        "value as close as possible to the actual number of tokens on your workload. "
+        "Note that this argument might be removed in the future."
+        "This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
+    )
 
     return parser.parse_args()
 
@@ -229,6 +247,21 @@ def build_gpt(args):
     max_beam_width = build_config['max_beam_width'] \
         if args.max_beam_width is None else args.max_beam_width
 
+    opt_batch_size = build_config[
+        'opt_batch_size'] if args.opt_batch_size is None else args.opt_batch_size
+
+    opt_num_tokens = build_config[
+        'opt_num_tokens'] if args.opt_num_tokens is None else args.opt_num_tokens
+
+    if args.mode != "ootb" and opt_batch_size is not None:
+        raise Exception(
+            f'--opt_batch_size only used when mode is ootb. Please using --opt_num_tokens instead it.'
+        )
+    if args.mode == "ootb" and opt_num_tokens is not None:
+        raise Exception(
+            f'--opt_num_tokens does not support ootb mode. Please using --opt_batch_size instead it.'
+        )
+
     quant_config = get_quant_config(args.quantization)
     quant_algo = quant_config.quant_algo
     kv_cache_quant_algo = quant_config.kv_cache_quant_algo
@@ -873,9 +906,11 @@ def build_gpt(args):
         # Inflight batching
         if args.mode == 'plugin-ifb':
             network.plugin_config.enable_paged_kv_cache()
+            network.plugin_config.enable_paged_state()
     elif args.mode == 'ootb-except-mha':
         network.plugin_config.set_gpt_attention_plugin(dtype=args.dtype)
         network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
+        network.plugin_config.enable_remove_input_padding()
 
     if world_size > 1:
         network.plugin_config.set_nccl_plugin(
@@ -895,7 +930,9 @@ def build_gpt(args):
             max_input_len=max_input_len,
             max_seq_len=max_input_len + max_output_len,
             use_cache=True,
-            max_beam_width=max_beam_width)
+            max_beam_width=max_beam_width,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens)
 
         tensorrt_llm_model(**inputs)
 
diff --git a/benchmarks/python/check_accuracy_mlperf.py b/benchmarks/python/check_accuracy_mlperf.py
new file mode 100644
index 000000000..025b08201
--- /dev/null
+++ b/benchmarks/python/check_accuracy_mlperf.py
@@ -0,0 +1,163 @@
+import json
+from enum import Enum
+
+import evaluate
+import nltk
+import numpy as np
+import pandas as pd
+from transformers import AutoTokenizer, LlamaTokenizerFast
+
+nltk.download("punkt", quiet=False)
+import argparse
+
+
+class Model(Enum):
+    Llama_v2_70B = 1
+    GPT_J = 2
+
+
+ACCURACY_TARGETS = {
+    Model.Llama_v2_70B: {
+        "rouge1": 44.4312 * 0.999,
+        "rouge2": 22.0352 * 0.999,
+        "rougeL": 28.6162 * 0.999,
+        "tokens_per_sample": 294.45 * 0.9
+    },
+    Model.GPT_J: {
+        "rouge1": 42.9435135,
+        "rouge2": 20.1033765,
+        "rougeL": 29.9581119,
+        # "tokens_per_sample": ??
+    }
+}
+
+
+def get_reference_df(processed_dataset_file):
+    data = pd.read_pickle(processed_dataset_file)
+    return data["output"].tolist()
+
+
+def get_reference_json(cnn_dailymail_valset):
+    # Load from CNN dailymail
+    with open(cnn_dailymail_valset, 'r') as fh:
+        list_data_dict = json.load(fh)
+
+    targets = [f"{example['output']}" for example in list_data_dict]
+
+    print(f"Loaded {len(targets)} samples from {cnn_dailymail_valset}")
+    return targets
+
+
+def get_responses_json(response_file):
+    f = open(response_file)
+    responses = json.load(f)
+    ordered_responses = sorted(responses, key=lambda x: int(x['response_id']))
+    return ordered_responses
+
+
+def postprocess_text(preds, targets):
+    # Post-process output texts for ROUGE evaluation
+    preds = [pred.strip() for pred in preds]
+    targets = [target.strip() for target in targets]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+    return preds, targets
+
+
+def strip_eos(pred_toks, eos_id):
+    while len(pred_toks) > 0 and pred_toks[-1] == eos_id:
+        pred_toks.pop()
+    if len(pred_toks) == 0:
+        raise RuntimeError("Empty output sequence detected with EOS")
+    return pred_toks
+
+
+def calculate_toks_per_sample(preds, eos_id):
+    preds = [strip_eos(pred, eos_id) for pred in preds]
+    avg_len = sum(len(pred) for pred in preds)
+    num_samples = len(preds)
+    return avg_len / num_samples
+
+
+def calculate_rouge_score(preds, targets):
+    print("Calculating ROUGE scores...")
+    metric = evaluate.load("rouge")
+    preds, targets = postprocess_text(preds, targets[0:len(preds)])
+    result = metric.compute(predictions=preds,
+                            references=targets,
+                            use_stemmer=True,
+                            use_aggregator=False)
+    result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
+    prediction_lens = [len(pred) for pred in preds]
+    result["gen_len"] = np.sum(prediction_lens)
+    result["gen_num"] = len(preds)
+
+    return result
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help=
+        "Path to the reference dataset against which the responses are evaluated for accuracy. MLPerf uses open-orca (pkl) and cnn-dailymail (np) for Llama2-70B and GPT-J respectively."
+    )
+    parser.add_argument(
+        "--responses",
+        type=str,
+        help="Path to the json file holding the responses from our benchmark run"
+    )
+    parser.add_argument("--base_model",
+                        type=str,
+                        help="Location of the model used (to create tokenizer)")
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_arguments()
+
+    if args.dataset.lower().endswith(".pkl"):
+        target_texts = get_reference_df(args.dataset)
+        model = Model.Llama_v2_70B
+        tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
+        relaxing_factor = 1.0
+    elif args.dataset.lower().endswith(".json"):
+        target_texts = get_reference_json(args.dataset)
+        model = Model.GPT_J
+        tokenizer = AutoTokenizer.from_pretrained(args.base_model,
+                                                  model_max_length=2047,
+                                                  padding_side="left",
+                                                  use_fast=False)
+        tokenizer.pad_token = tokenizer.eos_token
+        relaxing_factor = 0.93
+    else:
+        raise RuntimeError(
+            "Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
+
+    pred_out = get_responses_json(args.responses)
+    pred_toks = [x['response_tokens'] for x in pred_out]
+
+    tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
+
+    pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
+    achieved_scores = calculate_rouge_score(pred_texts, target_texts)
+
+    achieved_scores['tokens_per_sample'] = tps_score
+    targets = ACCURACY_TARGETS[model]
+
+    print("Achieved rouge scores: ", achieved_scores)
+    print("Tokens per sample: ", tps_score)
+    print("Targets: ", targets)
+
+    for k, _ in targets.items():
+        assert targets[k] * relaxing_factor <= achieved_scores[k]
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
index 5ff7c03bc..5fba0cc01 100644
--- a/benchmarks/python/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@@ -381,6 +381,7 @@ def report(self,
                     layer_idx, trt.LayerInformationFormat.ONELINE)
                 print(layer_info)
 
+    def report_profiler(self, benchmark_profiler=None):
         if benchmark_profiler is not None and benchmark_profiler.is_recording_perf_profile:
             perf_profile_data = self.decoder.profiler.results
             if not perf_profile_data:
@@ -418,8 +419,9 @@ def reduce_layer_data(layers):
             def dump_kernel_profile_table(name: str, profile_data: list,
                                           iter_cnt: int):
                 table = pd.DataFrame(
-                    [[k, '{:0.3f}'.format(v)] for k, v in profile_data.items()],
-                    columns=['{} Phase LayerName'.format(name), 'times (ms)'])
+                    [['{:0.3f}'.format(v), k]
+                     for k, v in profile_data.items() if v != 0.0],
+                    columns=['times (ms)', '{} Phase LayerName'.format(name)])
 
                 def ljust(s):
                     s = s.astype(str).str.strip()
diff --git a/benchmarks/python/mem_monitor.py b/benchmarks/python/mem_monitor.py
index f60ce8f0a..2606de04a 100644
--- a/benchmarks/python/mem_monitor.py
+++ b/benchmarks/python/mem_monitor.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import os
 from multiprocessing import Event, Process, Queue
+from queue import Empty
 
 from tensorrt_llm.logger import logger
 from tensorrt_llm.profiler import (MemUnitType, bytes_to_target_unit,
@@ -52,11 +53,16 @@ def kill(self):
     def stop(self):
         self.signal_event.set()
         logger.debug("Sent signal to stop memory monitor subprocess.")
-        peak_mem_use = self.peak_mem_queue.get(timeout=20)
 
-        self._peak_host_memory = max(self._peak_host_memory, peak_mem_use[0])
-        self._peak_device_memory = max(self._peak_device_memory,
-                                       peak_mem_use[1])
+        try:
+            peak_mem_use = self.peak_mem_queue.get(timeout=20)
+        except Empty:
+            logger.warning("peak_mem_queue was empty.")
+        else:
+            self._peak_host_memory = max(self._peak_host_memory,
+                                         peak_mem_use[0])
+            self._peak_device_memory = max(self._peak_device_memory,
+                                           peak_mem_use[1])
 
         self.mem_monitor_process.join(timeout=20)
         self.mem_monitor_process = None
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0c0173079..c4ec07400 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -37,6 +37,7 @@ option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
 option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF)
 option(FAST_MATH "Compiling in fast math mode" OFF)
 option(INDEX_RANGE_CHECK "Compiling with index range checks" OFF)
+option(USE_SHARED_NVRTC "Use shared NVRTC library instead of static" OFF)
 
 if(NVTX_DISABLE)
   add_compile_definitions("NVTX_DISABLE")
@@ -75,6 +76,23 @@ else()
   message(STATUS "Importing executor")
 endif()
 
+if(EXISTS
+   "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt"
+)
+  set(BUILD_NVRTC_WRAPPER_DEFAULT ON)
+else()
+  set(BUILD_NVRTC_WRAPPER_DEFAULT OFF)
+endif()
+
+option(BUILD_NVRTC_WRAPPER "Build nvrtc wrapper from source"
+       ${BUILD_NVRTC_WRAPPER_DEFAULT})
+
+if(BUILD_NVRTC_WRAPPER)
+  message(STATUS "Building nvrtc wrapper")
+else()
+  message(STATUS "Importing nvrtc wrapper")
+endif()
+
 if(BUILD_PYT)
   message(STATUS "Building PyTorch")
 else()
@@ -172,6 +190,41 @@ message(STATUS "    include path: ${CUDAToolkit_INCLUDE_DIRS}")
 # pick up on the includes
 set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
 
+if(USE_SHARED_NVRTC)
+  if(WIN32)
+    message(FATAL_ERROR "Cannot use NVRTC shared library on Windows.")
+  else()
+    find_library(
+      NVRTC_LIB nvrtc
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+    find_library(
+      NVRTC_BUILTINS_LIB nvrtc-builtins
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+  endif()
+else()
+  if(WIN32)
+    find_library(
+      NVRTC_LIB nvrtc
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+  else()
+    find_library(
+      NVRTC_LIB nvrtc_static
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+    find_library(
+      NVRTC_BUILTINS_LIB nvrtc-builtins_static
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+    find_library(
+      NVPTXCOMPILER_LIB nvptxcompiler_static
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+      PATH_SUFFIXES lib64 lib lib/x64)
+  endif()
+endif()
+
 set(CUBLAS_LIB CUDA::cublas)
 set(CUBLASLT_LIB CUDA::cublasLt)
 set(CUDA_DRV_LIB CUDA::cuda_driver)
@@ -204,7 +257,15 @@ include_directories(
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
 set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
 set(TRT_LIB nvinfer)
-find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
+
+# On Windows major version is appended to nvinfer libs.
+if(WIN32)
+  set(TRT_LIB_NAME nvinfer_10)
+else()
+  set(TRT_LIB_NAME nvinfer)
+endif()
+
+find_library_create_target(${TRT_LIB} ${TRT_LIB_NAME} SHARED ${TRT_LIB_DIR})
 
 if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index bf5160e65..51058c5f9 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -78,6 +78,10 @@ class GptManager
 
     virtual ~GptManager();
 
+    void setLayerProfiler();
+
+    [[nodiscard]] std::string getLayerProfileInfo() const;
+
 protected:
     /* Synchronizes the decoder */
     virtual BatchManagerErrorCode_t forwardSync();
@@ -91,6 +95,7 @@ class GptManager
     [[nodiscard]] SizeType getMaxInputLen() const;
     [[nodiscard]] SizeType getMaxSequenceLen() const;
     [[nodiscard]] SizeType getMaxNumSequences() const;
+    [[nodiscard]] SizeType getMaxDraftLen() const;
 
     void validateLlmRequest(
         LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 64b7690ed..f7a5f0cac 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -115,12 +115,22 @@ class GenericLlmRequest
         , mPadId(req.getPadId())
         , mOrigPromptLen(mPromptLen)
         , mMaxSentTokenPos(mPromptLen - 1)
+        , mEmbeddingBias(std::nullopt)
+        , mBadWordsList(std::nullopt)
+        , mStopWordsList(std::nullopt)
+        , mPromptEmbeddingTable(std::nullopt)
+        , mPromptVocabSize(std::nullopt)
+        , mLoraTaskId(std::nullopt)
+        , mLoraWeights(std::nullopt)
+        , mLoraConfig(std::nullopt)
         , mReturnLogProbs(req.getOutputConfig().returnLogProbs)
         , mContextChunkSize(std::nullopt)
         , mContextCurrentPosition(0)
         , mLogProbs(mSamplingConfig.beamWidth)
         , mCumLogProbs(mSamplingConfig.beamWidth)
         , mDraftTokens(std::make_shared<VecTokens>())
+        , mDraftLogits(std::nullopt)
+        , mNumTokensPerIteration(1)
         , mReturnContextLogits(req.getOutputConfig().returnContextLogits)
         , mReturnGenerationLogits(req.getOutputConfig().returnGenerationLogits)
         , mExcludeInputFromOutput(req.getOutputConfig().excludeInputFromOutput)
@@ -183,20 +193,42 @@ class GenericLlmRequest
         initialize(req.getInputTokenIds());
     }
 
-    void validate(SizeType maxInputLen, SizeType maxSequenceLen)
+    void validate(SizeType maxInputLen, SizeType maxSequenceLen, SizeType maxDraftLen)
     {
         if (mPromptLen > maxInputLen)
         {
             TLLM_THROW("Prompt length (%d) exceeds maximum input length (%d).", mPromptLen, maxInputLen);
         }
 
-        if (mPromptLen + mMaxNewTokens > maxSequenceLen)
+        // Maximum number of draft tokens per request we pass to the engine for single runtime iteration.
+        // It depends on the speculative decoding mode.
+        auto draftLenPerEngineStep = maxDraftLen;
+        auto const& draftTokens = getDraftTokens();
+        if (draftTokens && !draftTokens->empty())
+        {
+            auto const inputDraftTokensLen = static_cast<SizeType>(draftTokens->size());
+            if (inputDraftTokensLen > maxDraftLen)
+            {
+                TLLM_THROW("Draft tokens length (%d) exceeds maximum draft tokens length (%d).", inputDraftTokensLen,
+                    maxDraftLen);
+            }
+            draftLenPerEngineStep = inputDraftTokensLen;
+
+            if (mPromptLen + draftLenPerEngineStep > maxInputLen)
+            {
+                TLLM_THROW("Prompt length + number of draft tokens (%d + %d) exceeds maximum input length (%d).",
+                    mPromptLen, draftLenPerEngineStep, maxInputLen);
+            }
+        }
+
+        if (mPromptLen + mMaxNewTokens + draftLenPerEngineStep > maxSequenceLen)
         {
-            auto const maxNewTokens = maxSequenceLen - mPromptLen;
+            auto const maxNewTokens = maxSequenceLen - mPromptLen - draftLenPerEngineStep;
             TLLM_LOG_WARNING(
-                "Prompt length + number of requested output tokens (%d + %d) exceeds maximum sequence length (%d). "
+                "Prompt length + number of requested output tokens + draft tokens per step (%d + %d + %d) exceeds "
+                "maximum sequence length (%d). "
                 "Number of requested output tokens is changed to (%d).",
-                mPromptLen, mMaxNewTokens, maxSequenceLen, maxNewTokens);
+                mPromptLen, mMaxNewTokens, draftLenPerEngineStep, maxSequenceLen, maxNewTokens);
             mMaxNewTokens = maxNewTokens;
         }
 
@@ -537,9 +569,16 @@ class GenericLlmRequest
         mReturnGenerationLogits = returnGenerationLogits;
     }
 
+    // Return all generation logits for model w/o draft token
     [[nodiscard]] bool getReturnGenerationLogits() const
     {
-        return mReturnGenerationLogits;
+        return mReturnGenerationLogits && (getNumDraftTokens() == 0);
+    }
+
+    // Return accepted tokens logits for target model
+    [[nodiscard]] bool getReturnTargetModelAcceptedLogits() const
+    {
+        return mReturnGenerationLogits && (getNumDraftTokens() > 0);
     }
 
     [[nodiscard]] TensorPtr const& getContextLogitsHost() const
@@ -701,7 +740,8 @@ class GenericLlmRequest
             auto maxNbTokens = getMaxBeamNumTokens();
             // FIXME(nkorobov): For streaming we do not allow beam search and
             // streaming index calculation here applies only for sampling
-            int nbTokensOut = mIsStreaming ? 1 : maxNbTokens;
+            // getNumTokensPerIteration takes accepted draft tokens into account
+            int nbTokensOut = mIsStreaming ? std::max(getNumTokensPerIteration(), 1) : maxNbTokens;
             if (mExcludeInputFromOutput && !mIsStreaming)
             {
                 nbTokensOut -= getOrigPromptLen();
@@ -722,6 +762,11 @@ class GenericLlmRequest
                 {
                     auto tokens = getTokens(beam);
                     auto nbTokens = mIsStreaming ? (tokenPos - getMaxSentTokenPos()) : tokens.size();
+
+                    // Take accepted draft tokens into account when streaming
+                    auto const numAcceptedTokens = std::max(0, getNumTokensPerIteration() - 1);
+                    nbTokens += mIsStreaming ? numAcceptedTokens : 0;
+
                     if (mExcludeInputFromOutput && !mIsStreaming)
                     {
                         nbTokens -= getOrigPromptLen();
@@ -731,6 +776,8 @@ class GenericLlmRequest
                         result.outputTokenIds.at(beam).assign(
                             tokens.data() + tokenPos, tokens.data() + tokenPos + nbTokens);
                     }
+                    // Correct next token position by accepted draft tokens
+                    tokenPos += numAcceptedTokens;
                 }
 
                 if (returnLogProbs())
diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
index bcd47dd9a..d3fbd0381 100644
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
@@ -19,7 +19,9 @@
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/workerPool.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
+
+#include <NvInferRuntime.h>
+
 #include <future>
 #include <memory>
 #include <unordered_map>
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index 1bc7f0b06..fce217247 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -61,7 +61,7 @@ namespace utils
 std::vector<uint8_t> loadEngine(std::string const& enginePath);
 }
 
-class IpcMemory;
+class AllReduceBuffers;
 class IStatefulGptDecoder;
 class NcclCommunicator;
 class RuntimeBuffers;
@@ -229,6 +229,12 @@ class GptSession
     void generate(GenerationOutput& outputs, GenerationInput const& inputs, SamplingConfig const& samplingConfig,
         std::shared_ptr<GenerationProfiler> const generationProfiler = nullptr);
 
+    //! @brief Set LayerProfiler to collect performance per layer.
+    void setLayerProfiler();
+
+    //! @brief Print profile information per layer.
+    [[nodiscard]] std::string getLayerProfileInfo() const;
+
 private:
     [[nodiscard]] bool useCudaGraphs()
     {
@@ -349,9 +355,7 @@ class GptSession
     std::shared_ptr<CudaStream> mCommStream;
     CudaEvent mCommEvent{};
 
-    // tensor parallelism with custom allreduce plugin
-    ITensor::SharedPtr mCommPtrs;
-    std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;
+    std::shared_ptr<AllReduceBuffers> mAllReduceBuffers;
 
     SizeType mDecoderMaxSequenceLength{};
     SizeType mDecoderMaxAttentionWindow{};
diff --git a/cpp/include/tensorrt_llm/runtime/ipcUtils.h b/cpp/include/tensorrt_llm/runtime/ipcUtils.h
index b626b6977..82ce64a9e 100644
--- a/cpp/include/tensorrt_llm/runtime/ipcUtils.h
+++ b/cpp/include/tensorrt_llm/runtime/ipcUtils.h
@@ -17,39 +17,56 @@
 
 #pragma once
 
+#include "common.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::runtime
 {
 
-void setPeerAccess(WorldConfig const& worldConfig, bool enable = true);
-
 class IpcMemory
 {
 public:
-    using TensorPtr = ITensor::SharedPtr;
+    using BufferPtr = IBuffer::SharedPtr;
 
     // MAX_ALL_REDUCE_BLOCKS for block_barrier, 1 for multi_gpu_barrier
     size_t static constexpr FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
 
-    IpcMemory(WorldConfig const& worldConfig, std::size_t bufferSize);
+    IpcMemory(std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig);
     ~IpcMemory();
 
-    [[nodiscard]] std::vector<void*> const& getCommPtrsTensor() const
+    IpcMemory(IpcMemory const&) = delete;
+    IpcMemory& operator=(IpcMemory const&) = delete;
+
+    IpcMemory(IpcMemory&&) = default;
+    IpcMemory& operator=(IpcMemory&&) = default;
+
+    [[nodiscard]] std::vector<void*> const& getCommPtrs() const
     {
         return mCommPtrs;
     }
 
 private:
-    void allocateIpcMemory();
+    void allocateIpcMemory(std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig);
     void destroyIpcMemory();
 
-    WorldConfig mWorldConfig;
+    SizeType mTpRank;
     std::vector<void*> mCommPtrs;
-    std::size_t mBufferSize;
-    void* mBufferPtr{nullptr};
+    BufferPtr mBuffer;
+};
+
+class AllReduceBuffers
+{
+public:
+    using TensorPtr = ITensor::SharedPtr;
+
+    AllReduceBuffers(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, SizeType hiddenSize,
+        BufferManager const& manager, WorldConfig const& worldConfig);
+
+    TensorPtr mAllReduceCommPtrs;
+    std::vector<runtime::IpcMemory> mIpcMemoryHandles;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/loraCache.h b/cpp/include/tensorrt_llm/runtime/loraCache.h
index bfb3c701e..7d6972344 100644
--- a/cpp/include/tensorrt_llm/runtime/loraCache.h
+++ b/cpp/include/tensorrt_llm/runtime/loraCache.h
@@ -23,7 +23,9 @@
 #include "tensorrt_llm/runtime/loraModule.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
+
+#include <NvInferRuntime.h>
+
 #include <deque>
 #include <list>
 #include <map>
diff --git a/cpp/include/tensorrt_llm/runtime/loraCachePageManagerConfig.h b/cpp/include/tensorrt_llm/runtime/loraCachePageManagerConfig.h
index 83b19505a..51556c4b3 100644
--- a/cpp/include/tensorrt_llm/runtime/loraCachePageManagerConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/loraCachePageManagerConfig.h
@@ -19,7 +19,9 @@
 
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include <NvInferRuntimeBase.h>
+
+#include <NvInferRuntime.h>
+
 #include <ostream>
 #include <sstream>
 #include <type_traits>
diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
index dce83c835..763d6985d 100644
--- a/cpp/include/tensorrt_llm/runtime/modelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -25,21 +25,34 @@
 namespace tensorrt_llm::runtime
 {
 
-struct MambaConfig
-{
-    SizeType dState = 0;
-    SizeType dConv = 0;
-    SizeType expand = 0;
-};
-
 class ModelConfig
 {
 public:
     enum class ModelVariant : std::int32_t
     {
         kGpt = 0,
-        kGlm = 1,   // https://github.com/THUDM/GLM and https://github.com/THUDM/ChatGLM-6B
-        kMamba = 2, // https://github.com/state-spaces/mamba
+        kGlm = 1,            // https://github.com/THUDM/GLM and https://github.com/THUDM/ChatGLM-6B
+        kMamba = 2,          // https://github.com/state-spaces/mamba
+        kRecurrentGemma = 3, // https://github.com/google-deepmind/recurrentgemma
+    };
+
+    struct MambaConfig
+    {
+        SizeType dState = 0;
+        SizeType dConv = 0;
+        SizeType expand = 0;
+    };
+
+    struct RnnConfig
+    {
+        SizeType dConv = 0;
+        SizeType hiddenSize = 0;
+    };
+
+    enum class LayerType : std::int32_t
+    {
+        kATTENTION,
+        kRECURRENT,
     };
 
     explicit ModelConfig(SizeType vocabSize, SizeType nbAttentionLayers, SizeType nbSsmLayers, SizeType nbHeads,
@@ -478,7 +491,8 @@ class ModelConfig
 
     [[nodiscard]] bool constexpr isTransformerBased() const noexcept
     {
-        return mModelVariant == ModelVariant::kGpt || mModelVariant == ModelVariant::kGlm;
+        return mModelVariant == ModelVariant::kGpt || mModelVariant == ModelVariant::kGlm
+            || mModelVariant == ModelVariant::kRecurrentGemma;
     }
 
     [[nodiscard]] bool hasMambaConfig() const noexcept
@@ -498,7 +512,32 @@ class ModelConfig
 
     [[nodiscard]] bool constexpr isSsmBased() const noexcept
     {
-        return mModelVariant == ModelVariant::kMamba;
+        return mModelVariant == ModelVariant::kMamba || mModelVariant == ModelVariant::kRecurrentGemma;
+    }
+
+    [[nodiscard]] bool hasRnnConfig() const noexcept
+    {
+        return mRnnConfig.has_value();
+    }
+
+    [[nodiscard]] std::optional<RnnConfig> getRnnConfig() const noexcept
+    {
+        return mRnnConfig;
+    }
+
+    void setRnnConfig(RnnConfig const& rnnConfig) noexcept
+    {
+        mRnnConfig = rnnConfig;
+    }
+
+    [[nodiscard]] std::vector<LayerType> const& getLayerTypes() const noexcept
+    {
+        return mLayerTypes;
+    }
+
+    void setLayerTypes(std::vector<LayerType> const& layerTypes) noexcept
+    {
+        mLayerTypes = layerTypes;
     }
 
 private:
@@ -548,6 +587,10 @@ class ModelConfig
     bool mUsePositionEmbedding;
     bool mUseTokenTypeEmbedding;
     SizeType mFfnHiddenSize; // indicates encoder output hidden size
+
+    std::optional<RnnConfig> mRnnConfig;
+
+    std::vector<LayerType> mLayerTypes;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
new file mode 100644
index 000000000..8b3647ce3
--- /dev/null
+++ b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace tensorrt_llm
+{
+namespace runtime
+{
+
+class SpeculativeDecodingMode
+{
+    // [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/models/modeling_utils.py
+public:
+    static auto constexpr None()
+    {
+        return SpeculativeDecodingMode{kNone};
+    }
+
+    static auto constexpr DraftModel()
+    {
+        return SpeculativeDecodingMode{kDraftModel};
+    }
+
+    static auto constexpr Medusa()
+    {
+        return SpeculativeDecodingMode{kMedusa};
+    }
+
+    static auto constexpr LookaheadDecoding()
+    {
+        return SpeculativeDecodingMode{kLookaheadDecoding};
+    }
+
+    bool constexpr isNone() const
+    {
+        return anyBitSet(kNone);
+    }
+
+    bool constexpr isDraftModel() const
+    {
+        return anyBitSet(kDraftModel);
+    }
+
+    bool constexpr isMedusa() const
+    {
+        return anyBitSet(kMedusa);
+    }
+
+    bool constexpr isLookaheadDecoding() const
+    {
+        return anyBitSet(kLookaheadDecoding);
+    }
+
+    bool constexpr requiresAttentionMask() const
+    {
+        return anyBitSet(kLookaheadDecoding | kMedusa);
+    }
+
+    bool constexpr predictsDraftTokens() const
+    {
+        return anyBitSet(kLookaheadDecoding | kMedusa);
+    }
+
+    bool constexpr needsKVCacheRewind() const
+    {
+        return anyBitSet(kLookaheadDecoding | kMedusa);
+    }
+
+    bool constexpr hasDraftLogits() const
+    {
+        return anyBitSet(kMedusa);
+    }
+
+    using UnderlyingType = uint8_t;
+
+    bool operator==(SpeculativeDecodingMode const& other) const
+    {
+        return mState == other.mState;
+    }
+
+    constexpr SpeculativeDecodingMode(UnderlyingType state)
+        : mState(state)
+    {
+    }
+
+private:
+    // No speculative decoding is used.
+    static UnderlyingType constexpr kNone{1u << 0};
+    static UnderlyingType constexpr kDraftModel{1u << 1};
+    static UnderlyingType constexpr kMedusa{1u << 2};
+    static UnderlyingType constexpr kLookaheadDecoding{1u << 3};
+
+    bool constexpr anyBitSet(UnderlyingType bits) const
+    {
+        return (mState & bits) != 0;
+    }
+
+    bool constexpr allBitSet(UnderlyingType bits) const
+    {
+        return (mState & bits) == bits;
+    }
+
+    UnderlyingType mState{kNone};
+};
+
+static_assert(SpeculativeDecodingMode::None().isNone());
+static_assert(!SpeculativeDecodingMode::None().isDraftModel());
+static_assert(!SpeculativeDecodingMode::None().isMedusa());
+static_assert(!SpeculativeDecodingMode::None().isLookaheadDecoding());
+
+static_assert(SpeculativeDecodingMode::DraftModel().isDraftModel());
+static_assert(!SpeculativeDecodingMode::DraftModel().isNone());
+static_assert(!SpeculativeDecodingMode::DraftModel().isMedusa());
+static_assert(!SpeculativeDecodingMode::DraftModel().isLookaheadDecoding());
+
+static_assert(SpeculativeDecodingMode::Medusa().isMedusa());
+static_assert(!SpeculativeDecodingMode::Medusa().isNone());
+static_assert(!SpeculativeDecodingMode::Medusa().isDraftModel());
+static_assert(!SpeculativeDecodingMode::Medusa().isLookaheadDecoding());
+
+static_assert(SpeculativeDecodingMode::LookaheadDecoding().isLookaheadDecoding());
+static_assert(!SpeculativeDecodingMode::LookaheadDecoding().isNone());
+static_assert(!SpeculativeDecodingMode::LookaheadDecoding().isDraftModel());
+static_assert(!SpeculativeDecodingMode::LookaheadDecoding().isMedusa());
+
+} // namespace runtime
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index 3f3a0fc39..e4e096b32 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -35,11 +35,7 @@ add_subdirectory(layers)
 add_subdirectory(runtime)
 add_subdirectory(executor_worker)
 
-set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static)
-set(BATCH_MANAGER_TARGET_ARCH "unknown")
-
-set(EXECUTOR_TARGET tensorrt_llm_executor_static)
-set(EXECUTOR_TARGET_ARCH "unknown")
+set(TARGET_ARCH "unknown")
 
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if(NOT WIN32) # Linux
@@ -58,11 +54,9 @@ if(NOT WIN32) # Linux
   message(STATUS "Operating System: ${OS_ID}, ${OS_VERSION_ID}")
 
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-    set(BATCH_MANAGER_TARGET_ARCH "x86_64-linux-gnu")
-    set(EXECUTOR_TARGET_ARCH "x86_64-linux-gnu")
+    set(TARGET_ARCH "x86_64-linux-gnu")
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set(BATCH_MANAGER_TARGET_ARCH "aarch64-linux-gnu")
-    set(EXECUTOR_TARGET_ARCH "aarch64-linux-gnu")
+    set(TARGET_ARCH "aarch64-linux-gnu")
     if(NOT ${OS_ID} MATCHES "ubuntu" OR ${OS_VERSION_ID} VERSION_LESS 22.04)
       message(
         FATAL_ERROR
@@ -76,8 +70,7 @@ if(NOT WIN32) # Linux
 else() # Windows
   # AMD64, IA64, ARM64, EM64T, X86
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
-    set(BATCH_MANAGER_TARGET_ARCH "x86_64-windows-msvc")
-    set(EXECUTOR_TARGET_ARCH "x86_64-windows-msvc")
+    set(TARGET_ARCH "x86_64-windows-msvc")
   else()
     message(
       FATAL_ERROR
@@ -85,6 +78,9 @@ else() # Windows
   endif()
 endif()
 
+set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static)
+set(BATCH_MANAGER_TARGET_ARCH ${TARGET_ARCH})
+
 if(BUILD_BATCH_MANAGER)
   add_subdirectory(batch_manager)
 else()
@@ -115,6 +111,9 @@ else()
   endif()
 endif()
 
+set(EXECUTOR_TARGET tensorrt_llm_executor_static)
+set(EXECUTOR_TARGET_ARCH ${TARGET_ARCH})
+
 if(BUILD_EXECUTOR)
   add_subdirectory(executor)
 else()
@@ -189,6 +188,45 @@ else()
   add_custom_target(check_symbol_executor)
 endif()
 
+set(NVRTC_WRAPPER_TARGET tensorrt_llm_nvrtc_wrapper)
+set(NVRTC_WRAPPER_TARGET_ARCH ${TARGET_ARCH})
+
+if(BUILD_NVRTC_WRAPPER)
+  add_subdirectory(
+    kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper)
+else()
+  add_library(${NVRTC_WRAPPER_TARGET} SHARED IMPORTED)
+  if(NOT WIN32) # Linux
+    set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC
+        "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/libtensorrt_llm_nvrtc_wrapper.so"
+    )
+    set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC
+        "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so"
+    )
+  else()
+    set(NVRTC_WRAPPER_LIB_SOURCE_REL_LOC
+        "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/libtensorrt_llm_nvrtc_wrapper.dll"
+    )
+    set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC
+        "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.dll"
+    )
+  endif()
+  set(NVRTC_WRAPPER_LIB_LOC
+      "${CMAKE_CURRENT_SOURCE_DIR}/${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}")
+  # Copy the .so to build directory, which is needed in build_wheel.py.
+  configure_file(${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}
+                 ${NVRTC_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY)
+  set_property(TARGET ${NVRTC_WRAPPER_TARGET} PROPERTY IMPORTED_LOCATION
+                                                       ${NVRTC_WRAPPER_LIB_LOC})
+  file(SIZE ${NVRTC_WRAPPER_LIB_LOC} NVRTC_WRAPPER_LIB_SIZE)
+  if(NVRTC_WRAPPER_LIB_SIZE LESS 1024)
+    message(
+      FATAL_ERROR
+        "The nvrtc wrapper library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`."
+    )
+  endif()
+endif()
+
 set(TRTLLM_LINK_LIBS
     ${CUBLAS_LIB}
     ${CUBLASLT_LIB}
@@ -247,6 +285,13 @@ target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE ${SHARED_TARGET})
 # Cyclic dependency of executor on TRT-LLM
 target_link_libraries(${EXECUTOR_TARGET} INTERFACE ${SHARED_TARGET})
 
+if(NOT WIN32)
+  set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS
+                                                    "-Wl,-rpath='$ORIGIN'")
+endif()
+
+target_link_libraries(${SHARED_TARGET} PUBLIC ${NVRTC_WRAPPER_TARGET})
+
 add_dependencies(${SHARED_TARGET} check_symbol)
 add_dependencies(${SHARED_TARGET} check_symbol_executor)
 
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 5777dc43f..c61558bce 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a3c08bd9777149ddf546c2bd02fa78ec0d8a10e7e51fb05f29e63f089caffa9
-size 3215202
+oid sha256:97866290105b98bc63d2d38c7176b8e2d79969c99f9c456b04428fef81bd8780
+size 3309008
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index bffb766b4..96f5fd802 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57b677069d5673dfba53aa2ff89240320f72f21707865f73fe29ce74a36f9a57
-size 3257948
+oid sha256:891a0a6f2053b011ba2c58101b279ab583442ff3585f01c919e25a26e75e51d1
+size 3353702
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index 3cb8ce336..5da3e62c6 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-e33ec506a35e58225744944654645de5 libtensorrt_llm_batch_manager_static.a
-e0e0525dc521f70ba9b2f19638d82187 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-0ff5eb9f3ac62b2672bef68a7117bdef779926e7 commit
\ No newline at end of file
+ba4b89ea4ddf64403656d3626559ceae libtensorrt_llm_batch_manager_static.a
+decbd28e89ac740f9755b2b2537fa71b libtensorrt_llm_batch_manager_static.pre_cxx11.a
+942b83732d029cc3eaef9f5a849218d75161ec12 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 595495170..9758c7556 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:860ce68e8062b45dd15160834a5f223da1f3ae205caca5e8a99ce0037a55c900
-size 3117888
+oid sha256:04326319261c7b196048535872990497461eed46ed4b989a31527c2ef9ef8c92
+size 3205910
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 3c325a6fe..cb49bb620 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3eacf70f4b6b0f959c7b5b29a2f17d2d0f40283334e2decc6ea8ac67eb3523b7
-size 3097564
+oid sha256:8dffe215e14b2f67af2e8a77ecb8281db3fe54cc5184e635904f752a7ef84a0c
+size 3185774
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index f76e3f7ba..468e77e44 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc4351557104103d44a1bc38b967e34337777e3c45b441c0057d4a16d68dc458
-size 19620324
+oid sha256:5889c4e0dd2109a30c49a554780f43415528a710bf438bf57e0b34ec5c49a695
+size 19782918
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
index 072b3c443..abe8d0a4a 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
@@ -38,6 +38,28 @@ namespace tensorrt_llm
 namespace common
 {
 
+std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
+{
+    static std::mutex mutex;
+    static std::weak_ptr<CUDADriverWrapper> instance;
+    std::shared_ptr<CUDADriverWrapper> result = instance.lock();
+    if (result)
+    {
+        return result;
+    }
+    else
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        result = instance.lock();
+        if (!result)
+        {
+            result = std::shared_ptr<CUDADriverWrapper>(new CUDADriverWrapper());
+            instance = result;
+        }
+        return result;
+    }
+}
+
 CUDADriverWrapper::CUDADriverWrapper()
 {
     handle = dllOpen(CUDA_LIB_NAME);
@@ -63,6 +85,7 @@ CUDADriverWrapper::CUDADriverWrapper()
     *(void**) (&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
     *(void**) (&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
     *(void**) (&_cuTensorMapEncodeTiled) = load_sym(handle, "cuTensorMapEncodeTiled");
+    *(void**) (&_cuMemcpyDtoH) = load_sym(handle, "cuMemcpyDtoH_v2");
 }
 
 CUDADriverWrapper::~CUDADriverWrapper()
@@ -153,5 +176,10 @@ CUresult CUDADriverWrapper::cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUten
         boxDim, elementStrides, interleave, swizzle, l2Promotion, oobFill);
 }
 
+CUresult CUDADriverWrapper::cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const
+{
+    return (*_cuMemcpyDtoH)(dstHost, srcDevice, ByteCount);
+}
+
 } // namespace common
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
index 7be5023a1..a29c34527 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@@ -19,10 +19,12 @@
 
 #include <cstdio>
 #include <cuda.h>
+#include <memory>
+#include <mutex>
 
 #define cuErrCheck(stat, wrap)                                                                                         \
     {                                                                                                                  \
-        cuErrCheck_((stat), wrap, __FILE__, __LINE__);                                                                 \
+        cuErrCheck_((stat), wrap.get(), __FILE__, __LINE__);                                                           \
     }
 
 namespace tensorrt_llm
@@ -32,9 +34,12 @@ namespace common
 
 class CUDADriverWrapper
 {
-public:
+    // Use getInstance() instead.
     CUDADriverWrapper();
 
+public:
+    static std::shared_ptr<CUDADriverWrapper> getInstance();
+
     ~CUDADriverWrapper();
 
     CUresult cuGetErrorName(CUresult error, char const** pStr) const;
@@ -75,6 +80,8 @@ class CUDADriverWrapper
         cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
         CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const;
 
+    CUresult cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const;
+
 private:
     void* handle;
     CUresult (*_cuGetErrorName)(CUresult, char const**);
@@ -98,14 +105,15 @@ class CUDADriverWrapper
         cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
         cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
         CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+    CUresult (*_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
 };
 
-inline void cuErrCheck_(CUresult stat, CUDADriverWrapper const& wrap, char const* file, int line)
+inline void cuErrCheck_(CUresult stat, CUDADriverWrapper const* wrap, char const* file, int line)
 {
     if (stat != CUDA_SUCCESS)
     {
         char const* msg = nullptr;
-        wrap.cuGetErrorName(stat, &msg);
+        wrap->cuGetErrorName(stat, &msg);
         fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line);
     }
 }
diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
index 6d30ac576..9f2d93316 100644
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@@ -23,11 +23,8 @@ namespace tensorrt_llm::utils::customAllReduceUtils
 
 constexpr size_t NUM_POINTERS_PER_RANK = 4;
 
-namespace
-{
-
 // WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py
-size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
+inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
 {
     if (worldSize <= 2)
     {
@@ -35,6 +32,5 @@ size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
     }
     return 8 * 1000 * 1000;
 }
-} // namespace
 
 } // namespace tensorrt_llm::utils::customAllReduceUtils
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index e6764f04a..738e2cc7d 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -55,6 +55,25 @@ std::optional<int32_t> envXqaNbCtaPerKVHead()
     return ret;
 }
 
+bool getEnvDisableXQAJIT()
+{
+    static bool init = false;
+    static bool disableXQAJIT = false;
+    if (!init)
+    {
+        init = true;
+        char const* disable_xqa_jit_var = std::getenv("TRTLLM_DISABLE_XQA_JIT");
+        if (disable_xqa_jit_var)
+        {
+            if (disable_xqa_jit_var[0] == '1' && disable_xqa_jit_var[1] == '\0')
+            {
+                disableXQAJIT = true;
+            }
+        }
+    }
+    return disableXQAJIT;
+}
+
 // Tune the number of blocks per sequence for accuracy/performance purpose.
 bool getEnvMmhaMultiblockDebug()
 {
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 16429c74c..521d2d5d8 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -33,6 +33,9 @@ int32_t xqaMaxNbCtaPerKVHeadFactor();
 
 std::optional<int32_t> envXqaNbCtaPerKVHead();
 
+// Whether XQA JIT is disabled.
+bool getEnvDisableXQAJIT();
+
 // Tune the number of blocks per sequence for accuracy/performance purpose.
 bool getEnvMmhaMultiblockDebug();
 
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
index 35dbdee42..78938311e 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eeda6a94352bd7bff125b1645ccd7d1e049acf4d316057f7a3adc71f38de54b0
-size 1228412
+oid sha256:418820fec34c660cf94828f74159b0856517faf21b877d0a29b6a5e7dc71ece2
+size 1235256
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 1626b586f..3e2c509a1 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d2c63df67c83b0970032e549d477bccc6e07883bb82562df6fbaa3a7f22dbd5
-size 1247068
+oid sha256:65e3acc4d6e33b30775f3fce8c6b171c22b1842eb5a4e04fb2a109b5f56082c7
+size 1253184
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
index 8eb382ac9..8708c0b8e 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-91b15059b1b7ea4662db71c7af0abe2b libtensorrt_llm_executor_static.a
-fe5af71bf010a17fdf34c253ab187c28 libtensorrt_llm_executor_static.pre_cxx11.a
-0ff5eb9f3ac62b2672bef68a7117bdef779926e7 commit
\ No newline at end of file
+0d429aff4a27797c9a4b3078d59bb3d3 libtensorrt_llm_executor_static.a
+e5012c4a7e70b6d2e9d80563c26d2c83 libtensorrt_llm_executor_static.pre_cxx11.a
+942b83732d029cc3eaef9f5a849218d75161ec12 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
index 3efff892d..13a912cb2 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:812ce5bd5effd252b642d31ec261e8de1e93bc71017dff91fdb84f833a66029a
-size 1249594
+oid sha256:a46eec8c1209e4499478d656fce44bce280e74fb846b669c6601c3d6ea87a21a
+size 1255814
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 4fb0fca0d..365ddfea5 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01dc32257ebafd712a527d62cca4c4880a636e65ede63857b7bea62bf21b975e
-size 1204654
+oid sha256:7aa3c2841123c7db28fd0e81197e3fec59709d15d6dee8436c138c597bcec4bd
+size 1210336
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
index 549a53e51..81f59aca7 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7da73ddfa6393c8f040e92c32206b96c5dab936742fdcdca8e91992c26f80146
-size 11870092
+oid sha256:740bd924898b2cdd1181d9dfe60bdba710166ab0e4e11eef424ebed3c6de8ab6
+size 11912588
diff --git a/cpp/tensorrt_llm/executor_worker/executorWorker.cpp b/cpp/tensorrt_llm/executor_worker/executorWorker.cpp
index 5fe85311d..f2dba2746 100644
--- a/cpp/tensorrt_llm/executor_worker/executorWorker.cpp
+++ b/cpp/tensorrt_llm/executor_worker/executorWorker.cpp
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
     // In orchestrator mode, the spawned threads will wait for termination signal from orchestrator
     auto executor = tle::Executor(modelPath, modelType, executorConfig);
 
-    TLLM_LOG_INFO("Executor worker exiting");
+    TLLM_LOG_INFO("Executor instance created by worker");
 
 #endif // ENABLE_MULTI_DEVICE
 
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h
index 8500cc9dd..7d78fec36 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h
@@ -72,7 +72,8 @@ class TFusedMultiHeadAttentionXMMAKernel
 
     TFusedMultiHeadAttentionXMMAKernel(
         TKernelMeta const* pMetaStart, unsigned int nMetaCount, Data_type type, unsigned int sm)
-        : mDataType(type)
+        : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance())
+        , mDataType(type)
         , mKernelMeta(pMetaStart)
         , mKernelMetaCount(nMetaCount)
         , mSM(sm)
@@ -99,16 +100,17 @@ class TFusedMultiHeadAttentionXMMAKernel
                 }
                 else
                 {
-                    cuErrCheck(mDriver.cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
+                    cuErrCheck(mDriver->cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
                     mModules.insert(std::make_pair(kernelMeta.mCubin, hmod));
                 }
 
                 FusedMultiHeadAttentionKernelInfo funcInfo;
                 funcInfo.mMetaInfoIndex = i;
-                cuErrCheck(mDriver.cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
+                cuErrCheck(
+                    mDriver->cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
                 if (kernelMeta.mSharedMemBytes >= 48 * 1024)
                 {
-                    cuErrCheck(mDriver.cuFuncSetAttribute(funcInfo.mDeviceFunction,
+                    cuErrCheck(mDriver->cuFuncSetAttribute(funcInfo.mDeviceFunction,
                                    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, kernelMeta.mSharedMemBytes),
                         mDriver);
                 }
@@ -133,7 +135,7 @@ class TFusedMultiHeadAttentionXMMAKernel
         const CUfunction func = findIter->second.mDeviceFunction;
 
         void* kernelParams[] = {&params, nullptr};
-        cuErrCheck(mDriver.cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
+        cuErrCheck(mDriver->cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
                        kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
             mDriver);
     }
@@ -143,7 +145,7 @@ class TFusedMultiHeadAttentionXMMAKernel
     virtual ~TFusedMultiHeadAttentionXMMAKernel() = default;
 
 protected:
-    tensorrt_llm::common::CUDADriverWrapper mDriver;
+    std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> mDriver;
 
     Data_type mDataType;
     TKernelMeta const* mKernelMeta;
@@ -306,7 +308,7 @@ class FusedMultiHeadAttentionXMMAKernelV2
 
         if (!forceUnroll)
         {
-            cuErrCheck(mDriver.cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
+            cuErrCheck(mDriver->cuLaunchKernel(func, params.h, params.b, 1, kernelMeta.mThreadsPerCTA, 1, 1,
                            kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
                 mDriver);
         } // forceunroll = true for flash attention kernels
@@ -357,8 +359,8 @@ class FusedMultiHeadAttentionXMMAKernelV2
                 }
             }
 
-            cuErrCheck(mDriver.cuLaunchKernel(func, block_size.x, block_size.y, block_size.z, kernelMeta.mThreadsPerCTA,
-                           1, 1, kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
+            cuErrCheck(mDriver->cuLaunchKernel(func, block_size.x, block_size.y, block_size.z,
+                           kernelMeta.mThreadsPerCTA, 1, 1, kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
                 mDriver);
         }
         else
@@ -374,13 +376,13 @@ class FusedMultiHeadAttentionXMMAKernelV2
             // on Hopper non-flash-attention, we still launch blocks (h, b, steps)
             if (mSM == kSM_90 && !launch_params.flash_attention)
             {
-                cuErrCheck(mDriver.cuLaunchKernel(func, params.h, params.b, unroll, kernelMeta.mThreadsPerCTA, 1, 1,
+                cuErrCheck(mDriver->cuLaunchKernel(func, params.h, params.b, unroll, kernelMeta.mThreadsPerCTA, 1, 1,
                                kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
                     mDriver);
             } // on Ampere/Ada/Volta flash attention, we launch blocks (steps, h, b)
             else
             {
-                cuErrCheck(mDriver.cuLaunchKernel(func, unroll, params.h, params.b, kernelMeta.mThreadsPerCTA, 1, 1,
+                cuErrCheck(mDriver->cuLaunchKernel(func, unroll, params.h, params.b, kernelMeta.mThreadsPerCTA, 1, 1,
                                kernelMeta.mSharedMemBytes, stream, kernelParams, nullptr),
                     mDriver);
             }
diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.cu b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu
new file mode 100644
index 000000000..daed22a64
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime_api.h>
+
+#include <cub/device/device_scan.cuh>
+
+#include "cumsumLastDim.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+template <typename input_t>
+size_t invokeComputeCumsumLastDimWorkspaceSize(int input_length)
+{
+    input_t* iodata = nullptr;
+    size_t temp_storage_bytes;
+    cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, iodata, iodata, input_length);
+    return temp_storage_bytes;
+}
+
+#define INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE(input_t)                                           \
+    template size_t invokeComputeCumsumLastDimWorkspaceSize<input_t>(int input_length)
+
+INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE(int);
+INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE(float);
+INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_COMPUTE_CUMSUM_LastDim_WORKSPACE_SIZE_DATA_TYPE
+
+///////////////
+
+template <typename input_t>
+void invokeCumsumLastDim(int batch_size, int input_length, void const* __restrict__ input, void* __restrict__ output,
+    void* d_temp_storage, size_t temp_storage_bytes, cudaStream_t stream)
+{
+    for (int i = 0; i < batch_size; i++)
+    {
+        input_t const* input_ptr = reinterpret_cast<input_t const*>(input) + i * input_length;
+        input_t* output_ptr = reinterpret_cast<input_t*>(output) + i * input_length;
+        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, input_ptr, output_ptr, input_length, stream);
+    }
+}
+
+#define INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(input_t)                                                                  \
+    template void invokeCumsumLastDim<input_t>(int batch_size, int input_length, const void* __restrict__ input,       \
+        void* __restrict__ output, void* workspace, size_t temp_storage_bytes, cudaStream_t stream)
+
+INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(int);
+INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(float);
+INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_CUMSUM_LastDim_DATA_TYPE
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.h b/cpp/tensorrt_llm/kernels/cumsumLastDim.h
new file mode 100644
index 000000000..6955acc2e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template <typename input_t>
+size_t invokeComputeCumsumLastDimWorkspaceSize(int input_length);
+
+template <typename input_t>
+void invokeCumsumLastDim(int batch_size, int input_length, void const* __restrict__ input, void* __restrict__ output,
+    void* workspace, size_t temp_storage_bytes, cudaStream_t stream);
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
index baa74c5c3..3917149f3 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
@@ -525,7 +525,8 @@ size_t MoeGemmRunner<T, WeightType>::calcMaxWorkspaceSize(int num_experts) const
         return max_size;
     }
 
-    assert(false); // Unreachable
+    TLLM_CHECK_WITH_INFO(false, "Unsupported MoE GEMM configuration"); // Unreachable
+    return 0;
 }
 
 template <typename T, typename WeightType>
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
index cc4e821a4..d1f4db3db 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
@@ -123,7 +123,11 @@ struct Multihead_attention_params_base
     float rotary_embedding_base = 0.0f;
     RotaryScalingType rotary_embedding_scale_type = RotaryScalingType::kNONE;
     float rotary_embedding_scale = 0.0f;
+    float rotary_embedding_m_scale = 0.0f;
+    float const* rotary_embedding_scaling_factors = nullptr;
     int rotary_embedding_max_positions = 0;
+    int rotary_cogvlm_vision_start = -1;
+    int rotary_cogvlm_vision_length = -1;
     // Position shift for streamingllm
     bool position_shift_enabled = false;
     // The current timestep. TODO Check that do we only this param in cross attention?
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt
index b2afe8668..a6c7cbe8b 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt
@@ -18,6 +18,9 @@
 file(GLOB_RECURSE SRC_CPP *.cpp)
 file(GLOB_RECURSE SRC_CU *.cu)
 
+# Exclude files in nvrtcWrapper folder.
+list(FILTER SRC_CPP EXCLUDE REGEX ".*nvrtcWrapper/src.*")
+
 # skip mmha 48, 80, 96, 112, 144, 160, 192 and 224 for fast build
 if(FAST_BUILD)
   list(FILTER SRC_CU EXCLUDE REGEX
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h
index ae6eb6b72..93f304b91 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h
@@ -14,132 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+
 namespace tensorrt_llm
 {
 namespace kernels
 {
-// clang-format off
-// SingleQueryToken kernels.
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[];
 
 // MultiQueryToken kernels.
 extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin[];
@@ -263,170 +143,6 @@ extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nq
 extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[];
 extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[];
 
-// MHA with beamWidth=4
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[];
-extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[];
-
-// SingleQueryToken kernels.
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len;
-
 // MultiQueryToken kernels.
 extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len;
 extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len;
@@ -549,48 +265,6 @@ extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32
 extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len;
 extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len;
 
-// MHA with beamWidth=4
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len;
-extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len;
-
 static const struct XQAKernelMetaInfo
 {
     Data_type mDataType;
@@ -603,294 +277,557 @@ static const struct XQAKernelMetaInfo
     bool mPagedKVCache;
     bool mMultiQueryTokens;
     unsigned int mSM;
-    const unsigned long long* mCubin;
+    unsigned long long const* mCubin;
     unsigned int mCubinSize;
-    const char* mFuncName;
+    char const* mFuncName;
 } sXqaKernelMetaInfo[] = {
-// SingleQueryToken kernels.
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"},
-// MultiQueryToken kernels.
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
-// MHA with beamWidth=4
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"},
-{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}
-};
+    // SingleQueryToken kernels.
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    // MultiQueryToken kernels.
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin,
+        xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"},
+    // MHA with beamWidth=4
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, nullptr, 0, "kernel_mha"},
+    {DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, nullptr, 0, "kernel_mha"}};
 
 // clang-format on
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
index b7d390c6b..db9f6d0ea 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
@@ -72,7 +72,8 @@ inline size_t smem_size_in_bytes(Multihead_attention_params<T, DO_CROSS_ATTENTIO
     size_t red_sz = rows_per_red * params.hidden_size_per_head * sizeof(Tk) / 2;
 
     size_t transpose_rotary_size = 0;
-    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX)
+    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX
+        || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE)
     {
         assert(params.rotary_embedding_dim > 0);
         transpose_rotary_size = 2 * params.rotary_embedding_dim * sizeof(Tk);
@@ -365,7 +366,8 @@ void mmha_launch_kernel(KernelParamsType const& params, KVCacheBuffer const& kv_
 {
     assert((params.rotary_embedding_dim != 0)
         == (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX
-            || params.position_embedding_type == PositionEmbeddingType::kROPE_GPTJ));
+            || params.position_embedding_type == PositionEmbeddingType::kROPE_GPTJ
+            || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE));
     if (params.beam_width == 1)
     {
         mmha_launch_kernel_dispatch<T, KVCacheBuffer, KernelParamsType, Dh, false, IMPLICIT_REL_ATTN_BIAS>(
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
index 1f0ac2f42..5b16ea3c0 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -1639,15 +1639,16 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
         if (HANDLE_KV)
         {
             apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_embedding_base,
-                rotary_embedding_scale, current_pos_idx);
+                rotary_embedding_scale, 0, nullptr, current_pos_idx);
         }
         else
         {
-            apply_rotary_embedding(
-                q, tidx, params.rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, current_pos_idx);
+            apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale,
+                0, nullptr, current_pos_idx);
         }
         break;
     }
+    case PositionEmbeddingType::kLONG_ROPE:
     case PositionEmbeddingType::kROPE_GPT_NEOX:
     {
         bool const do_rotary = is_valid_qk_vec && QK_VEC_SIZE * tidx < params.rotary_embedding_dim;
@@ -1683,14 +1684,18 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
                 mmha::vec_from_smem_transpose(k, k_smem_, transpose_idx, smem_pitch);
 
                 mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim,
-                    rotary_embedding_base, rotary_embedding_scale, current_pos_idx);
+                    rotary_embedding_base, rotary_embedding_scale, params.rotary_embedding_m_scale,
+                    params.rotary_embedding_scaling_factors, current_pos_idx, params.rotary_cogvlm_vision_start,
+                    params.rotary_cogvlm_vision_length);
 
                 mmha::write_smem_transpose(k, k_smem_, transpose_idx, smem_pitch);
             }
             else
             {
                 mmha::apply_rotary_embedding(q, transpose_idx / tidx_factor, params.rotary_embedding_dim,
-                    rotary_embedding_base, rotary_embedding_scale, current_pos_idx);
+                    rotary_embedding_base, rotary_embedding_scale, params.rotary_embedding_m_scale,
+                    params.rotary_embedding_scaling_factors, current_pos_idx, params.rotary_cogvlm_vision_start,
+                    params.rotary_cogvlm_vision_length);
             }
             mmha::write_smem_transpose(q, q_smem_, transpose_idx, smem_pitch);
         }
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp
index d5756f729..59318463e 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp
@@ -15,6 +15,7 @@
  */
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h"
 
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h"
 
 #include <cassert>
@@ -44,12 +45,10 @@ std::unique_ptr<DecoderXQAImpl> DecoderXQAImpl::create(DecoderXQARunner* runner,
     switch (implType)
     {
     case ImplType::kPrecompiled: return std::unique_ptr<DecoderXQAImpl>(new DecoderXQAImplPrecompiled(runner));
-    // TODO(minwei): JIT impl.
-    case ImplType::kJIT: return nullptr;
+    case ImplType::kJIT: return std::unique_ptr<DecoderXQAImpl>(new DecoderXQAImplJIT(runner));
     }
     // Shouldn't reach here.
-    assert(false);
-    return nullptr;
+    TLLM_THROW("Unknown DecoderXQAImpl::ImplType");
 }
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h
index 4f7cf268f..783a868a0 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h
@@ -33,15 +33,15 @@ class DecoderXQARunner;
  * We need this layer of abstraction for abstracting out implementation details. Two possible implementations:
  *   1. Precompiled, i.e. kernels are compiled and saved as cubins in advance.
  *   2. JIT, i.e. kernels are compiled on the fly via NVRTC.
- *
- * This class is written as Composition over Inheritance, primarily because C++ does not support virtual template
- * functions.
  */
 class DecoderXQAImpl
 {
 public:
+    // TODO(minwei): shouldUse()/prepare() should be templated with KVCacheBuffer.
     // Whether it is beneficial to use this XQA codepath.
-    virtual bool shouldUse(XQAParams const& xqaParams) = 0;
+    //
+    // forConfigurePlugin: whether this method is called in configure plugin phase.
+    virtual bool shouldUse(XQAParams const& xqaParams, bool forConfigurePlugin) = 0;
     // Prepares for the kernel running. Must be called before calling run.
     virtual void prepare(XQAParams const& xqa_params) = 0;
     // Run XQA kernel with KVCacheBuffer.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h
new file mode 100644
index 000000000..7d08303f2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Common utils to be shared between Precompiled and JIT implementation.
+ */
+#pragma once
+#include "decoderXQAConstants.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/common/workspace.h"
+#include "tensorrt_llm/kernels/kvCacheUtils.h"
+#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h"
+#include "xqaParams.h"
+#include <cstddef>
+#include <utility>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+struct XQAKernelLoadHashKey
+{
+    Data_type data_type;
+    unsigned int sm;
+
+    bool operator==(XQAKernelLoadHashKey const& other) const
+    {
+        return data_type == other.data_type && sm == other.sm;
+    }
+};
+
+struct XQAKernelLoadHasher
+{
+    size_t operator()(XQAKernelLoadHashKey const& s) const
+    {
+        size_t key = s.data_type;
+        key <<= 16;
+        key ^= s.sm;
+        return key;
+    }
+};
+
+struct XQAKernelRuntimeHashKey
+{
+    Data_type kv_data_type;
+    unsigned int head_size;
+    unsigned int beam_size;
+    unsigned int num_q_heads_per_kv;
+    unsigned int m_tilesize;
+    unsigned int tokens_per_page;
+    bool paged_kv_cache;
+    bool multi_query_tokens;
+
+    bool operator==(XQAKernelRuntimeHashKey const& other) const
+    {
+        return kv_data_type == other.kv_data_type && head_size == other.head_size
+            && num_q_heads_per_kv == other.num_q_heads_per_kv && beam_size == other.beam_size
+            && multi_query_tokens == other.multi_query_tokens && m_tilesize == other.m_tilesize
+            && tokens_per_page == other.tokens_per_page && paged_kv_cache == other.paged_kv_cache;
+    }
+};
+
+struct XQAKernelRuntimeHasher
+{
+    size_t operator()(XQAKernelRuntimeHashKey const& s) const
+    {
+        size_t key = s.kv_data_type;
+        key <<= 16;
+        key ^= s.head_size;
+        key <<= 8;
+        key ^= s.num_q_heads_per_kv;
+        key <<= 8;
+        key ^= s.beam_size;
+        key <<= 6;
+        key ^= s.m_tilesize;
+        key <<= 10;
+        key ^= s.tokens_per_page;
+        key <<= 1;
+        key ^= s.paged_kv_cache;
+        key <<= 1;
+        key ^= s.multi_query_tokens;
+        return key;
+    }
+};
+
+// XQA kernel can be uniquely identified by (LoadHashKey, RuntimeHashKey).
+struct XQAKernelFullHashKey
+{
+    XQAKernelLoadHashKey load_key;
+    XQAKernelRuntimeHashKey runtime_key;
+
+    XQAKernelFullHashKey() = default;
+
+    XQAKernelFullHashKey(XQAKernelLoadHashKey const& load_key, XQAKernelRuntimeHashKey const& runtime_key)
+        : load_key(load_key)
+        , runtime_key(runtime_key)
+    {
+    }
+
+    XQAKernelFullHashKey(void const* buffer, size_t buffer_size)
+    {
+        TLLM_CHECK(sizeof(*this) <= buffer_size);
+        memcpy(this, buffer, sizeof(*this));
+    }
+
+    bool operator==(XQAKernelFullHashKey const& other) const
+    {
+        return load_key == other.load_key && runtime_key == other.runtime_key;
+    }
+
+    size_t getSerializationSize() const
+    {
+        return sizeof(*this);
+    }
+
+    void serialize(void* buffer, size_t buffer_size) const
+    {
+        TLLM_CHECK(sizeof(*this) <= buffer_size);
+        memcpy(buffer, this, sizeof(*this));
+    }
+};
+
+struct XQAKernelFullHasher
+{
+    size_t operator()(XQAKernelFullHashKey const& s) const
+    {
+        return XQAKernelLoadHasher()(s.load_key) ^ XQAKernelRuntimeHasher()(s.runtime_key);
+    }
+};
+
+// NOTE: we use int32_t sequence lengths as gpt attention plugins use int32_t for that.
+// XQA kernels assume all length should use uint32_t.
+// NOTE: Linear KV cache and paged KV cache uses the same structure.
+
+template <typename KVCacheBuffer>
+struct KVCache
+{
+};
+
+template <>
+struct KVCache<KVBlockArray>
+{
+    // Start address of the paged kv block pool.
+    void* poolPtr = nullptr;
+    // Block indices in the memory pool.
+    int32_t const* blockIndices = nullptr;
+    int32_t const* sequence_lengths = nullptr;
+    // NOTE: max_num_blocks_per_sequence for paged kv cache.
+    uint32_t capacity = 0;
+
+    KVCache(KVBlockArray& kv_cache_buffer)
+    {
+        poolPtr = kv_cache_buffer.mPrimaryPoolPtr;
+        blockIndices = reinterpret_cast<KVCacheIndex::UnderlyingType const*>(kv_cache_buffer.data);
+    }
+
+    KVCache() = default;
+};
+
+template <>
+struct KVCache<KVLinearBuffer>
+{
+    // Buffer address.
+    void* data = nullptr;
+    int32_t const* sequence_lengths = nullptr;
+    // NOTE: max_sequence_length for linear kv cache.
+    uint32_t capacity = 0;
+
+    KVCache(KVLinearBuffer& kv_cache_buffer)
+    {
+        data = kv_cache_buffer.data;
+    }
+
+    KVCache() = default;
+};
+
+struct BeamSearchParams
+{
+    int32_t const* indices;    // cacheIndir with shape: [batchSize][beamWidth][capacity]
+    int32_t capacity;
+    int32_t const* ctxLenList; // shape: [batchSize][beamWidth]. Should be [batchSize] but we have to match trt-llm API.
+};
+
+// XQA kernels assume all integer values should use uint32_t.
+template <typename KVCacheBuffer>
+struct XQALaunchParam
+{
+    uint32_t num_k_heads;
+    void* output;
+    void const* qkv;
+    KVCache<KVCacheBuffer> kvCacheParams;
+    std::optional<BeamSearchParams> beamSearchParams;
+    uint32_t batch_size;
+    float const* kv_scale_quant_orig = nullptr;
+    int* cu_seq_lens = nullptr;
+    float* rotary_inv_freq_buf = nullptr;
+    void* scratch = nullptr;
+};
+
+// Setup launch params.
+template <typename KVCacheBuffer>
+void buildXQALaunchParams(
+    XQALaunchParam<KVCacheBuffer>& launchParams, XQAParams const& params, KVCacheBuffer kv_cache_buffer)
+{
+    TLLM_CHECK_WITH_INFO(
+        params.data_type == DATA_TYPE_FP16 || params.data_type == DATA_TYPE_BF16, "Only fp16 or bf16 supported now.");
+    memset(&launchParams, 0, sizeof(XQALaunchParam<KVCacheBuffer>));
+    launchParams.num_k_heads = params.num_kv_heads;
+    launchParams.output = static_cast<uint8_t*>(params.output);
+    launchParams.qkv = static_cast<uint8_t const*>(params.qkv);
+    launchParams.batch_size = params.batch_size;
+    launchParams.kv_scale_quant_orig = params.kv_scale_quant_orig;
+
+    // Workspace.
+    size_t offset = 0;
+    int8_t* workspace = reinterpret_cast<int8_t*>(params.workspaces);
+    unsigned int batch_beam_size = params.batch_size * params.beam_width;
+    const size_t cu_seqlens_size = sizeof(int) * (batch_beam_size + 1);
+    const size_t rotary_inv_freq_size = sizeof(float) * batch_beam_size * params.rotary_embedding_dim / 2;
+    launchParams.cu_seq_lens = reinterpret_cast<int*>(workspace);
+    workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, cu_seqlens_size);
+    launchParams.rotary_inv_freq_buf = reinterpret_cast<float*>(workspace);
+    auto const multi_block_workspace_alignment = tensorrt_llm::common::roundUp(
+        sizeof(half) * params.head_size * (params.num_q_heads / params.num_kv_heads) * params.beam_width, 128);
+    workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(
+        workspace, rotary_inv_freq_size, multi_block_workspace_alignment);
+    launchParams.scratch = reinterpret_cast<void*>(workspace);
+
+    launchParams.kvCacheParams = KVCache<KVCacheBuffer>(kv_cache_buffer);
+    launchParams.kvCacheParams.sequence_lengths = params.sequence_lengths;
+    launchParams.kvCacheParams.capacity
+        = params.paged_kv_cache ? params.max_blocks_per_sequence : params.max_attention_window_size;
+    // TODO: beam searching has not been implemented yet.
+    if (params.beam_width > 1)
+    {
+        launchParams.beamSearchParams
+            = BeamSearchParams{params.cache_indir, params.max_attention_window_size, params.context_lengths};
+    }
+    else
+    {
+        launchParams.beamSearchParams = std::nullopt;
+    }
+}
+
+template <typename T>
+std::optional<T> getGlobalVar(std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver, CUmodule hmod,
+    char const* const name, bool required = false)
+{
+    T* pVar = nullptr;
+    size_t size = 0;
+    auto const error = driver->cuModuleGetGlobal(reinterpret_cast<CUdeviceptr*>(&pVar), &size, hmod, name);
+    T ret;
+    switch (error)
+    {
+    case CUDA_SUCCESS:
+        TLLM_CHECK(size == sizeof(T));
+        tensorrt_llm::common::check_cuda_error(cudaMemcpy(&ret, pVar, size, cudaMemcpyDeviceToHost));
+        break;
+    case CUDA_ERROR_NOT_FOUND:
+        if (!required)
+        {
+            return std::nullopt;
+        }
+        [[fallthrough]];
+    default: cuErrCheck(("Failed to retrieve global variable from cubin.", error), driver);
+    }
+    return std::optional<T>{std::move(ret)};
+}
+
+inline int computeMultiBlockCount(XQAParams const& xqaParams, int batch_size, int multiprocessor_count)
+{
+    if (tensorrt_llm::common::envXqaNbCtaPerKVHead().has_value())
+    {
+        return tensorrt_llm::common::envXqaNbCtaPerKVHead().value();
+    }
+    int multi_block_count = 1;
+    int num_kv_heads = xqaParams.num_kv_heads;
+    int history_length = xqaParams.timestep;
+
+    multi_block_count = history_length / kMinHistoryTokensPerBlock;
+    multi_block_count = std::max(multi_block_count, 1);
+    // adjust to kTargetWaveFactor, as already initialized using kMinHistoryTokensPerBlock, only need to decrease.
+    double wave_count = (double) batch_size * num_kv_heads * multi_block_count / (double) multiprocessor_count;
+    double adj_factor = wave_count / (double) kTargetWaveFactor;
+    if (adj_factor > 1.0)
+    {
+        multi_block_count = floor(multi_block_count / adj_factor);
+    }
+    multi_block_count = std::max(multi_block_count, 1);
+
+    // add limitation on upper bound.
+    multi_block_count = std::min(tensorrt_llm::common::xqaMaxNbCtaPerKVHeadFactor(), multi_block_count);
+
+    TLLM_CHECK_WITH_INFO(multi_block_count >= 1, "MultiBlock count should be larger than 1");
+    return multi_block_count;
+}
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp
new file mode 100644
index 000000000..c38db683d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "compileEngine.h"
+
+#include "cubinObj.h"
+#include "nvrtcWrapper/include/nvrtcWrapper.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/stringUtils.h"
+#include "tensorrt_llm/common/tllmException.h"
+#include <string>
+#include <vector>
+
+namespace
+{
+
+void CHECK_TLLM_XQA_JIT_ERROR_(tllmXqaJitStatus result, char const* const func, char const* const file, int const line)
+{
+    if (result != TLLM_XQA_JIT_SUCCESS)
+    {
+        std::vector<char> log(tllmXqaJitGetLastErrorStringSize());
+        tllmXqaJitGetLastErrorString(log.data());
+        throw tensorrt_llm::common::TllmException(file, line,
+            tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] TllmXqaJit runtime error in %s: %s", func, log.data()));
+    }
+}
+
+#define CHECK_TLLM_XQA_JIT_ERROR(val) CHECK_TLLM_XQA_JIT_ERROR_((val), #val, __FILE__, __LINE__)
+
+} // anonymous namespace
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+CubinObj CompileEngine::compile() const
+{
+    tllmXqaJitProgram program;
+    tllmXqaJitContext context{/*sm=*/mSM,
+        /*head_size=*/static_cast<uint32_t>(mXqaParams.head_size),
+        /*num_q_heads=*/static_cast<uint32_t>(mXqaParams.num_q_heads),
+        /*num_kv_heads=*/static_cast<uint32_t>(mXqaParams.num_kv_heads),
+        /*beam_width=*/static_cast<uint32_t>(mXqaParams.beam_width),
+        /*tokens_per_block=*/static_cast<uint32_t>(mXqaParams.tokens_per_block),
+        /*multi_query_tokens=*/mXqaParams.multi_query_tokens,
+        /*paged_kv_cache=*/mXqaParams.paged_kv_cache,
+        /*data_type=*/static_cast<int>(mXqaParams.data_type),
+        /*kv_cache_data_type=*/static_cast<int>(mXqaParams.kv_cache_data_type)};
+
+    CHECK_TLLM_XQA_JIT_ERROR(tllmXqaJitCreateAndCompileProgram(&program, &context));
+
+    size_t cubinSize;
+    CHECK_TLLM_XQA_JIT_ERROR(tllmXqaJitGetCUBINSize(program, &cubinSize));
+    std::string cubinContent(cubinSize, ' ');
+    CHECK_TLLM_XQA_JIT_ERROR(tllmXqaJitGetCUBIN(program, const_cast<char*>(cubinContent.c_str())));
+
+    CHECK_TLLM_XQA_JIT_ERROR(tllmXqaJitDestroyProgram(&program));
+
+    return CubinObj(cubinContent);
+}
+
+CompileEngine::CompileEngine(int SM, XQAParams const& xqaParams)
+    : mSM(SM)
+    , mXqaParams(xqaParams)
+{
+}
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h
new file mode 100644
index 000000000..8a503e939
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cubinObj.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h"
+#include <nvrtc.h>
+#include <string>
+#include <vector>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+// A thin wrapper around NVRTC for compiling CUDA programs.
+class CompileEngine
+{
+public:
+    CompileEngine(int SM, XQAParams const& xqaParams);
+
+    CubinObj compile() const;
+
+    ~CompileEngine() = default;
+
+private:
+    int mSM;
+    XQAParams const& mXqaParams;
+};
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp
new file mode 100644
index 000000000..4f748dc39
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cubinObj.h"
+
+#include "serializationUtils.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaDriverWrapper.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include <cuda_runtime_api.h>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+CubinObj::CubinObj(void const* buffer_, size_t buffer_size)
+    : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance())
+{
+    uint8_t const* buffer = static_cast<uint8_t const*>(buffer_);
+    size_t remaining_buffer_size = buffer_size;
+    uint32_t len = readFromBuffer<uint32_t>(buffer, remaining_buffer_size);
+    mContent.resize(len);
+    TLLM_CHECK(len <= remaining_buffer_size);
+    memcpy(mContent.data(), buffer, len);
+
+    initialize(mContent.c_str(), "kernel_mha");
+}
+
+size_t CubinObj::getSerializationSize() const noexcept
+{
+    size_t result = sizeof(uint32_t) + mContent.size();
+    // Make result multiples of 4.
+    result = (result + 3) & ~3;
+    return result;
+}
+
+void CubinObj::serialize(void* buffer_, size_t buffer_size) const noexcept
+{
+    size_t remaining_buffer_size = buffer_size;
+    uint8_t* buffer = static_cast<uint8_t*>(buffer_);
+    uint32_t len = mContent.size();
+    writeToBuffer<uint32_t>(len, buffer, remaining_buffer_size);
+    TLLM_CHECK(len <= remaining_buffer_size);
+    memcpy(buffer, mContent.c_str(), len);
+}
+
+CubinObj::CubinObj(std::string const& content)
+    : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance())
+    , mContent(content)
+    , mModule(nullptr)
+    , mFunction(nullptr)
+    , mSharedMemBytes(0)
+{
+    initialize(mContent.c_str(), "kernel_mha");
+}
+
+void CubinObj::launch(dim3 gridDim, dim3 blockDim, CUstream hStream, void** kernelParams)
+{
+    cuErrCheck(mDriver->cuLaunchKernel(mFunction, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
+                   mSharedMemBytes, hStream, kernelParams, /*extra=*/nullptr),
+        mDriver);
+}
+
+void CubinObj::initialize(char const* content, char const* funcName)
+{
+    cuErrCheck(mDriver->cuModuleLoadData(&mModule, content), mDriver);
+    TLLM_CHECK(mModule != nullptr);
+    cuErrCheck(mDriver->cuModuleGetFunction(&mFunction, mModule, funcName), mDriver);
+    TLLM_CHECK(mFunction != nullptr);
+
+    // Populate mSharedMemBytes.
+    CUdeviceptr shmem_dev_ptr = 0;
+    cuErrCheck(mDriver->cuModuleGetGlobal(&shmem_dev_ptr, nullptr, mModule, "smemSize"), mDriver);
+    TLLM_CHECK(shmem_dev_ptr != 0);
+    cuErrCheck(mDriver->cuMemcpyDtoH(&mSharedMemBytes, shmem_dev_ptr, sizeof(unsigned int)), mDriver);
+
+    TLLM_CHECK(mSharedMemBytes > 0);
+
+    /* Set 46KB threshold here because we have to take static/driver shared memory into consideration. */
+    if (mSharedMemBytes >= 46 * 1024)
+    {
+        cuErrCheck(
+            mDriver->cuFuncSetAttribute(mFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, mSharedMemBytes),
+            mDriver);
+    }
+
+    sync_check_cuda_error();
+}
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h
new file mode 100644
index 000000000..2706bf1cc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_runtime_api.h>
+#include <string>
+
+#include "tensorrt_llm/common/cudaDriverWrapper.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+class CubinObj
+{
+public:
+    // Default constructor constructs an empty unusable CubinObj instance.
+    CubinObj() = default;
+    CubinObj(std::string const& content);
+    CubinObj(void const* buffer, size_t buffer_size);
+    void launch(dim3 gridDim, dim3 blockDim, CUstream hStream, void** kernelParams);
+
+    size_t getSerializationSize() const noexcept;
+    void serialize(void* buffer, size_t buffer_size) const noexcept;
+
+private:
+    void initialize(char const* content, char const* funcName);
+
+    std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> mDriver;
+
+    std::string mContent;
+
+    CUmodule mModule;
+    CUfunction mFunction;
+    unsigned int mSharedMemBytes;
+};
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h
new file mode 100644
index 000000000..28bdc969a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cubinObj.h"
+
+#include "compileEngine.h"
+#include "serializationUtils.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
+#include <functional>
+#include <unordered_map>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+// A collection of CubinObjs, with caching functionality.
+template <typename Key, class Hash = std::hash<Key>>
+class CubinObjRegistryTemplate
+{
+public:
+    CubinObjRegistryTemplate() = default;
+
+    CubinObjRegistryTemplate(void const* buffer_, size_t buffer_size)
+    {
+        size_t remaining_buffer_size = buffer_size;
+        uint8_t const* buffer = static_cast<uint8_t const*>(buffer_);
+        // First 4 bytes: num of elements.
+        uint32_t n = readFromBuffer<uint32_t>(buffer, remaining_buffer_size);
+
+        for (uint32_t i = 0; i < n; ++i)
+        {
+            uint32_t key_size = readFromBuffer<uint32_t>(buffer, remaining_buffer_size);
+            TLLM_CHECK(key_size <= remaining_buffer_size);
+            Key key(buffer, key_size);
+            buffer += key_size;
+            remaining_buffer_size -= key_size;
+
+            uint32_t obj_size = readFromBuffer<uint32_t>(buffer, remaining_buffer_size);
+            TLLM_CHECK(obj_size <= remaining_buffer_size);
+            CubinObj obj(buffer, obj_size);
+            buffer += obj_size;
+            remaining_buffer_size -= obj_size;
+
+            mMap.insert({key, std::move(obj)});
+        }
+        TLLM_CHECK(remaining_buffer_size == 0);
+    }
+
+    std::unique_ptr<CubinObjRegistryTemplate<Key, Hash>> clone() const noexcept
+    {
+        auto result = std::make_unique<CubinObjRegistryTemplate<Key, Hash>>();
+        for (auto const& p : mMap)
+        {
+            result->mMap.insert(p);
+        }
+        return result;
+    }
+
+    size_t getSerializationSize() const noexcept
+    {
+        size_t result = sizeof(uint32_t);
+        for (auto&& p : mMap)
+        {
+            result += 2 * sizeof(uint32_t);
+            result += p.first.getSerializationSize() + p.second.getSerializationSize();
+        }
+        return result;
+    }
+
+    void serialize(void* buffer_, size_t buffer_size) const noexcept
+    {
+        size_t remaining_buffer_size = buffer_size;
+        uint8_t* buffer = static_cast<uint8_t*>(buffer_);
+        uint32_t n = mMap.size();
+        writeToBuffer<uint32_t>(n, buffer, remaining_buffer_size);
+        for (auto&& p : mMap)
+        {
+            uint32_t key_size = p.first.getSerializationSize();
+            TLLM_CHECK(key_size <= remaining_buffer_size);
+            writeToBuffer<uint32_t>(key_size, buffer, remaining_buffer_size);
+            p.first.serialize(buffer, key_size);
+            buffer += key_size;
+            remaining_buffer_size -= key_size;
+
+            uint32_t obj_size = p.second.getSerializationSize();
+            TLLM_CHECK(obj_size <= remaining_buffer_size);
+            writeToBuffer<uint32_t>(obj_size, buffer, remaining_buffer_size);
+            p.second.serialize(buffer, obj_size);
+            buffer += obj_size;
+            remaining_buffer_size -= obj_size;
+        }
+        TLLM_CHECK(remaining_buffer_size == 0);
+    }
+
+    // Returns directly if the Cubin already exists in the registry, otherwise call compileEngine to compile it.
+    //
+    // compileEngine may be nullptr.
+    CubinObj* getCubin(Key const& key, CompileEngine* compileEngine)
+    {
+        auto iter = mMap.find(key);
+        if (iter != mMap.end())
+        {
+            return &(iter->second);
+        }
+
+        TLLM_CHECK_WITH_INFO(compileEngine != nullptr, "Key not found; compileEngine shouldn't be nullptr.");
+
+        CubinObj obj = compileEngine->compile();
+        auto insertResultIter = mMap.insert({key, std::move(obj)}).first;
+        return &(insertResultIter->second);
+    }
+
+    void clear()
+    {
+        mMap.clear();
+    }
+
+private:
+    std::unordered_map<Key, CubinObj, Hash> mMap;
+};
+
+using CubinObjKey = XQAKernelFullHashKey;
+using CubinObjHasher = XQAKernelFullHasher;
+using CubinObjRegistry = CubinObjRegistryTemplate<CubinObjKey, CubinObjHasher>;
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
new file mode 100644
index 000000000..c69ceae43
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h"
+
+#include "compileEngine.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h"
+#include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
+
+namespace
+{
+
+using ::tensorrt_llm::kernels::XQAKernelRuntimeHashKey;
+using ::tensorrt_llm::kernels::XQAParams;
+using ::tensorrt_llm::kernels::XQAKernelMetaInfo;
+
+XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& kernelMeta)
+{
+    return {kernelMeta.mKVDataType, kernelMeta.mHeadDim, kernelMeta.mBeamWidth, kernelMeta.mNumQHeadsOverKV,
+        kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens};
+}
+
+XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParams)
+{
+    unsigned int head_size = xqaParams.head_size;
+    int num_q_heads = xqaParams.num_q_heads;
+    int num_kv_heads = xqaParams.num_kv_heads;
+    TLLM_CHECK_WITH_INFO(num_q_heads % num_kv_heads == 0, "numQHeads should be multiple of numKVHeads.");
+    unsigned int num_q_heads_over_kv = num_q_heads / num_kv_heads;
+    unsigned int beam_width = xqaParams.beam_width;
+    // MultiQueryToken kernels can support any num_q_heads_over_kv that is power of 2.
+    unsigned int kernel_num_q_heads_over_kv = xqaParams.multi_query_tokens ? 0 : num_q_heads_over_kv;
+    // MultiQueryToken kernels can handle either 16/32 for M direction per CTA.
+    unsigned int m_tilesize = xqaParams.multi_query_tokens ? 16 : num_q_heads_over_kv;
+
+    return {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, m_tilesize,
+        xqaParams.paged_kv_cache ? static_cast<unsigned int>(xqaParams.tokens_per_block) : 0, xqaParams.paged_kv_cache,
+        xqaParams.multi_query_tokens};
+}
+
+} // anonymous namespace
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+DecoderXQAImplJIT::DecoderXQAImplJIT(DecoderXQARunner* runner)
+    : DecoderXQAImpl(runner)
+    , mForceXQA(tensorrt_llm::common::forceXQAKernels())
+    , mSM(tensorrt_llm::common::getSMVersion())
+    , mCubinObjRegistry(runner->mResource->getCubinObjRegistry())
+{
+    initSupportedConfigs();
+}
+
+void DecoderXQAImplJIT::initSupportedConfigs()
+{
+    mSupportedConfigs.clear();
+
+    size_t nbConfigs = sizeof(sXqaKernelMetaInfo) / sizeof(sXqaKernelMetaInfo[0]);
+    for (size_t i = 0; i < nbConfigs; ++i)
+    {
+        XQAKernelMetaInfo const& kernelMeta = sXqaKernelMetaInfo[i];
+        if (!kernelMeta.mMultiQueryTokens)
+        {
+            // Exclude medusa kernels from JIT because they are compiled from a different CUDA source file.
+            mSupportedConfigs.insert(getRuntimeHashKeyFromKernelMeta(kernelMeta));
+        }
+    }
+}
+
+bool DecoderXQAImplJIT::supportConfig(XQAParams const& xqaParams) const
+{
+    return mSupportedConfigs.find(getRuntimeHashKeyFromXQAParams(xqaParams)) != mSupportedConfigs.end();
+}
+
+bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const
+{
+    // NOTE: only XQA supports multi_query_tokens (Medusa mode).
+    if (mForceXQA || xqaParams.multi_query_tokens)
+    {
+        return true;
+    }
+    int num_kv_heads = xqaParams.num_kv_heads;
+    int batch_size = static_cast<int>(xqaParams.batch_size);
+    int multi_block_count = 1;
+    if (xqaParams.multi_block_mode)
+    {
+        int history_length = xqaParams.timestep;
+        multi_block_count = history_length / kMinHistoryTokensPerBlock;
+    }
+    int block_count = num_kv_heads * batch_size * multi_block_count;
+    return static_cast<float>(block_count) * kEnableMinBlockFactor >= static_cast<float>(mRunner->mMultiProcessorCount);
+}
+
+bool DecoderXQAImplJIT::shouldUse(XQAParams const& xqaParams, bool forConfigurePlugin)
+{
+    bool is_config_supported = supportConfig(xqaParams);
+    if (forConfigurePlugin)
+    {
+        return is_config_supported;
+    }
+    else
+    {
+        return is_config_supported && mayHavePerfGain(xqaParams);
+    }
+}
+
+jit::CubinObjKey DecoderXQAImplJIT::getCubinObjKeyFromXQAParams(XQAParams const& xqaParams) const
+{
+    XQAKernelLoadHashKey loadKey;
+    loadKey.data_type = xqaParams.data_type;
+    loadKey.sm = mSM;
+
+    XQAKernelRuntimeHashKey runtimeKey = getRuntimeHashKeyFromXQAParams(xqaParams);
+    return {loadKey, runtimeKey};
+}
+
+void DecoderXQAImplJIT::prepare(XQAParams const& xqaParams)
+{
+    jit::CubinObjKey key = getCubinObjKeyFromXQAParams(xqaParams);
+
+    jit::CompileEngine compileEngine(mSM, xqaParams);
+
+    // Discard getCubin() result.
+    mCubinObjRegistry->getCubin(key, &compileEngine);
+}
+
+void DecoderXQAImplJIT::runWithKVLinearBuffer(
+    XQAParams const& xqaParams, KVLinearBuffer const& kv_linear_buffer, cudaStream_t const& stream)
+{
+    runDispatchKVCacheBuffer<KVLinearBuffer>(xqaParams, kv_linear_buffer, stream);
+}
+
+void DecoderXQAImplJIT::runWithKVBlockArray(
+    XQAParams const& xqaParams, KVBlockArray const& kv_block_array, cudaStream_t const& stream)
+{
+    runDispatchKVCacheBuffer<KVBlockArray>(xqaParams, kv_block_array, stream);
+}
+
+#define XQA_KERNEL_RUN(DATA_TYPE)                                                                                      \
+    runImpl<DATA_TYPE, KVCacheBuffer>(xqa_params, kv_cache_buffer, mRunner->mMultiProcessorCount, stream)
+
+template <typename KVCacheBuffer>
+void DecoderXQAImplJIT::runDispatchKVCacheBuffer(
+    XQAParams const& xqa_params, KVCacheBuffer const& kv_cache_buffer, cudaStream_t const& stream)
+{
+    if (mRunner->mDataType == DATA_TYPE_FP16)
+    {
+        XQA_KERNEL_RUN(__half);
+    }
+    else
+    {
+        XQA_KERNEL_RUN(__nv_bfloat16);
+    }
+}
+
+#undef XQA_KERNEL_RUN
+
+template <typename T, typename KVCacheBuffer>
+void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer,
+    int multiprocessor_count, cudaStream_t const& stream)
+{
+    unsigned int head_size = xqaParams.head_size;
+    int num_q_heads = xqaParams.num_q_heads;
+    int num_kv_heads = xqaParams.num_kv_heads;
+    TLLM_CHECK_WITH_INFO(num_q_heads % num_kv_heads == 0, "numQHeads should be multiple of numKVHeads.");
+    unsigned int num_q_heads_over_kv = num_q_heads / num_kv_heads;
+    unsigned int beam_width = xqaParams.beam_width;
+    unsigned int batch_beam_size = xqaParams.batch_size * beam_width;
+
+    const KvCacheDataType cache_type = xqaParams.kv_cache_quant_mode.hasInt8KvCache()
+        ? KvCacheDataType::INT8
+        : (xqaParams.kv_cache_quant_mode.hasFp8KvCache() ? KvCacheDataType::FP8 : KvCacheDataType::BASE);
+
+    XQALaunchParam<KVCacheBuffer> launchParams;
+    buildXQALaunchParams(launchParams, xqaParams, kv_cache_buffer);
+
+    // Build cu_seqlens, padding_offset, and rotary inv freq tensors
+    BuildDecoderInfoParams<T> decoder_params;
+    memset(&decoder_params, 0, sizeof(decoder_params));
+    decoder_params.seqQOffsets = launchParams.cu_seq_lens;
+    decoder_params.seqKVLengths = xqaParams.sequence_lengths;
+    decoder_params.batchSize = int(batch_beam_size);
+    decoder_params.maxQSeqLength = xqaParams.generation_input_length;
+    // Rotary embedding inv_freq buffer.
+    decoder_params.rotaryEmbeddingScale = xqaParams.rotary_embedding_scale;
+    decoder_params.rotaryEmbeddingBase = xqaParams.rotary_embedding_base;
+    decoder_params.rotaryEmbeddingDim = xqaParams.rotary_embedding_dim;
+    decoder_params.rotaryScalingType = xqaParams.rotary_embedding_scale_type;
+    decoder_params.rotaryEmbeddingInvFreq = launchParams.rotary_inv_freq_buf;
+    decoder_params.rotaryEmbeddingMaxPositions = xqaParams.rotary_embedding_max_positions;
+
+    invokeBuildDecoderInfo(decoder_params, stream);
+    sync_check_cuda_error();
+
+    // IDEA: Store rotary_processed Q buffer to output buffer.
+    // NOTE: MHA kernels should read kv cache that has already been appended with new tokens' kv cache.
+    void const* xqa_q_input_ptr = xqaParams.output;
+    QKVPreprocessingParams<T, KVCacheBuffer> preprocessingParms{static_cast<T*>(const_cast<void*>(xqaParams.qkv)),
+        nullptr, static_cast<T*>(const_cast<void*>(xqaParams.output)), kv_cache_buffer,
+        static_cast<T const*>(xqaParams.qkv_bias), nullptr, xqaParams.sequence_lengths, nullptr,
+        launchParams.rotary_inv_freq_buf, (float2 const*) nullptr, xqaParams.kv_scale_orig_quant,
+        xqaParams.spec_decoding_position_offsets, int(batch_beam_size), xqaParams.generation_input_length,
+        xqaParams.timestep, xqaParams.cyclic_attention_window_size, xqaParams.sink_token_length,
+        int(xqaParams.batch_size * beam_width * xqaParams.generation_input_length), xqaParams.num_q_heads,
+        xqaParams.num_kv_heads, xqaParams.num_q_heads / xqaParams.num_kv_heads, xqaParams.head_size,
+        xqaParams.rotary_embedding_dim, xqaParams.rotary_embedding_base, xqaParams.rotary_embedding_scale_type,
+        xqaParams.rotary_embedding_scale, xqaParams.rotary_embedding_max_positions, xqaParams.position_embedding_type,
+        xqaParams.position_shift_enabled, cache_type, true, false, multiprocessor_count};
+
+    invokeQKVPreprocessing<T, KVCacheBuffer>(preprocessingParms, stream);
+    sync_check_cuda_error();
+
+    // Use mTileSize = 16 kernels when qSeqLen <= 16.
+    unsigned int qSeqLen = static_cast<unsigned int>(xqaParams.generation_input_length);
+    unsigned int mTileSize = qSeqLen <= 16 ? 16 : 32;
+    // MultiQueryToken kernels can support any num_q_heads_over_kv that is power of 2.
+    unsigned int kernel_num_q_heads_over_kv = xqaParams.multi_query_tokens ? 0 : num_q_heads_over_kv;
+    // MultiQueryToken kernels can handle either 16/32 for M direction per CTA.
+    unsigned int kernel_m_tilesize = xqaParams.multi_query_tokens ? mTileSize : num_q_heads_over_kv;
+
+    jit::CubinObjKey key = getCubinObjKeyFromXQAParams(xqaParams);
+    jit::CubinObj* cubinObj = mCubinObjRegistry->getCubin(key, /*compileEngine=*/nullptr);
+
+    if (xqaParams.multi_query_tokens)
+    {
+        // MultiQueryTokens (generation_input_length > 1) need extra parameters (like qSeqLen, log2HeadGrpSize, and
+        // mask). Input parameters for MultiQueryTokens kernels.
+        unsigned int log2HeadGrpSize = log2(num_q_heads_over_kv);
+        unsigned int nbTokenBlocksPerGrp = divUp(qSeqLen << log2HeadGrpSize, mTileSize);
+        int const* maskPtr = xqaParams.spec_decoding_packed_mask;
+        // TODO: add fp8/int8 kv cache kernels.
+        float kvCacheQuantOrig = 1.0f;
+        // TODO: merge SingleQueryToken params and MultiQueryTokens params into one kernelParams.
+        void* kernelParams[]
+            = {&qSeqLen, &launchParams.num_k_heads, &log2HeadGrpSize, &launchParams.output, &xqa_q_input_ptr, &maskPtr,
+                &launchParams.kvCacheParams, &launchParams.batch_size, &kvCacheQuantOrig, &launchParams.scratch};
+        int multi_block = 1;
+        if (xqaParams.multi_block_mode)
+        {
+            multi_block = computeMultiBlockCount(xqaParams, xqaParams.batch_size, multiprocessor_count);
+            cudaMemsetAsync(
+                xqaParams.workspaces, 0, sizeof(int) * xqaParams.batch_size * xqaParams.num_kv_heads, stream);
+        }
+        dim3 gridDim(multi_block, xqaParams.num_kv_heads * nbTokenBlocksPerGrp, xqaParams.batch_size);
+        dim3 blockDim(128, 1, 2);
+        cubinObj->launch(gridDim, blockDim, stream, kernelParams);
+    }
+    else
+    {
+        constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 9;
+        uint32_t idxNextParam = 0;
+        void* kernelParams[kMAX_NB_KERNEL_PARAMS];
+        auto appendParam = [&](auto* p) mutable
+        {
+            TLLM_CHECK(idxNextParam < kMAX_NB_KERNEL_PARAMS);
+            kernelParams[idxNextParam++] = p;
+        };
+        appendParam(&launchParams.num_k_heads);
+        appendParam(&launchParams.output);
+        appendParam(&xqa_q_input_ptr);
+        appendParam(&launchParams.kvCacheParams);
+        if (xqaParams.beam_width > 1)
+        {
+            appendParam(&launchParams.beamSearchParams.value());
+        }
+        appendParam(&launchParams.batch_size);
+        appendParam(&launchParams.kv_scale_quant_orig);
+        appendParam(&launchParams.scratch);
+        kernelParams[idxNextParam] = nullptr; // one extra nullptr at end as guard.
+        int multi_block = 1;
+        if (xqaParams.multi_block_mode)
+        {
+            multi_block = computeMultiBlockCount(xqaParams, xqaParams.batch_size, multiprocessor_count);
+            cudaMemsetAsync(
+                xqaParams.workspaces, 0, sizeof(int) * xqaParams.batch_size * xqaParams.num_kv_heads, stream);
+        }
+
+        dim3 gridDim(multi_block, xqaParams.num_kv_heads, xqaParams.batch_size);
+        dim3 blockDim(128, 1, 2);
+        cubinObj->launch(gridDim, blockDim, stream, kernelParams);
+    }
+
+    sync_check_cuda_error();
+}
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h
new file mode 100644
index 000000000..6785b0875
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h"
+
+#include "compileEngine.h"
+#include "cubinObjRegistry.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
+#include <unordered_set>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+class DecoderXQAImplJIT : public DecoderXQAImpl
+{
+public:
+    DecoderXQAImplJIT(DecoderXQARunner* runner);
+
+    bool shouldUse(XQAParams const& xqaParams, bool forConfigurePlugin) override;
+    void prepare(XQAParams const& xqaParams) override;
+
+protected:
+    void runWithKVLinearBuffer(
+        XQAParams const& xqaParams, KVLinearBuffer const& kv_linear_buffer, cudaStream_t const& stream) override;
+    void runWithKVBlockArray(
+        XQAParams const& xqaParams, KVBlockArray const& kv_block_array, cudaStream_t const& stream) override;
+
+private:
+    void initSupportedConfigs();
+    //! Whether DecoderXQAImplJIT supports xqaParams.
+    bool supportConfig(XQAParams const& xqaParams) const;
+    //! Whether DecoderXQAImplJIT has perf gain over the default (non-XQA-optimized) implementation.
+    bool mayHavePerfGain(XQAParams const& xqaParams) const;
+
+    template <typename T, typename KVCacheBuffer>
+    void runImpl(XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer, int multiprocessor_count,
+        cudaStream_t const& stream);
+
+    template <typename KVCacheBuffer>
+    void runDispatchKVCacheBuffer(
+        XQAParams const& xqa_params, KVCacheBuffer const& kv_cache_buffer, cudaStream_t const& stream);
+
+    bool mForceXQA;
+    int mSM;
+
+    jit::CubinObjRegistry* mCubinObjRegistry;
+    jit::CubinObjKey getCubinObjKeyFromXQAParams(XQAParams const& xqaParams) const;
+
+    //! The first prototype just takes whatever available from the Precompiled cubins.
+    std::unordered_set<XQAKernelRuntimeHashKey, XQAKernelRuntimeHasher> mSupportedConfigs;
+};
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
new file mode 100644
index 000000000..0a80e28fb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7ab77d5678faa3cf90712f9919d71cd9b4d68f5e334f87c6047593963e861bf
+size 80328568
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
new file mode 100644
index 000000000..8161030f6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -0,0 +1,2 @@
+24849f03d35877abb0e0f393d32e5000 libtensorrt_llm_nvrtc_wrapper.so
+942b83732d029cc3eaef9f5a849218d75161ec12 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h
new file mode 100644
index 000000000..10cb9992c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This file is NOT thread safe.
+ */
+#pragma once
+#include <stddef.h>
+
+#ifdef _WIN32
+
+#if COMPILING_DLL
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT __declspec(dllimport)
+#endif
+
+#else             // _WIN32
+#define DLLEXPORT // Nothing.
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+    typedef struct
+    {
+        // Compute capability, e.g. 89.
+        int sm;
+
+        unsigned int head_size;
+        unsigned int num_q_heads;
+        unsigned int num_kv_heads;
+        unsigned int beam_width;
+        unsigned int tokens_per_block;
+        bool multi_query_tokens;
+        bool paged_kv_cache;
+
+        // Actual type: tensorrt_llm::kernels::Data_type
+        int data_type;
+        int kv_cache_data_type;
+    } tllmXqaJitContext;
+
+    // tllmXqaJitProgram is an opaque handle for a program.
+    typedef struct _tllmXqaJitProgram* tllmXqaJitProgram;
+
+    typedef enum
+    {
+        TLLM_XQA_JIT_SUCCESS = 0,
+        TLLM_XQA_JIT_INVALID_INPUT = 1,
+        TLLM_XQA_JIT_INTERNAL_ERROR = 2,
+    } tllmXqaJitStatus;
+
+    // context must outlive prog.
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitCreateAndCompileProgram(
+        tllmXqaJitProgram* prog, tllmXqaJitContext const* context);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitGetCUBINSize(tllmXqaJitProgram prog, size_t* cubinSizeRet);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitGetCUBIN(tllmXqaJitProgram prog, char* cubin);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitDestroyProgram(tllmXqaJitProgram* prog);
+
+    // Returns the size of the error string associated with the last non-success tllmXqaJit function call (including the
+    // trailing \0). Returns 0 if there is no such non-success function call.
+    DLLEXPORT size_t tllmXqaJitGetLastErrorStringSize();
+    // Returns the error string.
+    // Output can be nullptr if the returned value of tllmGetLastErrorStringSize() is 0.
+    DLLEXPORT void tllmXqaJitGetLastErrorString(char* output);
+
+#if __cplusplus
+} // extern "C"
+#endif
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
new file mode 100755
index 000000000..8a62a3880
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbf7c5d33ad7d0533569e1be71e6e13f04c7a001cab15ed55eba81c9f8bb6ad3
+size 83431088
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
new file mode 100644
index 000000000..41d519e49
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45dcdb034ff53e4f862cc035545973f1b9efae4a8aa3e83555fd77f8b55311db
+size 966144
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
new file mode 100644
index 000000000..9af1b79d9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca7c531980130dfd37c59132bbce8e90b821ecc31fa20d86726eec153bb016e
+size 3488
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h
new file mode 100644
index 000000000..f48af0f7c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorrt_llm/common/assert.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace jit
+{
+
+template <typename T>
+T readFromBuffer(uint8_t const*& buffer, size_t& remaining_buffer_size)
+{
+    TLLM_CHECK(sizeof(T) <= remaining_buffer_size);
+
+    T result = *reinterpret_cast<T const*>(buffer);
+    buffer += sizeof(T);
+    remaining_buffer_size -= sizeof(T);
+    return result;
+}
+
+template <typename T>
+void writeToBuffer(T output, uint8_t*& buffer, size_t& remaining_buffer_size)
+{
+    TLLM_CHECK(sizeof(T) <= remaining_buffer_size);
+
+    *reinterpret_cast<T*>(buffer) = output;
+    buffer += sizeof(T);
+    remaining_buffer_size -= sizeof(T);
+}
+
+} // namespace jit
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
index 9dfb7857e..dbd2410ab 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/common/workspace.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h"
 #include "tensorrt_llm/kernels/kvCacheUtils.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
@@ -36,219 +37,14 @@ namespace tensorrt_llm
 namespace kernels
 {
 
-struct XQAKernelLoadHashKey
-{
-    Data_type data_type;
-    unsigned int sm;
-
-    bool operator==(const XQAKernelLoadHashKey other) const
-    {
-        return data_type == other.data_type && sm == other.sm;
-    }
-};
-
-struct XQAKernelLoadHasher
-{
-    size_t operator()(XQAKernelLoadHashKey const& s) const
-    {
-        size_t key = s.data_type;
-        key <<= 16;
-        key ^= s.sm;
-        return key;
-    }
-};
-
-struct XQAKernelRuntimeHashKey
-{
-    Data_type kv_data_type;
-    unsigned int head_size;
-    unsigned int beam_size;
-    unsigned int num_q_heads_per_kv;
-    unsigned int m_tilesize;
-    unsigned int tokens_per_page;
-    bool paged_kv_cache;
-    bool multi_query_tokens;
-
-    bool operator==(const XQAKernelRuntimeHashKey other) const
-    {
-        return kv_data_type == other.kv_data_type && head_size == other.head_size
-            && num_q_heads_per_kv == other.num_q_heads_per_kv && beam_size == other.beam_size
-            && multi_query_tokens == other.multi_query_tokens && m_tilesize == other.m_tilesize
-            && tokens_per_page == other.tokens_per_page && paged_kv_cache == other.paged_kv_cache;
-    }
-};
-
-struct XQAKernelRuntimeHasher
-{
-    size_t operator()(XQAKernelRuntimeHashKey const& s) const
-    {
-        size_t key = s.kv_data_type;
-        key <<= 16;
-        key ^= s.head_size;
-        key <<= 8;
-        key ^= s.num_q_heads_per_kv;
-        key <<= 8;
-        key ^= s.beam_size;
-        key <<= 6;
-        key ^= s.m_tilesize;
-        key <<= 10;
-        key ^= s.tokens_per_page;
-        key <<= 1;
-        key ^= s.paged_kv_cache;
-        key <<= 1;
-        key ^= s.multi_query_tokens;
-        return key;
-    }
-};
-
-// NOTE: we use int32_t sequence lengths as gpt attention plugins use int32_t for that.
-// XQA kernels assume all length should use uint32_t.
-// NOTE: Linear KV cache and paged KV cache uses the same structure.
-
-template <typename KVCacheBuffer>
-struct KVCache
-{
-};
-
-template <>
-struct KVCache<KVBlockArray>
-{
-    // Start address of the paged kv block pool.
-    void* poolPtr = nullptr;
-    // Block indices in the memory pool.
-    int32_t const* blockIndices = nullptr;
-    int32_t const* sequence_lengths = nullptr;
-    // NOTE: max_num_blocks_per_sequence for paged kv cache.
-    uint32_t capacity = 0;
-
-    KVCache(KVBlockArray& kv_cache_buffer)
-    {
-        poolPtr = kv_cache_buffer.mPrimaryPoolPtr;
-        blockIndices = reinterpret_cast<KVCacheIndex::UnderlyingType const*>(kv_cache_buffer.data);
-    }
-
-    KVCache() = default;
-};
-
-template <>
-struct KVCache<KVLinearBuffer>
-{
-    // Buffer address.
-    void* data = nullptr;
-    int32_t const* sequence_lengths = nullptr;
-    // NOTE: max_sequence_length for linear kv cache.
-    uint32_t capacity = 0;
-
-    KVCache(KVLinearBuffer& kv_cache_buffer)
-    {
-        data = kv_cache_buffer.data;
-    }
-
-    KVCache() = default;
-};
-
-struct BeamSearchParams
-{
-    int32_t const* indices;    // cacheIndir with shape: [batchSize][beamWidth][capacity]
-    int32_t capacity;
-    int32_t const* ctxLenList; // shape: [batchSize][beamWidth]. Should be [batchSize] but we have to match trt-llm API.
-};
-
-// XQA kernels assume all integer values should use uint32_t.
-template <typename KVCacheBuffer>
-struct XQALaunchParam
-{
-    uint32_t num_k_heads;
-    void* output;
-    void const* qkv;
-    KVCache<KVCacheBuffer> kvCacheParams;
-    std::optional<BeamSearchParams> beamSearchParams;
-    uint32_t batch_size;
-    float const* kv_scale_quant_orig = nullptr;
-    int* cu_seq_lens = nullptr;
-    float* rotary_inv_freq_buf = nullptr;
-    void* scratch = nullptr;
-};
-
-// Setup launch params.
-template <typename KVCacheBuffer>
-void buildXQALaunchParams(
-    XQALaunchParam<KVCacheBuffer>& launchParams, XQAParams const& params, KVCacheBuffer kv_cache_buffer)
-{
-    TLLM_CHECK_WITH_INFO(
-        params.data_type == DATA_TYPE_FP16 || params.data_type == DATA_TYPE_BF16, "Only fp16 or bf16 supported now.");
-    memset(&launchParams, 0, sizeof(XQALaunchParam<KVCacheBuffer>));
-    launchParams.num_k_heads = params.num_kv_heads;
-    launchParams.output = static_cast<uint8_t*>(params.output);
-    launchParams.qkv = static_cast<uint8_t const*>(params.qkv);
-    launchParams.batch_size = params.batch_size;
-    launchParams.kv_scale_quant_orig = params.kv_scale_quant_orig;
-
-    // Workspace.
-    size_t offset = 0;
-    int8_t* workspace = reinterpret_cast<int8_t*>(params.workspaces);
-    unsigned int batch_beam_size = params.batch_size * params.beam_width;
-    const size_t cu_seqlens_size = sizeof(int) * (batch_beam_size + 1);
-    const size_t rotary_inv_freq_size = sizeof(float) * batch_beam_size * params.rotary_embedding_dim / 2;
-    launchParams.cu_seq_lens = reinterpret_cast<int*>(workspace);
-    workspace = nextWorkspacePtrWithAlignment(workspace, cu_seqlens_size);
-    launchParams.rotary_inv_freq_buf = reinterpret_cast<float*>(workspace);
-    auto const multi_block_workspace_alignment = roundUp(
-        sizeof(half) * params.head_size * (params.num_q_heads / params.num_kv_heads) * params.beam_width, 128);
-    workspace = nextWorkspacePtrWithAlignment(workspace, rotary_inv_freq_size, multi_block_workspace_alignment);
-    launchParams.scratch = reinterpret_cast<void*>(workspace);
-
-    launchParams.kvCacheParams = KVCache<KVCacheBuffer>(kv_cache_buffer);
-    launchParams.kvCacheParams.sequence_lengths = params.sequence_lengths;
-    launchParams.kvCacheParams.capacity
-        = params.paged_kv_cache ? params.max_blocks_per_sequence : params.max_attention_window_size;
-    // TODO: beam searching has not been implemented yet.
-    if (params.beam_width > 1)
-    {
-        launchParams.beamSearchParams
-            = BeamSearchParams{params.cache_indir, params.max_attention_window_size, params.context_lengths};
-    }
-    else
-    {
-        launchParams.beamSearchParams = std::nullopt;
-    }
-}
-
-namespace
-{
-template <typename T>
-std::optional<T> getGlobalVar(
-    tensorrt_llm::common::CUDADriverWrapper const& driver, CUmodule hmod, char const* const name, bool required = false)
-{
-    T* pVar = nullptr;
-    size_t size = 0;
-    auto const error = driver.cuModuleGetGlobal(reinterpret_cast<CUdeviceptr*>(&pVar), &size, hmod, name);
-    T ret;
-    switch (error)
-    {
-    case CUDA_SUCCESS:
-        TLLM_CHECK(size == sizeof(T));
-        check_cuda_error(cudaMemcpy(&ret, pVar, size, cudaMemcpyDeviceToHost));
-        break;
-    case CUDA_ERROR_NOT_FOUND:
-        if (!required)
-        {
-            return std::nullopt;
-        }
-        [[fallthrough]];
-    default: cuErrCheck(("Failed to retrieve global variable from cubin.", error), driver);
-    }
-    return std::optional<T>{std::move(ret)};
-}
-} // namespace
-
 class XQAKernelList
 {
 public:
     using TKernelMeta = XQAKernelMetaInfo;
 
     XQAKernelList(Data_type type, unsigned int sm)
-        : mDataType(type)
+        : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance())
+        , mDataType(type)
         , mKernelMetaCount(sizeof(sXqaKernelMetaInfo) / sizeof(sXqaKernelMetaInfo[0]))
         , mKernelMeta(&sXqaKernelMetaInfo[0])
         , mSM(sm)
@@ -268,6 +64,10 @@ class XQAKernelList
             if (kernelMeta.mSM != mSM || kernelMeta.mDataType != mDataType)
                 continue;
 
+            // Cubins for kernels that would take the JIT path are removed from kernelMeta.
+            if (kernelMeta.mCubin == nullptr)
+                continue;
+
             CUmodule hmod{0};
             auto findModuleIter = mModules.find(kernelMeta.mCubin);
             if (findModuleIter != mModules.end())
@@ -276,13 +76,13 @@ class XQAKernelList
             }
             else
             {
-                cuErrCheck(mDriver.cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
+                cuErrCheck(mDriver->cuModuleLoadData(&hmod, kernelMeta.mCubin), mDriver);
                 mModules.insert(std::make_pair(kernelMeta.mCubin, hmod));
             }
 
             XQAKernelFuncInfo funcInfo{};
             funcInfo.mMetaInfoIndex = i;
-            cuErrCheck(mDriver.cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
+            cuErrCheck(mDriver->cuModuleGetFunction(&funcInfo.mDeviceFunction, hmod, kernelMeta.mFuncName), mDriver);
             funcInfo.mSharedMemBytes = getGlobalVar<uint32_t>(mDriver, hmod, "smemSize", true).value();
             funcInfo.mKernelType = getGlobalVar<XQAKernelType>(mDriver, hmod, "kernelType", false)
                                        .value_or(XQAKernelType::kAMPERE_WARP_SPECIALIZED);
@@ -290,7 +90,7 @@ class XQAKernelList
             /* Set 46KB threshold here because we have to take static/driver shared memory into consideration. */
             if (funcInfo.mSharedMemBytes >= 46 * 1024)
             {
-                cuErrCheck(mDriver.cuFuncSetAttribute(funcInfo.mDeviceFunction,
+                cuErrCheck(mDriver->cuFuncSetAttribute(funcInfo.mDeviceFunction,
                                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, funcInfo.mSharedMemBytes),
                     mDriver);
             }
@@ -386,7 +186,7 @@ class XQAKernelList
             nullptr, static_cast<T*>(const_cast<void*>(xqaParams.output)), kv_cache_buffer,
             static_cast<T const*>(xqaParams.qkv_bias), nullptr, xqaParams.sequence_lengths, nullptr,
             launchParams.rotary_inv_freq_buf, (float2 const*) nullptr, xqaParams.kv_scale_orig_quant,
-            xqaParams.medusa_position_offsets, int(batch_beam_size), xqaParams.generation_input_length,
+            xqaParams.spec_decoding_position_offsets, int(batch_beam_size), xqaParams.generation_input_length,
             xqaParams.timestep, xqaParams.cyclic_attention_window_size, xqaParams.sink_token_length,
             int(xqaParams.batch_size * beam_width * xqaParams.generation_input_length), xqaParams.num_q_heads,
             xqaParams.num_kv_heads, xqaParams.num_q_heads / xqaParams.num_kv_heads, xqaParams.head_size,
@@ -424,7 +224,7 @@ class XQAKernelList
             // mask). Input parameters for MultiQueryTokens kernels.
             unsigned int log2HeadGrpSize = log2(num_q_heads_over_kv);
             unsigned int nbTokenBlocksPerGrp = divUp(qSeqLen << log2HeadGrpSize, mTileSize);
-            int const* maskPtr = xqaParams.medusa_packed_mask;
+            int const* maskPtr = xqaParams.spec_decoding_packed_mask;
             // TODO: add fp8/int8 kv cache kernels.
             float kvCacheQuantOrig = 1.0f;
             // TODO: merge SingleQueryToken params and MultiQueryTokens params into one kernelParams.
@@ -439,7 +239,7 @@ class XQAKernelList
                     launchParams.scratch, 0, sizeof(int) * xqaParams.batch_size * xqaParams.num_kv_heads, stream));
                 sync_check_cuda_error();
             }
-            cuErrCheck(mDriver.cuLaunchKernel(func, multi_block, xqaParams.num_kv_heads * nbTokenBlocksPerGrp,
+            cuErrCheck(mDriver->cuLaunchKernel(func, multi_block, xqaParams.num_kv_heads * nbTokenBlocksPerGrp,
                            xqaParams.batch_size, 128, 1, 2, shared_mem_bytes, stream, kernelParams, nullptr),
                 mDriver);
         }
@@ -484,7 +284,7 @@ class XQAKernelList
                     launchParams.scratch, 0, sizeof(int) * xqaParams.batch_size * xqaParams.num_kv_heads, stream));
                 sync_check_cuda_error();
             }
-            cuErrCheck(mDriver.cuLaunchKernel(func, multi_block, xqaParams.num_kv_heads, xqaParams.batch_size, 128, 1,
+            cuErrCheck(mDriver->cuLaunchKernel(func, multi_block, xqaParams.num_kv_heads, xqaParams.batch_size, 128, 1,
                            isGmmaKernel ? 3 : 2, shared_mem_bytes, stream, kernelParams, nullptr),
                 mDriver);
         }
@@ -492,34 +292,6 @@ class XQAKernelList
         sync_check_cuda_error();
     }
 
-    static int computeMultiBlockCount(XQAParams const& xqaParams, int batch_size, int multiprocessor_count)
-    {
-        if (envXqaNbCtaPerKVHead().has_value())
-        {
-            return envXqaNbCtaPerKVHead().value();
-        }
-        int multi_block_count = 1;
-        int num_kv_heads = xqaParams.num_kv_heads;
-        int history_length = xqaParams.timestep;
-
-        multi_block_count = history_length / kMinHistoryTokensPerBlock;
-        multi_block_count = std::max(multi_block_count, 1);
-        // adjust to kTargetWaveFactor, as already initialized using kMinHistoryTokensPerBlock, only need to decrease.
-        double wave_count = (double) batch_size * num_kv_heads * multi_block_count / (double) multiprocessor_count;
-        double adj_factor = wave_count / (double) kTargetWaveFactor;
-        if (adj_factor > 1.0)
-        {
-            multi_block_count = floor(multi_block_count / adj_factor);
-        }
-        multi_block_count = std::max(multi_block_count, 1);
-
-        // add limitation on upper bound.
-        multi_block_count = std::min(xqaMaxNbCtaPerKVHeadFactor(), multi_block_count);
-
-        TLLM_CHECK_WITH_INFO(multi_block_count >= 1, "MultiBlock count should be larger than 1");
-        return multi_block_count;
-    }
-
 private:
     static uint32_t getElemBytes(CUtensorMapDataType_enum dataType)
     {
@@ -565,7 +337,7 @@ class XQAKernelList
             }
         }();
 
-        cuErrCheck(mDriver.cuTensorMapEncodeTiled(&tensorMap, dataType, 4, const_cast<void*>(addr), globalDims,
+        cuErrCheck(mDriver->cuTensorMapEncodeTiled(&tensorMap, dataType, 4, const_cast<void*>(addr), globalDims,
                        globalStrides, boxDims, elemStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle,
                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE),
             mDriver);
@@ -594,7 +366,7 @@ class XQAKernelList
             }
         }();
 
-        cuErrCheck(mDriver.cuTensorMapEncodeTiled(&tensorMap, dataType, 4, const_cast<void*>(addr), globalDims,
+        cuErrCheck(mDriver->cuTensorMapEncodeTiled(&tensorMap, dataType, 4, const_cast<void*>(addr), globalDims,
                        globalStrides, boxDims, elemStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle,
                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE),
             mDriver);
@@ -619,7 +391,7 @@ class XQAKernelList
     }
 
 protected:
-    tensorrt_llm::common::CUDADriverWrapper mDriver;
+    std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> mDriver;
 
     Data_type mDataType;
     TKernelMeta const* mKernelMeta;
@@ -706,7 +478,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer(
 
 #undef XQA_KERNEL_RUN
 
-bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams)
+bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool /*forConfigurePlugin*/)
 {
     XQAKernelList const* xqa_kernel = getXQAKernels(mRunner->mDataType, tensorrt_llm::common::getSMVersion());
     return xqa_kernel->supportConfig(xqaParams)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h
index 6731de292..55c923b3b 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h
@@ -29,7 +29,7 @@ class DecoderXQAImplPrecompiled : public DecoderXQAImpl
     {
     }
 
-    bool shouldUse(XQAParams const& xqaParams) override;
+    bool shouldUse(XQAParams const& xqaParams, bool forConfigurePlugin) override;
     void prepare(XQAParams const& xqa_params) override;
 
 protected:
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp
index 0f2c6e046..16c6fda5c 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp
@@ -22,7 +22,6 @@
 #include <mutex>
 #include <unordered_map>
 
-#include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h"
@@ -36,9 +35,9 @@ namespace tensorrt_llm
 namespace kernels
 {
 
-DecoderXQARunner::DecoderXQARunner(
-    const XQADataType data_type, int num_heads, int num_kv_heads, int head_size, bool multi_block_mode)
-    : mPrepareCalled(false)
+DecoderXQARunner::DecoderXQARunner(Resource* resource, const XQADataType data_type, int num_heads, int num_kv_heads,
+    int head_size, bool multi_block_mode)
+    : mResource(resource)
     , mDataType(data_type)
     , mNumHeads(num_heads)
     , mNumKVHeads(num_kv_heads)
@@ -46,9 +45,12 @@ DecoderXQARunner::DecoderXQARunner(
     , mMultiBlockMode(multi_block_mode)
 {
     mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
-    // The initialization of mImpl must be the last line because *this needs to be fully initialized before calling
-    // DecoderXQAImpl::create().
-    mImpl = DecoderXQAImpl::create(this, DecoderXQAImpl::ImplType::kPrecompiled);
+
+    // TODO(minwei): needs both impls because medusa kernels haven't been migrated to JIT yet (which should be).
+    // mJITImpl/mPrecompiledImpl assignments must be the last lines of this constructor. DecoderXQAImpl::create() relies
+    // on *this being fully initialized.
+    mJITImpl = DecoderXQAImpl::create(this, DecoderXQAImpl::ImplType::kJIT);
+    mPrecompiledImpl = DecoderXQAImpl::create(this, DecoderXQAImpl::ImplType::kPrecompiled);
 }
 
 DecoderXQARunner::~DecoderXQARunner() = default;
@@ -96,21 +98,40 @@ size_t DecoderXQARunner::getWorkspaceSize(int max_batch_beam_size)
     return workspace_size;
 }
 
-bool DecoderXQARunner::shouldUseImpl(XQAParams const& xqaParams)
+DecoderXQAImpl* DecoderXQARunner::getImplFromXQAParams(XQAParams const& xqaParams)
 {
-    return mImpl->shouldUse(xqaParams);
+    if (tensorrt_llm::common::getEnvDisableXQAJIT())
+    {
+        // Always use Precompiled impl if TRTLLM_DISABLE_XQA_JIT is ON.
+        return mPrecompiledImpl.get();
+    }
+    if (xqaParams.multi_query_tokens)
+    {
+        // Use precompiled cubin for medusa, because medusa cubins are generated from a different CUDA source file than
+        // non-medusa.
+        return mPrecompiledImpl.get();
+    }
+    else
+    {
+        return mJITImpl.get();
+    }
+}
+
+bool DecoderXQARunner::shouldUseImpl(XQAParams const& xqa_params, bool for_configure_plugin)
+{
+    return getImplFromXQAParams(xqa_params)->shouldUse(xqa_params, for_configure_plugin);
 }
 
 void DecoderXQARunner::prepareForRun(XQAParams const& xqa_params)
 {
-    return mImpl->prepare(xqa_params);
+    return getImplFromXQAParams(xqa_params)->prepare(xqa_params);
 }
 
 template <typename KVCacheBuffer>
 void DecoderXQARunner::run(
     XQAParams const& xqa_params, KVCacheBuffer const& kv_cache_buffer, cudaStream_t const& stream)
 {
-    return mImpl->run(xqa_params, kv_cache_buffer, stream);
+    return getImplFromXQAParams(xqa_params)->run(xqa_params, kv_cache_buffer, stream);
 }
 
 template void DecoderXQARunner::run(
@@ -118,6 +139,42 @@ template void DecoderXQARunner::run(
 template void DecoderXQARunner::run(
     XQAParams const& xqa_params, KVBlockArray const& kv_block_array, cudaStream_t const& stream);
 
+//// DecoderXQARunner::Resource
+DecoderXQARunner::Resource::Resource()
+    : mCubinObjRegistry(std::make_unique<jit::CubinObjRegistry>())
+{
+}
+
+DecoderXQARunner::Resource::Resource(DecoderXQARunner::Resource const& other)
+    : mCubinObjRegistry(other.mCubinObjRegistry->clone())
+{
+}
+
+DecoderXQARunner::Resource& DecoderXQARunner::Resource::operator=(DecoderXQARunner::Resource const& other)
+{
+    if (this == &other)
+    {
+        return *this;
+    }
+    mCubinObjRegistry = other.mCubinObjRegistry->clone();
+    return *this;
+}
+
+DecoderXQARunner::Resource::Resource(void const* buffer, size_t buffer_size)
+    : mCubinObjRegistry(std::make_unique<jit::CubinObjRegistry>(buffer, buffer_size))
+{
+}
+
+size_t DecoderXQARunner::Resource::getSerializationSize() const noexcept
+{
+    return mCubinObjRegistry->getSerializationSize();
+}
+
+void DecoderXQARunner::Resource::serialize(void* buffer, size_t buffer_size) const noexcept
+{
+    mCubinObjRegistry->serialize(buffer, buffer_size);
+}
+
 } // namespace kernels
 
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h
index 5b0ceb5fe..8168f10ea 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h
@@ -22,6 +22,8 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
@@ -75,8 +77,10 @@ struct XQADispatchHelper<__nv_bfloat16, KVBlockArray>
 class DecoderXQARunner
 {
 public:
-    DecoderXQARunner(
-        const XQADataType data_type, int num_heads, int num_kv_heads, int head_size, bool multi_block_mode);
+    // Resources for constructing a DecoderXQARunner object.
+    class Resource;
+    DecoderXQARunner(Resource* resource, const XQADataType data_type, int num_heads, int num_kv_heads, int head_size,
+        bool multi_block_mode);
     ~DecoderXQARunner();
 
     /**
@@ -155,41 +159,25 @@ class DecoderXQARunner
                 SUPPORT_RETURN_FALSE("nbHeads");
             }
         }
-        return shouldUseImpl(xqaParams);
+        return shouldUseImpl(xqaParams, forConfigurePlugin);
     }
 
     size_t getWorkspaceSize(int max_batch_beam_size);
 
     void prepare(XQAParams const& xqa_params)
     {
-        if (!mPrepareCalled)
-        {
-            this->prepareForRun(xqa_params);
-            mPrepareCalled = true;
-        }
+        this->prepareForRun(xqa_params);
     }
 
     template <typename KVCacheBuffer>
     void dispatch(XQAParams const& xqa_params, KVCacheBuffer const& kv_cache_buffer, cudaStream_t const& stream)
     {
-        /*
-        TODO(minwei): re-enabling mPreparCalled checked once we figure out the root cause.
-
-        See https://github.com/NVIDIA/TensorRT-LLM/issues/1256.
-        It is safe to remove the check for now, because this->prepareForRun() is effectively a no-op. It calls into
-        DecoderXQAImplPrecompiled::prepare(), which does nothing in its body.
-
-        if (!mPrepareCalled)
-        {
-            TLLM_THROW("DecoderXQARunner::prepare() hasn't been called before DecoderXQARunner::dispatch().");
-        }
-        */
         sync_check_cuda_error();
         this->run(xqa_params, kv_cache_buffer, stream);
     }
 
 private:
-    bool shouldUseImpl(XQAParams const& xqa_params);
+    bool shouldUseImpl(XQAParams const& xqa_params, bool for_configure_plugin);
     void prepareForRun(XQAParams const& xqa_params);
 
     template <typename KVCacheBuffer>
@@ -197,7 +185,7 @@ class DecoderXQARunner
 
     static constexpr int kMaxBeamWidth = 4;
 
-    bool mPrepareCalled;
+    Resource* mResource;
 
     XQADataType mDataType;
     int mNumHeads;
@@ -206,9 +194,35 @@ class DecoderXQARunner
     bool mMultiBlockMode;
     int mMultiProcessorCount;
 
-    std::unique_ptr<DecoderXQAImpl> mImpl;
+    std::unique_ptr<DecoderXQAImpl> mJITImpl, mPrecompiledImpl;
+    DecoderXQAImpl* getImplFromXQAParams(XQAParams const& params);
 
     friend DecoderXQAImplPrecompiled;
+    friend DecoderXQAImplJIT;
+};
+
+class DecoderXQARunner::Resource
+{
+public:
+    Resource();
+    Resource(Resource const& other);
+    Resource& operator=(Resource const& other);
+    Resource(Resource&& other) = default;
+    Resource& operator=(Resource&& other) = default;
+    // Construct from a serialized buffer.
+    Resource(void const* buffer, size_t buffer_size);
+    ~Resource() = default;
+
+    jit::CubinObjRegistry* getCubinObjRegistry()
+    {
+        return mCubinObjRegistry.get();
+    }
+
+    size_t getSerializationSize() const noexcept;
+    void serialize(void* buffer, size_t buffer_size) const noexcept;
+
+private:
+    std::unique_ptr<jit::CubinObjRegistry> mCubinObjRegistry;
 };
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
index ada4f02ba..3651066b8 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -44,11 +44,11 @@ struct XQAParams
     int32_t sink_token_length = 0;
     int timestep = 0;
     void const* qkv_bias;
-    int32_t const* sequence_lengths;    //
-    int32_t const* context_lengths;     // maybe not used now
-    void const* alibi_slopes;           // maybe not used now
-    int32_t const* medusa_packed_mask;
-    int const* medusa_position_offsets; // rotary embedding.
+    int32_t const* sequence_lengths;           //
+    int32_t const* context_lengths;            // maybe not used now
+    void const* alibi_slopes;                  // maybe not used now
+    int32_t const* spec_decoding_packed_mask;
+    int const* spec_decoding_position_offsets; // rotary embedding.
 
     // almost copy from GPTAttentionPluginCommon.
     // maybe use one struct for parameters in GPTAttentionPluginCommon and share the same here.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
index 3ac5b560f..1dc6a0036 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
@@ -2609,10 +2609,28 @@ inline __device__ void update_rotary_base_n_scale(float& base, float& scale, Rot
     }
 }
 
-inline __device__ float2 rotary_embedding_coefficient(
-    int const zid, int const rot_embed_dim, float const base, float const scale, float const t_step)
+inline __device__ float2 rotary_embedding_coefficient(int const zid, int const rot_embed_dim, float const base,
+    float const scale, float const t_step, int const vision_start = -1, int const vision_length = -1)
 {
-    float const inv_freq = float(t_step * scale) / powf(base, zid / (float) rot_embed_dim);
+    float real_step = t_step;
+    if (vision_start != -1 && vision_length != -1)
+    {
+        int t_step_int = (int) t_step;
+        if (t_step_int <= vision_start)
+        {
+            real_step = t_step_int;
+        }
+        else if (t_step_int > vision_start && t_step_int <= (vision_length + vision_start))
+        {
+            real_step = vision_start + 1;
+        }
+        else
+        {
+            real_step = t_step_int - (vision_length - 1);
+        }
+    }
+
+    float const inv_freq = (real_step * scale) / powf(base, zid / (float) rot_embed_dim);
     return {cosf(inv_freq), sinf(inv_freq)};
 }
 
@@ -2640,42 +2658,50 @@ inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162
 }
 #endif
 
-inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     return;
 }
 
-inline __device__ void apply_rotary_embedding(
-    float& q, float& k, int zid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     return;
 }
 
-inline __device__ void apply_rotary_embedding(
-    float2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
-inline __device__ void apply_rotary_embedding(
-    float2& q, float2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(
-    float4& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -2683,14 +2709,17 @@ inline __device__ void apply_rotary_embedding(
     }
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(
-    float4& q, float4& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -2699,16 +2728,19 @@ inline __device__ void apply_rotary_embedding(
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
     Float4_& k_ = *reinterpret_cast<Float4_*>(&k);
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
     k_.x = rotary_embedding_transform(k_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
     k_.y = rotary_embedding_transform(k_.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(
-    Float8_& q, Float8_& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(Float8_& q, Float8_& k, int tid, int rot_embed_dim, float base,
+    float scale, float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
@@ -2717,205 +2749,289 @@ inline __device__ void apply_rotary_embedding(
 
     Float8_& q_ = *reinterpret_cast<Float8_*>(&q);
     Float8_& k_ = *reinterpret_cast<Float8_*>(&k);
-    auto const coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
     k_.x = rotary_embedding_transform(k_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
     k_.y = rotary_embedding_transform(k_.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
+    auto const coef2
+        = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.z = rotary_embedding_transform(q_.z, coef2);
     k_.z = rotary_embedding_transform(k_.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
+    auto const coef3
+        = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q_.w = rotary_embedding_transform(q_.w, coef3);
     k_.w = rotary_embedding_transform(k_.w, coef3);
 }
 
-inline __device__ void apply_rotary_embedding(
-    uint32_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
-inline __device__ void apply_rotary_embedding(
-    uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, float base,
+    float scale, float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
-    return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), tid, rot_embed_dim, base, scale, t_step);
+    return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), tid, rot_embed_dim, base, scale, mscale,
+        rotary_embedding_scaling_factors, t_step, vision_start, vision_length);
 }
 
-inline __device__ void apply_rotary_embedding(
-    half2& q, half2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(half2& q, half2& k, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
-    return apply_rotary_embedding(
-        *reinterpret_cast<uint32_t*>(&q), *reinterpret_cast<uint32_t*>(&k), tid, rot_embed_dim, base, scale, t_step);
+    return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), *reinterpret_cast<uint32_t*>(&k), tid,
+        rot_embed_dim, base, scale, mscale, rotary_embedding_scaling_factors, t_step, vision_start, vision_length);
 }
 
-inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(
-    uint2& q, uint2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ float2 rotary_embedding_coefficient_long_rope(
+    int const zid, int const rot_embed_dim, float const base, float const scale, float const mscale, float const t_step)
+{
+    float const inv_freq = float(t_step * scale) / powf(base, zid / (float) rot_embed_dim);
+    return {cosf(inv_freq) * mscale, sinf(inv_freq) * mscale};
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+    float2 coef0, coef1;
+    if (rotary_embedding_scaling_factors != nullptr)
+    {
+        float fscale = *(rotary_embedding_scaling_factors + (2 * tid));
+        fscale = 1.0 / fscale;
+        coef0 = rotary_embedding_coefficient_long_rope(4 * tid, rot_embed_dim, base, fscale, mscale, t_step);
+
+        fscale = *(rotary_embedding_scaling_factors + (2 * tid) + 1);
+        fscale = 1.0 / fscale;
+        coef1 = rotary_embedding_coefficient_long_rope(4 * tid + 2, rot_embed_dim, base, fscale, mscale, t_step);
+    }
+    else
+    {
+        coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
+        coef1 = rotary_embedding_coefficient(
+            4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
+    }
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
+    auto const coef2
+        = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
+    auto const coef3
+        = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
-inline __device__ void apply_rotary_embedding(
-    uint4& q, uint4& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
+    auto const coef2
+        = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
+    auto const coef3
+        = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
 
 #ifdef ENABLE_BF16
-inline __device__ void apply_rotary_embedding(
-    __nv_bfloat162& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
-inline __device__ void apply_rotary_embedding(
-    __nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim,
+    float base, float scale, float mscale, float const* rotary_embedding_scaling_factors, int t_step,
+    int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef
+        = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(
-    bf16_4_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(
-    bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base,
+    float scale, float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
+
+    float2 coef0, coef1;
+    if (rotary_embedding_scaling_factors != nullptr)
+    {
+        float fscale = *(rotary_embedding_scaling_factors + (2 * tid));
+        fscale = 1.0 / fscale;
+        coef0 = rotary_embedding_coefficient_long_rope(4 * tid, rot_embed_dim, base, fscale, mscale, t_step);
+
+        fscale = *(rotary_embedding_scaling_factors + (2 * tid) + 1);
+        fscale = 1.0 / fscale;
+        coef1 = rotary_embedding_coefficient_long_rope(4 * tid + 2, rot_embed_dim, base, fscale, mscale, t_step);
+    }
+    else
+    {
+        coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
+        coef1 = rotary_embedding_coefficient(
+            4 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
+    }
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(
-    bf16_8_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, float base, float scale,
+    float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
+    auto const coef2
+        = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
+    auto const coef3
+        = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
-inline __device__ void apply_rotary_embedding(
-    bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base,
+    float scale, float mscale, float const* rotary_embedding_scaling_factors, int t_step, int vision_start = -1,
+    int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
+    auto const coef0
+        = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
+    auto const coef1
+        = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
+    auto const coef2
+        = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
+    auto const coef3
+        = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu
index 798c9fd10..c8c152a66 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu
@@ -508,12 +508,12 @@ __global__ void copyNextStepIds(TokenIdType* nextStepIds, TokenIdType const* con
         auto const newTokens = numNewTokens == nullptr ? 1 : numNewTokens[batchSlot];
         auto const batchBeamIdx = batchSlot * beamWidth + beamIdx;
         auto const tokenBatchBeamIdx = tokenIdx * maxBatchSize * beamWidth + batchSlot * beamWidth + beamIdx;
-        if (tokenIdx >= newTokens)
+        auto const index_src = beamIdx * maxSeqLen + sequenceLengths[batchBeamIdx] - newTokens + tokenIdx;
+        if (tokenIdx >= newTokens || index_src < 0)
         {
             continue;
         }
-        nextStepIds[tokenBatchBeamIdx]
-            = outputIdsPtr[batchSlot][beamIdx * maxSeqLen + sequenceLengths[batchBeamIdx] - newTokens + tokenIdx];
+        nextStepIds[tokenBatchBeamIdx] = outputIdsPtr[batchSlot][index_src];
     }
 }
 
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h
index bb32255c6..fc1c55e7c 100644
--- a/cpp/tensorrt_llm/kernels/gptKernels.h
+++ b/cpp/tensorrt_llm/kernels/gptKernels.h
@@ -42,11 +42,12 @@ enum class PositionEmbeddingType : int8_t
     kLEARNED_ABSOLUTE = 0,
     kROPE_GPTJ = 1,
     kROPE_GPT_NEOX = 2,
+    kLONG_ROPE = 3,
     // Workflow: (bmm1_output * scale_bmm1 + alibi).
-    kALIBI = 3,
+    kALIBI = 4,
     // Workflow: (bmm1_output + alibi) * scale_bmm1.
-    kALIBI_WITH_SCALE = 4,
-    kRELATIVE = 5
+    kALIBI_WITH_SCALE = 5,
+    kRELATIVE = 6,
 };
 
 enum class RotaryScalingType : int8_t
diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
index 75e9ec1fd..dace2728d 100644
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@@ -200,8 +200,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(float const* inputs_after_softmax
 
 template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
-    void topkGatingSoftmax(float const* input, bool const* finished, float* output, int const num_rows, int* indices,
-        int* source_rows, int const k, int const start_expert, int const end_expert)
+    void topkGatingSoftmax(float const* input, bool const* finished, float* output, int64_t const num_rows,
+        int* indices, int* source_rows, int const k, int const start_expert, int const end_expert)
 {
     // We begin by enforcing compile time assertions and setting up compile time constants.
     static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
@@ -403,7 +403,7 @@ template <int EXPERTS, int BYTES_PER_LDG>
 struct TopkConstants
 {
     static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
-    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0);
     static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
     static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
     static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
@@ -413,7 +413,8 @@ struct TopkConstants
 
 template <int EXPERTS, int WARPS_PER_TB>
 void topkGatingSoftmaxLauncherHelper(float const* input, bool const* finished, float* output, int* indices,
-    int* source_row, int const num_rows, int const k, int const start_expert, int const end_expert, cudaStream_t stream)
+    int* source_row, int64_t const num_rows, int const k, int const start_expert, int const end_expert,
+    cudaStream_t stream)
 {
     static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
 
@@ -430,8 +431,8 @@ void topkGatingSoftmaxLauncherHelper(float const* input, bool const* finished, f
 }
 
 void topkGatingSoftmaxKernelLauncher(float const* input, bool const* finished, float* output,
-    float* softmax_temp_output, int* indices, int* source_row, int const num_rows, int const num_experts, int const k,
-    int const start_expert, int const end_expert, cudaStream_t stream)
+    float* softmax_temp_output, int* indices, int* source_row, int64_t const num_rows, int const num_experts,
+    int const k, int const start_expert, int const end_expert, cudaStream_t stream)
 {
     static constexpr int WARPS_PER_TB = 4;
 
@@ -523,11 +524,11 @@ void CubKeyValueSorter::updateNumExperts(int const num_experts)
 
 size_t CubKeyValueSorter::getWorkspaceSize(const size_t num_key_value_pairs, int const num_experts)
 {
-    size_t num_bits = (int) log2(num_experts) + 1;
+    int num_bits = static_cast<int>(log2(num_experts)) + 1;
     size_t required_storage = 0;
     int* null_int = nullptr;
     cub::DeviceRadixSort::SortPairs(
-        NULL, required_storage, null_int, null_int, null_int, null_int, num_key_value_pairs, 0, num_bits);
+        nullptr, required_storage, null_int, null_int, null_int, null_int, num_key_value_pairs, 0, num_bits);
     return required_storage;
 }
 
@@ -546,7 +547,7 @@ void CubKeyValueSorter::run(void* workspace, const size_t workspace_size, int co
 // ============================== Infer GEMM sizes =================================
 // TODO Could linear search be better for small # experts
 template <class T>
-__device__ inline int findTotalEltsLeqTarget(T const* sorted_indices, int const arr_length, const T target)
+__device__ inline int64_t findTotalEltsLeqTarget(T const* sorted_indices, int const arr_length, const T target)
 {
     int64_t low = 0, high = arr_length - 1, target_location = -1;
     while (low <= high)
@@ -613,7 +614,7 @@ CUTLASS_HOST_DEVICE cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> make_cu
 } // namespace detail
 
 __device__ void computeHopperInputStrides(
-    HopperGroupedGemmInput layout_info, int gemm_m, int gemm_n, int gemm_k, int out_idx)
+    HopperGroupedGemmInput layout_info, int gemm_m, int gemm_n, int gemm_k, int64_t out_idx)
 {
     layout_info.stride_a[out_idx] = detail::make_cute_packed_stride(
         HopperGroupedGemmInput::StrideA{}, cute::make_shape(gemm_m, gemm_k, cute::Int<1>{}));
@@ -677,6 +678,9 @@ __global__ void computeStridesHopperKernel(int64_t const* total_rows_before_expe
         layout_info.alpha_scale_ptr_array[expert] = fp8_dequant + expert;
     }
 
+    assert(gemm_m <= INT32_MAX);
+    assert(gemm_n <= INT32_MAX);
+    assert(gemm_k <= INT32_MAX);
     computeHopperInputStrides(layout_info, gemm_m, gemm_n, gemm_k, expert);
 
     computeHopperInputPointers(
@@ -699,24 +703,25 @@ __global__ void computeStridesHopperKernel(int64_t const* total_rows_before_expe
 template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(T const* unpermuted_input, T* permuted_output,
     int const* expanded_dest_row_to_expanded_source_row, int* expanded_source_row_to_expanded_dest_row,
-    int const num_rows, int64_t const* num_dest_rows, int const cols)
+    int64_t const num_rows, int64_t const* num_dest_rows, int64_t const cols)
 {
 
     // Reverse permutation map.
     // I do this so that later, we can use the source -> dest map to do the k-way reduction and unpermuting. I need the
     // reverse map for that reduction to allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
     // thread block will be responsible for all k summations.
-    int const expanded_dest_row = blockIdx.x;
-    int const expanded_source_row = expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+    int64_t const expanded_dest_row = blockIdx.x;
+    int64_t const expanded_source_row = expanded_dest_row_to_expanded_source_row[expanded_dest_row];
     if (threadIdx.x == 0)
     {
-        expanded_source_row_to_expanded_dest_row[expanded_source_row] = expanded_dest_row;
+        assert(expanded_dest_row <= INT32_MAX);
+        expanded_source_row_to_expanded_dest_row[expanded_source_row] = static_cast<int>(expanded_dest_row);
     }
 
     if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows)
     {
         // Duplicate and permute rows
-        int const source_row = expanded_source_row % num_rows;
+        int64_t const source_row = expanded_source_row % num_rows;
 
         T const* source_row_ptr = unpermuted_input + source_row * cols;
         T* dest_row_ptr = permuted_output + expanded_dest_row * cols;
@@ -731,10 +736,10 @@ __global__ void expandInputRowsKernel(T const* unpermuted_input, T* permuted_out
 template <typename T>
 void expandInputRowsKernelLauncher(T const* unpermuted_input, T* permuted_output,
     int const* expanded_dest_row_to_expanded_source_row, int* expanded_source_row_to_expanded_dest_row,
-    int const num_rows, int64_t const* num_valid_tokens_ptr, int const cols, int const k, cudaStream_t stream)
+    int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, cudaStream_t stream)
 {
-    int const blocks = num_rows * k;
-    int const threads = std::min(cols, 1024);
+    int64_t const blocks = num_rows * k;
+    int64_t const threads = std::min(cols, int64_t{1024});
     auto func = (num_valid_tokens_ptr != nullptr) ? expandInputRowsKernel<T, true> : expandInputRowsKernel<T, false>;
     func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output, expanded_dest_row_to_expanded_source_row,
         expanded_source_row_to_expanded_dest_row, num_rows, num_valid_tokens_ptr, cols);
@@ -752,8 +757,8 @@ enum class ScaleMode : int
 template <typename T, typename OutputType, class GemmOutputType, ScaleMode SCALE_MODE, bool CHECK_SKIPPED>
 __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted_rows,
     OutputType* reduced_unpermuted_output, T const* bias, float const* scales,
-    int const* expanded_source_row_to_expanded_dest_row, int const* expert_for_source_row, int const cols, int const k,
-    int64_t const* num_valid_ptr)
+    int const* expanded_source_row_to_expanded_dest_row, int const* expert_for_source_row, int64_t const cols,
+    int64_t const k, int64_t const* num_valid_ptr)
 {
     int const original_row = blockIdx.x;
     int const num_rows = gridDim.x;
@@ -766,10 +771,10 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
         float row_rescale{0.f};
         for (int k_idx = 0; k_idx < k; ++k_idx)
         {
-            int const expanded_original_row = original_row + k_idx * num_rows;
-            int const expanded_permuted_row = expanded_source_row_to_expanded_dest_row[expanded_original_row];
+            int64_t const expanded_original_row = original_row + k_idx * num_rows;
+            int64_t const expanded_permuted_row = expanded_source_row_to_expanded_dest_row[expanded_original_row];
 
-            const int64_t k_offset = original_row * k + k_idx;
+            int64_t const k_offset = original_row * k + k_idx;
             float const row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
             if constexpr (SCALE_MODE == ScaleMode::RENORM_SCALE)
             {
@@ -784,7 +789,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
 
             auto const* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols;
 
-            int const expert_idx = expert_for_source_row[k_offset];
+            int64_t const expert_idx = expert_for_source_row[k_offset];
 
             T const* bias_ptr = bias + expert_idx * cols;
             float const bias_value = bias ? static_cast<float>(bias_ptr[tid]) : 0.f;
@@ -807,12 +812,12 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
 template <class T, class OutputType, class GemmOutputType = T>
 void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_rows,
     OutputType* reduced_unpermuted_output, T const* bias, float const* scales,
-    int const* expanded_source_row_to_expanded_dest_row, int const* expert_for_source_row, int const num_rows,
-    int const cols, int const k, int64_t const* num_valid_ptr, MOEParallelismConfig parallelism_config,
+    int const* expanded_source_row_to_expanded_dest_row, int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, MOEParallelismConfig parallelism_config,
     MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream)
 {
-    int const blocks = num_rows;
-    int const threads = std::min(cols, 1024);
+    int64_t const blocks = num_rows;
+    int64_t const threads = std::min(cols, int64_t{1024});
 
     // Only add bias on rank 0 for tensor parallelism
     bool const is_rank_0 = parallelism_config.tp_rank == 0;
@@ -848,10 +853,10 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro
 
 template <class T, class ActFn>
 __global__ void doGatedActivationKernel(
-    T* output, T const* gemm_result, int64_t const* num_valid_tokens_ptr, size_t inter_size)
+    T* output, T const* gemm_result, int64_t const* num_valid_tokens_ptr, int64_t inter_size)
 {
-    int const tid = threadIdx.x;
-    int const token = blockIdx.x;
+    int64_t const tid = threadIdx.x;
+    int64_t const token = blockIdx.x;
     if (num_valid_tokens_ptr && token >= *num_valid_tokens_ptr)
     {
         return;
@@ -860,7 +865,7 @@ __global__ void doGatedActivationKernel(
     ActFn fn{};
     output = output + token * inter_size;
     gemm_result = gemm_result + token * inter_size * 2;
-    for (int i = tid; i < inter_size; i += blockDim.x)
+    for (int64_t i = tid; i < inter_size; i += blockDim.x)
     {
         auto fc1_value = static_cast<float>(gemm_result[i]);
         // BF16 isn't supported, use FP32 for activation function
@@ -871,11 +876,11 @@ __global__ void doGatedActivationKernel(
 }
 
 template <class T>
-void doGatedActivation(T* output, T const* gemm_result, int64_t const* num_valid_tokens_ptr, int inter_size,
-    int num_tokens, ActivationType activation_type, cudaStream_t stream)
+void doGatedActivation(T* output, T const* gemm_result, int64_t const* num_valid_tokens_ptr, int64_t inter_size,
+    int64_t num_tokens, ActivationType activation_type, cudaStream_t stream)
 {
-    int const blocks = num_tokens;
-    int const threads = std::min(inter_size, 1024);
+    int64_t const blocks = num_tokens;
+    int64_t const threads = std::min(inter_size, int64_t{1024});
 
     // TODO Instead of T use a vectored type if performance would benefit
     // TODO For some reason Volta fails on GELU_taylor here with Warp Illegal Instruction.
@@ -890,10 +895,10 @@ void doGatedActivation(T* output, T const* gemm_result, int64_t const* num_valid
 template <class T, class ActFn>
 __global__ void doActivationKernel(T* output, HopperGroupedGemmInput::OutputTypeAdaptor_t<T> const* gemm_result,
     float const* fp8_quant, T const* bias_ptr, int64_t const* total_rows_before_expert_, int num_experts,
-    size_t inter_size, bool gated)
+    int64_t inter_size, bool gated)
 {
-    int const tid = threadIdx.x;
-    int const token = blockIdx.x;
+    int64_t const tid = threadIdx.x;
+    int64_t const token = blockIdx.x;
     if (token >= total_rows_before_expert_[num_experts - 1])
     {
         return;
@@ -906,7 +911,7 @@ __global__ void doActivationKernel(T* output, HopperGroupedGemmInput::OutputType
     gemm_result = gemm_result + token * inter_size * gated_mul;
     output = output + token * inter_size; // Aliases gemm_result for non-gated, non-fp8 cases
 
-    int expert = 0;
+    int64_t expert = 0;
     if (bias_ptr)
     {
         // TODO this is almost certainly faster as a linear scan
@@ -919,7 +924,7 @@ __global__ void doActivationKernel(T* output, HopperGroupedGemmInput::OutputType
     {
         bias_ptr = bias_ptr + expert * inter_size * gated_mul;
     }
-    for (int i = tid; i < inter_size; i += blockDim.x)
+    for (int64_t i = tid; i < inter_size; i += blockDim.x)
     {
         auto fc1_value = static_cast<float>(gemm_result[i + gated_off]);
         if (bias_ptr)
@@ -940,11 +945,11 @@ __global__ void doActivationKernel(T* output, HopperGroupedGemmInput::OutputType
 
 template <class T>
 void doActivation(T* output, HopperGroupedGemmInput::OutputTypeAdaptor_t<T> const* gemm_result, float const* fp8_quant,
-    T const* bias, int64_t const* total_rows_before_expert_, int num_experts, int inter_size, int num_tokens,
+    T const* bias, int64_t const* total_rows_before_expert_, int num_experts, int64_t inter_size, int64_t num_tokens,
     ActivationType activation_type, cudaStream_t stream)
 {
-    int const blocks = num_tokens;
-    int const threads = std::min(inter_size, 1024);
+    int64_t const blocks = num_tokens;
+    int64_t const threads = std::min(inter_size, int64_t{1024});
 
     // TODO Instead of T use a vectored type if performance would benefit
     auto fn_list = std::array{
@@ -961,9 +966,9 @@ void doActivation(T* output, HopperGroupedGemmInput::OutputTypeAdaptor_t<T> cons
 }
 
 template <class T, class WeightType, class OutputType, class Enable>
-std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceBufferSizes(int const num_rows,
-    int const hidden_size, int const inter_size, int const num_experts, int const num_experts_per_node, int const k,
-    ActivationType activation_type) const
+std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceBufferSizes(
+    int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, int const num_experts,
+    int const num_experts_per_node, int const k, ActivationType activation_type) const
 {
     const size_t num_moe_inputs = k * num_rows;
     const size_t permuted_elems = num_moe_inputs * hidden_size;
@@ -979,7 +984,7 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
         // We need to have separate memory for these as we can no longer alias the output buffer for reuse
         glu_inter_elems = interbuf_elems;
     }
-    int num_softmax_outs = 0;
+    size_t num_softmax_outs = 0;
 
     bool using_hopper = moe_gemm_runner_.supportsHopperSpecialisation();
     const size_t gemm_output_dtype = using_hopper ? sizeof(HopperGemmOutputType) : sizeof(T);
@@ -1011,9 +1016,9 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
 }
 
 template <class T, class WeightType, class OutputType, class Enable>
-size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(int const num_rows,
-    int const hidden_size, int const inter_size, int const num_experts, int const k, ActivationType activation_type,
-    MOEParallelismConfig parallelism_config) const
+size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(int64_t const num_rows,
+    int64_t const hidden_size, int64_t const inter_size, int const num_experts, int const k,
+    ActivationType activation_type, MOEParallelismConfig parallelism_config) const
 {
     int const ep_size = parallelism_config.ep_size;
     TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of tp size");
@@ -1023,9 +1028,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
 }
 
 template <class T, class WeightType, class OutputType, class Enable>
-void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char* ws_ptr, int const num_rows,
-    int const hidden_size, int const inter_size, int const num_experts, int const num_experts_per_node, int const k,
-    ActivationType activation_type)
+void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char* ws_ptr, int64_t const num_rows,
+    int64_t const hidden_size, int64_t const inter_size, int const num_experts, int const num_experts_per_node,
+    int const k, ActivationType activation_type)
 {
     auto ws_sizes = getWorkspaceBufferSizes(
         num_rows, hidden_size, inter_size, num_experts, num_experts_per_node, k, activation_type);
@@ -1070,26 +1075,27 @@ template <class T, class WeightType, class OutputType, class Enable>
 void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* input_activations_void,
     float const* gating_output, void const* fc1_expert_weights_void, void const* fc1_expert_biases_void,
     ActivationType fc1_activation_type, void const* fc2_expert_weights_void, void const* fc2_expert_biases_void,
-    QuantParams quant_params, int const num_rows, int const hidden_size, int const inter_size, int const num_experts,
-    int const k, char* workspace_ptr, void* final_output_void, bool const* finished, int const active_rows,
-    void* expert_scales_void, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
-    MOEParallelismConfig parallelism_config, MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream)
+    QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
+    int const num_experts, int const k, char* workspace_ptr, void* final_output_void, bool const* finished,
+    int64_t const active_rows, void* expert_scales_void, int* expanded_source_row_to_expanded_dest_row,
+    int* expert_for_source_row, MOEParallelismConfig parallelism_config,
+    MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream)
 {
     static constexpr bool int_scales_required
         = std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
     static constexpr bool fp8_scales_required
         = std::is_same<WeightType, __nv_fp8_e4m3>::value || std::is_same<WeightType, __nv_fp8_e5m2>::value;
 
-    auto* input_activations = static_cast<T const*>(input_activations_void);
-    auto* fc1_expert_weights = static_cast<WeightType const*>(fc1_expert_weights_void);
-    auto* fc1_expert_biases = static_cast<T const*>(fc1_expert_biases_void);
-    auto* fc2_expert_weights = static_cast<WeightType const*>(fc2_expert_weights_void);
-    auto* fc1_int_scales = static_cast<T const*>(quant_params.fc1_weight_scales);
-    auto* fc2_int_scales = static_cast<T const*>(quant_params.fc2_weight_scales);
-    auto* fc1_fp8_dequant = static_cast<float const*>(quant_params.dequant_fc1);
-    auto* fc2_fp8_quant = static_cast<float const*>(quant_params.quant_fc2);
-    auto* fc2_fp8_dequant = static_cast<float const*>(quant_params.dequant_fc2);
-    auto* fc2_expert_biases = static_cast<T const*>(fc2_expert_biases_void);
+    auto const* input_activations = static_cast<T const*>(input_activations_void);
+    auto const* fc1_expert_weights = static_cast<WeightType const*>(fc1_expert_weights_void);
+    auto const* fc1_expert_biases = static_cast<T const*>(fc1_expert_biases_void);
+    auto const* fc2_expert_weights = static_cast<WeightType const*>(fc2_expert_weights_void);
+    auto const* fc1_int_scales = static_cast<T const*>(quant_params.fc1_weight_scales);
+    auto const* fc2_int_scales = static_cast<T const*>(quant_params.fc2_weight_scales);
+    auto const* fc1_fp8_dequant = quant_params.dequant_fc1;
+    auto const* fc2_fp8_quant = quant_params.quant_fc2;
+    auto const* fc2_fp8_dequant = quant_params.dequant_fc2;
+    auto const* fc2_expert_biases = static_cast<T const*>(fc2_expert_biases_void);
     auto* final_output = static_cast<OutputType*>(final_output_void);
     auto* expert_scales = static_cast<float*>(expert_scales_void);
 
@@ -1105,6 +1111,11 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
     TLLM_CHECK_WITH_INFO(hidden_size >= 128 / cutlass::sizeof_bits<WeightType>::value,
         "Hidden size is too small to meet alignment requirements for MOE GEMM");
 
+    // These values must fit into an int for building the source maps
+    TLLM_CHECK_WITH_INFO(num_rows <= std::numeric_limits<int>::max(), "Number of rows is too large");
+    TLLM_CHECK_WITH_INFO(
+        num_rows * num_experts <= std::numeric_limits<int>::max(), "Number of rows * num_experts is too large");
+
     if (int_scales_required)
     {
         TLLM_CHECK_WITH_INFO(
@@ -1166,7 +1177,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
     const size_t fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size;
 
     // Upper bound on number of expanded rows
-    int const expanded_active_expert_rows = k * active_rows;
+    int64_t const expanded_active_expert_rows = k * active_rows;
     computeTotalRowsBeforeExpert(
         permuted_experts_, expanded_active_expert_rows, num_experts_per_node, total_rows_before_expert_, stream);
 
@@ -1272,7 +1283,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::computeTotalRowsBefo
 
 template <class T, class WeightType, class OutputType, class Enable>
 HopperGroupedGemmInput CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::computeStridesHopper(
-    int64_t const* total_rows_before_expert, HopperGroupedGemmInput layout_info, int gemm_n, int gemm_k,
+    int64_t const* total_rows_before_expert, HopperGroupedGemmInput layout_info, int64_t gemm_n, int64_t gemm_k,
     int const num_experts, T const* in, WeightType const* weights, float const* fp8_dequant, T const* bias,
     HopperGemmOutputType* output, cudaStream_t stream)
 {
@@ -1322,7 +1333,7 @@ void makeLoadBalancedRoutingConfiguration(
     void* data_void, int num_experts, int num_tokens, int k, nvinfer1::DataType type, cudaStream_t stream)
 {
     TLLM_CHECK_WITH_INFO(type == nvinfer1::DataType::kFLOAT, "Routing configuration must be float");
-    check_cuda_error(cudaMemsetAsync(data_void, 0x0, num_experts * num_tokens * sizeof(float), stream));
+    check_cuda_error(cudaMemsetAsync(data_void, 0x0, int64_t{num_experts} * num_tokens * sizeof(float), stream));
 
     int stride = tensorrt_llm::common::ceilDiv(num_experts, k);
 
diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h
index 6f52c60cb..c1ac94e09 100644
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.h
@@ -33,26 +33,6 @@ static inline size_t pad_to_multiple_of_16(size_t const& input)
     return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
 }
 
-/*
-  Launches the topk gating softmax required for the MoE layers.
-
-  Params:
-  input - a [num_rows x num_experts]
-  finished - [num_rows] vector with 1 if the sentence at this row is done translating and 0 otherwise.
-  output - a buffer of shape [num_rows x k] containing the top-k values of the softmax for each row.
-  indices - a matrix of shape [num_rows x k] containing the top-k experts each row should get routed to.
-  source_rows - a matrix of shape [num_rows x k] used internally for permuting. source_rows[row][k] =  k * num_rows +
-  row. It is constructed like this so we can track where each of the original rows end up in order to perform the
-                "k-way" reduction later in the routing.
-
-  num_rows - The number of rows in the matrix
-  num_experts - The number of expert layers present
-  k - k value in topk
-*/
-template <typename T>
-void topk_gating_softmax_kernelLauncher(T const* input, bool const* finished, T* output, T* softmax_temp_out,
-    int* indices, int* source_row, int const num_rows, int const num_experts, int const k, cudaStream_t stream);
-
 class CubKeyValueSorter
 {
 public:
@@ -155,7 +135,7 @@ class CutlassMoeFCRunnerInterface
 {
 public:
     virtual ~CutlassMoeFCRunnerInterface() = default;
-    virtual size_t getWorkspaceSize(int const num_rows, int const hidden_size, int const inter_size,
+    virtual size_t getWorkspaceSize(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int const num_experts, int const k, ActivationType activation_type,
         MOEParallelismConfig parallelism_config) const
         = 0;
@@ -164,11 +144,12 @@ class CutlassMoeFCRunnerInterface
 
     virtual void runMoe(void const* input_activations, float const* gating_output, void const* fc1_expert_weights,
         void const* fc1_expert_biases, ActivationType fc1_activation_type, void const* fc2_expert_weights,
-        void const* fc2_expert_biases, QuantParams quant_params, int const num_rows, int const hidden_size,
-        int const inter_size, int const num_experts, int const k, char* workspace_ptr, void* final_output,
-        bool const* finished, int const active_rows, void* expert_scales, int* expanded_source_row_to_expanded_dest_row,
-        int* expert_for_source_row, MOEParallelismConfig parallelism_config,
-        MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream)
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        int64_t const inter_size, int const num_experts, int const k, char* workspace_ptr, void* final_output,
+        bool const* finished, int64_t const active_rows, void* expert_scales,
+        int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
+        MOEParallelismConfig parallelism_config, MOEExpertScaleNormalizationMode normalization_mode,
+        cudaStream_t stream)
         = 0;
 
     bool is_profiler = false;
@@ -191,8 +172,9 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
     static_assert(
         std::is_same_v<T, WeightType> || !std::is_same_v<T, float>, "Does not support float with quantized weights");
 
-    size_t getWorkspaceSize(int const num_rows, int const hidden_size, int const fc1_output_size, int const num_experts,
-        int const k, ActivationType activation_type, MOEParallelismConfig parallelism_config) const override;
+    size_t getWorkspaceSize(int64_t const num_rows, int64_t const hidden_size, int64_t const fc1_output_size,
+        int const num_experts, int const k, ActivationType activation_type,
+        MOEParallelismConfig parallelism_config) const override;
 
     void setTactic(std::optional<cutlass_extensions::CutlassGemmConfig> gemm_config) override
     {
@@ -206,11 +188,12 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     void runMoe(void const* input_activations, float const* gating_output, void const* fc1_expert_weights,
         void const* fc1_expert_biases, ActivationType fc1_activation_type, void const* fc2_expert_weights,
-        void const* fc2_expert_biases, QuantParams quant_params, int const num_rows, int const hidden_size,
-        int const inter_size, int const num_experts, int const k, char* workspace_ptr, void* final_output,
-        bool const* finished, int const active_rows, void* expert_scales, int* expanded_source_row_to_expanded_dest_row,
-        int* expert_for_source_row, MOEParallelismConfig parallelism_config,
-        MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream) override;
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        int64_t const inter_size, int const num_experts, int const k, char* workspace_ptr, void* final_output,
+        bool const* finished, int64_t const active_rows, void* expert_scales,
+        int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
+        MOEParallelismConfig parallelism_config, MOEExpertScaleNormalizationMode normalization_mode,
+        cudaStream_t stream) override;
 
 private:
     using HopperGemmOutputType = typename HopperGroupedGemmInput::OutputTypeAdaptor_t<T>;
@@ -218,12 +201,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
     void computeTotalRowsBeforeExpert(int const* sorted_indices, int const total_indices, int const num_experts,
         int64_t* total_rows_before_expert, cudaStream_t stream);
     HopperGroupedGemmInput computeStridesHopper(int64_t const* total_rows_before_expert,
-        HopperGroupedGemmInput layout_info, int gemm_n, int gemm_k, int const num_experts, T const* in,
+        HopperGroupedGemmInput layout_info, int64_t gemm_n, int64_t gemm_k, int const num_experts, T const* in,
         WeightType const* weights, float const* fp8_dequant, T const* bias, HopperGemmOutputType* output,
         cudaStream_t stream);
-    std::vector<size_t> getWorkspaceBufferSizes(int const num_rows, int const hidden_size, int const inter_size,
-        int const num_experts, int const num_experts_per_node, int const k, ActivationType activation_type) const;
-    void configureWsPtrs(char* ws_ptr, int const num_rows, int const hidden_size, int const inter_size,
+    std::vector<size_t> getWorkspaceBufferSizes(int64_t const num_rows, int64_t const hidden_size,
+        int64_t const inter_size, int const num_experts, int const num_experts_per_node, int const k,
+        ActivationType activation_type) const;
+    void configureWsPtrs(char* ws_ptr, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int const num_experts, int const num_experts_per_node, int const k, ActivationType activation_type);
 
 private:
diff --git a/cpp/tensorrt_llm/kernels/penaltyTypes.h b/cpp/tensorrt_llm/kernels/penaltyTypes.h
index d5a21c528..45532c29b 100644
--- a/cpp/tensorrt_llm/kernels/penaltyTypes.h
+++ b/cpp/tensorrt_llm/kernels/penaltyTypes.h
@@ -49,8 +49,9 @@ inline std::pair<float, float> getLimitsPenalty(DecodingPenaltyType penaltyType)
     case DecodingPenaltyType::Presence: return std::make_pair(fltMin, fltMax);
     case DecodingPenaltyType::Frequency: return std::make_pair(fltMin, fltMax);
     case DecodingPenaltyType::MinLength: return std::make_pair(-fltEpsilon, fltMax);
-    default: TLLM_CHECK_WITH_INFO(false, "Unknown penalty type %d", static_cast<int32_t>(penaltyType));
     }
+    TLLM_CHECK_WITH_INFO(false, "Unknown penalty type %d", static_cast<int32_t>(penaltyType));
+    return std::make_pair(fltMin, fltMax);
 }
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/selectiveScan.cu b/cpp/tensorrt_llm/kernels/selectiveScan.cu
index ec74aa035..b8f94f160 100644
--- a/cpp/tensorrt_llm/kernels/selectiveScan.cu
+++ b/cpp/tensorrt_llm/kernels/selectiveScan.cu
@@ -84,9 +84,6 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
     bool dt_softplus = params.delta_softplus;
     int num_channels = params.dim;
 
-    // static const int STAGES = 12;
-    // static const int SEQ_UNROLL = 6;
-
     __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, STAGES / SEQ_UNROLL> pipeline_state;
     auto block = cooperative_groups::this_thread_block();
 
@@ -97,9 +94,6 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
     __shared__ input_t sh_x[STAGES][CHANNELS_PER_BLOCK];
     __shared__ input_t sh_z[STAGES][CHANNELS_PER_BLOCK];
 
-    __shared__ weight_t sh_D[CHANNELS_PER_BLOCK];
-    __shared__ weight_t sh_dt_bias[CHANNELS_PER_BLOCK];
-
     int const channel = blockIdx.x * blockDim.x + threadIdx.x;
     int const sample = blockIdx.y; // batch id
 
@@ -127,14 +121,6 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
 
     if (threadIdx.y == 1)
     {
-        // Data loading warps
-
-        // Bias is independent of token
-        sh_dt_bias[threadIdx.x] = dt_bias[channel];
-        // D is independent of token
-        if (D)
-            sh_D[threadIdx.x] = D[channel];
-
         cuda::pipeline pipeline = cuda::make_pipeline(block, &pipeline_state, cuda::pipeline_role::producer);
 
         int stage = 0;
@@ -220,10 +206,11 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
         float A_reg[DSTATE];
         for (int i = 0; i < DSTATE; i++)
         {
-            // state_reg[i] = toFloat(state[sample*num_channels*DSTATE + i*num_channels + channel]);
             state_reg[i] = 0.f;
             A_reg[i] = toFloat(A[i * num_channels + channel]);
         }
+        float dt_bias_reg = dt_bias[channel];
+        float D_reg = D ? D[channel] : 0.f;
 
         cuda::pipeline pipeline = cuda::make_pipeline(block, &pipeline_state, cuda::pipeline_role::consumer);
         int stage = 0;
@@ -236,14 +223,14 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
             for (int token_id = si * SEQ_UNROLL; token_id < num_tokens && token_id < (si + 1) * SEQ_UNROLL; token_id++)
             {
 
-                float dt_b = toFloat(sh_dt[stage][threadIdx.x]) + toFloat(sh_dt_bias[threadIdx.x]);
+                float dt_b = toFloat(sh_dt[stage][threadIdx.x]) + dt_bias_reg;
                 float dt_b_sp;
                 if (dt_softplus)
                 {
-                    dt_b_sp = dt_b <= 20.f ? log1pf(__expf(dt_b)) : dt_b; // softplus
+                    dt_b_sp = dt_b <= 20.f ? __logf(1.f + __expf(dt_b)) : dt_b; // softplus
                 }
                 float my_x = toFloat(sh_x[stage][threadIdx.x]);
-                float Dx = my_x * (D ? toFloat(sh_D[threadIdx.x]) : 0.f);
+                float Dx = my_x * D_reg;
                 float dtx = dt_b_sp * my_x;
                 float my_z = z ? toFloat(sh_z[stage][threadIdx.x]) : 0.f;
 
@@ -303,7 +290,7 @@ __launch_bounds__(256, 1) __global__ void selective_scan_loop_kernel(SSMParamsBa
                 {
                     float enz = __expf(0.f - my_z);
                     enz += 1.0;
-                    float sig_z = 1.0 / enz;
+                    float sig_z = __fdividef(1.f, enz);
                     float silu_z = my_z * sig_z;
                     out *= silu_z;
                 }
@@ -332,16 +319,15 @@ void invokeSelectiveScan(SSMParamsBase& params, cudaStream_t stream)
     int samples = params.batch;
     int channels = params.dim;
 
+    TLLM_CHECK(params.is_variable_B);
+    TLLM_CHECK(params.is_variable_C);
+    TLLM_CHECK(params.dstate == 16);
+
     int const threads = 128;
     int const blocks = (channels + threads - 1) / threads;
     dim3 block(threads, 2);
     dim3 grid(blocks, samples);
     TLLM_CHECK((channels % block.x) == 0);
-
-    TLLM_CHECK(params.is_variable_B);
-    TLLM_CHECK(params.is_variable_C);
-    TLLM_CHECK(params.dstate == 16);
-
     selective_scan_loop_kernel<input_t, weight_t><<<grid, block, 0, stream>>>(params);
 }
 
@@ -412,15 +398,17 @@ __launch_bounds__(128, 2) __global__ void selective_scan_update_kernel(SSMParams
     float dt_b_sp;
     if (dt_softplus)
     {
-        dt_b_sp = dt_b <= 20.f ? logf(1.f + expf(dt_b)) : dt_b; // softplus
+        // dt_b_sp = dt_b <= 20.f ? logf(1.f + expf(dt_b)) : dt_b; // softplus
+        dt_b_sp = dt_b <= 20.f ? __logf(1.f + __expf(dt_b)) : dt_b; // softplus
     }
 
-    float out = 0.f;
+    float out = D ? my_D * my_x : 0.f;
 
 #pragma unroll
     for (int i = 0; i < DSTATE; i++)
     {
-        float dA = expf(rA[i] * dt_b_sp);
+        // float dA = expf(rA[i] * dt_b_sp);
+        float dA = __expf(rA[i] * dt_b_sp);
         float dB = rB[i] * dt_b_sp;
         float sdA = rState[i] * dA;
         float dBx = dB * my_x;
@@ -429,11 +417,10 @@ __launch_bounds__(128, 2) __global__ void selective_scan_update_kernel(SSMParams
         out += newState * rC[i];
     }
 
-    if (D)
-        out += my_D * my_x;
     if (z)
     {
-        float sig_z = 1.0 / (1.0 + exp(0.f - my_z));
+        // float sig_z = 1.0 / (1.0 + exp(0.f - my_z));
+        float sig_z = __fdividef(1.f, (1.f + __expf(0.f - my_z)));
         float silu_z = my_z * sig_z;
         out *= silu_z;
     }
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
index 2d0ccee29..353958094 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
@@ -1347,10 +1347,11 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf,
     {
     case PositionEmbeddingType::kROPE_GPTJ:
     {
-        mmha::apply_rotary_embedding(
-            q, k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, dst_kv_seq_idx);
+        mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, 0,
+            nullptr, dst_kv_seq_idx);
         break;
     }
+    case PositionEmbeddingType::kLONG_ROPE:
     case PositionEmbeddingType::kROPE_GPT_NEOX:
     {
         bool const do_rotary = !is_masked && vec_size * tidx < rotary_embedding_dim;
@@ -1379,7 +1380,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf,
             mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
 
             mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, rotary_embedding_dim, rotary_embedding_base,
-                rotary_embedding_scale, dst_kv_seq_idx);
+                rotary_embedding_scale, 0, nullptr, dst_kv_seq_idx);
 
             mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
             mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
@@ -1469,9 +1470,10 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, T cons
         // To implement rotary embeddings, each thread processes two QKV elems:
         dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
         dim3 grid(token_num, head_num);
-        size_t smem_size
-            = (position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX ? 2 * rotary_embedding_dim * sizeof(T)
-                                                                                : 0);
+        size_t smem_size = (position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX
+                    || position_embedding_type == PositionEmbeddingType::kLONG_ROPE
+                ? 2 * rotary_embedding_dim * sizeof(T)
+                : 0);
         // NOTE: add offset for rotary embedding
         if (qkv_bias != nullptr)
         {
@@ -1858,9 +1860,10 @@ __global__ void shiftKCache(KVCacheBuffer kvCacheBuffer, KVLinearBuffer shiftKCa
     case PositionEmbeddingType::kROPE_GPTJ:
     {
         mmha::apply_rotary_embedding(
-            k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, token_pos_idx);
+            k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, 0, nullptr, token_pos_idx);
         break;
     }
+    case PositionEmbeddingType::kLONG_ROPE:
     case PositionEmbeddingType::kROPE_GPT_NEOX:
     {
         bool const do_rotary = vec_size * tidx < rotary_embedding_dim;
@@ -1885,7 +1888,7 @@ __global__ void shiftKCache(KVCacheBuffer kvCacheBuffer, KVLinearBuffer shiftKCa
         {
             mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
             mmha::apply_rotary_embedding(k, transpose_idx / tidx_factor, rotary_embedding_dim, rotary_embedding_base,
-                rotary_embedding_scale, token_pos_idx);
+                rotary_embedding_scale, 0, nullptr, token_pos_idx);
             mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
         }
 
@@ -1919,8 +1922,10 @@ void invokeShiftKCache(KVCacheBuffer const& kvCacheBuffer, KVLinearBuffer const&
     int const vec_size = 16u / sizeof(T);
     dim3 block((sizePerHead / vec_size + 31) / 32 * 32);
     dim3 grid(token_num_in_k, kv_head_num, batch_beam);
-    size_t smem_size
-        = (position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX ? 2 * rotary_embedding_dim * sizeof(T) : 0);
+    size_t smem_size = (position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX
+                || position_embedding_type == PositionEmbeddingType::kLONG_ROPE
+            ? 2 * rotary_embedding_dim * sizeof(T)
+            : 0);
 
     if (cache_type == KvCacheDataType::INT8)
     {
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
index 8ef4e5a9a..acb8bbfc3 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
@@ -77,7 +77,7 @@ struct QKVPreprocessingParams
     float const* rotary_embedding_inv_freq;
     float2 const* rotary_coef_cache_buffer;
     float const* kvScaleOrigQuant;
-    int const* medusa_position_offsets;
+    int const* spec_decoding_position_offsets;
 
     // Scalars.
     int const batch_size;
@@ -101,6 +101,8 @@ struct QKVPreprocessingParams
     bool const enable_paged_kv_fmha;
     bool const quantized_fp8_output;
     int const multi_processor_count;
+    int const rotary_vision_start;
+    int const rotary_vision_length;
 
     // Pre-compute on host.
     int half_rotary_dim;
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
index 36a587185..5ddfafa7d 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -217,7 +217,8 @@ struct Rotary_base_t<float, RotaryPositionEmbeddingType::GPTJ>
 template <typename VecType, typename T, int VEC_SIZE, bool RECOMPUTE>
 inline __device__ void apply_rotary_embedding_gptneox(VecType& q, VecType& q_pair, VecType& k, VecType& k_pair,
     bool first_half, float2 (&rotary_coef_cache)[VEC_SIZE], float const* rotary_inv_freq_buffer,
-    int const rotary_dim_idx, int const half_rotary_dim, int const rotary_position)
+    int const rotary_dim_idx, int const half_rotary_dim, int const rotary_position, int const vision_start = -1,
+    int const vision_length = -1)
 {
     // Each thread holds NUM_ELTS elements.
     // Currently we apply the rotary embedding in float data type for accuracy reasons.
@@ -234,8 +235,25 @@ inline __device__ void apply_rotary_embedding_gptneox(VecType& q, VecType& q_pai
 
         if (RECOMPUTE)
         {
-            float const rotary_inv_freq
-                = float(rotary_position) * rotary_inv_freq_buffer[min(rotary_dim_idx + elt_id, half_rotary_dim - 1)];
+            int real_rotary_position = rotary_position;
+            if (vision_start != -1 && vision_length != -1)
+            {
+                int t_step_int = rotary_position;
+                if (t_step_int <= vision_start)
+                {
+                    real_rotary_position = t_step_int;
+                }
+                else if (t_step_int > vision_start && t_step_int <= (vision_length + vision_start))
+                {
+                    real_rotary_position = vision_start + 1;
+                }
+                else
+                {
+                    real_rotary_position = t_step_int - (vision_length - 1);
+                }
+            }
+            float const rotary_inv_freq = float(real_rotary_position)
+                * rotary_inv_freq_buffer[min(rotary_dim_idx + elt_id, half_rotary_dim - 1)];
             rotary_coef_cache[elt_id] = make_float2(cosf(rotary_inv_freq), sinf(rotary_inv_freq));
         }
 
@@ -356,10 +374,10 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams<T, KVCacheBuff
             int const token_idx_in_seq = (cache_seq_len - actual_seq_len) + local_token_idx;
             bool const valid_token = token_idx_in_seq < cache_seq_len;
 
-            // NOTE: only Medusa needs the position offsets.
+            // NOTE: only spec decoding needs the position offsets.
             // In the generation phase, we assume all sequences should have the same input length.
-            int const rotary_position = params.medusa_position_offsets != nullptr
-                ? (params.medusa_position_offsets[local_token_idx + batch_idx * params.max_input_seq_len]
+            int const rotary_position = params.spec_decoding_position_offsets != nullptr
+                ? (params.spec_decoding_position_offsets[local_token_idx + batch_idx * params.max_input_seq_len]
                     + cache_seq_len - actual_seq_len)
                 : token_idx_in_seq;
 
@@ -445,7 +463,8 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams<T, KVCacheBuff
                     apply_rotary_embedding_gptneox<VecType, BaseType, ROTARY_COEF_VEC_SIZE, true>(q, q_pair, k, k_pair,
                         first_half, rotary_coef_cache,
                         params.rotary_embedding_inv_freq + batch_idx * params.half_rotary_dim, gptneox_rotary_dim_idx,
-                        params.half_rotary_dim, rotary_position);
+                        params.half_rotary_dim, rotary_position, params.rotary_vision_start,
+                        params.rotary_vision_length);
                     cached_rotary_position = rotary_position;
                 }
                 else
@@ -453,7 +472,8 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams<T, KVCacheBuff
                     apply_rotary_embedding_gptneox<VecType, BaseType, ROTARY_COEF_VEC_SIZE, false>(q, q_pair, k, k_pair,
                         first_half, rotary_coef_cache,
                         params.rotary_embedding_inv_freq + batch_idx * params.half_rotary_dim, gptneox_rotary_dim_idx,
-                        params.half_rotary_dim, rotary_position);
+                        params.half_rotary_dim, rotary_position, params.rotary_vision_start,
+                        params.rotary_vision_length);
                 }
                 break;
             }
@@ -670,11 +690,11 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
         local_token_idx = std::min(local_token_idx, actual_seq_len - 1);
         int const global_token_idx = local_token_idx + global_token_offset;
 
-        // NOTE: only Medusa needs the position offsets.
+        // NOTE: only spec decoding needs the position offsets.
         // In the generation phase, we assume all sequences should have the same input length.
-        int const rotary_position = params.medusa_position_offsets != nullptr
-            ? (params.medusa_position_offsets[local_token_idx + batch_idx * params.max_input_seq_len] + cache_seq_len
-                - actual_seq_len)
+        int const rotary_position = params.spec_decoding_position_offsets != nullptr
+            ? (params.spec_decoding_position_offsets[local_token_idx + batch_idx * params.max_input_seq_len]
+                + cache_seq_len - actual_seq_len)
             : token_idx_in_kv_cache;
 
         // head_num == kv_head_num:
@@ -837,7 +857,8 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
     dim3 grid(params.max_input_seq_len, params.head_num);                                                              \
     grid.z = std::min(int(divUp(params.multi_processor_count * WARPS_PER_SM, grid.x * grid.y)),                        \
         int(divUp(params.batch_size, MIN_SEQUENCES_PER_WARP)));                                                        \
-    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX)                                       \
+    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX                                        \
+        || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE)                                        \
     {                                                                                                                  \
         applyBiasRopeUpdateKVCache<T, TCache, Dh_MAX, ADD_BIAS, STORE_QKV, KVCacheBuffer,                              \
             RotaryPositionEmbeddingType::GPT_NEOX, DYNAMIC_ROTARY_SCALING><<<grid, block, 0, stream>>>(params);        \
@@ -863,7 +884,8 @@ void kernelDispatchHeadSize(QKVPreprocessingParams<T, KVCacheBuffer> params, cud
 
     constexpr int VEC_SIZE = Rotary_vec_t<T, Dh_MAX>::size;
     // Make sure we have multiple of paired vectors so that the access is aligned.
-    TLLM_CHECK_WITH_INFO(params.position_embedding_type != PositionEmbeddingType::kROPE_GPT_NEOX
+    TLLM_CHECK_WITH_INFO((params.position_embedding_type != PositionEmbeddingType::kROPE_GPT_NEOX
+                             && params.position_embedding_type != PositionEmbeddingType::kLONG_ROPE)
             || params.half_rotary_dim % VEC_SIZE == 0,
         "Rotary dim size is not supported.");
 
@@ -946,7 +968,8 @@ void kernelV1Dispatch(QKVPreprocessingParams<T, KVCacheBuffer> params, cudaStrea
 #define APPLY_BIAS_ROPE_UPDATE_KV_CACHE_V2(ADD_BIAS, STORE_QKV)                                                        \
     dim3 block(BLOCK_SIZE);                                                                                            \
     dim3 grid(int(divUp(params.max_input_seq_len, tokens_per_cuda_block)), params.batch_size, params.head_num);        \
-    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX)                                       \
+    if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX                                        \
+        || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE)                                        \
     {                                                                                                                  \
         applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, KVCacheBuffer,                    \
             RotaryPositionEmbeddingType::GPT_NEOX><<<grid, block, 0, stream>>>(params);                                \
@@ -1010,7 +1033,8 @@ void invokeApplyBiasRopeUpdateKVCacheDispatch(QKVPreprocessingParams<T, KVCacheB
     bool const has_rotary_cos_sin_cache = params.rotary_coef_cache_buffer != nullptr;
     bool const has_sink_tokens = params.sink_token_len > 0;
     // V2 implementation requires multiple of paired 16 bytes for gpt-neox rotation.
-    bool const support_rotary_for_v2 = params.position_embedding_type != PositionEmbeddingType::kROPE_GPT_NEOX
+    bool const support_rotary_for_v2 = (params.position_embedding_type != PositionEmbeddingType::kROPE_GPT_NEOX
+                                           && params.position_embedding_type != PositionEmbeddingType::kLONG_ROPE)
         || params.rotary_embedding_dim % 16 == 0;
 
     if (long_seq_rotary_support || !has_rotary_cos_sin_cache || has_sink_tokens || !support_rotary_for_v2)
diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp
new file mode 100644
index 000000000..51dea6f78
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.cpp
@@ -0,0 +1,75 @@
+
+#include <sstream>
+
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
+
+namespace tensorrt_llm::layers
+{
+
+using namespace tensorrt_llm::runtime;
+using TensorPtr = ITensor::SharedPtr;
+
+void printTokens2d(char const* name, TensorPtr const& tensor)
+{
+    auto M = tensor->getShape().d[0];
+    auto N = tensor->getShape().d[1];
+    auto tr = BufferRange<TokenIdType>(*tensor);
+    std::ostringstream buf;
+    buf << name << ": " << tensor->getShape() << "(\n";
+    for (SizeType mi = 0; mi < M; mi++)
+    {
+        for (SizeType ni = 0; ni < N; ni++)
+        {
+            auto token = tr[mi * N + ni];
+            if (token >= 0 && token <= 255)
+            {
+                buf << "'" << static_cast<char>(token) << "'";
+            }
+            else
+            {
+                buf << token << "'";
+            }
+            buf << (ni == (N - 1) ? ';' : ',');
+        }
+        if (mi != M - 1)
+        {
+            buf << std::endl;
+        }
+    }
+    buf << ")" << std::endl;
+    TLLM_LOG_DEBUG(buf.str());
+}
+
+void printTokens(char const* name, TensorPtr const& tensor)
+{
+    std::ostringstream buf;
+    buf << name << ": " << tensor->getShape() << "(";
+    for (auto const& token : BufferRange<TokenIdType>(*tensor))
+    {
+        if (token >= 0 && token <= 255)
+        {
+            buf << "'" << static_cast<char>(token) << "',";
+        }
+        else
+        {
+            buf << token << ",";
+        }
+    }
+    buf << ")" << std::endl << std::flush;
+    TLLM_LOG_DEBUG(buf.str());
+}
+
+void printTensor(char const* name, TensorPtr const& tensor)
+{
+    std::ostringstream buf;
+    buf << name << ": " << tensor->getShape() << "(";
+    for (auto const& token : BufferRange<TokenIdType>(*tensor))
+    {
+        buf << token << ",";
+    }
+    buf << ")" << std::endl << std::flush;
+    TLLM_LOG_DEBUG(buf.str());
+}
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
new file mode 100644
index 000000000..00a609194
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "tensorrt_llm/runtime/iTensor.h"
+
+namespace tensorrt_llm::layers
+{
+
+void printTokens(char const* name, runtime::ITensor::SharedPtr const& tensor);
+#define PRINT_TOKENS(x) tensorrt_llm::layers::printTokens(#x, x)
+
+void printTokens2d(char const* name, runtime::ITensor::SharedPtr const& tensor);
+#define PRINT_TOKENS2D(x) tensorrt_llm::layers::printTokens2d(#x, x)
+
+void printTensor(char const* name, runtime::ITensor::SharedPtr const& tensor);
+#define PRINT_TENSOR(x) tensorrt_llm::layers::printTensor(#x, x)
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp b/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp
new file mode 100644
index 000000000..ce63245d9
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/lookaheadPoolManager.cpp
@@ -0,0 +1,83 @@
+
+
+#include "tensorrt_llm/layers/lookaheadPoolManager.h"
+
+namespace tensorrt_llm::layers
+{
+
+using namespace tensorrt_llm::runtime;
+
+void LookaheadPoolManager::insertOne(Key key, TensorPtr ngram)
+{
+    auto search = mTokenMap.find(key);
+    if (search != mTokenMap.end())
+    {
+        search->second.remove_if(
+            [&ngram](TensorPtr const& item)
+            {
+                auto ar = BufferRange<TokenIdType>(*ngram);
+                auto br = BufferRange<TokenIdType>(*item);
+                return std::equal(ar.begin(), ar.end(), br.begin());
+            });
+        if (mGuessSetSize >= 0 && search->second.size() >= mGuessSetSize)
+        {
+            search->second.pop_front();
+        }
+        search->second.push_back(ngram);
+    }
+    else
+    {
+        mTokenMap.insert({key, std::list<TensorPtr>({ngram})});
+    }
+}
+
+void LookaheadPoolManager::fillWithPrompt(TensorPtr prompt, SizeType level)
+{
+    SizeType length = prompt->getShape().d[0];
+    auto pr = BufferRange<Key>(*prompt);
+    for (SizeType ti = 0; ti + level - 1 < length; ti++)
+    {
+        auto key = pr[ti];
+        TensorPtr ngram
+            = mBufferManager->copyFrom(*ITensor::slice(prompt, ti + 1, level - 1), runtime::MemoryType::kCPU);
+        insertOne(key, ngram);
+    }
+}
+
+std::list<LookaheadPoolManager::TensorPtr> LookaheadPoolManager::guess(Key lastToken, SizeType guessSize) const
+{
+    auto search = mTokenMap.find(lastToken);
+    if (search != mTokenMap.end())
+    {
+        auto ngrams = search->second;
+        if (ngrams.size() > guessSize)
+        {
+            auto it = std::prev(ngrams.end(), guessSize);
+            return std::list<TensorPtr>(it, ngrams.end());
+        }
+        else
+        {
+            return ngrams;
+        }
+    }
+    else
+    {
+        return std::list<TensorPtr>();
+    }
+}
+
+void LookaheadPoolManager::update(TensorPtr keyTokens, TensorPtr ngramTokens)
+{
+    TLLM_CHECK(keyTokens->getShape().d[0] == ngramTokens->getShape().d[0]);
+    auto kr = BufferRange<Key>(*keyTokens);
+    auto window = ngramTokens->getShape().d[0];
+
+    for (SizeType wi = 0; wi < window; wi++)
+    {
+        TensorPtr ngram = mBufferManager->copyFrom(*ITensor::slice(ngramTokens, wi, 1), runtime::MemoryType::kCPU);
+        ngram->squeeze(0);
+        insertOne(kr[wi], ngram);
+    }
+}
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/lookaheadPoolManager.h b/cpp/tensorrt_llm/layers/lookaheadPoolManager.h
new file mode 100644
index 000000000..10e21f2ef
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/lookaheadPoolManager.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <list>
+#include <unordered_map>
+
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+
+namespace tensorrt_llm::layers
+{
+
+//! @brief A helper class for managing key-ngram pool.
+class LookaheadPoolManager
+{
+public:
+    using TensorPtr = runtime::ITensor::SharedPtr;
+    using Key = runtime::TokenIdType;
+
+    LookaheadPoolManager(runtime::SizeType g, std::shared_ptr<runtime::BufferManager> bufferManager)
+        : mGuessSetSize(g)
+        , mBufferManager(bufferManager)
+    {
+    }
+
+    //! @brief fill token map from prompt
+    //! @param prompt the user input prompt, [length] on cpu
+    //! @param level the n-gram length
+    void fillWithPrompt(TensorPtr prompt, runtime::SizeType level);
+
+    //! @brief  get a list of guess tokens
+    //! @param lastToken the newest golden token
+    //! @param guessSize at most guessSize candidates returned
+    //! @return the list guess tokens, with list size <= guessSize
+    std::list<TensorPtr> guess(Key lastToken, runtime::SizeType guessSize) const;
+
+    //! @brief update token map with new generated tokens
+    //! @param keyTokens the new shifted out tokens from each window, as the key, [window] on cpu
+    //! @param ngramTokens the new shifted lookahead window, as the ngrams, [window, ngramLen] on cpu
+    void update(TensorPtr keyTokens, TensorPtr ngramTokens);
+
+    std::unordered_map<Key, std::list<TensorPtr>> const& getMap() const
+    {
+        return mTokenMap;
+    }
+
+private:
+    void insertOne(Key key, TensorPtr ngram);
+
+private:
+    std::shared_ptr<runtime::BufferManager> mBufferManager;
+    //! @brief the token map with token as key and list of n-gram as value
+    std::unordered_map<Key, std::list<TensorPtr>> mTokenMap;
+    //! @brief guess set size, -1 for infinite size
+    runtime::SizeType mGuessSetSize;
+};
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt
index 29c504cfd..c2797c1da 100755
--- a/cpp/tensorrt_llm/plugins/CMakeLists.txt
+++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt
@@ -47,7 +47,8 @@ set(PLUGIN_LISTS
     mixtureOfExperts
     selectiveScanPlugin
     mambaConv1dPlugin
-    lruPlugin)
+    lruPlugin
+    cumsumLastDimPlugin)
 
 foreach(PLUGIN_ITER ${PLUGIN_LISTS})
   include_directories(${PLUGIN_ITER})
diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
index 71f0463aa..aa7371b58 100644
--- a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
@@ -36,6 +36,7 @@
 #include "tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.h"
 #include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h"
 #endif // ENABLE_MULTI_DEVICE
+#include "tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h"
 #include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h"
 #include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h"
 #include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h"
@@ -193,6 +194,7 @@ extern "C"
         static tensorrt_llm::plugins::SelectiveScanPluginCreator selectiveScanPluginCreator;
         static tensorrt_llm::plugins::MambaConv1dPluginCreator mambaConv1DPluginCreator;
         static tensorrt_llm::plugins::lruPluginCreator lruPluginCreator;
+        static tensorrt_llm::plugins::CumsumLastDimPluginCreator cumsumLastDimPluginCreator;
 
         static std::array pluginCreators
             = { creatorPtr(identityPluginCreator),
@@ -219,6 +221,7 @@ extern "C"
                   creatorPtr(selectiveScanPluginCreator),
                   creatorPtr(mambaConv1DPluginCreator),
                   creatorPtr(lruPluginCreator),
+                  creatorPtr(cumsumLastDimPluginCreator),
               };
         nbCreators = pluginCreators.size();
         return pluginCreators.data();
diff --git a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
index df53dac3b..a959647fa 100644
--- a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
+++ b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
@@ -16,30 +16,32 @@
  */
 #pragma once
 
+#include "pluginUtils.h"
+#include "tensorrt_llm/common/logger.h"
+
+#include <cuda_runtime.h>
+
 #include <cstdlib>
 #include <iostream>
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <shared_mutex>
 #include <sstream>
 #include <unordered_map>
 #include <vector>
 
-#include <cuda_runtime.h>
-
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/plugins/common/plugin.h"
-
 namespace tensorrt_llm::plugins
 {
 
 struct GemmDims
 {
-    int32_t minM;
-    int32_t maxM;
-    int32_t n;
-    int32_t k;
+    using DimType = utils::DimType;
+
+    DimType minM;
+    DimType maxM;
+    DimType n;
+    DimType k;
 
     GemmDims()
         : minM(-1)
@@ -49,7 +51,7 @@ struct GemmDims
     {
     }
 
-    GemmDims(int32_t minM_, int32_t maxM_, int32_t n_, int32_t k_)
+    GemmDims(DimType minM_, DimType maxM_, DimType n_, DimType k_)
         : minM(minM_)
         , maxM(maxM_)
         , n(n_)
@@ -57,7 +59,7 @@ struct GemmDims
     {
     }
 
-    bool isInitialized() const
+    [[nodiscard]] bool isInitialized() const
     {
         return minM >= 0 && maxM >= 0 && n >= 0 && k >= 0;
     }
diff --git a/cpp/tensorrt_llm/plugins/common/plugin.h b/cpp/tensorrt_llm/plugins/common/plugin.h
index 33a24d5f1..d45a084ec 100644
--- a/cpp/tensorrt_llm/plugins/common/plugin.h
+++ b/cpp/tensorrt_llm/plugins/common/plugin.h
@@ -22,21 +22,18 @@
 #include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
 
 #include <NvInferRuntime.h>
-
-#include <NvInferRuntimeBase.h>
-#include <cstring>
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
-#include <iostream>
-#include <map>
-#include <memory>
 #if ENABLE_MULTI_DEVICE
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
+
+#include <cstring>
+#include <map>
+#include <memory>
 #include <optional>
 #include <set>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 
diff --git a/cpp/tensorrt_llm/plugins/common/pluginUtils.h b/cpp/tensorrt_llm/plugins/common/pluginUtils.h
new file mode 100644
index 000000000..c4b680f2e
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/common/pluginUtils.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <NvInferRuntime.h>
+
+namespace tensorrt_llm::plugins::utils
+{
+using DimType = int32_t;
+
+inline DimType computeMDimension(bool transA, nvinfer1::Dims const& dims)
+{
+    DimType M{1};
+    if (transA)
+    {
+        for (int i = dims.nbDims - 1; i > 0; --i)
+        {
+            M *= dims.d[i];
+        }
+    }
+    else
+    {
+        for (int i = 0; i < dims.nbDims - 1; ++i)
+        {
+            M *= dims.d[i];
+        }
+    }
+    return M;
+}
+
+inline DimType computeNDimension(bool transB, nvinfer1::Dims const& dims)
+{
+    DimType N{1};
+    if (transB)
+    {
+        for (int32_t i = 0; i < dims.nbDims - 1; ++i)
+        {
+            N *= dims.d[i];
+        }
+    }
+    else
+    {
+        for (int32_t i = dims.nbDims - 1; i > 0; --i)
+        {
+            N *= dims.d[i];
+        }
+    }
+    return N;
+}
+
+} // namespace tensorrt_llm::plugins::utils
diff --git a/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/CMakeLists.txt b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/CMakeLists.txt
new file mode 100644
index 000000000..ea25de075
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES
+    ${PLUGIN_SOURCES}
+    PARENT_SCOPE)
diff --git a/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.cpp b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.cpp
new file mode 100644
index 000000000..0e4fc20ec
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.cpp
@@ -0,0 +1,276 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cumsumLastDimPlugin.h"
+#include "tensorrt_llm/common/assert.h"
+
+using namespace nvinfer1;
+using namespace tensorrt_llm::kernels;
+using namespace tensorrt_llm::common;
+using tensorrt_llm::plugins::CumsumLastDimPluginCreator;
+using tensorrt_llm::plugins::CumsumLastDimPlugin;
+
+static char const* CUMSUM_LAST_DIM_PLUGIN_VERSION{"1"};
+static char const* CUMSUM_LAST_DIM_PLUGIN_NAME{"CumsumLastDim"};
+PluginFieldCollection CumsumLastDimPluginCreator::mFC{};
+std::vector<nvinfer1::PluginField> CumsumLastDimPluginCreator::mPluginAttributes;
+
+CumsumLastDimPlugin::CumsumLastDimPlugin(int input_length, nvinfer1::DataType type)
+    : mInputLength(input_length)
+    , mType(type)
+{
+    TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16),
+        "Unsupported data type, pre SM 80 GPUs do not support bfloat16");
+    TLLM_CHECK_WITH_INFO((mType == DataType::kBF16) || (mType == DataType::kFLOAT) || (mType == DataType::kHALF)
+            || (mType == DataType::kINT32),
+        "Only support int, float, half, and bfloat16.");
+}
+
+// Parameterized constructor
+CumsumLastDimPlugin::CumsumLastDimPlugin(void const* data, size_t length)
+{
+    char const *d = reinterpret_cast<char const*>(data), *a = d;
+    read(d, mInputLength);
+    read(d, mType);
+    TLLM_CHECK(d == a + length);
+    TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type");
+    TLLM_CHECK_WITH_INFO((mType == DataType::kBF16) || (mType == DataType::kFLOAT) || (mType == DataType::kHALF)
+            || (mType == DataType::kINT32),
+        "Only support int, float, half, and bfloat16.");
+}
+
+// IPluginV2DynamicExt Methods
+nvinfer1::IPluginV2DynamicExt* CumsumLastDimPlugin::clone() const noexcept
+{
+    auto* plugin = new CumsumLastDimPlugin(mInputLength, mType);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+// Outputs
+// output_tensor: [batch_size, input_length]
+nvinfer1::DimsExprs CumsumLastDimPlugin::getOutputDimensions(
+    int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
+{
+    TLLM_CHECK_WITH_INFO(outputIndex == 0, "Only one output.");
+    return inputs[getInputTensorIdx()];
+}
+
+bool CumsumLastDimPlugin::supportsFormatCombination(
+    int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR);
+}
+
+void CumsumLastDimPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs,
+    nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept
+{
+}
+
+size_t CumsumLastDimPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs,
+    nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept
+{
+    if (mType == DataType::kINT32)
+    {
+        return invokeComputeCumsumLastDimWorkspaceSize<int>(mInputLength);
+    }
+    else if (mType == DataType::kHALF)
+    {
+        return invokeComputeCumsumLastDimWorkspaceSize<half>(mInputLength);
+    }
+    else if (mType == DataType::kFLOAT)
+    {
+        return invokeComputeCumsumLastDimWorkspaceSize<float>(mInputLength);
+    }
+#ifdef ENABLE_BF16
+    else if (mType == DataType::kBF16)
+    {
+        return invokeComputeCumsumLastDimWorkspaceSize<__nv_bfloat16>(mInputLength);
+    }
+#endif
+    return 0;
+}
+
+template <typename T>
+int CumsumLastDimPlugin::enqueueImpl(nvinfer1::PluginTensorDesc const* inputDesc,
+    nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream)
+{
+    // inputs
+    //     0.  input_tensor [batch_size, input_length]
+    // outputs
+    //     0.  output_tensor [batch_size, input_length]
+    auto const batch_size = inputDesc[getInputTensorIdx()].dims.d[0];
+    size_t temp_storage_bytes = invokeComputeCumsumLastDimWorkspaceSize<T>(mInputLength);
+    invokeCumsumLastDim<T>(
+        batch_size, mInputLength, inputs[getInputTensorIdx()], outputs[0], workspace, temp_storage_bytes, stream);
+
+    return 0;
+}
+
+int CumsumLastDimPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
+    nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream) noexcept
+{
+    if (mType == DataType::kINT32)
+    {
+        return enqueueImpl<int>(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+    }
+    else if (mType == DataType::kHALF)
+    {
+        return enqueueImpl<half>(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+    }
+    else if (mType == DataType::kFLOAT)
+    {
+        return enqueueImpl<float>(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+    }
+#ifdef ENABLE_BF16
+    else if (mType == DataType::kBF16)
+    {
+        return enqueueImpl<__nv_bfloat16>(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+    }
+#endif
+    return 0;
+}
+
+// IPluginV2Ext Methods
+nvinfer1::DataType CumsumLastDimPlugin::getOutputDataType(
+    int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept
+{
+    TLLM_CHECK_WITH_INFO(index == 0, "Only one output.");
+    return inputTypes[getInputTensorIdx()];
+}
+
+// IPluginV2 Methods
+
+char const* CumsumLastDimPlugin::getPluginType() const noexcept
+{
+    return CUMSUM_LAST_DIM_PLUGIN_NAME;
+}
+
+char const* CumsumLastDimPlugin::getPluginVersion() const noexcept
+{
+    return CUMSUM_LAST_DIM_PLUGIN_VERSION;
+}
+
+int CumsumLastDimPlugin::getNbOutputs() const noexcept
+{
+    return 1;
+}
+
+int CumsumLastDimPlugin::initialize() noexcept
+{
+    return 0;
+}
+
+void CumsumLastDimPlugin::terminate() noexcept {}
+
+size_t CumsumLastDimPlugin::getSerializationSize() const noexcept
+{
+    return sizeof(mInputLength) + sizeof(mType);
+}
+
+void CumsumLastDimPlugin::serialize(void* buffer) const noexcept
+{
+    char *d = static_cast<char*>(buffer), *a = d;
+    write(d, mInputLength);
+    write(d, mType);
+    assert(d == a + getSerializationSize());
+}
+
+void CumsumLastDimPlugin::destroy() noexcept
+{
+    delete this;
+}
+
+///////////////
+
+CumsumLastDimPluginCreator::CumsumLastDimPluginCreator()
+{
+    // Fill PluginFieldCollection with PluginField arguments metadata
+    mPluginAttributes.clear();
+    mPluginAttributes.emplace_back(PluginField("mInputLength", nullptr, PluginFieldType::kINT32, 49152));
+    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+char const* CumsumLastDimPluginCreator::getPluginName() const noexcept
+{
+    return CUMSUM_LAST_DIM_PLUGIN_NAME;
+}
+
+char const* CumsumLastDimPluginCreator::getPluginVersion() const noexcept
+{
+    return CUMSUM_LAST_DIM_PLUGIN_VERSION;
+}
+
+PluginFieldCollection const* CumsumLastDimPluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2* CumsumLastDimPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept
+{
+    PluginField const* fields = fc->fields;
+    int input_length;
+    nvinfer1::DataType type;
+    // Read configurations from each fields
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        char const* attrName = fields[i].name;
+        if (!strcmp(attrName, "input_length"))
+        {
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
+            input_length = static_cast<int>(*(static_cast<int const*>(fields[i].data)));
+        }
+        else if (!strcmp(attrName, "type_id"))
+        {
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
+            type = static_cast<nvinfer1::DataType>(*(static_cast<nvinfer1::DataType const*>(fields[i].data)));
+        }
+    }
+    try
+    {
+        auto* obj = new CumsumLastDimPlugin(input_length, type);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+    catch (std::exception const& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+IPluginV2* CumsumLastDimPluginCreator::deserializePlugin(
+    char const* name, void const* serialData, size_t serialLength) noexcept
+{
+    // This object will be deleted when the network is destroyed, which will
+    // call CumsumLastDimPlugin::destroy()
+    try
+    {
+        auto* obj = new CumsumLastDimPlugin(serialData, serialLength);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+    catch (std::exception const& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
diff --git a/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h
new file mode 100644
index 000000000..813168e29
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h
@@ -0,0 +1,98 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_CUMSUM_LAST_DIM_PLUGIN_H
+#define TRT_CUMSUM_LAST_DIM_PLUGIN_H
+
+#include "tensorrt_llm/kernels/cumsumLastDim.h"
+#include "tensorrt_llm/plugins/common/plugin.h"
+#include <cassert>
+
+namespace tensorrt_llm::plugins
+{
+class CumsumLastDimPlugin : public BasePlugin
+{
+public:
+    CumsumLastDimPlugin(int mInputLength, nvinfer1::DataType type);
+    CumsumLastDimPlugin(void const* data, size_t length);
+    ~CumsumLastDimPlugin() override = default;
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+    nvinfer1::DimsExprs getOutputDimensions(int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs,
+        nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs,
+        nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept override;
+    int enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
+        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+    template <typename T>
+    int enqueueImpl(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
+        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream);
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(
+        int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept override;
+
+    // IPluginV2 Methods
+    char const* getPluginType() const noexcept override;
+    char const* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+
+private:
+    using IndexType = std::int32_t;
+
+    IndexType getInputTensorIdx() const
+    {
+        return 0;
+    };
+
+private:
+    int mInputLength;
+    nvinfer1::DataType mType;
+};
+
+class CumsumLastDimPluginCreator : public BaseCreator
+{
+public:
+    CumsumLastDimPluginCreator();
+    char const* getPluginName() const noexcept override;
+
+    char const* getPluginVersion() const noexcept override;
+
+    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
+
+    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
+
+    nvinfer1::IPluginV2* deserializePlugin(
+        char const* name, void const* serialData, size_t serialLength) noexcept override;
+
+private:
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
+};
+
+} // namespace tensorrt_llm::plugins
+
+#endif
diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
index f737e9333..0fee4b92d 100644
--- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
@@ -14,13 +14,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "gemmPlugin.h"
+
+#include "gemmPluginProfiler.h"
 #include "plugin.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include <NvInferRuntimeBase.h>
+#include "pluginUtils.h"
+
+#include <NvInferRuntime.h>
+
+#include <cassert>
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::common;
+using tensorrt_llm::plugins::GemmDims;
 using tensorrt_llm::plugins::GemmPluginCreator;
 using tensorrt_llm::plugins::GemmPlugin;
 using tensorrt_llm::plugins::CublasLtGemmPluginProfiler;
@@ -47,10 +54,13 @@ void getProblemParams(cublasOperation_t& transa, cublasOperation_t& transb, int&
 }
 
 void runGemm(int const M, int const N, int const K, bool const transA, bool const transB, int const padLda,
-    int const padLdb, const nvinfer1::DataType type, CublasGemmWrapperPtr const& cublasWrapperPtr, void const* act,
+    int const padLdb, nvinfer1::DataType const type, CublasGemmWrapperPtr const& cublasWrapperPtr, void const* act,
     void const* weight, void* output, std::optional<cublasLtMatmulHeuristicResult_t> const& heuristic, void* workspace,
     cudaStream_t stream)
 {
+    if (M == 0 || N == 0 || K == 0)
+        return;
+
     cublasWrapperPtr->setStream(stream);
     cublasWrapperPtr->setWorkspace(workspace);
 
@@ -291,60 +301,19 @@ bool GemmPlugin::supportsFormatCombination(
     return desc.type == mType || desc.type == nvinfer1::DataType::kFLOAT;
 }
 
-int32_t computeMDimension(bool transA, const int32_t nbDims, tensorrt_llm::runtime::ITensor::DimType const* dims)
-{
-    int32_t M = 1;
-    if (transA)
-    {
-        for (int i = nbDims - 1; i > 0; --i)
-        {
-            M *= dims[i];
-        }
-    }
-    else
-    {
-        for (int i = 0; i < nbDims - 1; ++i)
-        {
-            M *= dims[i];
-        }
-    }
-    return M;
-}
-
-int32_t computeNDimension(bool transB, const int32_t nbDims, tensorrt_llm::runtime::ITensor::DimType const* dims)
-{
-    int32_t N = 1;
-    if (transB)
-    {
-        for (int i = 0; i < nbDims - 1; ++i)
-        {
-            N *= dims[i];
-        }
-    }
-    else
-    {
-        for (int i = nbDims - 1; i > 0; --i)
-        {
-            N *= dims[i];
-        }
-    }
-    return N;
-}
-
 void GemmPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs,
     nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept
 {
-    int const nbDimsA = in[0].max.nbDims;
-    int const nbDimsB = in[1].max.nbDims;
+    auto const nbDimsA = in[0].max.nbDims;
 
-    auto const minM = computeMDimension(mTransA, nbDimsA, in[0].min.d);
-    auto const maxM = computeMDimension(mTransA, nbDimsA, in[0].max.d);
-    auto const N = computeNDimension(mTransB, nbDimsB, in[1].max.d);
-    auto const K = mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1];
+    auto const minM = utils::computeMDimension(mTransA, in[0].min);
+    auto const maxM = utils::computeMDimension(mTransA, in[0].max);
+    auto const N = utils::computeNDimension(mTransB, in[1].max);
+    auto const K = static_cast<utils::DimType>(mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1]);
 
     if (!mDims.isInitialized())
     {
-        mDims = {minM, maxM, N, static_cast<runtime::SizeType>(K)};
+        mDims = {minM, maxM, N, K};
     }
     mGemmId.n = N;
     mGemmId.k = K;
@@ -370,13 +339,13 @@ int GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
     setGemmConfig();
 
     int const nbDimsA = inputDesc[0].dims.nbDims;
-    int const nbDimsB = inputDesc[1].dims.nbDims;
     int const padM = mTransA ? mPadLda : 0;
     int const padN = mTransB ? 0 : mPadLdb;
     int const padK = mTransA ? 0 : mPadLda;
-    auto const M = computeMDimension(mTransA, nbDimsA, inputDesc[0].dims.d) - padM;
-    auto const N = computeNDimension(mTransB, nbDimsB, inputDesc[1].dims.d) - padN;
-    int const K = mTransA ? inputDesc[0].dims.d[0] - padK : inputDesc[0].dims.d[nbDimsA - 1] - padK;
+    auto const M = utils::computeMDimension(mTransA, inputDesc[0].dims) - padM;
+    auto const N = utils::computeNDimension(mTransB, inputDesc[1].dims) - padN;
+    int const K = static_cast<utils::DimType>(
+        mTransA ? inputDesc[0].dims.d[0] - padK : inputDesc[0].dims.d[nbDimsA - 1] - padK);
 
     auto bestTactic = mPluginProfiler->getBestConfig(M, mGemmId);
     runGemm(M, N, K, mTransA, mTransB, mPadLda, mPadLdb, mType, mCublasWrapper, inputs[0], inputs[1], outputs[0],
diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
index c288d5044..a1af84180 100644
--- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
+++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
@@ -16,11 +16,11 @@
  */
 #ifndef TRT_GEMM_PLUGIN_H
 #define TRT_GEMM_PLUGIN_H
+
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/plugins/common/gemmPluginProfiler.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
-#include <cassert>
-#include <set>
+
 #include <string>
 #include <vector>
 
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
index 75e558769..774379991 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
@@ -68,7 +68,11 @@ struct FusedQKVMaskedAttentionDispatchParams
     float rotary_embedding_base;
     RotaryScalingType rotary_embedding_scale_type;
     float rotary_embedding_scale;
+    float rotary_embedding_m_scale;
+    float const* rotary_embedding_scaling_factors;
     int rotary_embedding_max_positions;
+    int rotary_cogvlm_vision_start;
+    int rotary_cogvlm_vision_length;
     PositionEmbeddingType position_embedding_type;
     bool position_shift_enabled;
     int max_attention_window;
@@ -179,7 +183,7 @@ bool GPTAttentionPluginCommon::convertMMHAParamsToXQAParams(tensorrt_llm::kernel
     xqaParams.max_distance = mMaxDistance;
     xqaParams.multi_block_mode = mMultiBlockMode;
     // Medusa mode will have multiple query tokens.
-    xqaParams.multi_query_tokens = mIsMedusaEnabled;
+    xqaParams.multi_query_tokens = mIsSpecDecodingEnabled;
 
     if (mKVCacheQuantMode.hasInt8KvCache())
     {
@@ -209,7 +213,7 @@ bool GPTAttentionPluginCommon::convertMMHAParamsToXQAParams(tensorrt_llm::kernel
     xqaParams.workspaces = generationsParams.workspace;
     xqaParams.batch_size = generationsParams.num_requests;
     xqaParams.beam_width = generationsParams.beam_width;
-    // Medusa mode has generation input_length > 1.
+    // Speculative decoding mode has generation input_length > 1.
     xqaParams.generation_input_length = generationsParams.input_seq_length;
     xqaParams.max_attention_window_size = generationsParams.max_attention_window;
     xqaParams.cyclic_attention_window_size = generationsParams.cyclic_attention_window_size;
@@ -222,12 +226,12 @@ bool GPTAttentionPluginCommon::convertMMHAParamsToXQAParams(tensorrt_llm::kernel
     xqaParams.alibi_slopes = generationsParams.alibi_slopes;
     if (!forConfigurePlugin)
     {
-        // Medusa (need to take new generated ids into consideration).
-        TLLM_CHECK_WITH_INFO(!mIsMedusaEnabled || generationsParams.medusa_packed_mask != nullptr,
-            "Medusa mode needs a valid packed_mask input tensor.");
+        // Speculative decoding (need to take new generated ids into consideration).
+        TLLM_CHECK_WITH_INFO(!mIsSpecDecodingEnabled || generationsParams.spec_decoding_packed_mask != nullptr,
+            "Speculative decoding mode needs a valid packed_mask input tensor.");
     }
-    xqaParams.medusa_packed_mask = generationsParams.medusa_packed_mask;
-    xqaParams.medusa_position_offsets = generationsParams.medusa_position_offsets;
+    xqaParams.spec_decoding_packed_mask = generationsParams.spec_decoding_packed_mask;
+    xqaParams.spec_decoding_position_offsets = generationsParams.spec_decoding_position_offsets;
     return true;
 }
 
@@ -290,7 +294,11 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
     params.rotary_embedding_base = input_params.rotary_embedding_base;
     params.rotary_embedding_scale_type = input_params.rotary_embedding_scale_type;
     params.rotary_embedding_scale = input_params.rotary_embedding_scale;
+    params.rotary_embedding_m_scale = input_params.rotary_embedding_m_scale;
+    params.rotary_embedding_scaling_factors = input_params.rotary_embedding_scaling_factors;
     params.rotary_embedding_max_positions = input_params.rotary_embedding_max_positions;
+    params.rotary_cogvlm_vision_start = input_params.rotary_cogvlm_vision_start;
+    params.rotary_cogvlm_vision_length = input_params.rotary_cogvlm_vision_length;
     params.position_embedding_type = input_params.position_embedding_type;
     params.position_shift_enabled = input_params.position_shift_enabled;
     // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
@@ -358,19 +366,23 @@ INSTANTIATE_MMHA_DISPATCH(__nv_bfloat16, __nv_bfloat16)
 #endif
 #undef INSTANTIATE_MMHA_DISPATCH
 
-GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads, int num_kv_heads, int head_size,
-    int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads, int vision_start, int vision_length,
+    int num_kv_heads, int head_size, int unidirectional, float q_scaling,
+    tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
     float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
-    float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
-    bool unfuse_qkv_gemm,                                                                       // for AutoPP
+    float rotary_embedding_scale, float rotary_embedding_m_scale, int rotary_embedding_max_positions, int tp_size,
+    int tp_rank,          // for ALiBi
+    bool unfuse_qkv_gemm, // for AutoPP
     tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, bool enable_xqa,
     int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type,
     bool paged_kv_cache, int tokens_per_block, nvinfer1::DataType type, int32_t max_context_length,
     bool qkv_bias_enabled, bool cross_attention, int max_distance, bool pos_shift_enabled, bool dense_context_fmha,
-    bool use_paged_context_fmha, bool use_fp8_context_fmha, bool use_cache, bool is_medusa_enabled)
+    bool use_paged_context_fmha, bool use_fp8_context_fmha, bool use_cache, bool is_spec_decoding_enabled)
     : mLayerIdx(layer_idx)
     , mNumHeads(num_heads)
+    , mVisionStart(vision_start)
+    , mVisionLength(vision_length)
     , mNumKVHeads(num_kv_heads)
     , mHeadSize(head_size)
     , mUnidirectional(unidirectional)
@@ -379,6 +391,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads,
     , mRotaryEmbeddingBase(rotary_embedding_base)
     , mRotaryEmbeddingScaleType(rotary_embedding_scale_type)
     , mRotaryEmbeddingScale(rotary_embedding_scale)
+    , mRotaryEmbeddingMscale(rotary_embedding_m_scale)
     , mRotaryEmbeddingMaxPositions(rotary_embedding_max_positions)
     , mPositionEmbeddingType(position_embedding_type)
     , mEnableContextFMHA(context_fmha_type != ContextFMHAType::DISABLED)
@@ -404,7 +417,8 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads,
     , mPagedContextFMHA(use_paged_context_fmha)
     , mFP8ContextFMHA(use_fp8_context_fmha)
     , mUseKVCache(use_cache)
-    , mIsMedusaEnabled(is_medusa_enabled)
+    , mIsSpecDecodingEnabled(is_spec_decoding_enabled)
+    , mDriver(CUDADriverWrapper::getInstance())
 {
     // Pre-check whether FMHA is supported in order to save memory allocation.
     if (mEnableContextFMHA)
@@ -473,12 +487,15 @@ int const GPTAttentionPluginCommon::getHeadSize(bool checkInit) const
 
 // Parameterized constructor
 GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t length)
+    : mDriver(CUDADriverWrapper::getInstance())
 {
     char const *d = reinterpret_cast<char const*>(data), *a = d;
     unsigned int kvCacheQuantMode;
 
     read(d, mLayerIdx);
     read(d, mNumHeads);
+    read(d, mVisionStart);
+    read(d, mVisionLength);
     read(d, mNumKVHeads);
     read(d, mHeadSize);
     read(d, mUnidirectional);
@@ -488,6 +505,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
     read(d, mRotaryEmbeddingBase);
     read(d, mRotaryEmbeddingScaleType);
     read(d, mRotaryEmbeddingScale);
+    read(d, mRotaryEmbeddingMscale);
     read(d, mRotaryEmbeddingMaxPositions);
     read(d, mTpSize);
     read(d, mTpRank);
@@ -511,11 +529,16 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
     read(d, mPagedContextFMHA);
     read(d, mFP8ContextFMHA);
     read(d, mUseKVCache);
-    read(d, mIsMedusaEnabled);
+    read(d, mIsSpecDecodingEnabled);
     read(d, mNbMultiBlockSemaphores);
 
     mKVCacheQuantMode = tc::QuantMode(kvCacheQuantMode);
 
+    uint32_t decoderXQARunnerResourceSerializedSize;
+    read(d, decoderXQARunnerResourceSerializedSize);
+    mDecoderXQARunnerResource = DecoderXQARunner::Resource(d, decoderXQARunnerResourceSerializedSize);
+    d += decoderXQARunnerResourceSerializedSize;
+
     TLLM_CHECK_WITH_INFO(d == a + length,
         "Expected length (%d) != real length (%d). This is often "
         "caused by using different TensorRT-LLM version to build "
@@ -860,7 +883,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams<T, KVCacheBuff
             params.sink_token_length, params.num_tokens, mNumHeads, mNumKVHeads, mNumHeads / mNumKVHeads, getHeadSize(),
             mRotaryEmbeddingDim, mRotaryEmbeddingBase, mRotaryEmbeddingScaleType, mRotaryEmbeddingScale,
             mRotaryEmbeddingMaxPositions, position_embedding_type, mPosShiftEnabled, cache_type,
-            enablePagedKVContextFMHA, mFP8ContextFMHA, mMultiProcessorCount};
+            enablePagedKVContextFMHA, mFP8ContextFMHA, mMultiProcessorCount, mVisionStart, mVisionLength};
 
         invokeQKVPreprocessing(preprocessingParms, stream);
 
@@ -1247,9 +1270,9 @@ int GPTAttentionPluginCommon::enqueueGeneration(
             mDecoderXQARunner->template dispatch<KVCacheBuffer>(xqaParams, kv_cache_buffer, stream);
             return 0;
         }
-        else if (mIsMedusaEnabled)
+        else if (mIsSpecDecodingEnabled)
         {
-            TLLM_CHECK_WITH_INFO(false, "No available XQA kernels are found for medusa mode.");
+            TLLM_CHECK_WITH_INFO(false, "No available XQA kernels are found for speculative decoding mode.");
         }
     }
 
@@ -1367,8 +1390,12 @@ int GPTAttentionPluginCommon::enqueueGeneration(
     dispatch_params.rotary_embedding_base = mRotaryEmbeddingBase;
     dispatch_params.rotary_embedding_scale_type = mRotaryEmbeddingScaleType;
     dispatch_params.rotary_embedding_scale = mRotaryEmbeddingScale;
+    dispatch_params.rotary_embedding_m_scale = mRotaryEmbeddingMscale;
+    dispatch_params.rotary_embedding_scaling_factors = params.rotary_embedding_scaling_factors;
     dispatch_params.rotary_embedding_max_positions = mRotaryEmbeddingMaxPositions;
     dispatch_params.position_shift_enabled = mPosShiftEnabled;
+    dispatch_params.rotary_cogvlm_vision_start = mVisionStart;
+    dispatch_params.rotary_cogvlm_vision_length = mVisionLength;
     dispatch_params.cross_attention = mCrossAttention;
     dispatch_params.memory_length_per_sample = params.encoder_input_lengths;
 
@@ -1487,7 +1514,7 @@ int GPTAttentionPluginCommon::initialize() noexcept
         mFMHARunner->setup_flags(mFMHAForceFP32Acc, !mRemovePadding, true, mNumKVHeads);
     }
 
-    bool useXQAKernels = (mEnableXQA || mIsMedusaEnabled) && !mCrossAttention
+    bool useXQAKernels = (mEnableXQA || mIsSpecDecodingEnabled) && !mCrossAttention
         && (mType == nvinfer1::DataType::kHALF || mType == nvinfer1::DataType::kBF16);
 
     if (useXQAKernels)
@@ -1502,22 +1529,22 @@ int GPTAttentionPluginCommon::initialize() noexcept
             xqa_runner_data_type = DATA_TYPE_BF16;
         }
         TLLM_LOG_DEBUG("Enabling XQA kernels for GPTAttention.");
-        if (mIsMedusaEnabled)
+        if (mIsSpecDecodingEnabled)
         {
             TLLM_CHECK_WITH_INFO(mNumHeads % mNumKVHeads == 0, "mNumHeads should be multiples of mNumKVHeads.");
             int numQHeadsPerKV = mNumHeads / mNumKVHeads;
             bool isPowerOfTwo = ((numQHeadsPerKV & (numQHeadsPerKV - 1)) == 0);
             TLLM_CHECK_WITH_INFO(isPowerOfTwo,
-                "numQHeadsPerKV should be power of 2 for Medusa, mNumHeads=%d, mNumKVHeads=%d.", mNumHeads,
-                mNumKVHeads);
+                "numQHeadsPerKV should be power of 2 for Speculative decoding, mNumHeads=%d, mNumKVHeads=%d.",
+                mNumHeads, mNumKVHeads);
         }
 
-        mDecoderXQARunner.reset(
-            new DecoderXQARunner(xqa_runner_data_type, mNumHeads, mNumKVHeads, mHeadSize, mMultiBlockMode));
+        mDecoderXQARunner.reset(new DecoderXQARunner(
+            &mDecoderXQARunnerResource, xqa_runner_data_type, mNumHeads, mNumKVHeads, mHeadSize, mMultiBlockMode));
     }
-    else if (mIsMedusaEnabled)
+    else if (mIsSpecDecodingEnabled)
     {
-        TLLM_CHECK_WITH_INFO(false, "Medusa mode doesn't support the data type or cross attention.");
+        TLLM_CHECK_WITH_INFO(false, "Speculative decoding mode doesn't support the data type or cross attention.");
     }
 
     if (mNbMultiBlockSemaphores != 0)
@@ -1533,18 +1560,20 @@ void GPTAttentionPluginCommon::destroy() noexcept
     delete this;
 }
 
-size_t GPTAttentionPluginCommon::getCommonSerializationSize() noexcept
+size_t GPTAttentionPluginCommon::getCommonSerializationSize() const noexcept
 {
-    return sizeof(mLayerIdx) + sizeof(mNumHeads) + sizeof(mNumKVHeads) + sizeof(mHeadSize) + sizeof(mUnidirectional)
-        + sizeof(mQScaling) + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim)
-        + sizeof(mRotaryEmbeddingBase) + sizeof(mRotaryEmbeddingScaleType) + sizeof(mRotaryEmbeddingScale)
-        + sizeof(mRotaryEmbeddingMaxPositions) + sizeof(mTpSize) + sizeof(mTpRank) + sizeof(mEnableContextFMHA)
-        + sizeof(mFMHAForceFP32Acc) + sizeof(mMultiBlockMode) + sizeof(mEnableXQA)
-        + sizeof(unsigned int) // mKVCacheQuantMode
+    return sizeof(mLayerIdx) + sizeof(mNumHeads) + +sizeof(mVisionStart) + sizeof(mVisionLength) + sizeof(mNumKVHeads)
+        + sizeof(mHeadSize) + sizeof(mUnidirectional) + sizeof(mQScaling) + sizeof(mPositionEmbeddingType)
+        + sizeof(mRotaryEmbeddingDim) + sizeof(mRotaryEmbeddingBase) + sizeof(mRotaryEmbeddingScaleType)
+        + sizeof(mRotaryEmbeddingScale) + sizeof(mRotaryEmbeddingMscale) + sizeof(mRotaryEmbeddingMaxPositions)
+        + sizeof(mTpSize) + sizeof(mTpRank) + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc)
+        + sizeof(mMultiBlockMode) + sizeof(mEnableXQA) + sizeof(unsigned int) // mKVCacheQuantMode
         + sizeof(mRemovePadding) + sizeof(mMaskType) + sizeof(mPagedKVCache) + sizeof(mTokensPerBlock) + sizeof(mType)
         + sizeof(mMaxContextLength) + sizeof(mQKVBiasEnabled) + sizeof(mCrossAttention) + sizeof(mMaxDistance)
         + sizeof(mPosShiftEnabled) + sizeof(mDenseContextFMHA) + sizeof(mPagedContextFMHA) + sizeof(mFP8ContextFMHA)
-        + sizeof(mUseKVCache) + sizeof(mUnfuseQkvGemm) + sizeof(mIsMedusaEnabled) + sizeof(mNbMultiBlockSemaphores);
+        + sizeof(mUseKVCache) + sizeof(mUnfuseQkvGemm) + sizeof(mIsSpecDecodingEnabled)
+        + sizeof(mNbMultiBlockSemaphores) + sizeof(uint32_t) // size of mDecoderXQARunnerResource buffer.
+        + mDecoderXQARunnerResource.getSerializationSize();
 }
 
 void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
@@ -1552,6 +1581,8 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
     char *d = static_cast<char*>(buffer), *a = d;
     write(d, mLayerIdx);
     write(d, mNumHeads);
+    write(d, mVisionStart);
+    write(d, mVisionLength);
     write(d, mNumKVHeads);
     write(d, mHeadSize);
     write(d, mUnidirectional);
@@ -1561,6 +1592,7 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
     write(d, mRotaryEmbeddingBase);
     write(d, mRotaryEmbeddingScaleType);
     write(d, mRotaryEmbeddingScale);
+    write(d, mRotaryEmbeddingMscale);
     write(d, mRotaryEmbeddingMaxPositions);
     write(d, mTpSize);
     write(d, mTpRank);
@@ -1584,8 +1616,15 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
     write(d, mPagedContextFMHA);
     write(d, mFP8ContextFMHA);
     write(d, mUseKVCache);
-    write(d, mIsMedusaEnabled);
+    write(d, mIsSpecDecodingEnabled);
     write(d, mNbMultiBlockSemaphores);
+
+    // An uint32_t that specifies the size of the serialized buffer, followed by the actual content.
+    uint32_t decoderXQARunnerResourceSerializedSize = mDecoderXQARunnerResource.getSerializationSize();
+    write(d, decoderXQARunnerResourceSerializedSize);
+    mDecoderXQARunnerResource.serialize(d, decoderXQARunnerResourceSerializedSize);
+    d += decoderXQARunnerResourceSerializedSize;
+
     assert(d == a + getCommonSerializationSize());
 }
 
@@ -1630,6 +1669,8 @@ GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon()
     // Fill PluginFieldCollection with PluginField arguments metadata
     mPluginAttributes.clear();
     mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, -1));
+    mPluginAttributes.emplace_back(PluginField("vision_start", nullptr, PluginFieldType::kINT32, -1));
+    mPluginAttributes.emplace_back(PluginField("vision_length", nullptr, PluginFieldType::kINT32, -1));
     mPluginAttributes.emplace_back(PluginField("num_kv_heads", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("head_size", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("unidirectional", nullptr, PluginFieldType::kINT32, 1));
@@ -1639,6 +1680,7 @@ GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon()
     mPluginAttributes.emplace_back(PluginField("rotary_embedding_base", nullptr, PluginFieldType::kFLOAT32, 0));
     mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale_type", nullptr, PluginFieldType::kINT8, 0));
     mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale", nullptr, PluginFieldType::kFLOAT32, 0));
+    mPluginAttributes.emplace_back(PluginField("rotary_embedding_m_scale", nullptr, PluginFieldType::kFLOAT32, 0));
     mPluginAttributes.emplace_back(PluginField("rotary_embedding_max_positions", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("tp_size", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("tp_rank", nullptr, PluginFieldType::kINT32, 0));
@@ -1661,7 +1703,7 @@ GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon()
     mPluginAttributes.emplace_back(PluginField("use_paged_context_fmha", nullptr, PluginFieldType::kINT8, 0));
     mPluginAttributes.emplace_back(PluginField("use_fp8_context_fmha", nullptr, PluginFieldType::kINT8, 0));
     mPluginAttributes.emplace_back(PluginField("use_cache", nullptr, PluginFieldType::kINT32, 0));
-    mPluginAttributes.emplace_back(PluginField("is_medusa_enabled", nullptr, PluginFieldType::kINT8, 0));
+    mPluginAttributes.emplace_back(PluginField("is_spec_decoding_enabled", nullptr, PluginFieldType::kINT8, 0));
     mFC.nbFields = mPluginAttributes.size();
     mFC.fields = mPluginAttributes.data();
 }
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
index 3ad790243..0aa2ee311 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
@@ -37,18 +37,20 @@ class GPTAttentionPluginCommon : public BasePlugin
 public:
     GPTAttentionPluginCommon() = delete;
 
-    GPTAttentionPluginCommon(int layer_idx, int num_heads, int num_kv_heads, int head_size, int unidirectional,
-        float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+    GPTAttentionPluginCommon(int layer_idx, int num_heads, int vision_start, int vision_length, int num_kv_heads,
+        int head_size, int unidirectional, float q_scaling,
+        tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
         float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
-        float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
-        bool unfuse_qkv_gemm,                                                                       // for AutoPP
+        float rotary_embedding_scale, float rotary_embedding_m_scale, int rotary_embedding_max_positions, int tp_size,
+        int tp_rank,          // for ALiBi
+        bool unfuse_qkv_gemm, // for AutoPP
         tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, bool enable_xqa,
         int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type,
         bool paged_kv_cache, int tokens_per_block, nvinfer1::DataType type, int32_t max_context_length,
         bool qkv_bias_enabled, bool cross_attention = false, int max_distance = 0, bool pos_shift_enabled = false,
         bool dense_context_fmha = false, bool use_paged_context_fmha = false, bool use_fp8_context_fmha = false,
-        bool use_cache = true, bool is_medusa_enabled = false);
+        bool use_cache = true, bool is_spec_decoding_enabled = false);
 
     GPTAttentionPluginCommon(void const* data, size_t length);
 
@@ -73,7 +75,7 @@ class GPTAttentionPluginCommon : public BasePlugin
     //! So plugin should put the resource release inside destroy.
     void destroy() noexcept override;
 
-    static size_t getCommonSerializationSize() noexcept;
+    size_t getCommonSerializationSize() const noexcept;
     void serializeCommon(void* buffer) const noexcept;
     int const getHeadSize(bool checkInit = true) const;
 
@@ -144,6 +146,7 @@ class GPTAttentionPluginCommon : public BasePlugin
         float const* kv_scale_orig_quant;
         float const* kv_scale_quant_orig;
         float const* attention_output_orig_quant;
+        float const* rotary_embedding_scaling_factors;
         T const* alibi_slopes;
         T* context_buf;
         void* key_value_cache;
@@ -168,10 +171,10 @@ class GPTAttentionPluginCommon : public BasePlugin
         // optional when cross attention
         int32_t const* encoder_input_lengths = nullptr;
         int32_t const* host_context_lengths = nullptr;
-        // optional when medusa is used.
-        bool const* medusa_mask = nullptr;
-        int32_t const* medusa_packed_mask = nullptr;
-        int32_t const* medusa_position_offsets = nullptr;
+        // optional when speculative decoding is used.
+        bool const* spec_decoding_mask = nullptr;
+        int32_t const* spec_decoding_packed_mask = nullptr;
+        int32_t const* spec_decoding_position_offsets = nullptr;
     };
 
     template <typename T, typename KVCacheBuffer>
@@ -204,7 +207,13 @@ class GPTAttentionPluginCommon : public BasePlugin
     bool isRoPE() const
     {
         return mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPTJ
-            || mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX;
+            || mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX
+            || mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kLONG_ROPE;
+    }
+
+    bool isLongRoPE() const
+    {
+        return mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kLONG_ROPE;
     }
 
     bool isCrossAttention() const
@@ -228,6 +237,8 @@ class GPTAttentionPluginCommon : public BasePlugin
 
     int mLayerIdx;
     int mNumHeads;
+    int mVisionStart;
+    int mVisionLength;
     int mNumKVHeads;
     int mHeadSize;
     int mUnidirectional;
@@ -236,6 +247,7 @@ class GPTAttentionPluginCommon : public BasePlugin
     float mRotaryEmbeddingBase;
     tensorrt_llm::kernels::RotaryScalingType mRotaryEmbeddingScaleType;
     float mRotaryEmbeddingScale;
+    float mRotaryEmbeddingMscale;
     int mRotaryEmbeddingMaxPositions;
     tensorrt_llm::kernels::PositionEmbeddingType mPositionEmbeddingType;
     bool mRemovePadding = false;
@@ -256,11 +268,11 @@ class GPTAttentionPluginCommon : public BasePlugin
     bool mPagedContextFMHA = false;
     bool mFP8ContextFMHA = false;
     bool mDenseContextFMHA = false;
-    bool mIsMedusaEnabled = false;
+    bool mIsSpecDecodingEnabled = false;
 
-    // Medusa packed mask.
-    uint4* mMedusaPackedMask;
-    uint4* mMedusaPackedHostMask;
+    // Speculative decoding packed mask.
+    uint4* mSpecDecodingPackedMask;
+    uint4* mSpecDecodingPackedHostMask;
 
     // fmha runner (disable by default)
     // flag: disabled = 0, enabled = 1, enabled with fp32 accumulation = 2
@@ -270,7 +282,9 @@ class GPTAttentionPluginCommon : public BasePlugin
     int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
     int mMaxSharedMemoryPerBlockOptin = tensorrt_llm::common::getMaxSharedMemoryPerBlockOptin();
     // The default copy constructor will leave it as nullptr. clone() shall initialize it.
+    std::shared_ptr<CUDADriverWrapper> mDriver;
     UniqPtrWNullCopy<tensorrt_llm::kernels::MHARunner> mFMHARunner;
+    tensorrt_llm::kernels::DecoderXQARunner::Resource mDecoderXQARunnerResource;
     UniqPtrWNullCopy<tensorrt_llm::kernels::DecoderXQARunner> mDecoderXQARunner;
 
     bool mMultiBlockMode;
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
index 183f821e7..e767918e4 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -38,23 +38,26 @@ using tensorrt_llm::plugins::GPTAttentionPlugin;
 static char const* GPT_ATTENTION_PLUGIN_VERSION{"1"};
 static char const* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"};
 
-GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int num_kv_heads, int head_size,
-    int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length,
+    int num_kv_heads, int head_size, int unidirectional, float q_scaling,
+    tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
     float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
-    float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
-    bool unfuse_qkv_gemm,                                                                       // for AutoPP
+    float rotary_embedding_scale, float rotary_embedding_m_scale, int rotary_embedding_max_positions, int tp_size,
+    int tp_rank,          // for ALiBi
+    bool unfuse_qkv_gemm, // for AutoPP
     tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, bool enable_xqa,
     int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type,
     bool paged_kv_cache, int tokens_per_block, nvinfer1::DataType type, int32_t max_context_length,
     bool qkv_bias_enabled, bool cross_attention, int max_distance, bool pos_shift_enabled, bool dense_context_fmha,
-    bool use_paged_context_fmha, bool use_fp8_context_fmha, bool use_cache, bool is_medusa_enabled)
-    : GPTAttentionPluginCommon(layer_idx, num_heads, num_kv_heads, head_size, unidirectional, q_scaling,
-        position_embedding_type, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale_type,
-        rotary_embedding_scale, rotary_embedding_max_positions, tp_size, tp_rank, unfuse_qkv_gemm, context_fmha_type,
-        multi_block_mode, enable_xqa, kv_cache_quant_mode, remove_input_padding, mask_type, paged_kv_cache,
-        tokens_per_block, type, max_context_length, qkv_bias_enabled, cross_attention, max_distance, pos_shift_enabled,
-        dense_context_fmha, use_paged_context_fmha, use_fp8_context_fmha, use_cache, is_medusa_enabled)
+    bool use_paged_context_fmha, bool use_fp8_context_fmha, bool use_cache, bool is_spec_decoding_enabled)
+    : GPTAttentionPluginCommon(layer_idx, num_heads, vision_start, vision_length, num_kv_heads, head_size,
+        unidirectional, q_scaling, position_embedding_type, rotary_embedding_dim, rotary_embedding_base,
+        rotary_embedding_scale_type, rotary_embedding_scale, rotary_embedding_m_scale, rotary_embedding_max_positions,
+        tp_size, tp_rank, unfuse_qkv_gemm, context_fmha_type, multi_block_mode, enable_xqa, kv_cache_quant_mode,
+        remove_input_padding, mask_type, paged_kv_cache, tokens_per_block, type, max_context_length, qkv_bias_enabled,
+        cross_attention, max_distance, pos_shift_enabled, dense_context_fmha, use_paged_context_fmha,
+        use_fp8_context_fmha, use_cache, is_spec_decoding_enabled)
 {
     initEntryIdx();
 }
@@ -87,6 +90,7 @@ bool GPTAttentionPlugin::isEntryUsed(IdxEntry const& entry) const
     case IdxEntry::KV_CACHE_DEQUANTIZATION_SCALE: return useKVCache() && mKVCacheQuantMode.hasKvCacheQuant();
     case IdxEntry::ATTENTION_OUTPUT_QUANTIZATION_SCALE: return mFP8ContextFMHA && mKVCacheQuantMode.hasFp8Qdq();
     case IdxEntry::ROTARY_COS_SIN: return isRoPE();
+    case IdxEntry::ROTARY_EMBEDDING_SCALING_FACTORS: return isLongRoPE();
     case IdxEntry::ALIBI_SLOPES: return isALiBi();
     case IdxEntry::RELATIVE_ATTENTION_BIAS: return isRelativePosition();
     case IdxEntry::CROSS_QKV: return isCrossAttention();
@@ -94,8 +98,8 @@ bool GPTAttentionPlugin::isEntryUsed(IdxEntry const& entry) const
     case IdxEntry::ENCODER_INPUT_LENGTH: return isCrossAttention();
     case IdxEntry::HOST_CONTEXT_LENGTH: return mRemovePadding;
     case IdxEntry::QKV_BIAS_TENSOR: return mQKVBiasEnabled;
-    case IdxEntry::MEDUSA_PACKED_MASK: return mIsMedusaEnabled;
-    case IdxEntry::MEDUSA_POSITION_OFFSETS: return mIsMedusaEnabled;
+    case IdxEntry::SPEC_DECODING_PACKED_MASK: return mIsSpecDecodingEnabled;
+    case IdxEntry::SPEC_DECODING_POSITION_OFFSETS: return mIsSpecDecodingEnabled;
     default: return false;
     }
 }
@@ -129,7 +133,7 @@ static int getPackedTensorHiddenDimIndex(bool removePadding)
     return removePadding ? 1 : 2;
 }
 
-// NOTE: generation input length might be larger than one in the Medusa mode.
+// NOTE: generation input length might be larger than one in the spec decoding mode.
 int GPTAttentionPlugin::getGenerationInputSequenceLength(
     nvinfer1::PluginTensorDesc const* inputDesc, int32_t localNbSeq, int32_t localNbTokens) const
 {
@@ -172,8 +176,9 @@ bool GPTAttentionPlugin::supportsFormatCombination(
 {
     if (pos == getIdx(IdxEntry::CONTEXT_LENGTHS) || pos == getIdx(IdxEntry::REQUEST_TYPES)
         || pos == getIdx(IdxEntry::HOST_MAX_ATTENTION_WINDOW) || pos == getIdx(IdxEntry::HOST_SINK_TOKEN_LENGTH)
-        || (isEntryUsed(IdxEntry::MEDUSA_PACKED_MASK) && pos == getIdx(IdxEntry::MEDUSA_PACKED_MASK))
-        || (isEntryUsed(IdxEntry::MEDUSA_POSITION_OFFSETS) && pos == getIdx(IdxEntry::MEDUSA_POSITION_OFFSETS)))
+        || (isEntryUsed(IdxEntry::SPEC_DECODING_PACKED_MASK) && pos == getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK))
+        || (isEntryUsed(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)
+            && pos == getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)))
     {
         return inOut[pos].type == nvinfer1::DataType::kINT32;
     }
@@ -187,6 +192,10 @@ bool GPTAttentionPlugin::supportsFormatCombination(
     {
         return inOut[pos].type == nvinfer1::DataType::kFLOAT;
     }
+    else if (isLongRoPE() && (pos == getIdx(IdxEntry::ROTARY_EMBEDDING_SCALING_FACTORS)))
+    {
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT;
+    }
     else if (useKVCache() && mKVCacheQuantMode.hasKvCacheQuant()
         && (pos == getIdx(IdxEntry::KV_CACHE_DEQUANTIZATION_SCALE)
             || pos == getIdx(IdxEntry::KV_CACHE_QUANTIZATION_SCALE)))
@@ -271,6 +280,7 @@ void GPTAttentionPlugin::configurePluginImpl(nvinfer1::DynamicPluginTensorDesc c
         /*kv_scale_orig_quant=*/nullptr,
         /*kv_scale_quant_orig=*/nullptr,
         /*attention_out_orig_quant=*/nullptr,
+        /*rotary_embedding_scaling_factors*/ nullptr,
         /*alibi_slopes=*/nullptr,
         /*context_buf_=*/nullptr,
         /*key_value_cache=*/nullptr,
@@ -453,6 +463,13 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
         rotary_cos_sin = reinterpret_cast<float2 const*>(inputs[getIdx(IdxEntry::ROTARY_COS_SIN)]);
     }
 
+    float const* rotary_embedding_scaling_factors = nullptr;
+    if (isLongRoPE())
+    {
+        rotary_embedding_scaling_factors
+            = reinterpret_cast<float const*>(inputs[getIdx(IdxEntry::ROTARY_EMBEDDING_SCALING_FACTORS)]);
+    }
+
     auto const reqTypeInBatchPtr = static_cast<RequestType const*>(inputs[getIdx(IdxEntry::REQUEST_TYPES)]) + seqIdxBeg;
     bool const is_context = (reqTypeInBatchPtr[0] == RequestType::kCONTEXT);
 
@@ -599,20 +616,21 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
 
     T const* alibi_slopes = isALiBi() ? static_cast<T const*>(inputs[getIdx(IdxEntry::ALIBI_SLOPES)]) : nullptr;
 
-    int const* medusa_packed_mask = nullptr;
-    int const* medusa_position_offsets = nullptr;
-    int num_medusa_tokens = 0;
-    if (mIsMedusaEnabled)
+    int const* spec_decoding_packed_mask = nullptr;
+    int const* spec_decoding_position_offsets = nullptr;
+    int num_spec_decoding_tokens = 0;
+    if (mIsSpecDecodingEnabled)
     {
-        // Second dimension of medusa_packed_mask is num_medusa_tokens + 1.
-        // [batch_size, num_medusa_tokens + 1, divUp(num_medusa_tokens + 1, 32)]
-        num_medusa_tokens = inputDesc[getIdx(IdxEntry::MEDUSA_PACKED_MASK)].dims.d[1] - 1;
-        if (num_medusa_tokens > 0)
+        // Second dimension of spec_decoding_packed_mask is num_spec_decoding_tokens + 1.
+        // [batch_size, num_spec_decoding_tokens + 1, divUp(num_spec_decoding_tokens + 1, 32)]
+        num_spec_decoding_tokens = inputDesc[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)].dims.d[1] - 1;
+        if (num_spec_decoding_tokens > 0)
         {
-            medusa_packed_mask = static_cast<int const*>(inputs[getIdx(IdxEntry::MEDUSA_PACKED_MASK)])
-                + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::MEDUSA_PACKED_MASK)].dims, 0);
-            medusa_position_offsets = static_cast<int const*>(inputs[getIdx(IdxEntry::MEDUSA_POSITION_OFFSETS)])
-                + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::MEDUSA_POSITION_OFFSETS)].dims, 0);
+            spec_decoding_packed_mask = static_cast<int const*>(inputs[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)])
+                + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_PACKED_MASK)].dims, 0);
+            spec_decoding_position_offsets
+                = static_cast<int const*>(inputs[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)])
+                + seqIdxBeg * getStride(inputDesc[getIdx(IdxEntry::SPEC_DECODING_POSITION_OFFSETS)].dims, 0);
         }
     }
 
@@ -681,18 +699,19 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
 
         int const input_seq_length = getGenerationInputSequenceLength(inputDesc, localNbSeq, localNbTokens);
         auto qkvDims = inputDesc[getIdx(IdxEntry::QKV_TENSOR)].dims;
-        TLLM_CHECK_WITH_INFO(input_seq_length == 1 || mIsMedusaEnabled,
-            "Only Medusa mode supports input length > 1 in the generation phase, input_seq_length=%d, "
-            "mIsMedusaEnabled=%s, nDims=%d, (" FMT_DIM ", " FMT_DIM ", " FMT_DIM ")",
-            input_seq_length, mIsMedusaEnabled ? "true" : "false", qkvDims.nbDims, qkvDims.d[0], qkvDims.d[1],
+        TLLM_CHECK_WITH_INFO(input_seq_length == 1 || mIsSpecDecodingEnabled,
+            "Only speculative decoding mode supports input length > 1 in the generation phase, input_seq_length=%d, "
+            "mIsSpecDecodingEnabled=%s, nDims=%d, (" FMT_DIM ", " FMT_DIM ", " FMT_DIM ")",
+            input_seq_length, mIsSpecDecodingEnabled ? "true" : "false", qkvDims.nbDims, qkvDims.d[0], qkvDims.d[1],
             qkvDims.d[2]);
-        TLLM_CHECK_WITH_INFO(input_seq_length == num_medusa_tokens + 1, "The generation input length is not expected.");
+        TLLM_CHECK_WITH_INFO(
+            input_seq_length == num_spec_decoding_tokens + 1, "The generation input length is not expected.");
         EnqueueGenerationParams<T, KVCacheBuffer> enqueue_params{attention_input, qkv_bias, input_seq_length,
             sequence_kv_length, max_context_kv_len, beamWidth, context_q_lengths, kv_scale_orig_quant,
-            kv_scale_quant_orig, attention_output_orig_quant, alibi_slopes, context_buf_, key_value_cache,
-            block_offsets, host_primary_pool_pointer, host_secondary_pool_pointer, max_attention_window_size,
-            cyclic_attention_window_size, sink_token_length, num_requests, max_blocks_per_sequence, cache_indir,
-            workspace, max_context_kv_len_list};
+            kv_scale_quant_orig, attention_output_orig_quant, rotary_embedding_scaling_factors, alibi_slopes,
+            context_buf_, key_value_cache, block_offsets, host_primary_pool_pointer, host_secondary_pool_pointer,
+            max_attention_window_size, cyclic_attention_window_size, sink_token_length, num_requests,
+            max_blocks_per_sequence, cache_indir, workspace, max_context_kv_len_list};
         enqueue_params.host_context_lengths = host_context_lengths;
         if (isRelativePosition())
         {
@@ -706,10 +725,10 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
             enqueue_params.encoder_input_lengths
                 = reinterpret_cast<int const*>(inputs[getIdx(IdxEntry::ENCODER_INPUT_LENGTH)]) + seqIdxBeg;
         }
-        if (mIsMedusaEnabled)
+        if (mIsSpecDecodingEnabled)
         {
-            enqueue_params.medusa_packed_mask = medusa_packed_mask;
-            enqueue_params.medusa_position_offsets = medusa_position_offsets;
+            enqueue_params.spec_decoding_packed_mask = spec_decoding_packed_mask;
+            enqueue_params.spec_decoding_position_offsets = spec_decoding_position_offsets;
         }
 
         enqueueGeneration<T, KVCacheBuffer>(enqueue_params, stream);
@@ -831,13 +850,15 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(char const* name, PluginField
     try
     {
         auto* obj = new GPTAttentionPlugin(p.getScalar<int32_t>("layer_idx").value(),
-            p.getScalar<int32_t>("num_heads").value(), p.getScalar<int32_t>("num_kv_heads").value(),
+            p.getScalar<int32_t>("num_heads").value(), p.getScalar<int32_t>("vision_start").value(),
+            p.getScalar<int32_t>("vision_length").value(), p.getScalar<int32_t>("num_kv_heads").value(),
             p.getScalar<int32_t>("head_size").value(), p.getScalar<int32_t>("unidirectional").value(),
             p.getScalar<float>("q_scaling").value(),
             static_cast<PositionEmbeddingType>(p.getScalar<int8_t>("position_embedding_type").value()),
             p.getScalar<int32_t>("rotary_embedding_dim").value(), p.getScalar<float>("rotary_embedding_base").value(),
             static_cast<RotaryScalingType>(p.getScalar<int8_t>("rotary_embedding_scale_type").value()),
             p.getScalar<float>("rotary_embedding_scale").value(),
+            p.getScalar<float>("rotary_embedding_m_scale").value(),
             p.getScalar<int32_t>("rotary_embedding_max_positions").value(),
             static_cast<int32_t>(p.getScalar<int32_t>("tp_size").value()),
             static_cast<int32_t>(p.getScalar<int32_t>("tp_rank").value()),
@@ -860,7 +881,7 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(char const* name, PluginField
             static_cast<bool>(p.getScalar<int8_t>("use_paged_context_fmha").value()),
             static_cast<bool>(p.getScalar<int8_t>("use_fp8_context_fmha").value()),
             static_cast<bool>(p.getScalar<int32_t>("use_cache").value()),
-            static_cast<bool>(p.getScalar<int8_t>("is_medusa_enabled").value()));
+            static_cast<bool>(p.getScalar<int8_t>("is_spec_decoding_enabled").value()));
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
index 9348c6771..ad8adc1a3 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
@@ -71,18 +71,20 @@ namespace tensorrt_llm::plugins
 class GPTAttentionPlugin : public GPTAttentionPluginCommon
 {
 public:
-    GPTAttentionPlugin(int layer_idx, int num_heads, int num_kv_heads, int head_size, int unidirectional,
-        float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+    GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length, int num_kv_heads,
+        int head_size, int unidirectional, float q_scaling,
+        tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
         float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
-        float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
-        bool unfuse_qkv_gemm,                                                                       // for AutoPP
+        float rotary_embedding_scale, float rotary_embedding_m_scale, int rotary_embedding_max_positions, int tp_size,
+        int tp_rank,          // for ALiBi
+        bool unfuse_qkv_gemm, // for AutoPP
         tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, bool enable_xqa,
         int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type,
         bool paged_kv_cache, int tokens_per_block, nvinfer1::DataType type, int32_t max_context_length,
         bool qkv_bias_enabled, bool cross_attention = false, int max_distance = 0, bool pos_shift_enabled = false,
         bool dense_context_fmha = false, bool use_paged_context_fmha = false, bool use_fp8_context_fmha = false,
-        bool use_cache = true, bool is_medusa_enabled = false);
+        bool use_cache = true, bool is_spec_decoding_enabled = false);
 
     GPTAttentionPlugin(void const* data, size_t length);
 
@@ -168,6 +170,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
         KV_CACHE_DEQUANTIZATION_SCALE,
         ATTENTION_OUTPUT_QUANTIZATION_SCALE,
         ROTARY_COS_SIN,
+        ROTARY_EMBEDDING_SCALING_FACTORS,
         ALIBI_SLOPES,
         RELATIVE_ATTENTION_BIAS,
         CROSS_QKV,
@@ -175,8 +178,8 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
         ENCODER_INPUT_LENGTH,
         HOST_CONTEXT_LENGTH,
         QKV_BIAS_TENSOR,
-        MEDUSA_PACKED_MASK,
-        MEDUSA_POSITION_OFFSETS,
+        SPEC_DECODING_PACKED_MASK,
+        SPEC_DECODING_POSITION_OFFSETS,
         ENUM_SIZE,
     };
 
@@ -184,7 +187,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
     void initEntryIdx();
     IndexType getIdx(IdxEntry const& entry) const;
 
-    // Get generation input sequence length (might be larger than 1 in the Medusa mode).
+    // Get generation input sequence length (might be larger than 1 in the speculative decoding mode).
     int getGenerationInputSequenceLength(
         nvinfer1::PluginTensorDesc const* inputDesc, int32_t localNbSeq, int32_t localNbTokens) const;
 };
diff --git a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
index eb3fa73fa..a48432881 100644
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
@@ -14,17 +14,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "loraPlugin.h"
+
+#include "pluginUtils.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/groupGemm.h"
 #include "tensorrt_llm/kernels/splitkGroupGemm.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iTensor.h"
 
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/cublasMMWrapper.h"
-#include "tensorrt_llm/common/cublasVersionCheck.h"
 #include <algorithm>
 
 namespace tk = tensorrt_llm::kernels;
@@ -244,61 +244,20 @@ bool LoraPlugin::supportsFormatCombination(
     }
 }
 
-int32_t _computeMDimension(bool transA, const int32_t nbDims, tensorrt_llm::runtime::ITensor::DimType const* dims)
-{
-    int32_t M = 1;
-    if (transA)
-    {
-        for (int i = nbDims - 1; i > 0; --i)
-        {
-            M *= dims[i];
-        }
-    }
-    else
-    {
-        for (int i = 0; i < nbDims - 1; ++i)
-        {
-            M *= dims[i];
-        }
-    }
-    return M;
-}
-
-int32_t _computeNDimension(bool transB, const int32_t nbDims, tensorrt_llm::runtime::ITensor::DimType const* dims)
-{
-    int32_t N = 1;
-    if (transB)
-    {
-        for (int i = 0; i < nbDims - 1; ++i)
-        {
-            N *= dims[i];
-        }
-    }
-    else
-    {
-        for (int i = nbDims - 1; i > 0; --i)
-        {
-            N *= dims[i];
-        }
-    }
-    return N;
-}
-
 void LoraPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs,
     nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept
 {
     TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
     int const nbDimsA = in[0].max.nbDims;
-    int const nbDimsB = in[1].max.nbDims;
 
-    auto const minM = _computeMDimension(mTransA, nbDimsA, in[0].min.d);
-    auto const maxM = _computeMDimension(mTransA, nbDimsA, in[0].max.d);
-    auto const N = _computeNDimension(mTransB, nbDimsB, in[1].max.d);
-    auto const K = mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1];
+    auto const minM = utils::computeMDimension(mTransA, in[0].min);
+    auto const maxM = utils::computeMDimension(mTransA, in[0].max);
+    auto const N = utils::computeNDimension(mTransB, in[1].max);
+    auto const K = static_cast<utils::DimType>(mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1]);
 
     if (!mDims.isInitialized())
     {
-        mDims = {minM, maxM, N, static_cast<runtime::SizeType>(K)};
+        mDims = {minM, maxM, N, K};
     }
     mGemmId.n = N;
     mGemmId.k = K;
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
index 5495fe6f1..d10f24297 100644
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
@@ -220,7 +220,10 @@ void MixtureOfExpertsPlugin::init()
 #endif
     else
     {
-        TLLM_THROW("Could not construct the mixture of experts plugin with the requested input combination");
+        TLLM_THROW(
+            "Could not construct the mixture of experts plugin with the requested input combination Activation: %d "
+            "Weight: %d",
+            static_cast<int>(mType), static_cast<int>(mWeightType));
     }
 
     mGemmId = GemmIDMoe{mNumExperts, mK, mExpertHiddenSize, mExpertInterSize, mActivationType, mType, mWeightType,
@@ -317,7 +320,7 @@ void MixtureOfExpertsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc c
         mQuantMode, mParallelismMode};
 }
 
-auto MixtureOfExpertsPlugin::setupWorkspace(void* base_ptr, int num_tokens) const -> WorkspaceInfo
+auto MixtureOfExpertsPlugin::setupWorkspace(void* base_ptr, int64_t num_tokens) const -> WorkspaceInfo
 {
     size_t dtype_size = tensorrt_llm::common::getDTypeSize(mType);
 
@@ -354,12 +357,12 @@ auto MixtureOfExpertsPlugin::setupWorkspace(void* base_ptr, int num_tokens) cons
     return info;
 }
 
-int MixtureOfExpertsPlugin::getNumTokens(nvinfer1::PluginTensorDesc const* input_tensors) const
+int64_t MixtureOfExpertsPlugin::getNumTokens(nvinfer1::PluginTensorDesc const* input_tensors) const
 {
     int ndim = input_tensors[getInputTensorIndex()].dims.nbDims;
     TLLM_CHECK_WITH_INFO(
         3 == ndim || 2 == ndim, "hidden_state dimension should be either 2 [b*s, hidden], or 3 [b, s, hidden]");
-    int num_tokens = input_tensors[getInputTensorIndex()].dims.d[0];
+    int64_t num_tokens = input_tensors[getInputTensorIndex()].dims.d[0];
     if (ndim == 3)
     {
         num_tokens *= input_tensors[getInputTensorIndex()].dims.d[1];
@@ -413,8 +416,8 @@ int MixtureOfExpertsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
     nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace_ptr,
     cudaStream_t stream) noexcept
 {
-    int const num_tokens = getNumTokens(inputDesc);
-    int const num_not_finished = num_tokens; // TODO Take this as an input
+    int64_t const num_tokens = getNumTokens(inputDesc);
+    int64_t const num_not_finished = num_tokens; // TODO Take this as an input
     auto parallelism_config = getParallelismConfig();
 
     auto workspace = setupWorkspace(workspace_ptr, num_tokens);
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
index e87da19bc..3f5fc70d6 100644
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
@@ -35,8 +35,8 @@ struct GemmIDMoe
 {
     int num_experts{};
     int moe_k{};
-    int hidden{};
-    int inter{};
+    int64_t hidden{};
+    int64_t inter{};
     tensorrt_llm::ActivationType actfn{};
     nvinfer1::DataType dtype{};
     nvinfer1::DataType wdtype{};
@@ -136,8 +136,8 @@ class MixtureOfExpertsPlugin : public nvinfer1::IPluginV2DynamicExt
     std::unique_ptr<kernels::CutlassMoeFCRunnerInterface> mMOERunner{};
     int mNumExperts{};
     int mK{};
-    int mExpertHiddenSize{};
-    int mExpertInterSize{};
+    int64_t mExpertHiddenSize{};
+    int64_t mExpertInterSize{};
     tensorrt_llm::ActivationType mActivationType;
     nvinfer1::DataType mType{};
     nvinfer1::DataType mWeightType{};
@@ -170,8 +170,8 @@ class MixtureOfExpertsPlugin : public nvinfer1::IPluginV2DynamicExt
         size_t size{};
     };
 
-    int getNumTokens(nvinfer1::PluginTensorDesc const* input_tensor) const;
-    WorkspaceInfo setupWorkspace(void* base_ptr, int num_tokens) const;
+    int64_t getNumTokens(nvinfer1::PluginTensorDesc const* input_tensor) const;
+    WorkspaceInfo setupWorkspace(void* base_ptr, int64_t num_tokens) const;
 
     kernels::MOEParallelismConfig getParallelismConfig() const;
     kernels::QuantParams getQuantParams(
diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt
index 4c19c87ec..78d491c1d 100644
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@@ -19,6 +19,7 @@ set(SRCS
     utils/sessionUtils.cpp
     utils/debugUtils.cu
     bufferManager.cpp
+    layerProfiler.cpp
     loraManager.cpp
     loraUtils.cpp
     loraModule.cpp
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
index bc5773d1f..dc9c61ff8 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
@@ -355,10 +355,15 @@ void GptDecoderBatch::newRequest(
             beamWidth, maxBeamWidth));
     auto const& requestIds = request.ids;
     auto const inputLength = request.inputLen;
-    auto const maxNewTokens = request.maxNewTokens.value_or(mMaxSequenceLength - inputLength);
-    TLLM_CHECK_WITH_INFO(inputLength + maxNewTokens <= mMaxSequenceLength,
-        tc::fmtstr("Input length (%d) + max new tokens (%d) must be less than max sequence length (%d).", inputLength,
-            maxNewTokens, mMaxSequenceLength));
+    auto const generatedTokensPerEngineStep = request.generatedTokensPerEngineStep;
+    auto const draftTokensPerEngineStep = generatedTokensPerEngineStep - 1;
+    auto const maxNewTokens
+        = request.maxNewTokens.value_or(mMaxSequenceLength - inputLength - draftTokensPerEngineStep);
+
+    TLLM_CHECK_WITH_INFO(inputLength + maxNewTokens + draftTokensPerEngineStep <= mMaxSequenceLength,
+        tc::fmtstr(
+            "Input length (%d) + max new tokens (%d) + draft tokens (%d) must be less than max sequence length (%d).",
+            inputLength, maxNewTokens, draftTokensPerEngineStep, mMaxSequenceLength));
     TLLM_CHECK(requestIds->getDataType() == TRTDataType<TokenIdType>::value);
     auto const endId = request.endId.value_or(-1);
 
@@ -498,7 +503,6 @@ void GptDecoderBatch::newRequest(
         dOutput->beamHypotheses.init(manager, endId);
     }
 
-    auto generatedTokensPerEngineStep = request.generatedTokensPerEngineStep;
     // Speculative execution
     if (generatedTokensPerEngineStep > 1)
     {
diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
index 11f8cc2d7..29d00ba2d 100644
--- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
@@ -71,41 +71,41 @@ std::optional<FieldType> parseJsonFieldOptional(Json const& json, std::string_vi
     return value;
 }
 
-// Return { number of attention layers, number of recurrent (SSM) layers }
-std::tuple<SizeType, SizeType> getNumLayersByType(SizeType const numLayers, std::vector<std::string> const& layerTypes)
+std::vector<ModelConfig::LayerType> buildLayerTypes(
+    std::size_t const numLayers, std::vector<std::string> const& layerStringTypes)
 {
-    if (layerTypes.empty())
+    std::vector<ModelConfig::LayerType> result{numLayers, ModelConfig::LayerType::kATTENTION};
+    if (layerStringTypes.empty())
     {
-        return {numLayers, 0};
+        return result;
     }
 
-    auto constexpr attentionLayerName = "attention";
-    auto constexpr ssmLayerName = "recurrent";
-
-    SizeType numAttentionLayers{0};
-    SizeType numSsmLayers{0};
+    auto constexpr layerNameAttention = "attention";
+    auto constexpr layerNameRecurrent = "recurrent";
 
     // The json field specifies a "group" of layers, which gets repeated multiple times
     // Note that the total number of layers does not need to be a multiple of a layer
     // group size (i.e. the last group will be incomplete).
     // For instance, Griffin has groups of 3 layers (2 recurrent + 1 attention) and 26
     // layers total (the last group has no attention layer)
-    auto const groupSize = layerTypes.size();
-    TLLM_CHECK(groupSize <= static_cast<std::size_t>(numLayers));
-    auto const numLayersInLastGroup = numLayers % groupSize;
-    auto const numFullLayersGroups = numLayers / groupSize;
-    std::map<std::string, SizeType> layerCount = {{attentionLayerName, 0}, {ssmLayerName, 0}};
-    for (std::size_t i = 0; i < groupSize; ++i)
+    auto const groupSize = layerStringTypes.size();
+    for (std::size_t i = 0; i < numLayers; ++i)
     {
-        layerCount[layerTypes[i]] += (i < numLayersInLastGroup) ? numFullLayersGroups + 1 : numFullLayersGroups;
+        if (layerStringTypes[i % groupSize] == layerNameAttention)
+        {
+            result[i] = ModelConfig::LayerType::kATTENTION;
+        }
+        else if (layerStringTypes[i % groupSize] == layerNameRecurrent)
+        {
+            result[i] = ModelConfig::LayerType::kRECURRENT;
+        }
+        else
+        {
+            TLLM_LOG_ERROR("Unknown layer type: %s", layerStringTypes[i % groupSize].c_str());
+        }
     }
 
-    numAttentionLayers = layerCount[attentionLayerName];
-    numSsmLayers = layerCount[ssmLayerName];
-
-    TLLM_CHECK(numAttentionLayers + numSsmLayers == numLayers);
-
-    return {numAttentionLayers, numSsmLayers};
+    return result;
 }
 
 ModelConfig createModelConfig(
@@ -121,9 +121,13 @@ ModelConfig createModelConfig(
 
     auto const numLayers = config.at(numLayersField).template get<SizeType>();
     auto const numHeads = config.at(numHeadsField).template get<SizeType>() / tensorParallelism;
-    auto const layerTypes
+    auto const layerStringTypes
         = parseJsonFieldOr<std::vector<std::string>>(config, "layer_types", std::vector<std::string>());
-    auto const [numAttentionLayers, numSsmLayers] = getNumLayersByType(numLayers, layerTypes);
+    auto const layerTypes = buildLayerTypes(numLayers, layerStringTypes);
+    auto const numAttentionLayers
+        = static_cast<SizeType>(std::count(layerTypes.begin(), layerTypes.end(), ModelConfig::LayerType::kATTENTION));
+    auto const numSsmLayers
+        = static_cast<SizeType>(std::count(layerTypes.begin(), layerTypes.end(), ModelConfig::LayerType::kRECURRENT));
 
     auto const vocabSize = config.at("vocab_size").template get<SizeType>();
     auto const hiddenSize = config.at("hidden_size").template get<SizeType>() / tensorParallelism;
@@ -139,6 +143,7 @@ ModelConfig createModelConfig(
     auto modelConfig = ModelConfig{vocabSize, numAttentionLayers, numSsmLayers, numHeads, hiddenSize, dataType};
     modelConfig.setSizePerHead(sizePerHead);
     modelConfig.setNbKvHeads(numKvHeads);
+    modelConfig.setLayerTypes(layerTypes);
 
     if (useCrossAttention.has_value())
     {
@@ -357,12 +362,22 @@ GptJsonConfig parseJson(InputType&& input)
             auto const& mambaDState = ssmCfg.at("d_state").template get<SizeType>();
             auto const& mambaDConv = ssmCfg.at("d_conv").template get<SizeType>();
             auto const& mambaExpand = ssmCfg.at("expand").template get<SizeType>();
-            MambaConfig mambaConfig{};
+            ModelConfig::MambaConfig mambaConfig{};
             mambaConfig.dState = mambaDState;
             mambaConfig.dConv = mambaDConv;
             mambaConfig.expand = mambaExpand;
             modelConfig.setMambaConfig(mambaConfig);
         }
+        else if (architecture == std::string("RecurrentGemmaForCausalLM"))
+        {
+            modelConfig.setModelVariant(ModelConfig::ModelVariant::kRecurrentGemma);
+            auto const& dConv = pretrainedConfig.at("conv_kernel").template get<SizeType>();
+            auto const& rnnHiddenSize = pretrainedConfig.at("rnn_hidden_size").template get<SizeType>();
+            ModelConfig::RnnConfig rnnConfig{};
+            rnnConfig.dConv = dConv;
+            rnnConfig.hiddenSize = rnnHiddenSize;
+            modelConfig.setRnnConfig(rnnConfig);
+        }
     }
     else
     {
@@ -372,7 +387,7 @@ GptJsonConfig parseJson(InputType&& input)
             auto const& mambaDState = builderConfig.at("mamba_d_state").template get<SizeType>();
             auto const& mambaDConv = builderConfig.at("mamba_d_conv").template get<SizeType>();
             auto const& mambaExpand = builderConfig.at("mamba_expand").template get<SizeType>();
-            MambaConfig mambaConfig{};
+            ModelConfig::MambaConfig mambaConfig{};
             mambaConfig.dState = mambaDState;
             mambaConfig.dConv = mambaDConv;
             mambaConfig.expand = mambaExpand;
diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp
index acd5d91b8..4be45fbe0 100644
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@@ -22,7 +22,6 @@
 #include "common.h"
 #include "iBuffer.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/runtime/gptDecoderBatch.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
@@ -221,34 +220,13 @@ void GptSession::createCustomAllReduceWorkspace(
     SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    setPeerAccess(mWorldConfig, true);
-
-    mIpcMemoryHandles.clear();
-    std::size_t const bufferSize = mWorldConfig.getTensorParallelism()
-        * std::min(static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength
-                * mModelConfig.getHiddenSize() * sizeof(float),
-            ::tensorrt_llm::utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(
-                mWorldConfig.getTensorParallelism()));
-    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(mWorldConfig, bufferSize));
-    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(mWorldConfig, bufferSize));
-    mIpcMemoryHandles.emplace_back(
-        std::make_shared<IpcMemory>(mWorldConfig, IpcMemory::FLAGS_SIZE * mWorldConfig.getTensorParallelism()));
-    mIpcMemoryHandles.emplace_back(
-        std::make_shared<IpcMemory>(mWorldConfig, IpcMemory::FLAGS_SIZE * mWorldConfig.getTensorParallelism()));
-
-    mCommPtrs = BufferManager::cpu(
-        ITensor::makeShape({static_cast<SizeType>(mIpcMemoryHandles.size()) * mWorldConfig.getTensorParallelism()}),
-        nvinfer1::DataType::kINT64);
-    auto* const commPtrsData = bufferCast<void*>(*mCommPtrs);
-
-    for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
-    {
-        auto const& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor();
-        for (SizeType tpIdx = 0; tpIdx < mWorldConfig.getTensorParallelism(); tpIdx++)
-        {
-            commPtrsData[memIdx * mWorldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx];
-        }
-    }
+
+    auto& manager = mRuntime->getBufferManager();
+    auto const hiddenSize = mModelConfig.getHiddenSize();
+
+    mAllReduceBuffers = std::make_shared<AllReduceBuffers>(
+        maxBatchSize, maxBeamWidth, maxSequenceLength, hiddenSize, manager, mWorldConfig);
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -872,6 +850,8 @@ void GptSession::executeContextStep(std::vector<GenerationInput> const& generati
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto& manager = mRuntime->getBufferManager();
 
+    auto allReduceCommPtrs = mAllReduceBuffers ? mAllReduceBuffers->mAllReduceCommPtrs : TensorPtr{};
+
     auto const numGenerationBatches = static_cast<SizeType>(generationBatchesInputs.size());
     auto constexpr step = 0;
     auto constexpr contextId = 0;
@@ -896,13 +876,15 @@ void GptSession::executeContextStep(std::vector<GenerationInput> const& generati
 
             buffers.prepareContextStep(inputIds.at(contextBatchId), generationBatchInputs.padId, manager,
                 kvCacheManager, batchOffset, mModelConfig, mWorldConfig);
-            buffers.getRuntimeBuffers(
-                inputBuffer, outputBuffer, step, inputIds.at(contextBatchId), mCommPtrs, mModelConfig, mWorldConfig);
+            buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, inputIds.at(contextBatchId), allReduceCommPtrs,
+                mModelConfig, mWorldConfig);
             mRuntime->setInputTensors(contextId, inputBuffer);
             mRuntime->setOutputTensors(contextId, outputBuffer);
 
             TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId), "Executing TRT engine in context step failed!");
             sync_check_cuda_error();
+            buffers.clearTensorMaps(); // inputBuffer and outputBuffer are not needed anymore, we explicitly clear them
+                                       // to release memory
         }
 
         generationBuffers.postContextStep(contextBuffers, manager, mModelConfig, mWorldConfig);
@@ -928,6 +910,10 @@ void GptSession::executeContextStep(std::vector<GenerationInput> const& generati
             generationBuffers.logits = newLogitBuffer;
         }
     }
+    if (mRuntime->hasLayerProfiler(contextId))
+    {
+        mRuntime->reportToProfiler(contextId);
+    }
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -939,6 +925,8 @@ SizeType GptSession::executeGenerationStep(SizeType step, std::vector<Generation
     TLLM_CHECK(microBatchesInputs.size() == microBatchesOutputs.size());
     auto& manager = mRuntime->getBufferManager();
 
+    auto allReduceCommPtrs = mAllReduceBuffers ? mAllReduceBuffers->mAllReduceCommPtrs : TensorPtr{};
+
     auto const numMicroBatches = static_cast<SizeType>(microBatchesInputs.size());
     SizeType numBatchesFinished{0};
 
@@ -958,7 +946,8 @@ SizeType GptSession::executeGenerationStep(SizeType step, std::vector<Generation
 
         auto nextInputIds = buffers.prepareNextStep(
             step - 1, manager, kvCacheManager, microBatchOffsets.at(generationBatchId), mModelConfig, mWorldConfig);
-        buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mCommPtrs, mModelConfig, mWorldConfig);
+        buffers.getRuntimeBuffers(
+            inputBuffer, outputBuffer, step, nextInputIds, allReduceCommPtrs, mModelConfig, mWorldConfig);
         mRuntime->setInputTensors(contextId, inputBuffer);
         mRuntime->setOutputTensors(contextId, outputBuffer);
 
@@ -990,6 +979,12 @@ SizeType GptSession::executeGenerationStep(SizeType step, std::vector<Generation
             continue;
         }
 
+        // report profile data
+        if (mRuntime->hasLayerProfiler(contextId))
+        {
+            mRuntime->reportToProfiler(contextId);
+        }
+
         if (useCudaGraphs() && mCudaGraphInstances.size() > (size_t) graphId
             && mCudaGraphInstances.at(graphId).hasInstance())
         {
@@ -1208,6 +1203,18 @@ void GptSession::finalize(SizeType microBatchId)
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
+void GptSession::setLayerProfiler()
+{
+    TLLM_CHECK(mRuntime);
+    mRuntime->setLayerProfiler();
+}
+
+std::string GptSession::getLayerProfileInfo() const
+{
+    TLLM_CHECK(mRuntime);
+    return mRuntime->getLayerProfileInfo();
+}
+
 void GptSession::CudaGraphExecutor::create(cudaGraph_t const& graph)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/runtime/ipcUtils.cpp b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
index fee181cc7..8d470662e 100644
--- a/cpp/tensorrt_llm/runtime/ipcUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
@@ -13,15 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "tensorrt_llm/runtime/ipcUtils.h"
+
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/mpiUtils.h"
 
+#include <NvInferRuntimeBase.h>
+#include <cstddef>
+
 namespace tensorrt_llm::runtime
 {
 
+namespace
+{
 void setPeerAccess(WorldConfig const& worldConfig, bool enable)
 {
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto const srcNode = worldConfig.getTensorParallelRank();
 
     for (SizeType destNode = 0; destNode < worldConfig.getTensorParallelism(); destNode++)
@@ -31,7 +40,7 @@ void setPeerAccess(WorldConfig const& worldConfig, bool enable)
             continue;
         }
 
-        int canAccessPeer;
+        int canAccessPeer{0};
         TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, srcNode, destNode));
 
         if (enable)
@@ -48,50 +57,55 @@ void setPeerAccess(WorldConfig const& worldConfig, bool enable)
             TLLM_CUDA_CHECK(error);
         }
     }
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
+} // namespace
 
-IpcMemory::IpcMemory(WorldConfig const& worldConfig, std::size_t bufferSize)
-    : mWorldConfig(worldConfig)
+IpcMemory::IpcMemory(std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig)
+    : mTpRank(worldConfig.getTensorParallelRank())
     , mCommPtrs(worldConfig.getTensorParallelism())
-    , mBufferSize(bufferSize)
 {
-    allocateIpcMemory();
+    allocateIpcMemory(bufferSize, manager, worldConfig);
 }
 
-void IpcMemory::allocateIpcMemory()
+void IpcMemory::allocateIpcMemory(std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig)
 {
-    TLLM_CUDA_CHECK(cudaMalloc(&mBufferPtr, mBufferSize));
-    TLLM_CUDA_CHECK(cudaMemset(mBufferPtr, 0, mBufferSize));
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    // cudaIpcGetMemHandle only works with allocation created with cudaMalloc
+    mBuffer = BufferManager::gpuSync(bufferSize, nvinfer1::DataType::kUINT8);
+    manager.setZero(*mBuffer);
+    auto* bufferPtr = mBuffer->data();
 
     cudaIpcMemHandle_t localHandle;
-    TLLM_CUDA_CHECK(cudaIpcGetMemHandle(&localHandle, mBufferPtr));
+    TLLM_CUDA_CHECK(cudaIpcGetMemHandle(&localHandle, bufferPtr));
 
-    auto const tpRank = mWorldConfig.getTensorParallelRank();
-    auto const ppRank = mWorldConfig.getPipelineParallelRank();
-    auto const comm = COMM_SESSION.split(ppRank, tpRank);
-    std::vector<char> serialHandles(CUDA_IPC_HANDLE_SIZE * mWorldConfig.getTensorParallelism(), 0);
+    auto const ppRank = worldConfig.getPipelineParallelRank();
+    auto const comm = COMM_SESSION.split(ppRank, mTpRank);
+    std::vector<char> serialHandles(CUDA_IPC_HANDLE_SIZE * worldConfig.getTensorParallelism(), 0);
     comm.allgather(&localHandle.reserved, serialHandles.data(), CUDA_IPC_HANDLE_SIZE, mpi::MpiType::kBYTE);
 
-    std::vector<cudaIpcMemHandle_t> handles(mWorldConfig.getTensorParallelism());
+    std::vector<cudaIpcMemHandle_t> handles(worldConfig.getTensorParallelism());
     for (size_t i = 0; i < handles.size(); ++i)
     {
         memcpy(handles[i].reserved, &serialHandles[i * CUDA_IPC_HANDLE_SIZE], CUDA_IPC_HANDLE_SIZE);
     }
 
-    for (size_t nodeId = 0; nodeId < handles.size(); nodeId++)
+    for (std::size_t nodeId = 0; nodeId < handles.size(); nodeId++)
     {
-        if ((int) nodeId == mWorldConfig.getTensorParallelRank())
+        if (nodeId == static_cast<std::size_t>(mTpRank))
         {
-            mCommPtrs[nodeId] = mBufferPtr;
+            mCommPtrs.at(nodeId) = bufferPtr;
         }
         else
         {
-            uint8_t* foreignBuffer;
+            uint8_t* foreignBuffer{nullptr};
             TLLM_CUDA_CHECK(cudaIpcOpenMemHandle(
                 reinterpret_cast<void**>(&foreignBuffer), handles[nodeId], cudaIpcMemLazyEnablePeerAccess));
-            mCommPtrs[nodeId] = foreignBuffer;
+            mCommPtrs.at(nodeId) = foreignBuffer;
         }
     }
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
 IpcMemory::~IpcMemory()
@@ -101,18 +115,48 @@ IpcMemory::~IpcMemory()
 
 void IpcMemory::destroyIpcMemory()
 {
-    for (SizeType nodeId = 0; nodeId < mWorldConfig.getTensorParallelism(); ++nodeId)
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    for (std::size_t nodeId = 0; nodeId < mCommPtrs.size(); ++nodeId)
     {
-        if ((int) nodeId == mWorldConfig.getTensorParallelRank())
+        if (nodeId != static_cast<std::size_t>(mTpRank))
         {
-            TLLM_CUDA_CHECK(cudaFree(mCommPtrs[nodeId]));
-        }
-        else
-        {
-            TLLM_CUDA_CHECK(cudaIpcCloseMemHandle(mCommPtrs[nodeId]));
+            TLLM_CUDA_CHECK(cudaIpcCloseMemHandle(mCommPtrs.at(nodeId)));
         }
     }
-    cudaFree(mBufferPtr);
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+AllReduceBuffers::AllReduceBuffers(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength,
+    SizeType hiddenSize, BufferManager const& manager, WorldConfig const& worldConfig)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    setPeerAccess(worldConfig, true);
+
+    auto const tpSize = worldConfig.getTensorParallelism();
+
+    auto const bufferSize = tpSize
+        * std::min(
+            static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength * hiddenSize * sizeof(float),
+            utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(tpSize));
+    auto const flagsSize = IpcMemory::FLAGS_SIZE * tpSize;
+
+    for (auto size : {bufferSize, bufferSize, flagsSize, flagsSize})
+    {
+        mIpcMemoryHandles.emplace_back(size, manager, worldConfig);
+    }
+
+    mAllReduceCommPtrs = BufferManager::cpu(
+        ITensor::makeShape({static_cast<SizeType>(mIpcMemoryHandles.size()) * tpSize}), nvinfer1::DataType::kINT64);
+    auto commPtrs = BufferRange<void*>(*mAllReduceCommPtrs);
+
+    for (std::size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
+    {
+        auto const& memCommPtrs = mIpcMemoryHandles[memIdx].getCommPtrs();
+        TLLM_CHECK(memCommPtrs.size() == static_cast<std::size_t>(tpSize));
+        std::copy(memCommPtrs.begin(), memCommPtrs.end(), commPtrs.begin() + memIdx * tpSize);
+    }
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/layerProfiler.cpp b/cpp/tensorrt_llm/runtime/layerProfiler.cpp
new file mode 100644
index 000000000..4c3c9779c
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/layerProfiler.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/layerProfiler.h"
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+
+using namespace tensorrt_llm::runtime;
+
+void LayerProfiler::reportLayerTime(char const* layerName, float timeMs) noexcept
+{
+    if (mIterator == mLayers.end())
+    {
+        bool const first = !mLayers.empty() && mLayers.begin()->name == layerName;
+        mUpdatesCount += mLayers.empty() || first;
+        if (first)
+        {
+            mIterator = mLayers.begin();
+        }
+        else
+        {
+            mLayers.emplace_back();
+            mLayers.back().name = layerName;
+            mIterator = mLayers.end() - 1;
+        }
+    }
+
+    mIterator->timeMs.push_back(timeMs);
+    ++mIterator;
+}
+
+float LayerProfiler::getTotalTime() const noexcept
+{
+    auto const plusLayerTime = [](float accumulator, LayerProfile const& lp)
+    { return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus<float>()); };
+    return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime);
+}
+
+std::string LayerProfiler::getLayerProfile() noexcept
+{
+    std::string const nameHdr("   Layer");
+    std::string const timeHdr("   Time(ms)");
+
+    float const totalTimeMs = getTotalTime();
+
+    auto const timeLength = timeHdr.size();
+
+    std::unordered_map<std::string, float> layer2times;
+    std::vector<std::string> layer_order;
+    for (auto const& p : mLayers)
+    {
+        if (!layer2times.count(p.name))
+        {
+            layer2times[p.name] = 0;
+            layer_order.push_back(p.name);
+        }
+        for (auto const& t : p.timeMs)
+        {
+            layer2times[p.name] += t;
+        }
+    }
+
+    std::stringstream ss;
+    ss << "\n=== Per-layer Profile ===\n" << timeHdr << nameHdr << "\n";
+
+    for (auto const& name : layer_order)
+    {
+        if (layer2times[name] == 0.0f)
+        {
+            continue;
+        }
+        ss << std::setw(timeLength) << std::fixed << std::setprecision(2) << layer2times[name] << "   " << name << "\n";
+    }
+
+    ss << std::setw(timeLength) << std::fixed << std::setprecision(2) << totalTimeMs << "   Total\n";
+    ss << "\n";
+
+    // clear data
+    mLayers.clear();
+
+    return ss.str();
+}
diff --git a/cpp/tensorrt_llm/runtime/layerProfiler.h b/cpp/tensorrt_llm/runtime/layerProfiler.h
new file mode 100644
index 000000000..bcae1546d
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/layerProfiler.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/common.h"
+#include <vector>
+
+#include <NvInfer.h>
+
+namespace tensorrt_llm::runtime
+{
+struct LayerProfile
+{
+    std::string name;
+    std::vector<float> timeMs;
+};
+
+class LayerProfiler : public nvinfer1::IProfiler
+{
+
+public:
+    void reportLayerTime(char const* layerName, float timeMs) noexcept override;
+
+    std::string getLayerProfile() noexcept;
+
+private:
+    [[nodiscard]] float getTotalTime() const noexcept;
+
+    std::vector<LayerProfile> mLayers;
+    std::vector<LayerProfile>::iterator mIterator{mLayers.begin()};
+    int32_t mUpdatesCount{0};
+};
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/loraManager.cpp b/cpp/tensorrt_llm/runtime/loraManager.cpp
index d37abc5c9..db06ae43b 100644
--- a/cpp/tensorrt_llm/runtime/loraManager.cpp
+++ b/cpp/tensorrt_llm/runtime/loraManager.cpp
@@ -18,7 +18,6 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
@@ -26,7 +25,8 @@
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/utils/sessionUtils.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
+
+#include <NvInferRuntime.h>
 
 namespace tensorrt_llm::runtime
 {
diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
index 4666645e0..6df5448c8 100644
--- a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
@@ -87,8 +87,6 @@ void RuntimeBuffers::create(TllmRuntime const& runtime, ModelConfig const& model
     bool transformerBased = modelConfig.isTransformerBased();
     bool ssmBased = modelConfig.isSsmBased();
 
-    TLLM_CHECK_WITH_INFO(transformerBased ^ ssmBased, "Model should be either Transformer based or SSM based now.");
-
     contextLengthsHost = manager.emptyTensor(MemoryType::kPINNED, nvinfer1::DataType::kINT32);
     if (transformerBased)
     {
@@ -445,13 +443,12 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu
     if (transformerBuffers)
     {
         transformerBuffers->getRuntimeBuffers(
-            this, inputBuffers, outputBuffers, step, inputIds, commPtrs, modelConfig, worldConfig);
+            this, inputBuffers, outputBuffers, step, inputIds, modelConfig, worldConfig);
     }
 
     if (ssmStateBuffers)
     {
-        ssmStateBuffers->getRuntimeBuffers(
-            this, inputBuffers, outputBuffers, step, inputIds, commPtrs, modelConfig, worldConfig);
+        ssmStateBuffers->getRuntimeBuffers(this, inputBuffers, outputBuffers, step, inputIds, modelConfig, worldConfig);
     }
 
     if (modelConfig.useCustomAllReduce() && worldConfig.isTensorParallel())
diff --git a/cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp b/cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp
index 33c19189e..c47c1d434 100644
--- a/cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp
@@ -37,32 +37,54 @@ SsmStateBuffers::SsmStateBuffers(
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK(modelConfig.isSsmBased());
-    TLLM_CHECK_WITH_INFO(modelConfig.hasMambaConfig(), "SSM only support Mamba now.");
+    // TODO: support RecurrentGemma: the code mostly works but returns incorrect tokens in the generation phase
+    TLLM_CHECK_WITH_INFO(modelConfig.hasMambaConfig(), "SSM only support Mamba for now.");
     auto maxBatchSize = modelConfig.getMaxBatchSize();
     auto maxBeamWidth = modelConfig.getMaxBeamWidth();
     auto maxBatchBeam = maxBatchSize * maxBeamWidth;
-    auto mambaConfig = modelConfig.getMambaConfig();
-    TLLM_CHECK_WITH_INFO(mambaConfig.has_value(), "SsmStateBuffers should be used with mambaConfig.");
-    mDConv = mambaConfig->dConv;
-    mDState = mambaConfig->dState;
-    auto expand = mambaConfig->expand;
-    auto hiddenSize = modelConfig.getHiddenSize();
-    mDInner = expand * hiddenSize;
+    auto rnnConfig = modelConfig.getRnnConfig();
+    mIsRecurrentGemma = rnnConfig.has_value();
+
+    if (mIsRecurrentGemma)
+    {
+        mDConv = rnnConfig->dConv;
+        mDInner = rnnConfig->hiddenSize;
+    }
+    else
+    {
+        auto mambaConfig = modelConfig.getMambaConfig();
+        mDConv = mambaConfig->dConv;
+        mDState = mambaConfig->dState;
+        auto expand = mambaConfig->expand;
+        auto hiddenSize = modelConfig.getHiddenSize();
+        mDInner = expand * hiddenSize;
+    }
+
     auto dType = modelConfig.getDataType();
     auto const localNbLayers = modelConfig.getNbSsmLayers(worldConfig.getPipelineParallelism());
     mLocalNbLayers = localNbLayers;
     mMaxBeamWidth = maxBeamWidth;
     mUseMambaConv1dPlugin = modelConfig.useMambaConv1dPlugin();
-    auto ssmStatesShape = ITensor::makeShape({localNbLayers * maxBatchBeam, mDState, mDInner});
+    auto const ssmStatesShape = [&]()
+    {
+        if (mIsRecurrentGemma)
+        {
+            return ITensor::makeShape({localNbLayers * maxBatchBeam, mDInner});
+        }
+        else
+        {
+            return ITensor::makeShape({localNbLayers * maxBatchBeam, mDState, mDInner});
+        }
+    }();
     auto const convStatesShape = [&]()
     {
         if (mUseMambaConv1dPlugin)
         {
-            return tensorrt_llm::runtime::ITensor::makeShape({localNbLayers * maxBatchBeam, mDConv - 1, mDInner});
+            return ITensor::makeShape({localNbLayers * maxBatchBeam, mDConv - 1, mDInner});
         }
         else
         {
-            return tensorrt_llm::runtime::ITensor::makeShape({localNbLayers * maxBatchBeam, mDInner, mDConv - 1});
+            return ITensor::makeShape({localNbLayers * maxBatchBeam, mDInner, mDConv - 1});
         }
     }();
     auto& bufferManager = runtime.getBufferManager();
@@ -94,18 +116,26 @@ SsmStateBuffers::SsmStateBuffers(
 void SsmStateBuffers::reshape(SizeType batchSize)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-    auto ssmStatesShape = ITensor::makeShape({mLocalNbLayers * batchSize * mMaxBeamWidth, mDState, mDInner});
+    auto const ssmStatesShape = [&]()
+    {
+        if (mIsRecurrentGemma)
+        {
+            return ITensor::makeShape({mLocalNbLayers * batchSize * mMaxBeamWidth, mDInner});
+        }
+        else
+        {
+            return ITensor::makeShape({mLocalNbLayers * batchSize * mMaxBeamWidth, mDState, mDInner});
+        }
+    }();
     auto const convStatesShape = [&]()
     {
         if (mUseMambaConv1dPlugin)
         {
-            return tensorrt_llm::runtime::ITensor::makeShape(
-                {mLocalNbLayers * batchSize * mMaxBeamWidth, mDConv - 1, mDInner});
+            return ITensor::makeShape({mLocalNbLayers * batchSize * mMaxBeamWidth, mDConv - 1, mDInner});
         }
         else
         {
-            return tensorrt_llm::runtime::ITensor::makeShape(
-                {mLocalNbLayers * batchSize * mMaxBeamWidth, mDInner, mDConv - 1});
+            return ITensor::makeShape({mLocalNbLayers * batchSize * mMaxBeamWidth, mDInner, mDConv - 1});
         }
     }();
     mambaSsmStates->reshape(ssmStatesShape);
@@ -118,10 +148,9 @@ void SsmStateBuffers::reshape(SizeType batchSize)
     for (int i = 0; i < mLocalNbLayers; i++)
     {
         size_t offset = batchSize * mMaxBeamWidth * i;
-        mambaSsmState[i] = tensorrt_llm::runtime::ITensor::slice(mambaSsmStates, offset, batchSize * mMaxBeamWidth);
-        mambaConvState[i] = tensorrt_llm::runtime::ITensor::slice(mambaConvStates, offset, batchSize * mMaxBeamWidth);
-        mambaConvStateAlt[i]
-            = tensorrt_llm::runtime::ITensor::slice(mambaConvStatesAlt, offset, batchSize * mMaxBeamWidth);
+        mambaSsmState[i] = ITensor::slice(mambaSsmStates, offset, batchSize * mMaxBeamWidth);
+        mambaConvState[i] = ITensor::slice(mambaConvStates, offset, batchSize * mMaxBeamWidth);
+        mambaConvStateAlt[i] = ITensor::slice(mambaConvStatesAlt, offset, batchSize * mMaxBeamWidth);
     }
     if (slotMappingDevice != nullptr)
     {
@@ -159,8 +188,8 @@ void SsmStateBuffers::fillStatePtrs()
     {
         mambaSsmStatePtrArray[i] = mambaSsmState[i]->data();
         mambaConvStatePtrArray[i] = mambaConvState[i]->data();
-        mambaSsmStatePtr[i] = tensorrt_llm::runtime::ITensor::slice(mambaSsmStatePtrs, i, 1);
-        mambaConvStatePtr[i] = tensorrt_llm::runtime::ITensor::slice(mambaConvStatePtrs, i, 1);
+        mambaSsmStatePtr[i] = ITensor::slice(mambaSsmStatePtrs, i, 1);
+        mambaConvStatePtr[i] = ITensor::slice(mambaConvStatePtrs, i, 1);
     }
 }
 
@@ -284,8 +313,8 @@ void SsmStateBuffers::postContextStep(RuntimeBuffers* runtimeBuffers, std::vecto
 }
 
 void SsmStateBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, TensorMap& inputBuffers,
-    TensorMap& outputBuffers, SizeType const step, TensorPtr const& inputIds, TensorPtr const& commPtrs,
-    ModelConfig const& modelConfig, WorldConfig const& worldConfig) const
+    TensorMap& outputBuffers, SizeType const step, TensorPtr const& inputIds, ModelConfig const& modelConfig,
+    WorldConfig const& worldConfig) const
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& logits = runtimeBuffers->logits;
@@ -316,21 +345,29 @@ void SsmStateBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, Te
 
     auto const localNbLayers = modelConfig.getNbSsmLayers(worldConfig.getPipelineParallelism());
     auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
+    auto const& layerTypes = modelConfig.getLayerTypes();
 
     if (modelConfig.usePagedState())
     {
+        auto const* const ssmStatePtrName = mIsRecurrentGemma ? "rnn_state_ptr_" : "ssm_state_ptr_";
         inputBuffers.insert_or_assign("slot_mapping", slotMappingDevice);
-        utils::insertTensorVector(inputBuffers, "conv_state_ptr_", mambaConvStatePtr, firstLayerId);
-        utils::insertTensorVector(inputBuffers, "ssm_state_ptr_", mambaSsmStatePtr, firstLayerId);
+        utils::insertTensorVector(inputBuffers, "conv_state_ptr_", mambaConvStatePtr, firstLayerId, layerTypes,
+            ModelConfig::LayerType::kRECURRENT);
+        utils::insertTensorVector(inputBuffers, ssmStatePtrName, mambaSsmStatePtr, firstLayerId, layerTypes,
+            ModelConfig::LayerType::kRECURRENT);
     }
     else
     {
-        utils::insertTensorVector(
-            inputBuffers, "past_conv_state_", (step % 2) ? mambaConvState : mambaConvStateAlt, firstLayerId);
-        utils::insertTensorVector(
-            outputBuffers, "present_conv_state_", (step % 2) ? mambaConvStateAlt : mambaConvState, firstLayerId);
-        utils::insertTensorVector(inputBuffers, "past_ssm_state_", mambaSsmState, firstLayerId);
-        utils::insertTensorVector(outputBuffers, "present_ssm_state_", mambaSsmState, firstLayerId);
+        auto const* const ssmPastStatePtrName = mIsRecurrentGemma ? "past_rnn_state_" : "past_ssm_state_";
+        auto const* const ssmPresentStatePtrName = mIsRecurrentGemma ? "present_rnn_state_" : "present_ssm_state_";
+        utils::insertTensorVector(inputBuffers, "past_conv_state_", (step % 2) ? mambaConvState : mambaConvStateAlt,
+            firstLayerId, layerTypes, ModelConfig::LayerType::kRECURRENT);
+        utils::insertTensorVector(outputBuffers, "present_conv_state_", (step % 2) ? mambaConvStateAlt : mambaConvState,
+            firstLayerId, layerTypes, ModelConfig::LayerType::kRECURRENT);
+        utils::insertTensorVector(inputBuffers, ssmPastStatePtrName, mambaSsmState, firstLayerId, layerTypes,
+            ModelConfig::LayerType::kRECURRENT);
+        utils::insertTensorVector(outputBuffers, ssmPresentStatePtrName, mambaSsmState, firstLayerId, layerTypes,
+            ModelConfig::LayerType::kRECURRENT);
     }
 
     inputBuffers.insert_or_assign("host_request_types", requestTypes);
diff --git a/cpp/tensorrt_llm/runtime/ssmStateBuffers.h b/cpp/tensorrt_llm/runtime/ssmStateBuffers.h
index af25c128d..e5f8549c2 100644
--- a/cpp/tensorrt_llm/runtime/ssmStateBuffers.h
+++ b/cpp/tensorrt_llm/runtime/ssmStateBuffers.h
@@ -36,7 +36,8 @@ class SsmStateBuffers
     using TensorMap = StringPtrMap<ITensor>;
 
     // Mamba states:  mamba_d_inner = mamba_expand * hidden_size
-    TensorPtr mambaSsmStates;                 // [layer_count * batch_beam, mamba_d_state, mamba_d_inner]
+    TensorPtr mambaSsmStates;                 // [layer_count * batch_beam, mamba_d_state, mamba_d_inner] for Mamba
+                                              // [layer_count * batch_beam, rnn_hidden_size] for recurrentgemma
     TensorPtr mambaConvStates;                // [layer_count * batch_beam, mamba_d_conv - 1, mamba_d_inner]
     TensorPtr mambaConvStatesAlt;             // [layer_count * batch_beam, mamba_d_conv - 1, mamba_d_inner]
 
@@ -71,7 +72,7 @@ class SsmStateBuffers
         BufferManager& manager, ModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
     void getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, TensorMap& inputBuffers, TensorMap& outputBuffers,
-        SizeType const step, TensorPtr const& inputIds, TensorPtr const& commPtrs, ModelConfig const& modelConfig,
+        SizeType const step, TensorPtr const& inputIds, ModelConfig const& modelConfig,
         WorldConfig const& worldConfig) const;
 
 protected:
@@ -82,13 +83,15 @@ class SsmStateBuffers
 
 private:
     SizeType mDConv = 0;
-    SizeType mDState = 0;
+    SizeType mDState = 0; // only valid for Mamba
     SizeType mDInner = 0;
 
     int mLocalNbLayers = 0;
     int mMaxBeamWidth = 0;
 
     bool mUseMambaConv1dPlugin = true;
+
+    bool mIsRecurrentGemma = false;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
index ab93dd811..e0cea8593 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
@@ -228,3 +228,29 @@ CudaStream const& TllmRuntime::getStream() const
 {
     return *mStream;
 }
+
+bool TllmRuntime::hasLayerProfiler(SizeType contextId) const
+{
+    return mContexts[contextId]->getProfiler() != nullptr;
+}
+
+void TllmRuntime::setLayerProfiler()
+{
+    mLayerProfiler.reset(new LayerProfiler);
+    for (auto& context : mContexts)
+    {
+        context->setProfiler(mLayerProfiler.get());
+        context->setEnqueueEmitsProfile(false);
+    }
+}
+
+std::string TllmRuntime::getLayerProfileInfo() const
+{
+    TLLM_CHECK(mLayerProfiler);
+    return mLayerProfiler->getLayerProfile();
+}
+
+void TllmRuntime::reportToProfiler(SizeType contextId)
+{
+    mContexts[contextId]->reportToProfiler();
+}
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h
index 83e1a4c66..91bb9af45 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@@ -18,6 +18,7 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/layerProfiler.h"
 #include <NvInferRuntime.h>
 
 #include <cstdint>
@@ -107,6 +108,11 @@ class TllmRuntime
         return mBufferManager;
     }
 
+    void setLayerProfiler();
+    bool hasLayerProfiler(SizeType contextId) const;
+    std::string getLayerProfileInfo() const;
+    void reportToProfiler(SizeType contextId);
+
 private:
     BufferManager::CudaStreamPtr mStream;
     BufferManager mBufferManager;
@@ -116,5 +122,6 @@ class TllmRuntime
     std::vector<std::unique_ptr<nvinfer1::IExecutionContext>> mContexts;
     std::unique_ptr<ITensor> mDummyTensor;
     std::unique_ptr<nvinfer1::IEngineInspector> mEngineInspector;
+    std::unique_ptr<LayerProfiler> mLayerProfiler;
 };
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
index e02e6b8ca..e8210a5ff 100644
--- a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
@@ -47,7 +47,14 @@ TransformerBuffers::TransformerBuffers(
     auto& engine = runtime.getEngine();
 
     auto const localNbLayers = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism());
-    auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
+    auto firstAttentionLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
+
+    auto const& layerTypes = modelConfig.getLayerTypes();
+    if (!layerTypes.empty())
+    {
+        firstAttentionLayerId
+            = std::find(layerTypes.begin(), layerTypes.end(), ModelConfig::LayerType::kATTENTION) - layerTypes.begin();
+    }
 
     nvinfer1::DataType kvDtype;
     if (modelConfig.usePagedKvCache())
@@ -58,7 +65,7 @@ TransformerBuffers::TransformerBuffers(
     {
         kvDtype = modelConfig.getQuantMode().hasFp8KvCache()
             ? nvinfer1::DataType::kFP8
-            : engine.getTensorDataType(("present_key_value_" + std::to_string(firstLayerId)).c_str());
+            : engine.getTensorDataType(("present_key_value_" + std::to_string(firstAttentionLayerId)).c_str());
     }
 
     if (modelConfig.usePagedKvCache())
@@ -290,7 +297,8 @@ void TransformerBuffers::prepareContextStep(RuntimeBuffers* runtimeBuffers, Tens
         auto const contextLengthsHostPtr = bufferCast<SizeType const>(*contextLengthsHost);
         auto const modelVariant = modelConfig.getModelVariant();
 
-        if (modelVariant == ModelConfig::ModelVariant::kGpt)
+        if (modelVariant == ModelConfig::ModelVariant::kGpt
+            || modelVariant == ModelConfig::ModelVariant::kRecurrentGemma)
         {
             auto const inputSize = inputIds->getSize();
             std::vector<SizeType> positionIdsVec(inputSize);
@@ -582,7 +590,8 @@ void TransformerBuffers::prepareNextStep(RuntimeBuffers* runtimeBuffers, SizeTyp
 
         auto const modelVariant = modelConfig.getModelVariant();
 
-        if (modelVariant == ModelConfig::ModelVariant::kGpt)
+        if (modelVariant == ModelConfig::ModelVariant::kGpt
+            || modelVariant == ModelConfig::ModelVariant::kRecurrentGemma)
         {
             positionIds->reshape(inputShape);
             manager.copy(*contextLengthsDevice, *positionIds);
@@ -662,8 +671,8 @@ void TransformerBuffers::prepareNextStep(RuntimeBuffers* runtimeBuffers, SizeTyp
 }
 
 void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, TensorMap& inputBuffers,
-    TensorMap& outputBuffers, SizeType const step, TensorPtr const& inputIds, TensorPtr const& commPtrs,
-    ModelConfig const& modelConfig, WorldConfig const& worldConfig) const
+    TensorMap& outputBuffers, SizeType const step, TensorPtr const& inputIds, ModelConfig const& modelConfig,
+    WorldConfig const& worldConfig) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     inputBuffers.clear();
@@ -704,6 +713,7 @@ void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers,
 
     auto const localNbLayers = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism());
     auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
+    auto const& layerTypes = modelConfig.getLayerTypes();
 
     if (modelConfig.useGptAttentionPlugin())
     {
@@ -726,8 +736,10 @@ void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers,
         }
         else
         {
-            utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals, firstLayerId);
-            utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals, firstLayerId);
+            utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals, firstLayerId, layerTypes,
+                ModelConfig::LayerType::kATTENTION);
+            utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals, firstLayerId, layerTypes,
+                ModelConfig::LayerType::kATTENTION);
         }
     }
     else
@@ -744,6 +756,7 @@ void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers,
         char* disableReuseChar = std::getenv("TRTLLM_DISABLE_OOTB_KVCACHE_REUSE");
         bool reuse = (disableReuseChar == nullptr || std::string(disableReuseChar) != "ON");
 
+        // TODO: fix for recurrentgemma
         for (int32_t idx = 0; idx < localNbLayers; ++idx)
         {
             TensorPtr input;
diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.h b/cpp/tensorrt_llm/runtime/transformerBuffers.h
index 36fad0e63..bd229643e 100644
--- a/cpp/tensorrt_llm/runtime/transformerBuffers.h
+++ b/cpp/tensorrt_llm/runtime/transformerBuffers.h
@@ -71,8 +71,7 @@ class TransformerBuffers
         WorldConfig const& worldConfig);
 
     void getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, TensorMap& inputBuffers, TensorMap& outputBuffers,
-        SizeType step, TensorPtr const& inputIds, TensorPtr const& commPtrs, ModelConfig const& modelConfig,
-        WorldConfig const& worldConfig) const;
+        SizeType step, TensorPtr const& inputIds, ModelConfig const& modelConfig, WorldConfig const& worldConfig) const;
 
 protected:
     void copyAttentionMasks(
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
index 941d3b9f5..9c79596e3 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
@@ -97,10 +97,24 @@ std::vector<ITensor::SharedPtr> sliceBufferVector(
 }
 
 void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
-    SizeType const indexOffset)
+    SizeType indexOffset, std::vector<ModelConfig::LayerType> const& layerTypes, ModelConfig::LayerType type)
 {
-    for (std::size_t i = 0; i < vec.size(); ++i)
-        map.insert_or_assign(key + std::to_string(indexOffset + i), vec[i]);
+    if (layerTypes.empty())
+    {
+        for (std::size_t i = 0; i < vec.size(); ++i)
+            map.insert_or_assign(key + std::to_string(indexOffset + i), vec[i]);
+    }
+    else
+    {
+        std::size_t vecIndex = 0;
+        for (std::size_t i = 0; i < layerTypes.size(); ++i)
+        {
+            if (layerTypes[i] == type)
+            {
+                map.insert_or_assign(key + std::to_string(indexOffset + i), vec.at(vecIndex++));
+            }
+        }
+    }
 }
 
 void insertTensorSlices(
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
index 33cb8a2bf..52e095d12 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <NvInferRuntime.h>
@@ -59,7 +60,7 @@ std::vector<ITensor::SharedPtr> sliceBufferVector(
     std::vector<ITensor::SharedPtr> const& vector, SizeType offset, SizeType size);
 
 void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
-    SizeType indexOffset);
+    SizeType indexOffset, std::vector<ModelConfig::LayerType> const& layerTypes, ModelConfig::LayerType type);
 
 void insertTensorSlices(
     StringPtrMap<ITensor>& map, std::string const& key, ITensor::SharedPtr const& tensor, SizeType indexOffset);
diff --git a/cpp/tensorrt_llm/runtime/workerPool.h b/cpp/tensorrt_llm/runtime/workerPool.h
index aa70fe6b4..24960eac5 100644
--- a/cpp/tensorrt_llm/runtime/workerPool.h
+++ b/cpp/tensorrt_llm/runtime/workerPool.h
@@ -82,15 +82,16 @@ class WorkerPool
     }
 
 private:
+    static constexpr size_t kMaxNumWorkers = 128;
     std::size_t mNumWorkers;
 
-    std::queue<std::function<void()>> mTasks;
+    std::queue<std::function<void()>> mTasks{};
     mutable std::mutex mTasksMutex;
     std::condition_variable mTasksCv;
 
     std::atomic<bool> mShutdown = false;
 
-    std::vector<std::shared_ptr<std::thread>> mThreads;
+    std::thread mThreads[kMaxNumWorkers];
 
     int mDevice{-1};
 
@@ -102,17 +103,22 @@ class WorkerPool
         }
         mShutdown = true;
         mTasksCv.notify_all();
-        for (std::size_t i = 0; i < mThreads.size(); ++i)
+        for (std::size_t i = 0; i < mNumWorkers; ++i)
         {
-            mThreads.at(i)->join();
+            mThreads[i].join();
         }
     }
 
     void initThreads()
     {
+        if (mNumWorkers > kMaxNumWorkers)
+        {
+            throw std::runtime_error(
+                "numWorker > maxNumWorkers " + std::to_string(mNumWorkers) + " > " + std::to_string(kMaxNumWorkers));
+        }
         for (std::size_t i = 0; i < mNumWorkers; ++i)
         {
-            mThreads.push_back(std::make_shared<std::thread>(std::thread(&WorkerPool::doWork, this)));
+            mThreads[i] = std::thread(&WorkerPool::doWork, this);
         }
     }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c27de6149..e306ecc00 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -21,8 +21,14 @@ FetchContent_Declare(
 FetchContent_MakeAvailable(googletest)
 include(GoogleTest)
 
-find_library_create_target(nvonnxparser nvonnxparser SHARED ${TRT_OUT_DIR}
-                           ${TRT_LIB_DIR})
+# On Windows major version is appended to nvinfer libs.
+if(WIN32)
+  set(ONNX_PARSER_LIB_NAME nvonnxparser_10)
+else()
+  set(ONNX_PARSER_LIB_NAME nvonnxparser)
+endif()
+find_library_create_target(nvonnxparser ${ONNX_PARSER_LIB_NAME} SHARED
+                           ${TRT_OUT_DIR} ${TRT_LIB_DIR})
 
 include_directories(
   ${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include
@@ -103,6 +109,7 @@ set(SAMPLING_LAYER_TEST_SRC
 add_gtest(samplingLayerTest "${SAMPLING_LAYER_TEST_SRC}")
 add_gtest(dynamicDecodeLayerTest layers/dynamicDecodeLayerTest.cpp)
 add_gtest(medusaDecodeLayerTest layers/medusaDecodeLayerTest.cpp)
+add_gtest(lookaheadPoolManagerTest layers/lookaheadPoolManagerTest.cpp)
 
 if(BUILD_BATCH_MANAGER)
   if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/batch_manager)
diff --git a/cpp/tests/README.md b/cpp/tests/README.md
index 33368fa5e..36239de12 100644
--- a/cpp/tests/README.md
+++ b/cpp/tests/README.md
@@ -19,7 +19,7 @@ An example call may look like this:
 ```bash
 CPP_BUILD_DIR=cpp/build
 MODEL_CACHE=/path/to/model_cache
-python3 cpp/tests/resources/scripts/test_cpp.py -a "80-real;86-real" --build_dir ${CPP_BUILD_DIR} --trt_root /usr/local/tensorrt --model_cache ${MODEL_CACHE} --only_gptj
+python3 cpp/tests/resources/scripts/test_cpp.py -a "80-real;86-real" --build_dir ${CPP_BUILD_DIR} --trt_root /usr/local/tensorrt --model_cache ${MODEL_CACHE} --run_gptj --skip_unit_tests
 ```
 
 ## Manual steps
diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu
index b891b80bb..41401f80b 100644
--- a/cpp/tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu
@@ -17,7 +17,7 @@ using namespace tensorrt_llm::runtime;
 constexpr static float FP8_MAX = 440; // FP8_E4M3_MAX;
 
 template <class T>
-__global__ void initWeightsKernel(T* data, int w, int h, float scalar)
+__global__ void initWeightsKernel(T* data, int64_t w, int64_t h, float scalar)
 {
     size_t expert_id = blockIdx.z;
     T* start_offset = data + expert_id * w * h;
@@ -44,7 +44,7 @@ __global__ void initWeightsGatedKernel(T* data, int w, int h, float scalar_1, fl
 }
 
 template <class T>
-__global__ void initBiasToExpertIdKernel(T* data, int w)
+__global__ void initBiasToExpertIdKernel(T* data, int64_t w)
 {
     size_t expert_id = blockIdx.y;
     T* start_offset = data + expert_id * w;
@@ -86,8 +86,8 @@ protected:
     constexpr static bool INT_QUANT = !std::is_same_v<DataType, WeightType>;
     using WeightStorage = std::conditional_t<INT_QUANT, uint8_t, WeightType>;
     constexpr static int WEIGHT_ELEM_PER_BYTE = INT4 ? 2 : 1;
-    int const HIDDEN_SIZE_MULTIPLIER = 1;
-    int const DEFAULT_HIDDEN_SIZE = HIDDEN_SIZE_MULTIPLIER * 64 / sizeof(WeightType) * WEIGHT_ELEM_PER_BYTE;
+    int64_t const HIDDEN_SIZE_MULTIPLIER = 8;
+    int64_t const DEFAULT_HIDDEN_SIZE = HIDDEN_SIZE_MULTIPLIER * 64 / sizeof(WeightType) * WEIGHT_ELEM_PER_BYTE;
 
     static BufferManager::CudaStreamPtr mStream;
     static std::unique_ptr<BufferManager> mBufferManager;
@@ -97,9 +97,9 @@ protected:
     float* mInputProbabilities{};
     DataType* mInputTensor{};
 
-    int mHiddenSize{};
-    int mNumExperts{};
-    int mK{};
+    int64_t mHiddenSize{};
+    int64_t mNumExperts{};
+    int64_t mK{};
 
     float getTolerance(float scale = 1.f)
     {
@@ -128,7 +128,7 @@ protected:
         mDeviceCount = getDeviceCount();
         if (shouldSkip())
         {
-            GTEST_SKIP();
+            GTEST_SKIP() << "Skipping due to no/unsupported GPU";
         }
 
         mStream = std::make_shared<CudaStream>();
@@ -146,7 +146,7 @@ protected:
         assert(mBufferManager);
         if (shouldSkip())
         {
-            GTEST_SKIP();
+            GTEST_SKIP() << "Skipping due to no/unsupported GPU";
         }
     }
 
@@ -155,7 +155,7 @@ protected:
         managed_buffers.clear();
     }
 
-    void initWeights(DataType* buffer, int w, int h, float scalar)
+    void initWeights(DataType* buffer, int64_t w, int64_t h, float scalar)
     {
         if constexpr (FP8)
             scalar = FP8_MAX; // Automatically set it to max
@@ -225,9 +225,9 @@ protected:
     int* mSourceToExpandedMap;
     int* mSelectedExpert;
     bool* mFinished{};
-    int mInterSize{};
-    int mTotalTokens{};
-    int mActiveRows{};
+    int64_t mInterSize{};
+    int64_t mTotalTokens{};
+    int64_t mActiveRows{};
 
     bool mUseBias = true;
 
@@ -256,7 +256,7 @@ protected:
     }
 
     void initBuffersPermute(std::vector<std::vector<DataType>> h_hidden_states,
-        std::vector<std::vector<float>> h_router_results, int hidden_size, int num_experts, int k,
+        std::vector<std::vector<float>> h_router_results, int64_t hidden_size, int64_t num_experts, int64_t k,
         std::vector<uint8_t> finished, MOEParallelismConfig parallelism_config)
     {
         managed_buffers.clear();
@@ -270,12 +270,13 @@ protected:
         auto const gated_inter = mInterSize * mGatedMultiplier;
 
         mTotalTokens = 0;
+
         std::vector<int> h_seq_lens;
         h_seq_lens.push_back(0);
         for (auto& sequence : h_hidden_states)
         {
             assert(sequence.size() % hidden_size == 0);
-            int num_tokens = sequence.size() / hidden_size;
+            int64_t num_tokens = sequence.size() / hidden_size;
             h_seq_lens.emplace_back(h_seq_lens.back() + num_tokens);
             mTotalTokens += num_tokens;
         }
@@ -330,7 +331,7 @@ protected:
             mExpertFP8Scale2 = allocBuffer<float>(1);
             mExpertFP8Scale3 = allocBuffer<float>(mNumExperts);
 
-            ASSERT_NE(mMaxInput, 0.0f);
+            EXPECT_NE(mMaxInput, 0.0f);
             initFP8Scales(mMaxInput);
         }
 
@@ -494,7 +495,7 @@ protected:
     }
 
     void runMoEPermute(std::vector<std::vector<DataType>> h_hidden_states,
-        std::vector<std::vector<float>> h_router_results, int hidden_size, int num_experts, int k,
+        std::vector<std::vector<float>> h_router_results, int64_t hidden_size, int64_t num_experts, int64_t k,
         std::vector<uint8_t> finished = {}, MOEParallelismConfig parallelism_config = {})
     {
         initBuffersPermute(std::move(h_hidden_states), std::move(h_router_results), hidden_size, num_experts, k,
@@ -642,7 +643,7 @@ protected:
             {
                 if (entry >= num_experts_per_node * tp_rank && entry < num_experts_per_node * (tp_rank + 1))
                     return entry;
-                return mNumExperts;
+                return (int) mNumExperts;
             });
         return result;
     }
@@ -1280,3 +1281,57 @@ TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
         }) << tactic.str();
     }
 }
+
+TYPED_TEST(MixtureOfExpertsTest, PermuteVeryLongSequence)
+{
+    this->mUseBias = !this->FP8;
+
+    using DataType = typename TypeParam::DataType;
+    // Sequence * hidden size > INT32_MAX
+    int64_t hidden_size = 2048ll;
+    int64_t num_experts = 4;
+    int64_t k = 1;
+    int64_t num_tokens = 1024ll * 1024ll + 1ll;
+    ASSERT_GT(hidden_size * num_tokens, (uint64_t) std::numeric_limits<int>::max() + 1ull);
+
+    // Skip the test if the GPU does not have enough memory
+    size_t workspace_size = this->mMoERunner.getWorkspaceSize(
+        num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {});
+    auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
+    if (freeMem < workspace_size)
+    {
+        GTEST_SKIP() << "Insufficient free memory for workspace size";
+    }
+
+    std::vector<DataType> hidden_states(hidden_size * num_tokens);
+    this->mMaxInput = 1.f; // Any arbitrary non-zero value
+
+    // All tokens to expert 0
+    float const token_probs[] = {1.f, 0.5f, 0.f, 0.f};
+    std::vector<float> probs;
+    probs.reserve(num_tokens * num_experts);
+    for (size_t i = 0; i < num_tokens; i++)
+    {
+        probs.insert(probs.cend(), std::begin(token_probs), std::end(token_probs));
+    }
+
+    this->runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k);
+
+    // Just look at the first few tokens
+    this->mTotalTokens = 10;
+
+    probs.resize(num_experts * this->mTotalTokens);
+    hidden_states.resize(hidden_size * this->mTotalTokens);
+
+    auto selected_expert = this->getDataFromDevice(this->mSelectedExpert, k * this->mTotalTokens);
+    // All tokens should go to expert 0
+    for (auto& item : selected_expert)
+    {
+        ASSERT_EQ(item, 0);
+    }
+
+    this->compareSoftmax(selected_expert, probs);
+    // Create a default vector for the reference outputs of the correct type for FP8
+    std::vector<typename TypeParam::OutputType> unquant_states(this->mTotalTokens * hidden_size);
+    this->compareFinal(selected_expert, probs, unquant_states);
+}
diff --git a/cpp/tests/kernels/shiftKCacheKernelTest.cu b/cpp/tests/kernels/shiftKCacheKernelTest.cu
index 62723306c..7e0a99212 100644
--- a/cpp/tests/kernels/shiftKCacheKernelTest.cu
+++ b/cpp/tests/kernels/shiftKCacheKernelTest.cu
@@ -100,7 +100,7 @@ __global__ void applyRoPE(KVCacheBuffer kCacheRead, KVLinearBuffer kCacheWrite,
     case PositionEmbeddingType::kROPE_GPTJ:
     {
         mmha::apply_rotary_embedding(
-            k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, token_pos_idx);
+            k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, 0, nullptr, token_pos_idx);
         break;
     }
     case PositionEmbeddingType::kROPE_GPT_NEOX:
@@ -127,7 +127,7 @@ __global__ void applyRoPE(KVCacheBuffer kCacheRead, KVLinearBuffer kCacheWrite,
         {
             mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
             mmha::apply_rotary_embedding(k, transpose_idx / tidx_factor, rotary_embedding_dim, rotary_embedding_base,
-                rotary_embedding_scale, token_pos_idx);
+                rotary_embedding_scale, 0, nullptr, token_pos_idx);
             mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
         }
 
diff --git a/cpp/tests/layers/lookaheadPoolManagerTest.cpp b/cpp/tests/layers/lookaheadPoolManagerTest.cpp
new file mode 100644
index 000000000..96a11161b
--- /dev/null
+++ b/cpp/tests/layers/lookaheadPoolManagerTest.cpp
@@ -0,0 +1,171 @@
+
+#include <gtest/gtest.h>
+
+#include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
+#include "tensorrt_llm/layers/lookaheadPoolManager.h"
+
+namespace tensorrt_llm::tests::layers
+{
+using namespace tensorrt_llm::runtime;
+using namespace tensorrt_llm::layers;
+using TensorPtr = runtime::ITensor::SharedPtr;
+
+void printMap(char const* name, std::unordered_map<LookaheadPoolManager::Key, std::list<TensorPtr>> const& tokenMap)
+{
+    std::ostringstream buf;
+    buf << name << std::endl;
+    for (auto const& [key, value] : tokenMap)
+    {
+        buf << static_cast<char>(key) << ": ";
+        for (auto const& tup : value)
+        {
+            buf << "(";
+            for (auto const& token : BufferRange<LookaheadPoolManager::Key>(*tup))
+            {
+                buf << static_cast<char>(token) << ",";
+            }
+            buf << "),";
+        }
+        buf << std::endl;
+    }
+    TLLM_LOG_DEBUG(buf.str());
+}
+
+TensorPtr initTensor(
+    std::shared_ptr<BufferManager>& mBufferManager, std::string str, std::optional<ITensor::Shape> shape = std::nullopt)
+{
+    std::vector<TokenIdType> data(str.begin(), str.end());
+    auto shape1d = ITensor::makeShape({static_cast<SizeType>(data.size())});
+    if (shape)
+    {
+        TLLM_CHECK(ITensor::volume(shape1d) == ITensor::volume(shape.value()));
+    }
+    return ITensor::view(mBufferManager->copyFrom(data, MemoryType::kCPU), shape.value_or(shape1d));
+}
+
+bool isTensorEqString(TensorPtr const& a, std::string b)
+{
+    TLLM_CHECK(ITensor::volume(a->getShape()) == static_cast<SizeType>(b.size()));
+    auto ar = BufferRange<TokenIdType>(*a);
+    return std::equal(ar.begin(), ar.end(), b.begin());
+}
+
+TEST(LookaheadPoolManagerTest, fillAndUpdate)
+{
+    std::shared_ptr<CudaStream> mStream = std::make_shared<CudaStream>();
+    std::shared_ptr<BufferManager> mBufferManager = std::make_shared<BufferManager>(mStream);
+
+    SizeType constexpr W{5};
+    SizeType constexpr N{4};
+    SizeType constexpr G{5};
+    auto prompt = initTensor(mBufferManager, "hello world; hello world. live is life.");
+    LookaheadPoolManager pm(G, mBufferManager);
+    pm.fillWithPrompt(prompt, N);
+    printMap("Token map after fill with prompt", pm.getMap());
+    /***
+     s: ( ,l,i,),
+     v: (e, ,i,),
+     i: (v,e, ,),(s, ,l,),(f,e,.,),
+     d: (;, ,h,),(., ,l,),
+     w: (o,r,l,),
+      : (h,e,l,),(w,o,r,),(l,i,v,),(i,s, ,),(l,i,f,),
+     .: ( ,l,i,),
+     ;: ( ,h,e,),
+     o: ( ,w,o,),(r,l,d,),
+     l: (l,o, ,),(o, ,w,),(d,., ,),(i,v,e,),(i,f,e,),
+     r: (l,d,;,),(l,d,.,),
+     e: (l,l,o,),( ,i,s,),
+     h: (e,l,l,),
+     **/
+
+    LookaheadPoolManager::Key lastToken = 'l';
+    auto list = pm.guess(lastToken, G);
+    for (auto const& ngram : list)
+    {
+        PRINT_TOKENS(ngram);
+    }
+    /***
+     l: (l,o, ,),(o, ,w,),(d,., ,),(i,v,e,),(i,f,e,),
+     **/
+
+    auto pastTokens = initTensor(mBufferManager, std::string("abcde12345hijkm"), ITensor::makeShape({5, 3}));
+    auto keyTokens = initTensor(mBufferManager, std::string("lvwxy"));
+    pm.update(keyTokens, pastTokens);
+    printMap("Token map after update", pm.getMap());
+    /** Noted, we update the map with N=4, so the map has different sizes of ngrams.
+     y: (j,k,m,),
+     x: (5,h,i,),
+     e: (l,l,o,),( ,i,s,),
+     r: (l,d,;,),(l,d,.,),
+     l: (o, ,w,),(d,., ,),(i,v,e,),(i,f,e,),(a,b,c,),
+     o: ( ,w,o,),(r,l,d,),
+     ;: ( ,h,e,),
+     h: (e,l,l,),
+     .: ( ,l,i,),
+      : (h,e,l,),(w,o,r,),(l,i,v,),(i,s, ,),(l,i,f,),
+     w: (o,r,l,),(2,3,4,),
+     d: (;, ,h,),(., ,l,),
+     i: (v,e, ,),(s, ,l,),(f,e,.,),
+     v: (e, ,i,),(d,e,1,),
+     s: ( ,l,i,),
+    */
+
+    lastToken = 'w';
+    list = pm.guess(lastToken, G);
+    for (auto const& ngram : list)
+    {
+        PRINT_TOKENS(ngram);
+    }
+    /**
+     w: (o,r,l,),(2,3,4,),
+    */
+
+    ASSERT_EQ(list.size(), 2);
+    auto it = list.begin();
+    EXPECT_TRUE(isTensorEqString(*it, "orl"));
+    it++;
+    EXPECT_TRUE(isTensorEqString(*it, "234"));
+
+    pastTokens = initTensor(mBufferManager, std::string("dogde12345hijkm"), ITensor::makeShape({5, 3}));
+    pm.update(keyTokens, pastTokens);
+
+    pastTokens = initTensor(mBufferManager, std::string("catde12345hijkm"), ITensor::makeShape({5, 3}));
+    pm.update(keyTokens, pastTokens);
+
+    pastTokens = initTensor(mBufferManager, std::string("abcde12345hijkm"), ITensor::makeShape({5, 3}));
+    pm.update(keyTokens, pastTokens);
+
+    printMap("Token map after update more for key 'l'", pm.getMap());
+    /**
+     y: (j,k,m,),
+     x: (5,h,i,),
+     e: (l,l,o,),( ,i,s,),
+     r: (l,d,;,),(l,d,.,),
+     l: (i,v,e,),(i,f,e,),(d,o,g,),(c,a,t,),(a,b,c,),
+     o: ( ,w,o,),(r,l,d,),
+     ;: ( ,h,e,),
+     h: (e,l,l,),
+     .: ( ,l,i,),
+      : (h,e,l,),(w,o,r,),(l,i,v,),(i,s, ,),(l,i,f,),
+     w: (o,r,l,),(2,3,4,),
+     d: (;, ,h,),(., ,l,),
+     i: (v,e, ,),(s, ,l,),(f,e,.,),
+     v: (e, ,i,),(d,e,1,),
+     s: ( ,l,i,),
+    */
+    lastToken = 'l';
+    list = pm.guess(lastToken, G);
+    ASSERT_EQ(list.size(), G);
+    it = list.begin();
+    EXPECT_TRUE(isTensorEqString(*it, "ive"));
+    it++;
+    EXPECT_TRUE(isTensorEqString(*it, "ife"));
+    it++;
+    EXPECT_TRUE(isTensorEqString(*it, "dog"));
+    it++;
+    EXPECT_TRUE(isTensorEqString(*it, "cat"));
+    it++;
+    EXPECT_TRUE(isTensorEqString(*it, "abc"));
+}
+
+} // namespace tensorrt_llm::tests::layers
diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py
index 382ddbb7f..4f9c0d194 100644
--- a/cpp/tests/resources/scripts/build_chatglm_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@@ -59,8 +59,8 @@ def build_engines(model_cache: typing.Optional[str] = None,
                   world_size: int = 1):
 
     for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b"]:
-        model_cache_dir = Path(model_cache) / model_name
-        if model_cache_dir.is_dir():
+        if model_cache and (Path(model_cache) / model_name).is_dir():
+            model_cache_dir = Path(model_cache) / model_name
             if bCopyModel or model_name == "chatglm-6b":
                 print("Copy model from model_cache")
                 hf_dir = model_dir / model_name
diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py
index 3cb22186a..76b1d66c4 100755
--- a/cpp/tests/resources/scripts/build_gpt_engines.py
+++ b/cpp/tests/resources/scripts/build_gpt_engines.py
@@ -173,12 +173,27 @@ def build_engines(model_cache: Optional[str] = None, world_size: int = 1):
     ]
     build_engine(str(fp16_ckpt_dir),
                  str(engine_dir / 'fp16-plugin-packed-paged' / tp_pp_dir),
-                 '--max_draft_len=5', *ifb_args)
+                 *ifb_args)
+    build_engine(
+        str(fp16_ckpt_dir),
+        str(engine_dir / 'fp16-plugin-packed-paged-draft-tokens' / tp_pp_dir),
+        '--max_draft_len=5',
+        '--speculative_decoding_mode=draft_tokens_external', *ifb_args)
     build_engine(str(fp16_ckpt_dir),
                  str(engine_dir / 'fp16-plugin-packed-paged-in128' / tp_pp_dir),
                  *ifb_args,
                  max_input_len=128)
 
+    # Build the target model with return accepted token logits
+    # Build with '--max_draft_len', '--speculative_decoding_mode' and '--gather_generation_logits'
+    build_engine(
+        str(fp16_ckpt_dir),
+        str(engine_dir /
+            'fp16-plugin-packed-paged-return-accepted-tokens-logits' /
+            tp_pp_dir), '--max_draft_len=5',
+        '--speculative_decoding_mode=draft_tokens_external',
+        '--gather_generation_logits', *ifb_args)
+
     # We build almost the same engine twice. But this engine has gather_all_token_logits
     # to extract logits from python runtime and uses context FMHA for generation to match draft model executions,
     # which uses context FMHA for draft tokens prediction.
@@ -192,7 +207,8 @@ def build_engines(model_cache: Optional[str] = None, world_size: int = 1):
         str(fp16_ckpt_dir),
         str(engine_dir / 'fp16-plugin-packed-paged-context-fmha-for-gen' /
             tp_pp_dir), '--use_context_fmha_for_generation=enable',
-        '--max_draft_len=5', *ifb_args)
+        '--max_draft_len=5',
+        '--speculative_decoding_mode=draft_tokens_external', *ifb_args)
 
     # build engine with lora enabled
     build_engine(str(fp16_ckpt_dir),
diff --git a/cpp/tests/resources/scripts/build_gptj_engines.py b/cpp/tests/resources/scripts/build_gptj_engines.py
index 35b89c28c..c5ecad98d 100755
--- a/cpp/tests/resources/scripts/build_gptj_engines.py
+++ b/cpp/tests/resources/scripts/build_gptj_engines.py
@@ -32,7 +32,7 @@ def get_ckpt_without_quatization(model_dir, output_dir):
     run_command(build_args)
 
 
-def get_ckpt_with_ammo_quant(model_dir, output_dir):
+def get_ckpt_with_modelopt_quant(model_dir, output_dir):
     build_args = [_sys.executable, "examples/quantization/quantize.py"] + [
         '--model_dir={}'.format(model_dir),
         '--output_dir={}'.format(output_dir), '--qformat=fp8',
@@ -121,9 +121,9 @@ def build_engines(model_cache: _tp.Optional[str] = None, only_fp8=False):
         )
         # TODO: use dummy scales atm; to re-enable when data is uploaded to the model cache
         # quantized_fp8_model_arg = '--quantized_fp8_model_path=' + \
-        #     str(_pl.Path(model_cache) / 'fp8-quantized-ammo' / 'gptj_tp1_rank0.npz')
+        #     str(_pl.Path(model_cache) / 'fp8-quantized-modelopt' / 'gptj_tp1_rank0.npz')
         fp8_ckpt_path = engine_dir / 'fp8' / tp_pp_dir
-        get_ckpt_with_ammo_quant(hf_dir, fp8_ckpt_path)
+        get_ckpt_with_modelopt_quant(hf_dir, fp8_ckpt_path)
         build_engine(fp8_ckpt_path, engine_dir / 'fp8-plugin' / tp_pp_dir,
                      '--gpt_attention_plugin=float16',
                      '--paged_kv_cache=enable', '--remove_input_padding=enable',
diff --git a/cpp/tests/resources/scripts/build_medusa_engines.py b/cpp/tests/resources/scripts/build_medusa_engines.py
index 02ee8dca0..bb26ba709 100755
--- a/cpp/tests/resources/scripts/build_medusa_engines.py
+++ b/cpp/tests/resources/scripts/build_medusa_engines.py
@@ -36,10 +36,15 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path,
     build_args = ["trtllm-build"] + (
         ['--checkpoint_dir', str(engine_dir)] if engine_dir else []) + [
             '--output_dir',
-            str(engine_dir), '--gpt_attention_plugin=float16',
-            '--gemm_plugin=float16', '--max_batch_size=8',
-            '--max_input_len=512', '--max_output_len=20', '--log_level=error',
-            '--paged_kv_cache=enable', '--remove_input_padding=enable'
+            str(engine_dir),
+            '--gemm_plugin=float16',
+            '--max_batch_size=8',
+            '--max_input_len=512',
+            '--max_output_len=20',
+            '--log_level=error',
+            '--paged_kv_cache=enable',
+            '--remove_input_padding=enable',
+            '--speculative_decoding_mode=medusa',
         ]
 
     run_command(build_args)
diff --git a/cpp/tests/resources/scripts/build_recurrentgemma_engines.py b/cpp/tests/resources/scripts/build_recurrentgemma_engines.py
new file mode 100644
index 000000000..21c10ef3f
--- /dev/null
+++ b/cpp/tests/resources/scripts/build_recurrentgemma_engines.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse as _arg
+import os as _os
+import pathlib as _pl
+import platform as _pf
+import sys as _sys
+import typing as _tp
+
+from build_engines_utils import run_command, wincopy
+
+
+def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
+                 *args):
+    convert_args = [
+        _sys.executable, "examples/recurrentgemma/convert_checkpoint.py"
+    ] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
+        '--output_dir',
+        str(ckpt_dir),
+        '--ckpt_type=hf',
+        '--dtype=float16',
+    ]
+    run_command(convert_args)
+    build_args = ["trtllm-build"] + ['--checkpoint_dir',
+                                     str(ckpt_dir)] + [
+                                         '--output_dir',
+                                         str(engine_dir),
+                                         '--gpt_attention_plugin=float16',
+                                         '--paged_kv_cache=enable',
+                                         '--gemm_plugin=float16',
+                                         '--max_batch_size=8',
+                                         '--max_input_len=924',
+                                         '--max_output_len=100',
+                                         '--max_beam_width=1',
+                                     ] + list(args)
+    run_command(build_args)
+
+
+def build_engines(model_cache: _tp.Optional[str] = None):
+    resources_dir = _pl.Path(__file__).parent.resolve().parent
+    models_dir = resources_dir / 'models'
+    model_name = 'recurrentgemma-2b'
+    hf_dir = models_dir / model_name
+
+    # Clone or update the model directory without lfs
+    if model_cache:
+        print("Copy model from model_cache")
+        model_cache_dir = _pl.Path(model_cache) / 'recurrentgemma' / model_name
+        print(model_cache_dir)
+        assert (model_cache_dir.is_dir())
+        if _pf.system() == "Windows":
+            wincopy(source=str(model_cache_dir),
+                    dest=model_name,
+                    isdir=True,
+                    cwd=models_dir)
+        else:
+            run_command(
+                ["rsync", "-av", str(model_cache_dir), "."], cwd=models_dir)
+    else:
+        if not hf_dir.is_dir():
+            if _pf.system() == "Windows":
+                url_prefix = ""
+            else:
+                url_prefix = "file://"
+            model_url = "https://huggingface.co/google/recurrentgemma-2b"
+            run_command([
+                "git", "clone", model_url, "--single-branch", "--no-local",
+                model_name
+            ],
+                        cwd=models_dir,
+                        env={
+                            **_os.environ, "GIT_LFS_SKIP_SMUDGE": "1"
+                        })
+
+    assert (hf_dir.is_dir())
+
+    # Download the model file
+    model_file_name = "*"
+    if not model_cache:
+        run_command(["git", "lfs", "pull", "--include", model_file_name],
+                    cwd=hf_dir)
+
+    tp_size = 1
+    pp_size = 1
+    tp_pp_dir = f"tp{tp_size}-pp{pp_size}-gpu"
+
+    ckpt_dir = models_dir / 'rt_ckpt' / model_name
+    engine_dir = models_dir / 'rt_engine' / model_name
+
+    python_exe = _sys.executable
+    run_command([python_exe, "-m", "pip", "install", "transformers>=4.40.0"],
+                env=_os.environ,
+                timeout=300)
+
+    print("\nBuilding fp16-plugin-packed-paged engine")
+    build_engine(hf_dir, ckpt_dir / 'fp16-plugin-packed-paged' / tp_pp_dir,
+                 engine_dir / 'fp16-plugin-packed-paged' / tp_pp_dir,
+                 '--remove_input_padding=enable', '--paged_state=enable')
+
+    # Restore transformers version
+    run_command([python_exe, "-m", "pip", "uninstall", "transformers", "-y"],
+                env=_os.environ,
+                timeout=300)
+    run_command([python_exe, "-m", "pip", "install", "transformers==4.38.2"],
+                env=_os.environ,
+                timeout=300)
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    parser = _arg.ArgumentParser()
+    parser.add_argument("--model_cache",
+                        type=str,
+                        help="Directory where models are stored")
+
+    build_engines(**vars(parser.parse_args()))
diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py
index e1a6c625a..ad6203a11 100755
--- a/cpp/tests/resources/scripts/generate_expected_chatglm_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py
@@ -61,7 +61,7 @@ def generate_output(
         str(max_output_len),
         '--num_beams',
         str(num_beams),
-        #'--use_py_session',
+        '--use_py_session',
     ]
 
     output_logits_npy = None
diff --git a/cpp/tests/resources/scripts/generate_expected_gpt_output.py b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
index ee48ef242..642f20737 100755
--- a/cpp/tests/resources/scripts/generate_expected_gpt_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
@@ -110,7 +110,9 @@ def generate_outputs(num_beams):
                     num_beams=num_beams,
                     input_name='input_tokens',
                     output_name='output_tokens_fp16_plugin_packed_paged_gather',
-                    output_logits=True)
+                    output_logits=True,
+                    output_log_probs=True,
+                    output_cum_log_probs=True)
     generate_output(
         engine='fp16-plugin-packed-paged-context-fmha-for-gen',
         num_beams=num_beams,
diff --git a/cpp/tests/resources/scripts/generate_expected_recurrentgemma_output.py b/cpp/tests/resources/scripts/generate_expected_recurrentgemma_output.py
new file mode 100644
index 000000000..82b8f2bef
--- /dev/null
+++ b/cpp/tests/resources/scripts/generate_expected_recurrentgemma_output.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import run
+
+
+def generate_output(engine: str,
+                    num_beams: int,
+                    input_name: str,
+                    output_name: str,
+                    max_output_len: int = 8,
+                    output_logits: bool = False):
+    tp_size = 1
+    pp_size = 1
+    model = 'recurrentgemma-2b'
+    resources_dir = Path(__file__).parent.resolve().parent
+    models_dir = resources_dir / 'models'
+    tp_pp_dir = 'tp' + str(tp_size) + '-pp' + str(pp_size) + '-gpu/'
+    engine_dir = models_dir / 'rt_engine' / model / engine / tp_pp_dir
+
+    data_dir = resources_dir / 'data'
+    input_file = data_dir / (input_name + '.npy')
+    model_data_dir = data_dir / model
+    if num_beams <= 1:
+        output_dir = model_data_dir / 'sampling'
+    else:
+        output_dir = model_data_dir / ('beam_search_' + str(num_beams))
+
+    output_name += '_tp' + str(tp_size) + '_pp' + str(pp_size)
+
+    output_logits_npy = None
+    if output_logits:
+        output_logits_npy = str(output_dir / (output_name + '_logits' + '.npy'))
+
+    args = run.parse_arguments([
+        '--engine_dir',
+        str(engine_dir), '--input_file',
+        str(input_file), '--tokenizer_dir',
+        str(models_dir / model), '--output_npy',
+        str(output_dir / (output_name + '.npy')), '--output_csv',
+        str(output_dir / (output_name + '.csv')), '--max_output_len',
+        str(max_output_len), '--num_beams',
+        str(num_beams), '--output_logits_npy',
+        str(output_logits_npy), '--use_py_session'
+    ])
+    run.main(args)
+
+
+def generate_outputs(num_beams):
+    print('Generating RecurrentGemma FP16-plugin-packed-paged outputs')
+    generate_output(engine='fp16-plugin-packed-paged',
+                    num_beams=num_beams,
+                    input_name='input_tokens',
+                    output_name='output_tokens_fp16_plugin_packed_paged')
+
+
+if __name__ == '__main__':
+    generate_outputs(num_beams=1)
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index 7841f0bfd..80fa6d150 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -98,6 +98,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
               run_chatglm=False,
               run_medusa=False,
               run_mamba=False,
+              run_recurrentgemma=False,
               run_encoder=False,
               run_fp8=False,
               only_multi_gpu=False,
@@ -137,6 +138,20 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
             env=_os.environ,
             timeout=300)
 
+    if run_recurrentgemma:
+        run_command([
+            "git", "clone",
+            "https://github.com/google-deepmind/recurrentgemma.git"
+        ],
+                    cwd=root_dir,
+                    env=_os.environ,
+                    timeout=300)
+        run_command(
+            [python_exe, "-m", "pip", "install", "./recurrentgemma[full]"],
+            cwd=root_dir,
+            env=_os.environ,
+            timeout=300)
+
     build_dir = build_dir if build_dir.is_absolute() else root_dir / build_dir
     resources_dir = _pl.Path("cpp") / "tests" / "resources"
 
@@ -182,6 +197,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
                                 run_chatglm=run_chatglm,
                                 run_medusa=run_medusa,
                                 run_mamba=run_mamba,
+                                run_recurrentgemma=run_recurrentgemma,
                                 run_encoder=run_encoder,
                                 run_fp8=run_fp8)
 
@@ -195,6 +211,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
                              run_chatglm=run_chatglm,
                              run_medusa=run_medusa,
                              run_mamba=run_mamba,
+                             run_recurrentgemma=run_recurrentgemma,
                              run_encoder=run_encoder,
                              run_fp8=run_fp8,
                              timeout=test_timeout)
@@ -229,6 +246,7 @@ def prepare_all_model_tests(python_exe: str,
                             run_chatglm=False,
                             run_medusa=False,
                             run_mamba=False,
+                            run_recurrentgemma=False,
                             run_encoder=False,
                             run_fp8=False):
     model_cache_arg = ["--model_cache", model_cache] if model_cache else []
@@ -295,6 +313,15 @@ def prepare_all_model_tests(python_exe: str,
     else:
         _log.info("Skipping Mamba tests")
 
+    if run_recurrentgemma:
+        prepare_model_tests(model_name="recurrentgemma",
+                            python_exe=python_exe,
+                            root_dir=root_dir,
+                            resources_dir=resources_dir,
+                            model_cache_arg=model_cache_arg)
+    else:
+        _log.info("Skipping RecurrentGemma tests")
+
     if run_encoder:
         prepare_model_tests(model_name="enc_dec",
                             python_exe=python_exe,
@@ -376,6 +403,7 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800):
     excluded_tests.append("ChatGlm")
     excluded_tests.append("Medusa")
     excluded_tests.append("Mamba")
+    excluded_tests.append("RecurrentGemma")
     excluded_tests.append("Encoder")
     ctest.extend(["-E", "|".join(excluded_tests)])
     run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
@@ -388,6 +416,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
                          run_chatglm,
                          run_medusa,
                          run_mamba,
+                         run_recurrentgemma,
                          run_encoder,
                          run_fp8,
                          timeout=3600):
@@ -412,6 +441,8 @@ def run_single_gpu_tests(build_dir: _pl.Path,
         included_tests.append("Medusa")
     if run_mamba:
         included_tests.append("Mamba")
+    if run_recurrentgemma:
+        included_tests.append("RecurrentGemma")
     if run_encoder:
         included_tests.append("EncoderModelTestSingleGPU")
 
@@ -613,6 +644,9 @@ def run_benchmarks(python_exe: str, root_dir: _pl.Path, build_dir: _pl.Path,
     parser.add_argument("--run_mamba",
                         action="store_true",
                         help="Run the tests for Mamba")
+    parser.add_argument("--run_recurrentgemma",
+                        action="store_true",
+                        help="Run the tests for RecurrentGemma")
     parser.add_argument("--run_encoder",
                         action="store_true",
                         help="Run the tests for BART encoder")
@@ -637,6 +671,7 @@ def run_benchmarks(python_exe: str, root_dir: _pl.Path, build_dir: _pl.Path,
         args.run_llama = True
         args.run_chatglm = True
         args.run_mamba = True
+        args.run_recurrentgemma = True
         args.run_encoder = True
 
     del args.run_all_models
diff --git a/cpp/tests/runtime/loraCacheTest.cpp b/cpp/tests/runtime/loraCacheTest.cpp
index 6fd4196ea..7ec7142b0 100644
--- a/cpp/tests/runtime/loraCacheTest.cpp
+++ b/cpp/tests/runtime/loraCacheTest.cpp
@@ -23,11 +23,14 @@
 #include "tensorrt_llm/runtime/loraUtils.h"
 #include "tensorrt_llm/runtime/utils/numpyUtils.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
-#include <filesystem>
+
+#include <NvInferRuntime.h>
+
 #include <gmock/gmock-matchers.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+
+#include <filesystem>
 #include <memory>
 
 namespace fs = std::filesystem;
diff --git a/cpp/tests/runtime/loraUtilsTest.cpp b/cpp/tests/runtime/loraUtilsTest.cpp
index dc882df3e..b6cdd15f8 100644
--- a/cpp/tests/runtime/loraUtilsTest.cpp
+++ b/cpp/tests/runtime/loraUtilsTest.cpp
@@ -24,7 +24,8 @@
 #include "tensorrt_llm/runtime/loraUtils.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
+
+#include <NvInferRuntime.h>
 
 #include <algorithm>
 #include <optional>
diff --git a/cpp/tests/runtime/medusaModuleTest.cpp b/cpp/tests/runtime/medusaModuleTest.cpp
index 63a036563..6c08ba948 100644
--- a/cpp/tests/runtime/medusaModuleTest.cpp
+++ b/cpp/tests/runtime/medusaModuleTest.cpp
@@ -18,15 +18,14 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/worldConfig.h"
-#include <NvInferRuntimeBase.h>
-#include <algorithm>
+
+#include <NvInferRuntime.h>
+
 #include <gmock/gmock-matchers.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include <optional>
-#include <stdexcept>
+
+#include <algorithm>
 
 namespace tensorrt_llm::runtime
 {
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index 7f256b79b..6627cd451 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,6 +1,6 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=24.02-py3
+ARG BASE_TAG=24.03-py3
 ARG DEVEL_IMAGE=devel
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
@@ -48,7 +48,9 @@ RUN bash ./install_mpi4py.sh && rm install_mpi4py.sh
 # Install PyTorch
 ARG TORCH_INSTALL_TYPE="skip"
 COPY docker/common/install_pytorch.sh install_pytorch.sh
-RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
+# Apply PyTorch patch for supporting compiling with CUDA 12.4 from source codes
+COPY docker/common/pytorch_pr_116072.patch /tmp/pytorch_pr_116072.patch
+RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh /tmp/pytorch_pr_116072.patch
 
 FROM ${DEVEL_IMAGE} as wheel
 WORKDIR /src/tensorrt_llm
diff --git a/docker/Makefile b/docker/Makefile
index e219a9df2..0157ce695 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -145,21 +145,12 @@ jenkins-aarch64_%: STAGE = devel
 centos7_%: IMAGE_WITH_TAG = $(shell grep 'LLM_CENTOS7_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
 centos7_%: STAGE = devel
 centos7_%: BASE_IMAGE = nvidia/cuda
-centos7_%: BASE_TAG = 12.3.2-devel-centos7
+centos7_%: BASE_TAG = 12.4.0-devel-centos7
 
 # For x86_64 and aarch64
 ubuntu22_%: STAGE = devel
 ubuntu22_%: BASE_IMAGE = nvidia/cuda
-ubuntu22_%: BASE_TAG = 12.3.2-devel-ubuntu22.04
-
-# For x86_64
-old-cuda_%: IMAGE_WITH_TAG = $(shell grep 'LLM_OLD_CUDA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
-old-cuda_%: BASE_TAG = 23.07-py3
-old-cuda_%: STAGE = devel
-old-cuda_%: CUDA_VERSION = 12.1
-old-cuda_%: CUDNN_VERSION = 8.9.3.28-1+cuda12.1
-old-cuda_%: NCCL_VERSION = 2.18.3-1+cuda12.1
-old-cuda_%: CUBLAS_VERSION = 12.1.3.1-1
+ubuntu22_%: BASE_TAG = 12.4.0-devel-ubuntu22.04
 
 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
diff --git a/docker/common/install_polygraphy.sh b/docker/common/install_polygraphy.sh
index f3ae75a18..922f7430a 100644
--- a/docker/common/install_polygraphy.sh
+++ b/docker/common/install_polygraphy.sh
@@ -2,4 +2,4 @@
 
 set -ex
 
-pip3 install polygraphy==0.49.0
+pip3 install polygraphy==0.49.9
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index 1536bd514..0a42ae444 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -4,8 +4,8 @@ set -ex
 
 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html#rel-24-02
-TORCH_VERSION="2.2.1"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-03.html#rel-24-03
+TORCH_VERSION="2.2.2"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
@@ -42,6 +42,8 @@ install_from_source() {
     cd /tmp
     git clone --depth 1 --branch v$TORCH_VERSION https://github.com/pytorch/pytorch
     cd pytorch
+    # Apply PyTorch patch for supporting compiling with CUDA 12.4 from source codes.
+    git apply /tmp/pytorch_pr_116072.patch
     git submodule sync && git submodule update --init --recursive
     pip3 install -r requirements.txt
     python3 setup.py install
diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
index 20ea5c62c..991b7d726 100644
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@@ -2,16 +2,17 @@
 
 set -ex
 
-# Use https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html#rel-24-02
-TRT_VER="9.3.0.1"
-CUDA_VER="12.3"
+TRT_VER="10.0.1.6"
+# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-03.html#rel-24-03
+CUDA_VER="12.4" # 12.4.0
+# cuDNN v8 is still needed by PyTorch v2.2.2
 CUDNN_VER="8.9.7.29-1+cuda12.2"
-# v2.19.4 doesn't exist in https://developer.download.nvidia.cn/compute/cuda/repos/
-NCCL_VER="2.19.3-1+cuda12.3"
-CUBLAS_VER="12.3.4.1-1"
-# Align with the pre-installed CUDA / NVCC version.
-# https://docs.nvidia.com/cuda/archive/12.3.2/cuda-toolkit-release-notes/index.html
-NVRTC_VER="12.3.107-1"
+NCCL_VER="2.20.5-1+cuda12.4"
+CUBLAS_VER="12.4.2.65-1"
+# Align with the pre-installed CUDA / NVCC / NVRTC versions from
+# https://docs.nvidia.com/cuda/archive/12.4.0/cuda-toolkit-release-notes/index.html
+NVRTC_VER="12.4.99-1"
 
 for i in "$@"; do
     case $i in
@@ -77,7 +78,7 @@ install_centos_requirements() {
 install_tensorrt() {
     PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
     PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
-    TRT_CUDA_VERSION="12.2"
+    TRT_CUDA_VERSION="12.4"
 
     if [ -z "$RELEASE_URL_TRT" ];then
         ARCH=${TRT_TARGETARCH}
@@ -86,7 +87,8 @@ install_tensorrt() {
         if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
         if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
         if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-${TRT_VER}.${OS}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz;
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+
     fi
     wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
     tar -xf /tmp/TensorRT.tar -C /usr/local/
diff --git a/docker/common/pytorch_pr_116072.patch b/docker/common/pytorch_pr_116072.patch
new file mode 100644
index 000000000..05485d3e4
--- /dev/null
+++ b/docker/common/pytorch_pr_116072.patch
@@ -0,0 +1,31 @@
+From 2a440348958b3f0a2b09458bd76fe5959b371c0c Mon Sep 17 00:00:00 2001
+From: eqy <eddiey@nvidia.com>
+Date: Tue, 19 Dec 2023 05:56:48 +0000
+Subject: [PATCH] [CUDA] Include `<thrust/swap.h>` in `LinearAlgebra.cu`
+ (#116072)
+
+Fixes build against the latest `NVIDIA/cccl`.
+
+CC @malfet @xwang233 @ptrblck
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/116072
+Approved by: https://github.com/malfet, https://github.com/xwang233
+---
+ aten/src/ATen/native/cuda/LinearAlgebra.cu | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
+index fb59f976041..a6a566a5de2 100644
+--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
++++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
+@@ -9,6 +9,8 @@
+ #include <ATen/native/ReduceOps.h>
+ #include <c10/core/Scalar.h>
+
++#include <thrust/swap.h>
++
+ namespace at::native {
+
+ namespace {
+--
+2.34.1
diff --git a/docs/source/architecture/checkpoint.md b/docs/source/architecture/checkpoint.md
index 5044c4bbb..dc269c5f3 100644
--- a/docs/source/architecture/checkpoint.md
+++ b/docs/source/architecture/checkpoint.md
@@ -15,7 +15,7 @@ NeMo -------------
                   |
 HuggingFace ------
                   |   convert                             build                    load
-AMMO -------------  ----------> TensorRT-LLM Checkpoint --------> TensorRT Engine ------> TensorRT-LLM ModelRunner
+Modelopt ---------  ----------> TensorRT-LLM Checkpoint --------> TensorRT Engine ------> TensorRT-LLM ModelRunner
                   |
 JAX --------------
                   |
@@ -27,7 +27,7 @@ DeepSpeed --------
 TensorRT-LLM aims at supporting different sources:
 
 1. Trained models from NVIDIA NeMo, Microsoft DeepSpeed, and JAX
-2. Quantized models from NVIDIA AMMO
+2. Quantized models from NVIDIA Modelopt
 3. Popular models from HuggingFace
 
 TensorRT-LLM defines its own checkpoint format. A checkpoint directory includes:
diff --git a/docs/source/architecture/workflow.md b/docs/source/architecture/workflow.md
index 1d1a3a096..d4cbea643 100644
--- a/docs/source/architecture/workflow.md
+++ b/docs/source/architecture/workflow.md
@@ -91,14 +91,14 @@ Though there are some limitations and pitfalls of doing these custom weights loa
 
 ## Quantization APIs
 
-TensorRT-LLM relies on NVIDIA AMMO toolkit to support some of the quantization like: FP8, W4A16_AWQ, W4A8_AWQ, while it also has some its own quantization implementation for Smooth Quant, INT8 KV cache, and INT4/INT8 weight only.
+TensorRT-LLM relies on NVIDIA Modelopt toolkit to support some of the quantization like: FP8, W4A16_AWQ, W4A8_AWQ, while it also has some its own quantization implementation for Smooth Quant, INT8 KV cache, and INT4/INT8 weight only.
 
 
 In TensorRT-LLM 0.8 version:
 
-* For AMMO-supported quantization algorithms, a standalone script in the example folder [quantize.py](../../examples/quantization/quantize.py) shall be executed to export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
+* For Modelopt-supported quantization algorithms, a standalone script in the example folder [quantize.py](../../examples/quantization/quantize.py) shall be executed to export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
 
-* For the non-AMMO quantization algorithms, users need to use the per-model convert_checkpoint.py scripts to export TensorRT-LLM checkpoints.
+* For the non-Modelopt quantization algorithms, users need to use the per-model convert_checkpoint.py scripts to export TensorRT-LLM checkpoints.
 
 Use the `quantize()` interface to unify the different quantization flows. The default implementation is added in the `PretrainedModel` class.
 
@@ -112,14 +112,14 @@ class PretrainedModel:
         output_dir,
         quant_config: QuantConfig,
         mapping: Optional[Mapping] = None): #some args are omitted here
-        # Internally quantize the given hugging face models using AMMO
+        # Internally quantize the given hugging face models using Modelopt
         # and save the checkpoint to output_dir
 ```
 
 ```{note}
 
-* The default implementation only handles the AMMO supported quantization. The LLaMA class then inherits this `PretrainedModel` and dispatches the AMMO quantization to the super class's default implementation.
-* The model developer raises errors in the sub-class implementation if the new model is not supported by AMMO yet.
+* The default implementation only handles the Modelopt supported quantization. The LLaMA class then inherits this `PretrainedModel` and dispatches the Modelopt quantization to the super class's default implementation.
+* The model developer raises errors in the sub-class implementation if the new model is not supported by Modelopt yet.
 
 
 ```python
@@ -131,8 +131,8 @@ class LLaMAForCausalLM:
         output_dir,
         quant_config: QuantiConfig,
         mapping: Optional[Mapping] = None): #some args are omitted here
-        use_ammo_quantization = ... # determine if to use AMMO or use native
-        if use_ammo_quantization:
+        use_modelopt_quantization = ... # determine if to use Modelopt or use native
+        if use_modelopt_quantization:
             super().quantize(hf_model_dir,
                              output_dir,
                              quant_config)
diff --git a/docs/source/installation/build-from-source-windows.md b/docs/source/installation/build-from-source-windows.md
index 76a793957..9435ae54c 100644
--- a/docs/source/installation/build-from-source-windows.md
+++ b/docs/source/installation/build-from-source-windows.md
@@ -10,7 +10,7 @@ This section is for advanced users. Skip this section if you plan to use the pre
 
 1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
 2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/).
-3. Download and unzip [TensorRT 9.3.0.1 for TensorRT-LLM](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.win10.cuda-12.2.llm.beta.zip).
+3. Download and unzip [TensorRT 10.0.1.6](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip).
 
 ## Building a TensorRT-LLM Docker Image
 
@@ -35,7 +35,13 @@ After building, copy the files out of your container. `docker cp` is not support
 
 ### Acquire an Image
 
-The Docker container will be hosted for public download in a future release. At this time, it must be built manually. Refer to [windows/docker/README.md](/windows/docker/README.md) for the image build instructions.
+The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command:
+
+```bash
+docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
+```
+
+And your image is now ready for use.
 
 ### Run the Container
 
@@ -58,7 +64,7 @@ git submodule update --init --recursive
 2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`.
 
 ```bash
-python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-9.2.0.5\
+python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.0.1.6\
 ```
 
 3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage).
@@ -95,7 +101,7 @@ python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-9.
 
         1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
         2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**.
-        3. Download and unzip [TensorRT 9.2.0.5 for TensorRT-LLM](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.windows10.x86_64.cuda-12.2.llm.beta.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
+        3. Download and unzip [TensorRT 10.0.1.6](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
 
             1. Add the libraries for TensorRT  to your system's `Path` environment variable. Your `Path` should include a line like this:
 
diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md
index 842dd2af8..096d5573a 100644
--- a/docs/source/installation/linux.md
+++ b/docs/source/installation/linux.md
@@ -2,13 +2,18 @@
 
 # Installing on Linux
 
-1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit).
-2. Install TensorRT-LLM.
+1. Retrieve and launch the docker container (optional).
+
+    You can pre-install the environment using the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit) to avoid manual environment configuration.
 
     ```bash
     # Obtain and start the basic docker image environment (optional).
     docker run --rm --runtime=nvidia --gpus all --entrypoint /bin/bash -it nvidia/cuda:12.1.0-devel-ubuntu22.04
+    ```
 
+2. Install TensorRT-LLM.
+
+    ```bash
     # Install dependencies, TensorRT-LLM requires Python 3.10
     apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev git
 
@@ -21,7 +26,7 @@
     python3 -c "import tensorrt_llm"
     ```
 
-3. Install the requirements inside the Docker container.
+3. Install the requirements for running the example.
 
     ```bash
     git clone https://github.com/NVIDIA/TensorRT-LLM.git
diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md
index 393c9fb77..94a8a76fb 100644
--- a/docs/source/installation/windows.md
+++ b/docs/source/installation/windows.md
@@ -12,11 +12,22 @@ The Windows release of TensorRT-LLM is currently in beta. We recommend using the
 
 2. Install the dependencies one of two ways:
 
-    1. Run the provided PowerShell script; `setup_env.ps1`, which installs Python, CUDA 12.2, and Microsoft MPI automatically with default settings. Run PowerShell as Administrator to use the script.
+    1. Install all dependencies together.
+
+       1. Run the provided PowerShell script `setup_env.ps1` located under `/windows/` folder which installs Python, CUDA 12.4 and cuDNN automatically with default settings. Run PowerShell as Administrator to use the script. Note that cuDNN is installed in the current working directory in which the script is launched.
+
+       ```bash
+       ./setup_env.ps1 [-skipCUDA] [-skipPython] [-skipCUDNN]
+       ```
+
+       2. Add cuDNN to your system's `Path` environment variable by executing
+
+       ```powershell
+       [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';' + $Env:CUDNN, [EnvironmentVariableTarget]::Machine)
+       ```
+
+       and closing followed by re-opening any existing PowerShell or Git Bash windows so they pick up the new `Path`.
 
-    ```bash
-    ./setup_env.ps1 [-skipCUDA] [-skipPython] [-skipMPI]
-    ```
 
     2. Install the dependencies one at a time.
 
@@ -25,26 +36,26 @@ The Windows release of TensorRT-LLM is currently in beta. We recommend using the
             1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command.
             2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`.
 
-    3. Install [CUDA 12.2 Toolkit](https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
+        1. Install [CUDA 12.4 Toolkit](https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
 
-    4. Download and install [Microsoft MPI](https://www.microsoft.com/en-us/download/details.aspx?id=57467). You will be prompted to choose between an `exe`, which installs the MPI executable, and an `msi`, which installs the MPI SDK. Download and install both.
+        2. [Optional] Download and install [Microsoft MPI](https://www.microsoft.com/en-us/download/details.aspx?id=57467). You will be prompted to choose between an `exe`, which installs the MPI executable, and an `msi`, which installs the MPI SDK. Download and install both.
 
-3. Download and unzip [cuDNN](https://developer.nvidia.com/cudnn).
+        3. Download and unzip [cuDNN](https://developer.nvidia.com/cudnn).
 
-    1. Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\cuDNN`.
-    2. Add the libraries and binaries for cuDNN to your system's `Path` environment variable.
+            1. Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\cuDNN`.
+            2. Add the libraries and binaries for cuDNN to your system's `Path` environment variable.
 
-        1. Click the Windows button and search for *environment variables*.
-        2. Click **Edit the system environment variables** > **Environment Variables**.
-        3. In the new window under *System variables*, click **Path** > **Edit**. Add **New** lines for the `bin` and `lib` directories of cuDNN. Your `Path` should include lines like this:
+                1. Click the Windows button and search for *environment variables*.
+                2. Click **Edit the system environment variables** > **Environment Variables**.
+                3. In the new window under *System variables*, click **Path** > **Edit**. Add **New** lines for the `bin` and `lib` directories of cuDNN. Your `Path` should include lines like this:
 
-        ```bash
-        %USERPROFILE%\inference\cuDNN\bin
-        %SERPROFILE%\inference\cuDNN\lib
-          ```
+                ```bash
+                %USERPROFILE%\inference\cuDNN\bin
+                %SERPROFILE%\inference\cuDNN\lib
+                  ```
 
-        4. Click **OK** on all the open dialog windows.
-        5. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
+                4. Click **OK** on all the open dialog windows.
+                5. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
 
 
 **Steps**
diff --git a/docs/source/reference/precision.md b/docs/source/reference/precision.md
index f8e6cb665..2e29bd35b 100644
--- a/docs/source/reference/precision.md
+++ b/docs/source/reference/precision.md
@@ -121,13 +121,14 @@ This release of TensorRT-LLM contains the following examples:
 | Baichuan   |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     Y     |     Y      |
 | BERT       |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
 | BLIP-2     |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
-| BLOOM      |   Y   |   Y   |   Y   |   .   |    Y    |   Y   |   Y   |     .     |     .      |
+| BLOOM      |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     .     |     .      |
 | ChatGLM    |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
 | ChatGLM-v2 |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
 | ChatGLM-v3 |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
 | DBRX       |   Y   |   Y   |   Y   |   .   |    .    |   Y   |   Y   |     .     |     .      |
 | Falcon     |   Y   |   Y   |   Y   |   Y   |    .    |   Y   |   Y   |     Y     |     .      |
 | Flan-T5    |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| Gemma      |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     Y     |     .      |
 | GPT        |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     .     |     .      |
 | GPT-J      |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     Y     |     .      |
 | GPT-NeMo   |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
index adc592520..9c877db55 100644
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@@ -56,11 +56,12 @@ The following table shows the supported software for TensorRT-LLM.
         - Volta (SM70) - FP32, FP16, INT8(1), INT4(2)
     * - Models
       -
+        - [Arctic](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/arctic)
         - [Baichuan](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/baichuan)
         - [BART](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec)
         - [BERT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/bert)
-        - [Blip2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
         - [BLOOM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/bloom)
+        - [ByT5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec)
         - [ChatGLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/chatglm)
         - [DBRX](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/dbrx)
         - [FairSeq NMT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec)
@@ -80,7 +81,7 @@ The following table shows the supported software for TensorRT-LLM.
         - [MPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt)
         - [mT5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec)
         - [OPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/opt)
-        - [Phi-1.5/Phi-2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/phi)
+        - [Phi-1.5/Phi-2/Phi-3](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/phi)
         - [Qwen](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwen)
         - [Qwen-VL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwenvl)
         - [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt)
@@ -95,8 +96,12 @@ The following table shows the supported software for TensorRT-LLM.
       -
         - [BLIP2 w/ OPT-2.7B](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
         - [BLIP2 w/ T5-XL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
+        - [CogVLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)(6)
+        - [Deplot](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
+        - [Fuyu](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
         - [LLaVA-v1.5-7B](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
         - [Nougat family](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal) Nougat-small, Nougat-base
+        - [VILA](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal)
 ```
 
 (1) INT8 SmoothQuant is not supported on SM70 and SM75.<br>
@@ -104,6 +109,7 @@ The following table shows the supported software for TensorRT-LLM.
 (3) INT4 AWQ and GPTQ with FP8 activations require SM >= 89.<br>
 (4) [Encoder-Decoder](https://github.com/NVIDIA/TensorRT-LLM/tree/main/main/examples/enc_dec) provides general encoder-decoder functionality that supports many encoder-decoder models such as T5 family, BART family, Whisper family, NMT family, and so on.
 (5) Multi-modal provides general multi-modal functionality that supports many multi-modal architectures such as BLIP family, LLaVA family, and so on.
+(6) Only supports bfloat16 precision.
 
 
 ```{note}
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index 0ab8195cf..ec450f564 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -5,6 +5,39 @@
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
 
 
+
+## TensorRT-LLM Release Next
+
+### Announcements
+- TensorRT-LLM supports TensorRT 10.0.1 and NVIDIA NGC 24.03 containers.
+
+### Key Features and Enhancements
+
+- Infrastructure features
+  - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.03-py3`.
+  - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.03-py3`.
+  - The dependent TensorRT version is updated to 10.0.1.
+  - The dependent CUDA version is updated to 12.4.0.
+  - The dependent PyTorch version is updated to 2.2.2.
+
+### API Changes
+
+- TBD
+
+### Model Updates
+
+- TBD
+
+### Limitations
+
+- TBD
+
+### Fixed Issues
+
+- TBD
+
+
+
 ## TensorRT-LLM Release 0.9.0
 ### Announcements
 - TensorRT-LLM requires TensorRT 9.3 and 24.02 containers.
diff --git a/docs/source/speculative_decoding.md b/docs/source/speculative_decoding.md
index a245d89c1..5359e2193 100644
--- a/docs/source/speculative_decoding.md
+++ b/docs/source/speculative_decoding.md
@@ -47,7 +47,7 @@ Configuring and executing the Draft model within the Inflight Fused Batching (IF
 follows the same procedure as for any other model within IFB.
 The `maxNewTokens` parameter should be set to the number of draft tokens in the `LlmRequest` for the Draft model query.
 
-When building the Target model, it is necessary to specify the `--max_draft_len <K>` option to the `trtllm-build` command.
+When building the Target model, it is necessary to specify the `--max_draft_len <K> --speculative_decoding_mode draft_tokens_external` option to the `trtllm-build` command.
 During the Target model's inference phase in IFB, `maxNewTokens` should be set to `1`,
 and the draft tokens must be set in the `draftTokens` field of the `LlmRequest` for the Target model query.
 
diff --git a/examples/arctic/README.md b/examples/arctic/README.md
new file mode 100644
index 000000000..af24794ee
--- /dev/null
+++ b/examples/arctic/README.md
@@ -0,0 +1,89 @@
+# Arctic
+
+This document shows how to build and run a [Arctic](https://huggingface.co/Snowflake/snowflake-arctic-instruct) model in TensorRT-LLM.
+
+The TensorRT-LLM Arctic implementation is based on the LLaMA model, with Mixture of Experts (MoE) enabled. The implementation can
+be found in [llama/model.py](../../tensorrt_llm/models/llama/model.py).
+See the LLaMA example [`examples/llama`](../llama) for details.
+
+- [Arctic](#arctic)
+  - [Download model checkpoints](#download-model-checkpoints)
+  - [TensorRT-LLM workflow](#tensorrt-llm-workflow)
+    - [Apply FP8 PTQ](#apply-fp8-ptq)
+    - [Build TensorRT engine](#build-tensorrt-engine)
+    - [Run Engine](#run-engine)
+    - [OOTB](#ootb)
+
+## Download model checkpoints
+
+First, download the HuggingFace BF16 checkpoints of Arctic model.
+
+**CAVEAT: this model is a pretty large Mixture-of-Experts (MoE) model, which has nearly 500B parameters and requires around 900GB disk space for storage. Please make sure you have enough space before proceeding.**
+
+```bash
+HF_MODEL="arctic"
+git clone https://huggingface.co/Snowflake/snowflake-arctic-instruct tmp/hf_checkpoints/${HF_MODEL}
+
+```
+
+## TensorRT-LLM workflow
+Next, we use the general quantization script `quantize.py` to convert the checkpoints in FP8, and build the model with `trtllm-build` on multi-GPUs. In the example below, we use Tensor Parallelism (TP) across 8 GPUs.
+
+**Note: for such large model, it is deemed necessary to apply Post-Training Quantization (PTQ) methods on the model weights to deploy it on a cluster node, e.g., 8xH100 GPUs. In this example, we demonstrate the FP8 quantization workflow, which is supported on Hopper-and-next GPU architectures. For instructions of other PTQ methods other than FP8, please refer to the LLaMA or Mixtral examples.**
+
+
+Set environment variables and necessary directory:
+
+```bash
+PREC_RAW="bfloat16"
+PREC_QUANT="fp8"
+TP=8
+ENGINE="${HF_MODEL}_${PREC_QUANT}_tp${TP}"
+
+mkdir -p tmp/trt_engines
+```
+
+### Apply FP8 PTQ
+
+Notes:
+- currently quantize.py does not support for Expert Parallelism (EP) mode yet. User should use `../llama/convert_checkpoint.py` and specify `--moe_tp_mode 1` (1 for EP, 2 for TP) instead, if needed.
+- TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to ../quantization/README.md for explanation.
+- **due to the large model size and the calibration step (which has to load the HuggingFace model and run forward passes), it is likely that you will need more number of GPUs during quantization step than the number of GPUs for engine building and final deployment. For example, using 16xH100 or 8xH200 for quantization & 8xH100 for deployment.**
+
+```bash
+python ../quantization/quantize.py --model_dir tmp/hf_checkpoints/${HF_MODEL} \
+                                   --dtype ${PREC_RAW} \
+                                   --qformat ${PREC_QUANT} \
+                                   --kv_cache_dtype ${PREC_QUANT} \
+                                   --output_dir tmp/tllm_checkpoints/${ENGINE} \
+                                   --batch_size 1 \
+                                   --calib_size 128 \
+                                   --tp_size ${TP} |& tee tmp/trt_engines/${ENGINE}_quantize.log
+
+```
+
+### Build TensorRT engine
+```bash
+# Enable fp8 context fmha to get further acceleration by setting `--use_fp8_context_fmha enable`
+# Use --workers to enable parallel build
+trtllm-build --checkpoint_dir ./tmp/tllm_checkpoints/${ENGINE} \
+             --output_dir ./tmp/trt_engines/${ENGINE} \
+             --gpt_attention_plugin ${PREC_RAW} \
+             --gemm_plugin ${PREC_RAW} \
+             --strongly_typed \
+             --workers ${TP} |& tee tmp/trt_engines/${ENGINE}_build.log
+```
+
+### Run Engine
+Test your engine with the [run.py](../run.py) script:
+
+```bash
+mpirun -n ${TP} --allow-run-as-root python ../run.py --engine_dir ./tmp/trt_engines/${ENGINE} --tokenizer_dir tmp/hf_checkpoints/${HF_MODEL} --max_output_len 20 --input_text "The future of AI is" |& tee tmp/trt_engines/${ENGINE}_run.log
+```
+
+For more examples see [`examples/llama/README.md`](../llama/README.md)
+
+
+### OOTB
+
+Arctic supports OOTB operation without the plugin, however this comes at a significant performance cost. Users should prefer using the plugin path whenever possible.
diff --git a/examples/baichuan/README.md b/examples/baichuan/README.md
index 2ef3dcc03..37ef12218 100644
--- a/examples/baichuan/README.md
+++ b/examples/baichuan/README.md
@@ -137,9 +137,9 @@ python convert_checkpoint.py --model_version v1_13b \
 
 #### FP8 Post-Training Quantization
 
-The examples below uses the NVIDIA AMMO (AlgorithMic Model Optimization) toolkit for the model quantization process.
+The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
 
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
 ```bash
 # Quantize HF Baichuan v2 13B into FP8 and export a single-rank checkpoint
@@ -155,7 +155,7 @@ Note that you can enable fp8 context fmha to get further acceleration by setting
 
 #### Groupwise quantization (AWQ/GPTQ)
 ##### AWQ
-NVIDIA AMMO toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for AMMO installation instructions.
+NVIDIA Modelopt toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for Modelopt installation instructions.
 ```bash
 # Quantize HF Baichuan 13B checkpoint into INT4 AWQ format
 python ../quantization/quantize.py --model_dir /code/model/Baichuan2-13B-Chat/ \
@@ -195,7 +195,7 @@ To run the GPTQ Baichuan example, the following steps are required:
 #### INT8 KV cache
 INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger.
 
-You can get the INT8 scale of KV cache through NVIDIA AMMO (AlgorithMic Model Optimization) toolkit, which features a
+You can get the INT8 scale of KV cache through NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit, which features a
 `--kv_cache_dtype` option.
 
 Example:
diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt
index a203bafaa..35edb068e 100644
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/bloom/README.md b/examples/bloom/README.md
index 13b040ee7..7571a89ab 100644
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@@ -12,7 +12,8 @@ This document shows how to build and run a BLOOM model in TensorRT-LLM on both s
     - [Build TensorRT engine(s)](#build-tensorrt-engines)
       - [INT8 weight only + INT8 KV cache](#int8-weight-only--int8-kv-cache)
       - [SmoothQuant](#smoothquant)
-    - [4. Run](#4-run)
+      - [FP8 Post-Training Quantization](#fp8-post-training-quantization)
+    - [Run](#run)
 
 ## Overview
 
@@ -31,6 +32,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
   * INT8 KV CACHE
   * Smooth Quant
   * Tensor Parallel
+  * FP8 and FP8 KV cache
 
 ## Usage
 
@@ -188,7 +190,27 @@ Note that GPT attention plugin is required to be enabled for SmoothQuant for now
 
 Note we use `--bin_model_dir` instead of `--model_dir` since SmoothQuant model needs INT8 weights and various scales from the binary files.
 
-### 4. Run
+#### FP8 Post-Training Quantization
+
+```
+# Quantize HF Bloom 3B into FP8 and export trtllm checkpoint
+python ../quantization/quantize.py --model_dir /home/scratch.trt_llm_data/llm-models/bloom-3b \
+                                   --dtype float16 \
+                                   --qformat fp8 \
+                                   --kv_cache_dtype fp8 \
+                                   --output_dir /tmp/bloom/3b/trt_ckpts/fp8/1-gpu/ \
+                                   --calib_size 512 \
+                                   --tp_size 1
+
+trtllm-build --checkpoint_dir /tmp/bloom/3b/trt_ckpts/fp8/1-gpu/ \
+             --output_dir /tmp/bloom/3b/trt_engines/fp8/1-gpu/ \
+             --gemm_plugin float16 \
+             --use_fp8_context_fmha enable \
+             --strongly_typed \
+             --workers 1
+```
+
+### Run
 
 ```bash
 python ../summarize.py --test_trt_llm \
@@ -212,4 +234,9 @@ mpirun -n 8 --allow-run-as-root \
                            --hf_model_dir ./bloom/176B/ \
                            --data_type fp16 \
                            --engine_dir ./bloom/176B/trt_engines/fp16/8-gpu/
+
+python ../summarize.py --test_trt_llm \
+                       --hf_model_dir /home/scratch.trt_llm_data/llm-models/bloom-3b \
+                       --data_type fp16 \
+                       --engine_dir /tmp/bloom/3b/trt_engines/fp8/1-gpu/
 ```
diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt
index 4f2d482cf..1c14f7186 100644
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt
index 7f2622107..9d81c4a39 100644
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
diff --git a/examples/cogvlm/convert_checkpoint.py b/examples/cogvlm/convert_checkpoint.py
new file mode 100644
index 000000000..236d08bd7
--- /dev/null
+++ b/examples/cogvlm/convert_checkpoint.py
@@ -0,0 +1,641 @@
+import argparse
+import copy
+import json
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+
+import safetensors
+import torch
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+import tensorrt_llm
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models.cogvlm.convert import convert_hf_cogvlm
+from tensorrt_llm.models.llama.convert import (capture_activation_range,
+                                               smooth_llama_model)
+from tensorrt_llm.models.llama.weight import (load_from_gptq_llama,
+                                              load_from_hf_checkpoint,
+                                              load_from_meta_llama)
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+
+try:
+    from transformers import LlavaConfig, LlavaForConditionalGeneration
+except ImportError:
+    pass
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir', type=str, default=None)
+    parser.add_argument('--meta_ckpt_dir', type=str, default=None)
+
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument('--vocab_size', type=int, default=32000)
+    parser.add_argument('--n_positions', type=int, default=2048)
+    parser.add_argument('--n_layer', type=int, default=32)
+    parser.add_argument('--n_head', type=int, default=32)
+    parser.add_argument('--n_kv_head', type=int, default=None)
+    parser.add_argument('--n_embd', type=int, default=4096)
+    parser.add_argument('--inter_size', type=int, default=11008)
+    parser.add_argument('--rms_norm_eps', type=float, default=1e-06)
+
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4', 'int4_gptq'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        "--smoothquant",
+        "-sq",
+        type=float,
+        default=None,
+        help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
+        " to Smoothquant the model, and output int8 weights."
+        " A good first try is 0.5. Must be in [0, 1]")
+    parser.add_argument(
+        '--per_channel',
+        action="store_true",
+        default=False,
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        action="store_true",
+        default=False,
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--ammo_quant_ckpt_path',
+        type=str,
+        default=None,
+        help='Path of a quantized model checkpoint in .npz format')
+
+    parser.add_argument(
+        '--per_group',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale weights in the int4 range. '
+        'per_group chooses at run time, and for each group, a custom scaling factor. '
+        'The flag is built for GPTQ/AWQ quantization.')
+
+    parser.add_argument(
+        '--enable_fp8',
+        default=False,
+        action='store_true',
+        help='Use FP8 Linear layer for Attention QKV/Dense and MLP.')
+    parser.add_argument(
+        '--fp8_kv_cache',
+        default=False,
+        action="store_true",
+        help='By default, we use dtype for KV cache. fp8_kv_cache chooses int8 '
+        'quantization for KV')
+    parser.add_argument('--load_by_shard',
+                        action='store_true',
+                        help='Load a pretrained model shard-by-shard.')
+    parser.add_argument('--hidden_act', type=str, default='silu')
+
+    parser.add_argument('--rotary_base', type=float, default=10000.0)
+    parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
+
+    parser.add_argument('--group_size',
+                        type=int,
+                        default=128,
+                        help='Group size used in GPTQ/AWQ quantization.')
+
+    parser.add_argument("--storage-type",
+                        "-t",
+                        type=str,
+                        default="fp32",
+                        choices=["fp32", "fp16"])
+    parser.add_argument("--dataset-cache-dir",
+                        type=str,
+                        default=None,
+                        help="cache dir to load the hugging face dataset")
+    parser.add_argument("--load_model_on_cpu", action="store_true")
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument(
+        '--use_embedding_sharing',
+        action="store_true",
+        default=False,
+        help=
+        'Try to reduce the engine size by sharing the embedding lookup table between two layers.'
+        'Note: the flag might not take effect when the criteria are not met.')
+    parser.add_argument('--use_prompt_tuning',
+                        action="store_true",
+                        default=False)
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='tllm_checkpoint',
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
+    parser.add_argument(
+        '--moe_num_experts',
+        default=0,
+        type=int,
+        help='Specify the number of experts to use for MOE layers')
+    parser.add_argument(
+        '--moe_top_k',
+        default=0,
+        type=int,
+        help=
+        'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
+    )
+    parser.add_argument(
+        '--moe_tp_mode',
+        default=MoeConfig.ParallelismMode.TENSOR_PARALLEL,
+        type=int,
+        help=
+        'Controls how to distribute experts in TP. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument(
+        '--moe_renorm_mode',
+        default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+        type=int,
+        help=
+        'Controls renormalization after gate logits. Check layers/moe.py for accepted values',
+    )
+    parser.add_argument('--enable_pos_shift',
+                        default=False,
+                        action='store_true',
+                        help='Enable position shift for streamingllm method')
+    parser.add_argument(
+        '--dense_context_fmha',
+        default=False,
+        action='store_true',
+        help=
+        'Enable dense fmha in context phase, otherwise sliding window attention.'
+        'If dense_context_fmha=False, the sliding window size is the max attention window size.'
+    )
+    parser.add_argument('--hf_lora_dir', type=str, default=None)
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
+        help=
+        "Add lora in which modules. Only be activated when use_lora_plugin is enabled."
+    )
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.')
+    args = parser.parse_args()
+    return args
+
+
+def update_quantization_from_args(config: dict, args: argparse.Namespace):
+    '''update the given config dict in-place based on the command line args
+    '''
+    if args.use_weight_only:
+        if args.weight_only_precision == 'int8':
+            config['quantization']['quant_algo'] = 'W8A16'
+        elif args.weight_only_precision == 'int4':
+            config['quantization']['quant_algo'] = 'W4A16'
+    elif args.smoothquant:
+        config['quantization']['sq_use_plugin'] = True
+        if args.per_channel:
+            if args.per_token:
+                config['quantization'][
+                    'quant_algo'] = 'W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN'
+            else:
+                config['quantization'][
+                    'quant_algo'] = 'W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN'
+        else:
+            if args.per_token:
+                config['quantization'][
+                    'quant_algo'] = 'W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN'
+            else:
+                config['quantization'][
+                    'quant_algo'] = 'W8A8_SQ_PER_TENSOR_PLUGIN'
+
+    if args.use_weight_only and args.moe_config.has_moe():
+        config['quantization']['exclude_modules'].append('router')
+
+    if args.int8_kv_cache:
+        config['quantization']['kv_cache_quant_algo'] = 'INT8'
+
+    if args.weight_only_precision == 'int4_gptq':
+        config['quantization'].update({
+            "group_size": args.group_size,
+            "has_zero_point": True,
+            "pre_quant_scale": False,
+            'quant_algo': 'W4A16_GPTQ'
+        })
+
+
+def create_config_from_args(args: argparse.Namespace,
+                            lora_config: Optional[LoraConfig] = None):
+    config = {
+        'architecture': args.architecture,
+        'dtype': args.dtype,
+        'logits_dtype': 'float32',
+        'num_hidden_layers': args.n_layer,
+        'num_attention_heads': args.n_head,
+        'hidden_size': args.n_embd,
+        'intermediate_size': args.inter_size,
+        'num_key_value_heads': args.n_kv_head,
+        'vocab_size': args.vocab_size,
+        'position_embedding_type': 'rope_gpt_neox',
+        'max_position_embeddings': args.n_positions,
+        'hidden_act': args.hidden_act,
+        'rotary_base': args.rotary_base,
+        'rotary_scaling': args.rotary_scaling,
+        'norm_epsilon': args.rms_norm_eps,
+        'vision_start': args.vision_start,
+        'vision_length': args.vision_length,
+        'quantization': {
+            'quant_algo': None,
+            'kv_cache_quant_algo': None,
+            'exclude_modules': ['lm_head'],
+        },
+        'mapping': {
+            'world_size': args.tp_size * args.pp_size,
+            'tp_size': args.tp_size,
+            'pp_size': args.pp_size,
+        },
+        'use_parallel_embedding': args.use_parallel_embedding,
+        'embedding_sharding_dim': args.embedding_sharding_dim,
+        'share_embedding_table': args.use_embedding_sharing,
+        'use_prompt_tuning': args.use_prompt_tuning,
+        'moe_num_experts': args.moe_num_experts,
+        'moe_top_k': args.moe_top_k,
+        'moe_tp_mode': args.moe_tp_mode,
+        'moe_normalization_mode': args.moe_renorm_mode,
+        'enable_pos_shift': args.enable_pos_shift,
+        'dense_context_fmha': args.dense_context_fmha,
+    }
+    if lora_config is not None:
+        config.update({
+            'max_lora_rank':
+            args.max_lora_rank,
+            'lora_target_modules':
+            lora_config.lora_target_modules,
+            'hf_modules_to_trtllm_modules':
+            lora_config.hf_modules_to_trtllm_modules,
+            'trtllm_modules_to_hf_modules':
+            lora_config.trtllm_modules_to_hf_modules,
+            'disable_weight_only_quant_plugin':
+            args.disable_weight_only_quant_plugin
+        })
+        # the lora checkpoint might finetune the embedding
+        if lora_config.vocab_size != 0:
+            config['vocab_size'] = lora_config.vocab_size
+    update_quantization_from_args(config, args)
+    return config
+
+
+def create_lora_config(args: argparse.Namespace):
+    '''update args based on lora dir
+    '''
+    hf_modules_to_trtllm_modules = {
+        "q_proj": "attn_q",
+        "k_proj": "attn_k",
+        "v_proj": "attn_v",
+        "o_proj": "attn_dense",
+        "gate_proj": "mlp_h_to_4h",
+        "down_proj": "mlp_4h_to_h",
+        "up_proj": "mlp_gate"
+    }  # lora modules on llama
+
+    trtllm_modules_to_hf_modules = {
+        "attn_q": "q_proj",
+        "attn_k": "k_proj",
+        "attn_v": "v_proj",
+        "attn_dense": "o_proj",
+        "mlp_h_to_4h": "gate_proj",
+        "mlp_4h_to_h": "down_proj",
+        "mlp_gate": "up_proj",
+    }
+
+    lora_config = LoraConfig.from_hf(args.hf_lora_dir,
+                                     hf_modules_to_trtllm_modules,
+                                     trtllm_modules_to_hf_modules)
+
+    if lora_config.is_valid and lora_config.vocab_size != 0:
+        if args.lora_target_modules is not None:
+            # command line options is preferred over the modules in the lora dir
+            lora_config.lora_target_modules = args.lora_target_modules
+    # can be invalid
+    return lora_config
+
+
+def smooth_quant(model, args):
+    assert model is not None
+    act_range = {}
+    llama_qkv_para = {}
+    # smoother for inputs of self_attn.o_proj and mlp.down_proj
+    llama_smoother = {}
+
+    os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
+        "TOKENIZERS_PARALLELISM", "false")
+    if args.load_model_on_cpu:
+        logger.warning(
+            "Note that running capture_activation_range on cpu would be very small."
+        )
+    dataset = load_dataset("ccdv/cnn_dailymail",
+                           '3.0.0',
+                           cache_dir=args.dataset_cache_dir)
+
+    act_range = capture_activation_range(
+        model,
+        AutoTokenizer.from_pretrained(args.model_dir,
+                                      trust_remote_code=True,
+                                      use_fast=False,
+                                      padding_side='left'), dataset)
+    if args.smoothquant is not None:
+        smooth_llama_model(model, act_range, args.smoothquant, llama_qkv_para,
+                           llama_smoother)
+    return act_range, llama_qkv_para, llama_smoother
+
+
+def main():
+    # TODO(qijun): Currently, the convert script depends on a torch op:
+    # torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix,
+    # which is included in tensorrt_llm Python package. Otherwise, the convert
+    # script does not need to import tensorrt_llm. Will remove it after reimplementing
+    # the op with PyTorch.
+    logger.info(tensorrt_llm.__version__)
+    args = parse_arguments()
+    if args.model_dir is None and args.meta_ckpt_dir is None:
+        raise AssertionError(
+            "One of the model_dir or meta_ckpt_dir must be specified to generate the checkpoint"
+        )
+
+    world_size = args.tp_size * args.pp_size
+
+    tik = time.time()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    hf_config = None
+    if args.model_dir is not None:
+        hf_config = AutoConfig.from_pretrained(args.model_dir,
+                                               trust_remote_code=True)
+        if hf_config.model_type == "llava":
+            # LLaVA = Vision model + Llama LLM
+            # We load a llava config and use its' text config as llama config
+            hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config
+            hf_config.model_type = "llava"  # Replace llama with llava
+
+        if hf_config.architectures[0] == "CogVLMForCausalLM":
+            hf_config.model_type = 'cogvlm'
+        args.model_type = hf_config.model_type
+        args.n_head = hf_config.num_attention_heads
+        args.inter_size = hf_config.intermediate_size
+        args.n_layer = hf_config.num_hidden_layers
+        args.n_embd = hf_config.hidden_size
+        if hasattr(hf_config, "num_key_value_heads"):
+            args.n_kv_head = hf_config.num_key_value_heads
+        if args.n_kv_head is None:
+            args.n_kv_head = args.n_head
+        args.rms_norm_eps = hf_config.rms_norm_eps
+        args.vocab_size = hf_config.vocab_size
+        args.n_positions = hf_config.max_position_embeddings
+        if hf_config.model_type == "mixtral":
+            # HF LLaMA-type models are implicitly using gated activation.
+            # With our MoE implementation, we must make it explicit
+            args.hidden_act = "swiglu"
+            args.moe_num_experts = getattr(hf_config, "num_local_experts",
+                                           args.moe_num_experts)
+            args.moe_top_k = getattr(hf_config, "num_experts_per_tok",
+                                     args.moe_top_k)
+            args.rotary_base = getattr(hf_config, "rope_theta",
+                                       args.rotary_base)
+        args.architecture = hf_config.architectures[0]
+        args.vision_start = 1
+        args.vision_length = hf_config.vision_config['num_positions'] - 1
+
+    elif args.meta_ckpt_dir is not None:
+        with open(Path(args.meta_ckpt_dir, "params.json")) as fp:
+            meta_config: dict = json.load(fp)
+        args.n_embd = meta_config["dim"]
+        args.n_head = meta_config["n_heads"]
+        args.n_layer = meta_config["n_layers"]
+        args.n_kv_head = meta_config.get("n_kv_heads", args.n_head)
+
+        if "hidden_dim" in meta_config:
+            args.inter_size = meta_config["hidden_dim"]
+        else:
+            args.multiple_of = meta_config.get("multiple_of", 1)
+            n_embd = int(4 * args.n_embd * 2 / 3)
+            args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1)
+            args.inter_size = args.multiple_of * (
+                (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1)
+                // args.multiple_of)
+        args.rms_norm_eps = meta_config["norm_eps"]
+        args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0)
+        args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok",
+                                                        0)
+        args.architecture = "LlamaForCausalLM"
+    else:
+        args.n_kv_head = args.n_kv_head or args.n_head
+        args.architecture = "LlamaForCausalLM"
+
+    if args.moe_num_experts and args.moe_top_k == 0:
+        args.moe_top_k = 1
+    args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k,
+                                args.moe_tp_mode,
+                                args.moe_renorm_mode).validate()
+
+    if args.rotary_scaling is not None:
+        # assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin."
+        rotary_scaling = {
+            "type": args.rotary_scaling[0],
+            "factor": float(args.rotary_scaling[1])
+        }
+        assert rotary_scaling["type"] in ["linear", "dynamic"]
+        assert rotary_scaling["factor"] > 1.0
+        args.rotary_scaling = rotary_scaling
+
+    lora_config = create_lora_config(args)
+    config = create_config_from_args(args, lora_config)
+
+    with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
+        json.dump(config, f, indent=4)
+
+    act_range = {}
+    llama_qkv_para = {}
+    # smoother for inputs of self_attn.o_proj and mlp.down_proj
+    llama_smoother = {}
+    model = None
+    if args.model_dir is not None:
+
+        if args.model_type == "llava":
+            hf_llava = LlavaForConditionalGeneration.from_pretrained(
+                args.model_dir, torch_dtype="auto")
+            model = hf_llava.language_model
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_dir,
+                device_map='auto' if not args.load_model_on_cpu else 'cpu',
+                torch_dtype='auto' if not args.smoothquant else torch.float16,
+                trust_remote_code=True,
+            )
+        if args.smoothquant is not None or args.int8_kv_cache:
+            act_range, llama_qkv_para, llama_smoother = smooth_quant(
+                model, args)
+
+    def covert_and_save(rank):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=args.tp_size,
+                          pp_size=args.pp_size)
+
+        if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
+            weights = load_from_gptq_llama(args.ammo_quant_ckpt_path,
+                                           args.n_layer,
+                                           args.vocab_size,
+                                           mapping,
+                                           dtype=args.dtype)
+
+        elif args.meta_ckpt_dir is not None:
+            weights = load_from_meta_llama(
+                args.meta_ckpt_dir, mapping,
+                PretrainedConfig.from_dict(copy.deepcopy(config)))
+
+        else:
+            if args.load_by_shard:
+                weights = load_from_hf_checkpoint(
+                    args.model_dir, mapping,
+                    PretrainedConfig.from_dict(copy.deepcopy(config)),
+                    lora_config)
+
+            else:
+                if args.weight_only_precision == 'int8':
+                    plugin_weight_only_quant_type = torch.int8
+                elif args.weight_only_precision == 'int4':
+                    plugin_weight_only_quant_type = torch.quint4x2
+                weights = convert_hf_cogvlm(
+                    model,
+                    mapping,
+                    vocab_size=args.vocab_size,
+                    dtype=args.dtype,
+                    use_weight_only=args.use_weight_only,
+                    use_gemm_woq_plugin=not args.
+                    disable_weight_only_quant_plugin,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type,
+                    use_parallel_embedding=args.use_parallel_embedding,
+                    sharding_dim=args.embedding_sharding_dim,
+                    share_embedding_table=args.use_embedding_sharing,
+                    use_smooth_quant=args.smoothquant,
+                    per_channel=args.per_channel,
+                    per_token=args.per_token,
+                    int8_kv_cache=args.int8_kv_cache,
+                    act_range=act_range,
+                    qkv_para=llama_qkv_para,
+                    smoother=llama_smoother,
+                    moe_config=args.moe_config,
+                    lora_config=lora_config)
+
+        safetensors.torch.save_file(
+            weights, os.path.join(args.output_dir, f'rank{rank}.safetensors'))
+
+    if args.workers == 1:
+
+        for rank in range(world_size):
+            covert_and_save(rank)
+    else:
+        with ThreadPoolExecutor(max_workers=args.workers) as p:
+            futures = [
+                p.submit(covert_and_save, rank) for rank in range(world_size)
+            ]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of converting checkpoints: {t}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/cpp/executor/CMakeLists.txt b/examples/cpp/executor/CMakeLists.txt
index 9aa1b0eea..8ba10c2ee 100644
--- a/examples/cpp/executor/CMakeLists.txt
+++ b/examples/cpp/executor/CMakeLists.txt
@@ -62,7 +62,13 @@ message(STATUS "    include path: ${CUDAToolkit_INCLUDE_DIRS}")
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
 set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
 set(TRT_LIB nvinfer)
-find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
+# On Windows major version is appended to nvinfer libs.
+if(WIN32)
+  set(TRT_LIB_NAME nvinfer_10)
+else()
+  set(TRT_LIB_NAME nvinfer)
+endif()
+find_library_create_target(${TRT_LIB} ${TRT_LIB_NAME} SHARED ${TRT_LIB_DIR})
 message(${TRT_INCLUDE_DIR})
 include_directories("${TRT_INCLUDE_DIR}")
 
diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt
index b86a0433f..f1ecf78c3 100644
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/enc_dec/README.md b/examples/enc_dec/README.md
index 2a0756d66..fd6b1802a 100644
--- a/examples/enc_dec/README.md
+++ b/examples/enc_dec/README.md
@@ -25,7 +25,7 @@ The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc
 
  * `trtllm-build` to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Enc-Dec model,
  * [`run.py`](./run.py) to run the inference on an example input text.
- * Enc-Dec models can have specific implementations, such as the popular T5 family (T5, mT5, Flan-T5), BART family (BART, mBART), and FairSeq family (WMTs). They are now merged into a single convert script:
+ * Enc-Dec models can have specific implementations, such as the popular T5 family (T5, mT5, Flan-T5, ByT5), BART family (BART, mBART), and FairSeq family (WMTs). They are now merged into a single convert script:
    * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert weights from HuggingFace or FairSeq format to TRT-LLM format, and split weights for multi-GPU inference,
 ## Usage
 
@@ -40,6 +40,7 @@ The implementation is designed to support generic encoder-decoder models by abst
 - [BART](https://huggingface.co/docs/transformers/model_doc/bart)
 - [mBART](https://huggingface.co/docs/transformers/model_doc/mbart)
 - [FairSeq NMT](https://pytorch.org/hub/pytorch_fairseq_translation/)
+- [ByT5](https://huggingface.co/docs/transformers/main/en/model_doc/byt5)
 - [UL2 (coming)](https://huggingface.co/docs/transformers/model_doc/ul2) and [Flan-UL2 (coming)](https://huggingface.co/docs/transformers/model_doc/flan-ul2)
 
 It also supports full Tensor Parallelism (TP), Pipeline Parallelism (PP), and a hybrid of the two. Currently, Fused Multi-Head Attention (FMHA) is not yet enabled for T5 family due to its relative attention design.
@@ -53,6 +54,7 @@ git clone https://huggingface.co/t5-small tmp/hf_models/t5-small
 git clone https://huggingface.co/google/flan-t5-small tmp/hf_models/flan-t5-small
 git clone https://huggingface.co/facebook/bart-large-cnn tmp/hf_models/bart-large-cnn
 git clone https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt tmp/hf_models/mbart-large-50-many-to-one-mmt
+git clone https://huggingface.co/google/byt5-small tmp/hf_models/byt5-small
 ```
 
 ### Convert and Split Weights
diff --git a/examples/enc_dec/convert_checkpoint.py b/examples/enc_dec/convert_checkpoint.py
index 3eb9990b3..5845b0045 100755
--- a/examples/enc_dec/convert_checkpoint.py
+++ b/examples/enc_dec/convert_checkpoint.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import os
+import types
 from ast import literal_eval
 from datetime import datetime
 from pathlib import Path
@@ -11,6 +12,7 @@
 import safetensors
 from helper import convert_weight_to_dtype, fuse_qkv_one_layer, reshape, split
 from transformers import (AutoModelForSeq2SeqLM, MBartForConditionalGeneration,
+                          Pix2StructForConditionalGeneration,
                           T5ForConditionalGeneration, VisionEncoderDecoderModel)
 
 from tensorrt_llm._utils import str_dtype_to_torch
@@ -26,6 +28,12 @@
 mlp_type_map = {i.name: i.value for i in MLPType}
 
 
+def copy_args_to_component_config(component_config, args):
+    for arg in vars(args):
+        setattr(component_config, arg, getattr(args, arg))
+    return component_config
+
+
 def parse_t5_config(args, hf_model):
     config = configparser.ConfigParser()
 
@@ -36,24 +44,15 @@ def parse_t5_config(args, hf_model):
 
     # manually set q_scaling to offset attention scaling's effect.
     # TODO: modify kernels to control whether to disable attention scaling
-    def get_offset_q_scaling(config) -> str:
-        d_model = config.d_model
-        num_heads = config.num_heads
-        head_size = d_model / num_heads
-        scaling = 1 / head_size**.5
-        return str(scaling)
-
-    config["encoder"]["q_scaling"] = get_offset_q_scaling(
-        hf_model.encoder.config)
+    def get_offset_q_scaling(config):
+        scaling = 1 / config.head_size**.5
+        return scaling
 
     config["decoder"] = {}
     for key, val in hf_model.decoder.config.to_dict().items():
         config["decoder"][key] = f"{val}"
     config["decoder"]["weight_data_type"] = args.weight_data_type
 
-    config["decoder"]["q_scaling"] = get_offset_q_scaling(
-        hf_model.decoder.config)
-
     config["structure"] = dict()
     config["structure"]["t5_with_bias"] = "false"
     config["structure"]["use_gated_activation"] = str(
@@ -62,133 +61,96 @@ def get_offset_q_scaling(config) -> str:
     config["structure"]["model_type"] = args.model_type
 
     def parse_t5_config_by_component(config, component, args):
+        component_config = types.SimpleNamespace()
+        component_config = copy_args_to_component_config(component_config, args)
+        component_config.n_head = config.getint(component, 'num_heads')
+        component_config.head_size = config.getint(component, 'd_kv')
+        component_config.hidden_size = config.getint(component, 'd_model')
+        component_config.ffn_hidden_size = config.getint(component, 'd_ff')
+        component_config.vocab_size = config.getint(component, 'vocab_size')
+        component_config.n_positions = config.getint(component,
+                                                     'n_positions',
+                                                     fallback=512)
+        component_config.has_position_embedding = config.getboolean(
+            component, 'has_position_embedding',
+            fallback=False)  # TODO: hardcoded here
+
+        component_config.has_token_type_embedding = config.getboolean(
+            component, 'has_token_type_embedding', fallback=False)
+        component_config.has_embedding_layernorm = config.getboolean(
+            component, 'has_embedding_layernorm', fallback=False)
+        component_config.has_embedding_scale = config.getboolean(
+            component, 'has_embedding_scale', fallback=False)
+        component_config.q_scaling = get_offset_q_scaling(component_config)
+        component_config.has_attention_qkvo_bias = config.getboolean(
+            component, 'has_attention_qkvo_bias',
+            fallback=False)  # TODO: hardcoded here
+        component_config.has_mlp_bias = config.getboolean(component,
+                                                          'has_mlp_bias',
+                                                          fallback=False)
+        component_config.has_model_final_layernorm = config.getboolean(
+            component, 'has_model_final_layernorm', fallback=True)
+        component_config.layernorm_eps = config.getfloat(
+            component, 'layer_norm_epsilon')
+        component_config.layernorm_position = layernorm_position_map[config.get(
+            component, 'layernorm_position',
+            fallback='pre_layernorm')]  # TODO: hardcoded here
+        component_config.layernorm_type = layernorm_type_map[config.get(
+            component, 'layernorm_type', fallback='RmsNorm')]
+        component_config.hidden_act = config.get(component, 'dense_act_fn')
+        component_config.gated_act = config.getboolean(component,
+                                                       'is_gated_act')
+        component_config.mlp_type = mlp_type_map['GatedMLP' if component_config.
+                                                 gated_act else 'MLP']
+        component_config.num_buckets = config.getint(
+            component, 'relative_attention_num_buckets')
+        component_config.max_distance = config.getint(
+            component, 'relative_attention_max_distance')
+        component_config.position_embedding_type = config.get(
+            'structure', 'position_embedding_type')
+        component_config.logits_dtype = config.get(component,
+                                                   'logits_dtype',
+                                                   fallback='float32')
+        component_config.ckpt_weight_dtype = config.get(component,
+                                                        'weight_data_type')
+
         if component == 'encoder':
-            args.n_layer = config.getint(component, 'num_layers')
-            args.n_head = config.getint(component, 'num_heads')
-            args.head_size = config.getint(component, 'd_kv')
-            args.hidden_size = config.getint(component, 'd_model')
-            args.ffn_hidden_size = config.getint(component, 'd_ff')
-            args.vocab_size = config.getint(component, 'vocab_size')
-            args.n_positions = config.getint(component,
-                                             'n_positions',
-                                             fallback=512)
-            args.has_position_embedding = config.getboolean(
-                component, 'has_position_embedding',
-                fallback=False)  # TODO: hardcoded here
-            args.has_token_type_embedding = config.getboolean(
-                component, 'has_token_type_embedding', fallback=False)
-            args.has_embedding_layernorm = config.getboolean(
-                component, 'has_embedding_layernorm', fallback=False)
-            args.has_embedding_scale = config.getboolean(component,
-                                                         'has_embedding_scale',
-                                                         fallback=False)
-            args.q_scaling = config.getfloat(component,
-                                             'q_scaling',
-                                             fallback=1.0)
-            args.has_attention_qkvo_bias = config.getboolean(
-                component, 'has_attention_qkvo_bias',
-                fallback=False)  # TODO: hardcoded here
-            args.has_mlp_bias = config.getboolean(component,
-                                                  'has_mlp_bias',
-                                                  fallback=False)
-            args.has_model_final_layernorm = config.getboolean(
-                component, 'has_model_final_layernorm', fallback=True)
-            args.layernorm_eps = config.getfloat(component,
-                                                 'layer_norm_epsilon')
-            args.layernorm_position = layernorm_position_map[config.get(
-                component, 'layernorm_position',
-                fallback='pre_layernorm')]  # TODO: hardcoded here
-            args.layernorm_type = layernorm_type_map[config.get(
-                component, 'layernorm_type',
-                fallback='RmsNorm')]  # TODO: hardcoded here
-            args.hidden_act = config.get(component, 'dense_act_fn')
-            args.gated_act = config.getboolean(component, 'is_gated_act')
-            args.mlp_type = mlp_type_map['GatedMLP' if args.
-                                         gated_act else 'MLP']
-            args.relative_attention = config.get(
+            component_config.n_layer = config.getint(component, 'num_layers')
+
+            component_config.relative_attention = config.get(
                 'structure', 'position_embedding_type') == 'relative'
-            args.num_buckets = config.getint(component,
-                                             'relative_attention_num_buckets')
-            args.max_distance = config.getint(
-                component, 'relative_attention_max_distance')
-            args.ckpt_weight_dtype = config.get(component, 'weight_data_type')
-            args.position_embedding_type = config.get(
-                'structure', 'position_embedding_type')
+
+            component_config.ckpt_weight_dtype = config.get(
+                component, 'weight_data_type')
 
         elif component == 'decoder':
-            args.n_layer = config.getint(component, 'num_decoder_layers')
-            args.n_head = config.getint(component, 'num_heads')
-            args.head_size = config.getint(component, 'd_kv')
-            args.hidden_size = config.getint(component, 'd_model')
-            args.ffn_hidden_size = config.getint(component, 'd_ff')
-            args.vocab_size = config.getint(component, 'vocab_size')
-            args.n_positions = config.getint(component,
-                                             'n_positions',
-                                             fallback=512)
-            args.has_position_embedding = config.getboolean(
-                component, 'has_position_embedding',
-                fallback=False)  # TODO: hardcoded here
-            args.has_token_type_embedding = config.getboolean(
-                component, 'has_token_type_embedding', fallback=False)
-            args.has_embedding_layernorm = config.getboolean(
-                component, 'has_embedding_layernorm', fallback=False)
-            args.has_embedding_scale = config.getboolean(component,
-                                                         'has_embedding_scale',
-                                                         fallback=False)
-            args.q_scaling = config.getfloat(component,
-                                             'q_scaling',
-                                             fallback=1.0)
-            args.has_attention_qkvo_bias = config.getboolean(
-                component, 'has_attention_qkvo_bias', fallback=False)
-            args.has_mlp_bias = config.getboolean(component,
-                                                  'has_mlp_bias',
-                                                  fallback=False)
-            args.has_model_final_layernorm = config.getboolean(
-                component, 'has_model_final_layernorm', fallback=True)
-            args.layernorm_eps = config.getfloat(component,
-                                                 'layer_norm_epsilon')
-            args.layernorm_position = layernorm_position_map[config.get(
-                component, 'layernorm_position',
-                fallback='pre_layernorm')]  # TODO: hardcoded here
-            args.layernorm_type = layernorm_type_map[config.get(
-                component, 'layernorm_type', fallback='RmsNorm')]
-            args.hidden_act = config.get(component, 'dense_act_fn')
-            args.gated_act = config.getboolean(component, 'is_gated_act')
-            args.mlp_type = mlp_type_map['GatedMLP' if args.
-                                         gated_act else 'MLP']
-            args.has_lm_head_bias = config.getboolean(
+            component_config.n_layer = config.getint(component,
+                                                     'num_decoder_layers')
+            component_config.has_lm_head_bias = config.getboolean(
                 component,  # TODO: T5 with bias
                 'has_lm_head_bias',
                 fallback=False)
-            args.relative_attention = config.getboolean(component,
-                                                        'relative_attention',
-                                                        fallback=True)
-            args.num_buckets = config.getint(component,
-                                             'relative_attention_num_buckets')
-            args.max_distance = config.getint(
-                component, 'relative_attention_max_distance')
-            args.logits_dtype = config.get(component,
-                                           'logits_dtype',
-                                           fallback='float32')
-            args.rescale_before_lm_head = config.getboolean(
+            component_config.relative_attention = config.getboolean(
+                component, 'relative_attention', fallback=True)
+            component_config.rescale_before_lm_head = config.getboolean(
                 component, 'tie_word_embeddings'
             )  # default is True (for T5), but False for Flan-T5
-            args.encoder_hidden_size = config.getint('encoder', 'd_model')
-            args.encoder_num_heads = config.getint('encoder', 'num_heads')
-            args.encoder_head_size = config.getint('encoder', 'd_kv')
-            args.ckpt_weight_dtype = config.get(component, 'weight_data_type')
-            args.position_embedding_type = config.get(
-                'structure', 'position_embedding_type')
+            component_config.encoder_hidden_size = config.getint(
+                'encoder', 'd_model')
+            component_config.encoder_num_heads = config.getint(
+                'encoder', 'num_heads')
+            component_config.encoder_head_size = config.getint(
+                'encoder', 'd_kv')
 
         else:
             assert False, 'Unsupported component!'
 
-        return args
+        return component_config
 
-    encoder_args = parse_t5_config_by_component(config, "encoder", args)
-    decoder_args = parse_t5_config_by_component(config, "decoder", args)
+    encoder_config = parse_t5_config_by_component(config, "encoder", args)
+    decoder_config = parse_t5_config_by_component(config, "decoder", args)
 
-    return encoder_args, decoder_args
+    return encoder_config, decoder_config
 
 
 def convert_t5_weights_to_tllm_safetensors(config, component, params):
@@ -380,87 +342,93 @@ def parse_nmt_config(args, model):
 
     def parse_nmt_config_by_component(config, component, args):
         assert component in ('encoder', 'decoder'), 'Unsupported component!'
-        args.n_layer = config.getint(component, f'{component}_layers')
-        args.n_head = config.getint(component, f'{component}_attention_heads')
-        args.hidden_size = config.getint(
+        component_config = types.SimpleNamespace()
+        component_config = copy_args_to_component_config(component_config, args)
+        component_config.n_layer = config.getint(component,
+                                                 f'{component}_layers')
+        component_config.n_head = config.getint(component,
+                                                f'{component}_attention_heads')
+        component_config.hidden_size = config.getint(
             component, f'{component}_embed_dim')  # fairseq naming
-        args.head_size = config.getint(component,
-                                       'd_kv',
-                                       fallback=args.hidden_size // args.n_head)
-        args.ffn_hidden_size = config.getint(
+        component_config.head_size = config.getint(
+            component,
+            'd_kv',
+            fallback=component_config.hidden_size // component_config.n_head)
+        component_config.ffn_hidden_size = config.getint(
             component, f'{component}_ffn_embed_dim')  # fairseq naming
-        args.vocab_size = config.getint(component, 'vocab_size')
-        args.n_positions = config.getint(
+        component_config.vocab_size = config.getint(component, 'vocab_size')
+        component_config.n_positions = config.getint(
             component, 'max_source_positions')  # fairseq naming
-        args.has_position_embedding = not config.getboolean(
+        component_config.has_position_embedding = not config.getboolean(
             component, 'no_token_positional_embeddings',
             fallback=False)  # fairseq naming
-        args.has_token_type_embedding = config.getboolean(
+        component_config.has_token_type_embedding = config.getboolean(
             component, 'has_token_type_embedding', fallback=False)
-        args.has_embedding_layernorm = config.getboolean(
+        component_config.has_embedding_layernorm = config.getboolean(
             component, 'layernorm_embedding', fallback=True)  # fairseq naming
-        args.has_embedding_scale = not config.getboolean(
+        component_config.has_embedding_scale = not config.getboolean(
             component, 'no_scale_embedding')  # fairseq naming
-        args.q_scaling = config.getfloat(component, 'q_scaling', fallback=1.0)
-        args.has_attention_qkvo_bias = config.getboolean('structure',
-                                                         't5_with_bias',
-                                                         fallback=True)
-        args.has_mlp_bias = config.getboolean('structure',
-                                              't5_with_bias',
-                                              fallback=True)
-        args.has_model_final_layernorm = config.getboolean(
+        component_config.q_scaling = config.getfloat(component,
+                                                     'q_scaling',
+                                                     fallback=1.0)
+        component_config.has_attention_qkvo_bias = config.getboolean(
+            'structure', 't5_with_bias', fallback=True)
+        component_config.has_mlp_bias = config.getboolean('structure',
+                                                          't5_with_bias',
+                                                          fallback=True)
+        component_config.has_model_final_layernorm = config.getboolean(
             component, 'has_model_final_layernorm')
-        args.layernorm_eps = config.getfloat(component,
-                                             'layer_norm_epsilon',
-                                             fallback=1e-5)  # fairseq naming
+        component_config.layernorm_eps = config.getfloat(
+            component, 'layer_norm_epsilon', fallback=1e-5)  # fairseq naming
 
         normalize_before = config.getboolean(
             component, f'{component}_normalize_before')  # fairseq naming
-        args.layernorm_position = layernorm_position_map[
+        component_config.layernorm_position = layernorm_position_map[
             'pre_layernorm' if normalize_before else 'post_layernorm']
 
-        args.layernorm_type = layernorm_type_map[config.get(
+        component_config.layernorm_type = layernorm_type_map[config.get(
             component, 'layernorm_type', fallback='LayerNorm')]
-        args.hidden_act = config.get(component,
-                                     'activation_fn')  # fairseq naming
-        args.gated_act = config.getboolean(component,
-                                           'is_gated_act',
-                                           fallback=False)
-        args.mlp_type = mlp_type_map['GatedMLP' if args.gated_act else 'MLP']
-        args.relative_attention = config.get(
+        component_config.hidden_act = config.get(
+            component, 'activation_fn')  # fairseq naming
+        component_config.gated_act = config.getboolean(component,
+                                                       'is_gated_act',
+                                                       fallback=False)
+        component_config.mlp_type = mlp_type_map['GatedMLP' if component_config.
+                                                 gated_act else 'MLP']
+        component_config.relative_attention = config.get(
             'structure', 'position_embedding_type') == 'relative'
 
-        args.num_buckets = config.getint(component,
-                                         'relative_attention_num_buckets',
-                                         fallback=0)
-        args.max_distance = config.getint(component,
-                                          'relative_attention_max_distance',
-                                          fallback=0)
-        args.ckpt_weight_dtype = config.get(component, 'weight_data_type')
-        args.position_embedding_type = config.get('structure',
-                                                  'position_embedding_type')
-
+        component_config.num_buckets = config.getint(
+            component, 'relative_attention_num_buckets', fallback=0)
+        component_config.max_distance = config.getint(
+            component, 'relative_attention_max_distance', fallback=0)
+        component_config.ckpt_weight_dtype = config.get(component,
+                                                        'weight_data_type')
+        component_config.position_embedding_type = config.get(
+            'structure', 'position_embedding_type')
+        component_config.logits_dtype = config.get(component,
+                                                   'logits_dtype',
+                                                   fallback='float32')
         if component == 'decoder':
-            args.rescale_before_lm_head = config.getboolean(
+            component_config.rescale_before_lm_head = config.getboolean(
                 component, 'rescale_before_lm_head')
-            args.logits_dtype = config.get(component,
-                                           'logits_dtype',
-                                           fallback='float32')
-            args.encoder_hidden_size = config.getint(
+
+            component_config.encoder_hidden_size = config.getint(
                 'encoder', 'encoder_embed_dim')  # fairseq naming
-            args.encoder_num_heads = config.getint('encoder',
-                                                   'encoder_attention_heads')
-            args.encoder_head_size = config.getint(
+            component_config.encoder_num_heads = config.getint(
+                'encoder', 'encoder_attention_heads')
+            component_config.encoder_head_size = config.getint(
                 'encoder',
                 'd_kv',
-                fallback=args.encoder_hidden_size // args.encoder_num_heads)
+                fallback=component_config.encoder_hidden_size //
+                component_config.encoder_num_heads)
 
-        return args
+        return component_config
 
-    encoder_args = parse_nmt_config_by_component(config, "encoder", args)
-    decoder_args = parse_nmt_config_by_component(config, "decoder", args)
+    encoder_config = parse_nmt_config_by_component(config, "encoder", args)
+    decoder_config = parse_nmt_config_by_component(config, "decoder", args)
 
-    return encoder_args, decoder_args
+    return encoder_config, decoder_config
 
 
 def convert_nmt_weights_to_tllm_safetensors(config, component, params,
@@ -659,95 +627,106 @@ def parse_bart_config(args, hf_model):
 
     def parse_bart_config_by_component(config, component, args):
         assert component in ('encoder', 'decoder'), 'Unsupported component!'
-        args.n_layer = config.getint(component, f'{component}_layers')
-        args.n_head = config.getint(component, f'{component}_attention_heads')
-        args.hidden_size = config.getint(component, 'd_model')
-        args.head_size = config.getint(component,
-                                       'd_kv',
-                                       fallback=args.hidden_size // args.n_head)
-        args.ffn_hidden_size = config.getint(component, f'{component}_ffn_dim')
-        args.vocab_size = config.getint(component, 'vocab_size')
-        args.n_positions = config.getint(component, 'max_position_embeddings')
-        args.has_position_embedding = config.getboolean(
+        component_config = types.SimpleNamespace()
+        component_config = copy_args_to_component_config(component_config, args)
+        component_config.n_layer = config.getint(component,
+                                                 f'{component}_layers')
+        component_config.n_head = config.getint(component,
+                                                f'{component}_attention_heads')
+        component_config.hidden_size = config.getint(component, 'd_model')
+        component_config.head_size = config.getint(
+            component,
+            'd_kv',
+            fallback=component_config.hidden_size // component_config.n_head)
+        component_config.ffn_hidden_size = config.getint(
+            component, f'{component}_ffn_dim')
+        component_config.vocab_size = config.getint(component, 'vocab_size')
+        component_config.n_positions = config.getint(component,
+                                                     'max_position_embeddings')
+        component_config.has_position_embedding = config.getboolean(
             component, 'has_position_embedding',
             fallback=True)  # TODO: hardcoded here
-        args.has_token_type_embedding = config.getboolean(
+        component_config.has_token_type_embedding = config.getboolean(
             component, 'has_token_type_embedding', fallback=False)
-        args.has_embedding_layernorm = config.getboolean(
+        component_config.has_embedding_layernorm = config.getboolean(
             component, 'has_embedding_layernorm', fallback=True)
-        args.has_embedding_scale = config.getboolean(component,
-                                                     'scale_embedding')
-        args.q_scaling = config.getfloat(component, 'q_scaling', fallback=1.0)
-        args.has_attention_qkvo_bias = config.getboolean('structure',
-                                                         't5_with_bias',
-                                                         fallback=True)
-        args.has_mlp_bias = config.getboolean('structure',
-                                              't5_with_bias',
-                                              fallback=True)
-        args.has_model_final_layernorm = config.getboolean(
+        component_config.has_embedding_scale = config.getboolean(
+            component, 'scale_embedding')
+        component_config.q_scaling = config.getfloat(component,
+                                                     'q_scaling',
+                                                     fallback=1.0)
+        component_config.has_attention_qkvo_bias = config.getboolean(
+            'structure', 't5_with_bias', fallback=True)
+        component_config.has_mlp_bias = config.getboolean('structure',
+                                                          't5_with_bias',
+                                                          fallback=True)
+        component_config.has_model_final_layernorm = config.getboolean(
             component, 'has_model_final_layernorm')
-        args.layernorm_eps = config.getfloat(component,
-                                             'layer_norm_epsilon',
-                                             fallback=False)
+        component_config.layernorm_eps = config.getfloat(component,
+                                                         'layer_norm_epsilon',
+                                                         fallback=False)
 
         normalize_before = config.getboolean(component, 'normalize_before')
-        args.layernorm_position = layernorm_position_map[
+        component_config.layernorm_position = layernorm_position_map[
             'pre_layernorm' if normalize_before else 'post_layernorm']
 
-        args.layernorm_type = layernorm_type_map[config.get(
+        component_config.layernorm_type = layernorm_type_map[config.get(
             component, 'layernorm_type', fallback='LayerNorm')]
-        args.hidden_act = config.get(component, 'activation_function')
-        args.gated_act = config.getboolean(component,
-                                           'is_gated_act',
-                                           fallback=False)
-        args.mlp_type = mlp_type_map['GatedMLP' if args.gated_act else 'MLP']
-        args.relative_attention = config.get(
+        component_config.hidden_act = config.get(component,
+                                                 'activation_function')
+        component_config.gated_act = config.getboolean(component,
+                                                       'is_gated_act',
+                                                       fallback=False)
+        component_config.mlp_type = mlp_type_map['GatedMLP' if component_config.
+                                                 gated_act else 'MLP']
+        component_config.relative_attention = config.get(
             'structure', 'position_embedding_type') == 'relative'
 
-        args.num_buckets = config.getint(component,
-                                         'relative_attention_num_buckets',
-                                         fallback=0)
-        args.max_distance = config.getint(component,
-                                          'relative_attention_max_distance',
-                                          fallback=0)
-        args.ckpt_weight_dtype = config.get(component, 'weight_data_type')
-        args.max_lora_rank = config.getint(component,
-                                           'max_lora_rank',
-                                           fallback=0)
-        args.lora_target_modules = literal_eval(
+        component_config.num_buckets = config.getint(
+            component, 'relative_attention_num_buckets', fallback=0)
+        component_config.max_distance = config.getint(
+            component, 'relative_attention_max_distance', fallback=0)
+        component_config.ckpt_weight_dtype = config.get(component,
+                                                        'weight_data_type')
+        component_config.max_lora_rank = config.getint(component,
+                                                       'max_lora_rank',
+                                                       fallback=0)
+        component_config.lora_target_modules = literal_eval(
             config.get(component, 'lora_target_modules', fallback="[]"))
-        args.hf_modules_to_trtllm_modules = literal_eval(
+        component_config.hf_modules_to_trtllm_modules = literal_eval(
             config.get(component, 'hf_modules_to_trtllm_modules',
                        fallback="{}"))
-        args.trtllm_modules_to_hf_modules = literal_eval(
+        component_config.trtllm_modules_to_hf_modules = literal_eval(
             config.get(component, 'trtllm_modules_to_hf_modules',
                        fallback="{}"))
-        args.logits_dtype = config.get(component,
-                                       'logits_dtype',
-                                       fallback='float32')
-        args.position_embedding_type = config.get('structure',
-                                                  'position_embedding_type')
+        component_config.logits_dtype = config.get(component,
+                                                   'logits_dtype',
+                                                   fallback='float32')
+        component_config.position_embedding_type = config.get(
+            'structure', 'position_embedding_type')
 
         if component == 'decoder':
-            args.rescale_before_lm_head = config.getboolean(
+            component_config.rescale_before_lm_head = config.getboolean(
                 component, 'rescale_before_lm_head')
 
-            args.encoder_hidden_size = config.getint('encoder', 'd_model')
-            args.encoder_num_heads = config.getint('encoder',
-                                                   'encoder_attention_heads')
-            args.encoder_head_size = config.getint(
+            component_config.encoder_hidden_size = config.getint(
+                'encoder', 'd_model')
+            component_config.encoder_num_heads = config.getint(
+                'encoder', 'encoder_attention_heads')
+            component_config.encoder_head_size = config.getint(
                 'encoder',
                 'd_kv',
-                fallback=args.encoder_hidden_size // args.encoder_num_heads)
+                fallback=component_config.encoder_hidden_size //
+                component_config.encoder_num_heads)
 
-        return args
+        return component_config
 
-    encoder_args = None
+    encoder_config = None
     if not args.nougat:
-        encoder_args = parse_bart_config_by_component(config, "encoder", args)
-    decoder_args = parse_bart_config_by_component(config, "decoder", args)
+        encoder_config = parse_bart_config_by_component(config, "encoder", args)
+    decoder_config = parse_bart_config_by_component(config, "decoder", args)
 
-    return encoder_args, decoder_args
+    return encoder_config, decoder_config
 
 
 def convert_bart_weights_to_tllm_safetensors(config, component, params):
@@ -940,6 +919,247 @@ def get_attn_module_name(component, layer, attn_type):
     return weights
 
 
+def parse_pix2struct_config(args, hf_model):
+    # manually set q_scaling to offset attention scaling's effect.
+    # TODO: modify kernels to control whether to disable attention scaling
+    config = configparser.ConfigParser()
+
+    def get_offset_q_scaling(config) -> str:
+        d_model = config.hidden_size
+        num_heads = config.num_heads
+        head_size = d_model / num_heads
+        scaling = 1 / head_size**.5
+        return str(scaling)
+
+    config["decoder"] = {}
+    for key, val in hf_model.decoder.config.to_dict().items():
+        config["decoder"][key] = f"{val}"
+    config["decoder"]["weight_data_type"] = args.weight_data_type
+
+    config["decoder"]["q_scaling"] = get_offset_q_scaling(
+        hf_model.decoder.config)
+
+    config["structure"] = dict()
+    config["structure"]["pix2struct_with_bias"] = "false"
+    config["structure"]["use_gated_activation"] = "false"
+    config["structure"]["position_embedding_type"] = "relative"
+    config["structure"]["model_type"] = args.model_type
+
+    def parse_pix2struct_config_by_component(config, component, args):
+        if component == 'decoder':
+            args.n_layer = config.getint(component, 'num_layers')
+            args.n_head = config.getint(component, 'num_heads')
+            args.head_size = config.getint(component, 'd_kv')
+            args.hidden_size = config.getint(component, 'hidden_size')
+            args.ffn_hidden_size = config.getint(component, 'd_ff')
+            args.vocab_size = config.getint(component, 'vocab_size')
+            args.n_positions = config.getint(component,
+                                             'n_positions',
+                                             fallback=512)
+            args.has_position_embedding = config.getboolean(
+                component, 'has_position_embedding',
+                fallback=False)  # TODO: hardcoded here
+            args.has_token_type_embedding = config.getboolean(
+                component, 'has_token_type_embedding', fallback=False)
+            args.has_embedding_layernorm = config.getboolean(
+                component, 'has_embedding_layernorm', fallback=False)
+            args.has_embedding_scale = config.getboolean(component,
+                                                         'has_embedding_scale',
+                                                         fallback=False)
+            args.q_scaling = config.getfloat(component,
+                                             'q_scaling',
+                                             fallback=1.0)
+            args.has_attention_qkvo_bias = config.getboolean(
+                component, 'has_attention_qkvo_bias', fallback=False)
+            args.has_mlp_bias = config.getboolean(component,
+                                                  'has_mlp_bias',
+                                                  fallback=False)
+            args.has_model_final_layernorm = config.getboolean(
+                component, 'has_model_final_layernorm', fallback=True)
+            args.layernorm_eps = config.getfloat(component,
+                                                 'layer_norm_epsilon')
+            args.layernorm_position = layernorm_position_map[config.get(
+                component, 'layernorm_position',
+                fallback='pre_layernorm')]  # TODO: hardcoded here
+            args.layernorm_type = layernorm_type_map[config.get(
+                component, 'layernorm_type', fallback='RmsNorm')]
+            args.hidden_act = config.get(component, 'dense_act_fn')
+            args.gated_act = True
+            args.mlp_type = mlp_type_map['GatedMLP' if args.
+                                         gated_act else 'MLP']
+            args.has_lm_head_bias = config.getboolean(
+                component,  # TODO: T5 with bias
+                'has_lm_head_bias',
+                fallback=False)
+            args.relative_attention = config.getboolean(component,
+                                                        'relative_attention',
+                                                        fallback=True)
+            args.num_buckets = config.getint(component,
+                                             'relative_attention_num_buckets')
+            args.max_distance = config.getint(
+                component, 'relative_attention_max_distance')
+            args.logits_dtype = config.get(component,
+                                           'logits_dtype',
+                                           fallback='float32')
+            args.rescale_before_lm_head = config.getboolean(
+                component, 'tie_word_embeddings'
+            )  # default is True (for T5), but False for Flan-T5
+            args.encoder_hidden_size = config.getint('decoder', 'hidden_size')
+            args.encoder_num_heads = config.getint('decoder', 'num_heads')
+            args.encoder_head_size = config.getint('decoder', 'd_kv')
+            args.ckpt_weight_dtype = config.get(component, 'weight_data_type')
+            args.position_embedding_type = config.get(
+                'structure', 'position_embedding_type')
+
+        else:
+            assert False, 'Unsupported component!'
+        return args
+
+    decoder_args = parse_pix2struct_config_by_component(config, "decoder", args)
+    return None, decoder_args
+
+
+def convert_pix2struct_weights_to_tllm_safetensors(config, component, params):
+    weights = {}
+
+    mapping = config.mapping
+
+    convert_weight_to_dtype(params, config.dtype)
+    hidden_size = config.hidden_size
+    ffn_hidden_size = config.ffn_hidden_size
+    num_layers = config.num_hidden_layers
+    n_head = config.num_attention_heads
+    head_size = config.head_size
+    attention_hidden_size = n_head * head_size  # head size * num_heads not necessarily equals hidden_dim, such as Flan-T5
+
+    hf_param_prefix = f'{component}'
+    trtllm_layer_name = f'{component}_layers'
+    trtllm_attn_layer_name = 'self_attention'
+    trtllm_attn_layernorm_name = 'self_attention_layernorm'
+
+    def get_attn_module_name(component, layer, attn_type):
+        return f'{component}.layer.{int(layer)}.{attn_type}.attention'
+
+    weights['embedding.vocab_embedding.weight'] = reshape(
+        params[f'{hf_param_prefix}.embed_tokens.weight'].clone(), None)
+
+    layers_range = mapping.pp_layers(num_layers)
+    for layer_idx in layers_range:
+        local_layer_idx = layer_idx - layers_range[0]
+        trtllm_layer_name_prefix = f'{trtllm_layer_name}.{local_layer_idx}'
+        hf_layer_name_prefix = f'{hf_param_prefix}.layer.{layer_idx}'
+
+        hidden_layer_name_split = {
+            f'{hf_layer_name_prefix}.self_attention.attention.output.weight': {
+                "name":
+                f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.dense.weight',
+                "shape":
+                (hidden_size, attention_hidden_size // mapping.tp_size),
+                "split_dim": -1
+            },
+            f'{hf_layer_name_prefix}.mlp.DenseReluDense.wo.weight': {
+                "name": f'{trtllm_layer_name_prefix}.mlp.proj.weight',
+                "shape": (hidden_size, ffn_hidden_size // mapping.tp_size),
+                "split_dim": -1
+            },
+            f'{hf_layer_name_prefix}.mlp.DenseReluDense.wi_0.weight': {
+                "name": f'{trtllm_layer_name_prefix}.mlp.fc.weight',
+                "shape": (ffn_hidden_size // mapping.tp_size, hidden_size),
+                "split_dim": 0
+            },
+        }
+
+        hidden_layer_name_no_split = {
+            f'{hf_layer_name_prefix}.self_attention.layer_norm.weight': {
+                "name":
+                f'{trtllm_layer_name_prefix}.{trtllm_attn_layernorm_name}.weight',
+                "shape": None
+            },
+            f'{hf_layer_name_prefix}.mlp.layer_norm.weight': {
+                "name": f'{trtllm_layer_name_prefix}.mlp_layernorm.weight',
+                "shape": None
+            },
+        }
+
+        if config.gated_act:
+            hidden_layer_name_split.update({
+                f'{hf_layer_name_prefix}.mlp.DenseReluDense.wi_1.weight': {
+                    "name": f'{trtllm_layer_name_prefix}.mlp.gate.weight',
+                    "shape": (ffn_hidden_size // mapping.tp_size, hidden_size),
+                    "split_dim": 0
+                },
+            })
+
+        hidden_layer_name_split.update({
+            f'{hf_layer_name_prefix}.encoder_decoder_attention.attention.output.weight':
+            {
+                "name":
+                f'{trtllm_layer_name_prefix}.cross_attention.dense.weight',
+                "shape":
+                (hidden_size, attention_hidden_size // mapping.tp_size),
+                "split_dim": -1
+            },
+        })
+        hidden_layer_name_no_split.update({
+            f'{hf_layer_name_prefix}.encoder_decoder_attention.layer_norm.weight':
+            {
+                "name":
+                f'{trtllm_layer_name_prefix}.cross_attention_layernorm.weight',
+                "shape": None
+            },
+        })
+        self_attn_module_name = get_attn_module_name(
+            component, layer_idx, 'encoder_decoder_attention')
+        weights.update(
+            fuse_qkv_one_layer(
+                params, self_attn_module_name,
+                f'{trtllm_layer_name_prefix}.cross_attention', mapping.tp_size,
+                mapping.tp_rank, config.model_type,
+                (attention_hidden_size * 3 // mapping.tp_size, hidden_size),
+                None))
+
+        self_attn_module_name = get_attn_module_name(component, layer_idx,
+                                                     'self_attention')
+        weights.update(
+            fuse_qkv_one_layer(
+                params, self_attn_module_name,
+                f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}',
+                mapping.tp_size, mapping.tp_rank, config.model_type,
+                (attention_hidden_size * 3 // mapping.tp_size, hidden_size),
+                None))
+
+        weights[
+            f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.rel_attn_table'] = reshape(
+                split(
+                    params[
+                        f'{component}.layer.0.self_attention.attention.relative_attention_bias.weight']
+                    .T, mapping.tp_size, mapping.tp_rank, 0),
+                (n_head // mapping.tp_size, config.num_buckets))
+
+        for hf_weight_name, weight_info in hidden_layer_name_split.items():
+            if hf_weight_name in params.keys():
+                weights[weight_info["name"]] = reshape(
+                    split(params[hf_weight_name],
+                          mapping.tp_size,
+                          mapping.tp_rank,
+                          dim=weight_info["split_dim"]), weight_info["shape"])
+        for hf_weight_name, weight_info in hidden_layer_name_no_split.items():
+            if hf_weight_name in params.keys():
+                weights[weight_info["name"]] = reshape(
+                    params[hf_weight_name].clone(), shape=weight_info["shape"])
+
+    weights[f'final_layernorm.weight'] = reshape(
+        params[f'{component}.final_layer_norm.weight'].clone(), None)
+
+    weights['lm_head.weight'] = reshape(
+        split(params[f'{component}.lm_head.weight'],
+              mapping.tp_size,
+              mapping.tp_rank,
+              dim=0), (config.vocab_size // mapping.tp_size, hidden_size))
+
+    return weights
+
+
 def get_model(args):
     if args.model_type == "t5":
         model = T5ForConditionalGeneration.from_pretrained(args.model_dir)
@@ -952,6 +1172,9 @@ def get_model(args):
             model = model.get_decoder()
         else:
             model = AutoModelForSeq2SeqLM.from_pretrained(args.model_dir)
+    elif args.model_type == "pix2struct":
+        model = Pix2StructForConditionalGeneration.from_pretrained(
+            args.model_dir)
     return model
 
 
@@ -979,7 +1202,7 @@ def convert_checkpoint(args):
 
     additional_settings = ["gated_act"]
 
-    if not args.nougat:
+    if not args.nougat and args.model_type != "pix2struct":
         tllm_encoder_config = {
             'architecture': "EncoderModel",
             'dtype': args.dtype,
@@ -1113,7 +1336,7 @@ def convert_checkpoint(args):
         decoder_convert_args["sin_pos_embedding"] = sin_pos_embedding
 
     if args.workers == 1:
-        if not args.nougat:
+        if not args.nougat and args.model_type != "pix2struct":
             convert(0, world_size, args, tllm_encoder_config,
                     encoder_convert_args, encoder_saved_dir)
         convert(0, world_size, args, tllm_decoder_config, decoder_convert_args,
@@ -1123,7 +1346,7 @@ def convert_checkpoint(args):
             args.workers = world_size
         LOGGER.info(f'Convert checkpoint using {args.workers} workers.')
         import torch.multiprocessing as mp
-        if not args.nougat:
+        if not args.nougat and args.model_type != "pix2struct":
             mp.spawn(convert,
                      nprocs=args.workers,
                      args=(world_size, args, tllm_encoder_config,
@@ -1152,7 +1375,7 @@ def convert(worker_rank, world_size, args, model_config, convert_args,
     parser.add_argument('--model_type',
                         type=str,
                         default='t5',
-                        choices=['t5', 'nmt', 'bart'],
+                        choices=['t5', 'nmt', 'bart', 'pix2struct'],
                         help='Model to be converted.')
     parser.add_argument('--world_size',
                         type=int,
diff --git a/examples/enc_dec/helper.py b/examples/enc_dec/helper.py
index a4a9a07c0..2b4f44cb7 100755
--- a/examples/enc_dec/helper.py
+++ b/examples/enc_dec/helper.py
@@ -77,6 +77,10 @@ def get_qkv_module_name(model_type):
         q = "q_proj"
         k = "k_proj"
         v = "v_proj"
+    elif model_type == "pix2struct":
+        q = "query"
+        k = "key"
+        v = "value"
     return {"q": q, "k": k, "v": v}
 
 
diff --git a/examples/enc_dec/run.py b/examples/enc_dec/run.py
index ee75c8c80..83b1ee4e7 100644
--- a/examples/enc_dec/run.py
+++ b/examples/enc_dec/run.py
@@ -737,9 +737,14 @@ def test_fairseq_models(args):
 
     inference_dtype = tllm_model.encoder_model_config.dtype
     if inference_dtype == 'float32':
-        input_text.append(
-            "Summarize this article in one sentence.\n\nKristine Watts (Molie Weeks) is broken apart, missing her lover; she is not able to overcome her love for him that is lost in the past. She hires a stranger (Douglas Davis) and gives a list of her mistakes to him with things to fix. But time is irreversible and sometimes the cure for the pain is a tragic end.\n\nThe first point that impresses in \"The Cure\" is the stylish cinematography that alternates black and white with color. The concise and sharp screenplay is capable to develop a tragic and bleak tale of love with an unexpected plot point in the very end in less than eight minutes. The soundtrack is beautiful but the volume is a little loud and associated to the fact that English is not my native language, in some moments I needed to repeat some words whispered by the narrator. The unknown lead actress has magnificent performance and is extremely gorgeous. I hope to have a chance to see her again on the screen. Last but not the least, the debut of the director and writer Ryan Jafri could not be better. My vote is nine.\n\nTitle (Brazil): Not Available",
-        )
+        if "byt5" in args.model_name:
+            print(
+                "ByT5 models tokenize input by bytes instead of words, causing the input text in this example to be longer than the default value during build stage. Please adjust --max_input_len during trtllm-build to select the right length limit for ByT5 models."
+            )
+        else:
+            input_text.append(
+                "Summarize this article in one sentence.\n\nKristine Watts (Molie Weeks) is broken apart, missing her lover; she is not able to overcome her love for him that is lost in the past. She hires a stranger (Douglas Davis) and gives a list of her mistakes to him with things to fix. But time is irreversible and sometimes the cure for the pain is a tragic end.\n\nThe first point that impresses in \"The Cure\" is the stylish cinematography that alternates black and white with color. The concise and sharp screenplay is capable to develop a tragic and bleak tale of love with an unexpected plot point in the very end in less than eight minutes. The soundtrack is beautiful but the volume is a little loud and associated to the fact that English is not my native language, in some moments I needed to repeat some words whispered by the narrator. The unknown lead actress has magnificent performance and is extremely gorgeous. I hope to have a chance to see her again on the screen. Last but not the least, the debut of the director and writer Ryan Jafri could not be better. My vote is nine.\n\nTitle (Brazil): Not Available",
+            )
 
     tokenizer = AutoTokenizer.from_pretrained(
         args.model_name)  # TODO: use model path instead
diff --git a/examples/falcon/README.md b/examples/falcon/README.md
index ffda75de8..aa6f535c8 100644
--- a/examples/falcon/README.md
+++ b/examples/falcon/README.md
@@ -230,9 +230,9 @@ If the engines are run successfully, you will see output like (falcon-rw-1b as t
 
 ### FP8 Post-Training Quantization
 
-The examples below use the NVIDIA AMMO (AlgorithMic Model Optimization) toolkit for the model quantization process.
+The examples below use the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
 
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
 Now quantize HF Falcon weights and export trtllm checkpoint.
 
@@ -263,9 +263,9 @@ Note that you can enable fp8 context fmha to get further acceleration by setting
 
 ### Groupwise quantization (AWQ)
 
-The examples below use the NVIDIA AMMO (AlgorithMic Model Optimization) toolkit for the model quantization process.
+The examples below use the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
 
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
 Now quantize HF Falcon weights and export trtllm checkpoint.
 
@@ -291,7 +291,7 @@ mpirun -n 2 --allow-run-as-root --oversubscribe \
 ```
 
 #### W4A16 AWQ with FP8 GEMM (W4A8 AWQ)
-For Hopper GPUs, TRT-LLM also supports employing FP8 GEMM for accelerating linear layers. This mode is noted with `w4a8_awq` for AMMO and TRT-LLM, in which both weights and activations are converted from W4A16 to FP8 for GEMM calculation.
+For Hopper GPUs, TRT-LLM also supports employing FP8 GEMM for accelerating linear layers. This mode is noted with `w4a8_awq` for Modelopt and TRT-LLM, in which both weights and activations are converted from W4A16 to FP8 for GEMM calculation.
 
 Please make sure your system contains a Hopper GPU before trying the commands below.
 
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 0b01a27f6..6fb8bee82 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
diff --git a/examples/gemma/README.md b/examples/gemma/README.md
index c585ef127..c06d53cf1 100644
--- a/examples/gemma/README.md
+++ b/examples/gemma/README.md
@@ -22,7 +22,7 @@
       - [Run 7B inference under SmoothQuant for jax checkpoint](#run-7b-inference-under-smoothquant-for-jax-checkpoint)
       - [Run inference under weight only for keras checkpoint](#run-inference-under-weight-only-for-keras-checkpoint)
       - [Run inference under INT8 KV caches for keras checkpoint](#run-inference-under-int8-kv-caches-for-keras-checkpoint)
-    - [Run AMMO Quantization](#run-ammo-quantization)
+    - [Run Modelopt Quantization](#run-modelopt-quantization)
       - [Requirements](#requirements)
       - [Quantize Checkpoints](#quantize-checkpoints)
       - [Build Engines](#build-engines)
@@ -187,7 +187,7 @@ python3 ../summarize.py --test_trt_llm \
 
 #### Run inference under FP8 for keras checkpoint
 
-WARNING: This way of running FP8 will introduce noticeable accuracy drop. To avoid that, use AMMO quantization mentioned in this readme.
+WARNING: This way of running FP8 will introduce noticeable accuracy drop. To avoid that, use Modelopt quantization mentioned in this readme.
 
 In this example, we demonstrate how to run FP8 inference on Gemma. Note that `convert_checkpoint.py` only uses identity activation scales, so the accuracy might be little worse than higher precision in some cases, but it is still very good because we don't do any calibration. This also shows the stability of FP8 compared to INT8.
 
@@ -466,7 +466,7 @@ Average accuracy: 0.630
 
 #### Run inference under FP8 for jax checkpoint
 
-WARNING: This way of running FP8 will introduce noticeable accuracy drop. To avoid that, use AMMO quantization mentioned in this readme.
+WARNING: This way of running FP8 will introduce noticeable accuracy drop. To avoid that, use Modelopt quantization mentioned in this readme.
 
 In this example, we demonstrate how to run FP8 inference on Gemma. Note that `convert_checkpoint.py` only uses identity activation scales, so the accuracy might be little worse than higher precision in some cases, but it is still very good because we don't do any calibration. This also shows the stability of FP8 compared to INT8.
 
@@ -689,11 +689,11 @@ python3 ../summarize.py --test_trt_llm \
 [02/08/2024-07:51:11] [TRT-LLM] [I]   rougeLsum : 17.94213019528988
 ```
 
-### Run AMMO Quantization
+### Run Modelopt Quantization
 
 #### Requirements
 
-AMMO toolkit also provides quantization solutions. To enable it, have the latest ammo and transformers Python package installed to support Gemma. Then run the following commands.
+Modelopt toolkit also provides quantization solutions. To enable it, have the latest modelopt and transformers Python package installed to support Gemma. Then run the following commands.
 
 #### Quantize Checkpoints
 
@@ -736,7 +736,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
 
 #### Accuracy Results on MMLU
 
-| Model         | fp8   | int4_awq | int8_sq (AMMO) | int8_sq (Native per-channel) |
+| Model         | fp8   | int4_awq | int8_sq (Modelopt) | int8_sq (Native per-channel) |
 |---------------|-------|----------|----------------|------------------|
 | 2B Pretrained | 0.407 | 0.378    |    0.338       |     0.338        |
 | 7B Pretrained | 0.643 | 0.615    |    0.448       |     0.595        |
diff --git a/examples/gemma/convert_checkpoint.py b/examples/gemma/convert_checkpoint.py
index c3f5295a6..410231b8d 100644
--- a/examples/gemma/convert_checkpoint.py
+++ b/examples/gemma/convert_checkpoint.py
@@ -65,7 +65,7 @@ def parse_arguments():
         "By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV",
     )
     parser.add_argument(
-        "--ammo_quant_ckpt_path",
+        "--modelopt_quant_ckpt_path",
         default=None,
         help=
         "Path of a directory to quantized model checkpoints in .safetensors format or \
@@ -904,7 +904,7 @@ def convert(worker_rank, args, convert_kwargs):
             weight_scales = quantize_fp8_weights(
                 weights, trt_llm_config.num_hidden_layers,
                 trt_llm_config.mapping)
-            scales = load_from_fp8_gemma(args.ammo_quant_ckpt_path,
+            scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path,
                                          trt_llm_config.num_hidden_layers,
                                          trt_llm_config.mapping,
                                          args.fp8_kv_cache, weight_scales)
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index b281d7392..f802ea2dc 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
diff --git a/examples/gpt/convert_checkpoint.py b/examples/gpt/convert_checkpoint.py
index bb131490a..f2754bc47 100644
--- a/examples/gpt/convert_checkpoint.py
+++ b/examples/gpt/convert_checkpoint.py
@@ -44,7 +44,9 @@ def parse_arguments():
     parser.add_argument(
         '--gpt_variant',
         default=None,
-        choices=[None, 'gpt2', 'santacoder', 'starcoder', 'starcoder2'],
+        choices=[
+            None, 'gpt2', 'santacoder', 'starcoder', 'starcoder2', 'persimmon'
+        ],
         help=
         "By default the script will try to infer the gpt_variant from model_dir. "
         "Or users may overwrite gpt_variant by explicitly passing the variant.")
@@ -160,26 +162,30 @@ def load_gpt_config(model_dir: str,
 
     if gpt_variant is None:
         print("Inferring gpt variant from path...")
-        for v in ['starcoder2', 'starcoder', 'santacoder', 'gpt2']:
-            if v in config._name_or_path:
+        for v in ['starcoder2', 'starcoder', 'santacoder', 'gpt2', 'persimmon']:
+            if v in config._name_or_path or ('fuyu' in config._name_or_path
+                                             and v == 'persimmon'):
                 gpt_variant = v
                 break
-    assert gpt_variant in ['gpt2', 'santacoder', 'starcoder', 'starcoder2']
+    assert gpt_variant in [
+        'gpt2', 'santacoder', 'starcoder', 'starcoder2', 'persimmon'
+    ]
     print(f"Gpt variant: {gpt_variant}")
 
-    if gpt_variant == 'starcoder2':
+    if gpt_variant in ['starcoder2', 'persimmon']:
         config.n_embd = config.hidden_size
         config.n_inner = config.intermediate_size
         config.n_head = config.num_attention_heads
-        config.n_kv_head = config.num_key_value_heads
+        config.n_kv_head = config.num_key_value_heads if hasattr(
+            config, 'num_key_value_heads') else config.n_head
         config.n_layer = config.num_hidden_layers
         config.n_positions = config.max_position_embeddings
-        config.activation_function = 'gelu'
-        config.layer_norm_epsilon = config.norm_epsilon
-        config.bias = config.use_bias
+        config.activation_function = 'gelu' if gpt_variant == 'starcoder2' else 'squared-relu'
+        config.layer_norm_epsilon = config.norm_epsilon if gpt_variant == 'starcoder2' else config.layer_norm_eps
+        config.bias = config.use_bias if gpt_variant == 'starcoder2' else True
         config.position_embedding_type = 'rope_gpt_neox'
         config.rotary_base = config.rope_theta
-        config.rotary_pct = 1.0
+        config.rotary_pct = getattr(config, 'partial_rotary_factor', 1.0)
     else:
         if config.n_inner is None:
             config.n_inner = config.n_embd * 4
@@ -345,6 +351,9 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
     for l in layers_range:
         if gpt_variant == 'starcoder2':
             prefix = f'model.layers.{l}'
+        elif gpt_variant == 'persimmon':
+            is_fuyu = f'language_model.model.embed_tokens.weight' in model_params
+            prefix = f'language_model.model.layers.{l}' if is_fuyu else f'model.layers.{l}'
         else:
             prefix = f'transformer.h.{l}'
         tllm_prex = f'transformer.layers.{l-layers_range[0]}'
@@ -364,15 +373,30 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
                                            f'{prefix}.self_attn.v_proj', dtype)
             qkv_w = torch.cat([q_w, k_w, v_w], dim=0)
             qkv_b = torch.cat([q_b, k_b, v_b], dim=0)
+        elif gpt_variant == 'persimmon':
+            qkv_w, qkv_b = get_weight_and_bias(
+                model_params, f'{prefix}.self_attn.query_key_value', dtype)
         else:
             qkv_w, qkv_b = get_weight_and_bias(model_params,
                                                f'{prefix}.attn.c_attn', dtype)
         if gpt_variant in ['gpt2', 'santacoder']:
             qkv_w = qkv_w.t().contiguous()  # transpose for Conv1D
-        qkv_w = split_qkv(qkv_w, mapping.tp_rank, mapping.tp_size, hidden_size,
-                          num_attention_heads, num_kv_heads)
-        qkv_b = split_qkv(qkv_b, mapping.tp_rank, mapping.tp_size, hidden_size,
-                          num_attention_heads, num_kv_heads)
+
+        if gpt_variant == 'persimmon':
+            qkv_w = split(qkv_w,
+                          mapping.tp_rank,
+                          mapping.tp_size,
+                          is_column=True)
+
+            qkv_b = split(qkv_b,
+                          mapping.tp_rank,
+                          mapping.tp_size,
+                          is_column=True)
+        else:
+            qkv_w = split_qkv(qkv_w, mapping.tp_rank, mapping.tp_size,
+                              hidden_size, num_attention_heads, num_kv_heads)
+            qkv_b = split_qkv(qkv_b, mapping.tp_rank, mapping.tp_size,
+                              hidden_size, num_attention_heads, num_kv_heads)
 
         weights.update(
             get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', qkv_b,
@@ -382,6 +406,9 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
         if gpt_variant == 'starcoder2':
             attn_dense_w, attn_dense_b = get_weight_and_bias(
                 model_params, f'{prefix}.self_attn.o_proj', dtype)
+        elif gpt_variant == 'persimmon':
+            attn_dense_w, attn_dense_b = get_weight_and_bias(
+                model_params, f'{prefix}.self_attn.dense', dtype)
         else:
             attn_dense_w, attn_dense_b = get_weight_and_bias(
                 model_params, f'{prefix}.attn.c_proj', dtype)
@@ -396,8 +423,13 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
                                    attn_dense_b, use_weight_only,
                                    plugin_weight_only_quant_type))
 
-        mlp_fc_w, mlp_fc_b = get_weight_and_bias(model_params,
-                                                 f'{prefix}.mlp.c_fc', dtype)
+        if gpt_variant == 'persimmon':
+            mlp_fc_w, mlp_fc_b = get_weight_and_bias(
+                model_params, f'{prefix}.mlp.dense_h_to_4h', dtype)
+        else:
+            mlp_fc_w, mlp_fc_b = get_weight_and_bias(model_params,
+                                                     f'{prefix}.mlp.c_fc',
+                                                     dtype)
         if gpt_variant in ['gpt2', 'santacoder']:
             mlp_fc_w = mlp_fc_w.t().contiguous()  # transpose for Conv1D
         mlp_fc_w = split(mlp_fc_w,
@@ -413,9 +445,12 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
                                    use_weight_only,
                                    plugin_weight_only_quant_type))
 
-        mlp_proj_w, mlp_proj_b = get_weight_and_bias(model_params,
-                                                     f'{prefix}.mlp.c_proj',
-                                                     dtype)
+        if gpt_variant == 'persimmon':
+            mlp_proj_w, mlp_proj_b = get_weight_and_bias(
+                model_params, f'{prefix}.mlp.dense_4h_to_h', dtype)
+        else:
+            mlp_proj_w, mlp_proj_b = get_weight_and_bias(
+                model_params, f'{prefix}.mlp.c_proj', dtype)
         if gpt_variant in ['gpt2', 'santacoder']:
             mlp_proj_w = mlp_proj_w.t().contiguous()  # transpose for Conv1D
         mlp_proj_w = split(mlp_proj_w,
@@ -427,7 +462,7 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
                                    mlp_proj_b, use_weight_only,
                                    plugin_weight_only_quant_type))
 
-        if gpt_variant == 'starcoder2':
+        if gpt_variant in ['starcoder2', 'persimmon']:
             input_ln_w, input_ln_b = get_weight_and_bias(
                 model_params, f'{prefix}.input_layernorm', dtype)
         else:
@@ -437,7 +472,7 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
         if input_ln_b is not None:
             weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_b
 
-        if gpt_variant == 'starcoder2':
+        if gpt_variant in ['starcoder2', 'persimmon']:
             post_ln_w, post_ln_b = get_weight_and_bias(
                 model_params, f'{prefix}.post_attention_layernorm', dtype)
         else:
@@ -447,9 +482,26 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
         if post_ln_b is not None:
             weights[f'{tllm_prex}.post_layernorm.bias'] = post_ln_b
 
+        if gpt_variant == 'persimmon':
+            q_layernorm_w, q_layernorm_b = get_weight_and_bias(
+                model_params, f'{prefix}.self_attn.q_layernorm', dtype)
+
+            weights[f'{tllm_prex}.attention.q_layernorm.weight'] = q_layernorm_w
+            weights[f'{tllm_prex}.attention.q_layernorm.bias'] = q_layernorm_b
+
+            k_layernorm_w, k_layernorm_b = get_weight_and_bias(
+                model_params, f'{prefix}.self_attn.k_layernorm', dtype)
+
+            weights[f'{tllm_prex}.attention.k_layernorm.weight'] = k_layernorm_w
+            weights[f'{tllm_prex}.attention.k_layernorm.bias'] = k_layernorm_b
+
     if mapping.is_first_pp_rank():
         if gpt_variant == 'starcoder2':
             embed_w = get_weight(model_params, 'model.embed_tokens', dtype)
+        elif gpt_variant == 'persimmon':
+            embed_w = get_weight(model_params,
+                                 ('language_model.' if is_fuyu else '') +
+                                 'model.embed_tokens', dtype)
         else:
             embed_w = get_weight(model_params, 'transformer.wte', dtype)
         weights['transformer.vocab_embedding.weight'] = split_embedding(
@@ -473,6 +525,10 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
             embed_w = get_weight(model_params, 'lm_head', dtype)
             if embed_w is None:
                 embed_w = get_weight(model_params, 'model.embed_tokens', dtype)
+        elif gpt_variant == 'persimmon':
+            embed_w = get_weight(model_params,
+                                 ('language_model.' if is_fuyu else '') +
+                                 'lm_head', dtype)
         else:
             embed_w = get_weight(model_params, 'transformer.wte', dtype)
         if not share_embedding_table:
@@ -488,6 +544,10 @@ def convert_hf_gpt(hf_model: AutoModelForCausalLM,
         if gpt_variant == 'starcoder2':
             ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'model.norm',
                                                  dtype)
+        elif gpt_variant == 'persimmon':
+            ln_f_w, ln_f_b = get_weight_and_bias(
+                model_params, ('language_model.' if is_fuyu else '') +
+                'model.final_layernorm', dtype)
         else:
             ln_f_w, ln_f_b = get_weight_and_bias(model_params,
                                                  'transformer.ln_f', dtype)
@@ -1769,6 +1829,8 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
         getattr(hf_config, 'rotary_base', 10000.0),
         'rotary_scaling':
         getattr(hf_config, 'rotary_scaling', None),
+        'qk_layernorm':
+        args.model_dir is not None and gpt_variant == 'persimmon'
     }
 
     with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 9949e34c8..2ade0b706 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptj/README.md b/examples/gptj/README.md
index 0b66ac02a..3a826eb24 100644
--- a/examples/gptj/README.md
+++ b/examples/gptj/README.md
@@ -112,9 +112,9 @@ Building command is identical to the common one above.
 
 #### FP8 Post-Training Quantization
 
-The examples below uses the NVIDIA AMMO (AlgorithMic Model Optimization) toolkit for the model quantization process.
+The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
 
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
 One can quantize HF GPT-J weights in FP8 as follows.
 
@@ -180,7 +180,7 @@ Note `--context_fmha` / `--context_fmha_fp32_acc` has to be used together with `
 #### INT8 KV cache
 INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger.
 
-You can get the INT8 scale of KV cache through AMMO:
+You can get the INT8 scale of KV cache through Modelopt:
 
 ```bash
 # INT8 calibration
diff --git a/examples/gptneox/README.md b/examples/gptneox/README.md
index 9cb484830..ac1c4329e 100644
--- a/examples/gptneox/README.md
+++ b/examples/gptneox/README.md
@@ -167,7 +167,7 @@ sh gptq_convert.sh
 ### 3. Convert weights from HF Transformers to TensorRT-LLM format
 
 To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`:
-Here `--ammo_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
+Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
 
 ```bash
 # Single GPU
@@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                               --dtype float16 \
                               --use_weight_only \
                               --weight_only_precision int4_gptq \
-                              --ammo_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                               --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/
 # With 2-way Tensor Parallel
 python3 convert_checkpoint.py --model_dir ./gptneox_model \
@@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                               --weight_only_precision int4_gptq \
                               --tp_size 2 \
                               --workers 2 \
-                              --ammo_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                               --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/
 ```
 
diff --git a/examples/gptneox/convert_checkpoint.py b/examples/gptneox/convert_checkpoint.py
index 07a30c267..b89390a48 100644
--- a/examples/gptneox/convert_checkpoint.py
+++ b/examples/gptneox/convert_checkpoint.py
@@ -50,7 +50,7 @@ def parse_arguments():
         'Define the precision for the weights when using weight-only quantization.'
         'You must also use --use_weight_only for that argument to have an impact.'
     )
-    parser.add_argument('--ammo_quant_ckpt_path',
+    parser.add_argument('--modelopt_quant_ckpt_path',
                         type=str,
                         default=None,
                         help='Path of a quantized model checkpoint')
@@ -707,7 +707,8 @@ def convert_hf_gptneox(hf_model,
             'has_zero_point':
             True,
             'group_size':
-            get_gptq_gptneox_group_size(args.ammo_quant_ckpt_path, hf_config)
+            get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path,
+                                        hf_config)
         })
 
     with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
@@ -721,7 +722,7 @@ def covert_and_save(rank):
 
         if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
             weights = load_from_gptq_gptneox(
-                args.ammo_quant_ckpt_path,
+                args.modelopt_quant_ckpt_path,
                 hf_config,
                 use_parallel_embedding=args.use_parallel_embedding,
                 sharding_dim=args.embedding_sharding_dim,
diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt
index c41409253..db72e9e63 100644
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
diff --git a/examples/high-level-api/README.md b/examples/high-level-api/README.md
index 82432fe51..540810470 100644
--- a/examples/high-level-api/README.md
+++ b/examples/high-level-api/README.md
@@ -13,7 +13,14 @@ pip install -r requirements.txt
 You can refer to [llm_examples.py](llm_examples.py) for all of the examples, and run it with the [run_examples.py](./run_examples.py) script, the command is as follows:
 
 ```sh
-python3 ./run_examples.py <llama-model-path>
+# To run examples with single GPU:
+python3 ./run_examples.py run_single_gpu --model_dir <llama-model-path>
+
+# Run the multi-GPU examples
+python3 ./run_examples.py run_multi_gpu --model_dir <llama-model-path>
+
+# Run the quantization examples
+python3 ./run_examples.py run_quant --model_dir <llama-model-path>
 ```
 
 For 7B, 13B models those could be held in a single GPU, it should run all the examples automatically and print the results.
diff --git a/examples/high-level-api/llm_examples.py b/examples/high-level-api/llm_examples.py
index eb4a632f0..33433bc4f 100644
--- a/examples/high-level-api/llm_examples.py
+++ b/examples/high-level-api/llm_examples.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 import asyncio
-import inspect
-import sys
-from argparse import ArgumentParser
-from typing import List, Optional
+import os
+from typing import List, Optional, Union
 
+import click
 import torch
 
-from tensorrt_llm import LLM, ModelConfig, logger
+from tensorrt_llm import LLM, ModelConfig
 from tensorrt_llm.hlapi.llm import KvCacheConfig, SamplingConfig
 from tensorrt_llm.hlapi.utils import get_device_count
 from tensorrt_llm.quantization import QuantAlgo
@@ -15,62 +14,101 @@
 # NOTE, Currently, the following examples are only available for LLaMA models.
 
 
-def run_llm_from_huggingface_model(prompts: List[str],
-                                   llama_model_dir: str,
-                                   dump_engine_dir: Optional[str] = None,
-                                   tp_size: int = 1):
-    ''' Loading a HuggingFace model. '''
-    if get_device_count() < tp_size:
+@click.group()
+def cli():
+    pass
+
+
+@click.command('run_llm_generate')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--engine_dir',
+              type=str,
+              help='The directory of the engine.',
+              default=None)
+@click.option('--tp_size',
+              type=int,
+              default=1,
+              help='The number of GPUs for Tensor Parallel.')
+@click.option('--pp_size',
+              type=int,
+              default=1,
+              help='The number of GPUs for Pipeline Parallel.')
+@click.option('--prompt_is_digit',
+              type=bool,
+              default=False,
+              help='Whether the prompt is a list of integers.')
+def run_llm_generate(
+    prompt: str,
+    model_dir: str,
+    engine_dir: Optional[str] = None,
+    tp_size: int = 1,
+    pp_size: int = 1,
+    prompt_is_digit: bool = False,
+    end_id: int = 2,
+):
+    ''' Running LLM with arbitrary model formats including:
+        - HF model
+        - TRT-LLM checkpoint
+        - TRT-LLM engine
+
+    It will dump the engine to `engine_dir` if specified.
+
+    Args:
+        prompts: A list of prompts. Each prompt can be either a string or a list of integers when tokenizer is disabled.
+        model_dir: The directory of the model.
+        engine_dir: The directory of the engine, if specified different than model_dir then it will save the engine to `engine_dir`.
+        tp_size: The number of GPUs for Tensor Parallel.
+        pp_size: The number of GPUs for Pipeline Parallel.
+    '''
+
+    config = ModelConfig(model_dir)
+    # Avoid the tp_size and pp_size setting override the ones loaded from built engine
+    if tp_size > 1: config.parallel_config.tp_size = tp_size
+    if pp_size > 1: config.parallel_config.pp_size = pp_size
+
+    if get_device_count() < config.parallel_config.world_size:
         print(
             "Skip the example for TP!!! Since the number of GPUs is less than required"
         )
         return
-    if tp_size > 1:
+    if config.parallel_config.world_size > 1:
         print(f'Running LLM with Tensor Parallel on {tp_size} GPUs.')
 
-    config = ModelConfig(llama_model_dir)
-    config.parallel_config.tp_size = tp_size
-
     llm = LLM(config)
-    if dump_engine_dir:
-        llm.save(dump_engine_dir)
 
-    for output in llm.generate(prompts):
-        print(output)
-
-
-def run_llm_from_tllm_engine(prompts: List[str],
-                             llama_engine_dir: str,
-                             tp_size: int = 1):
-    ''' Loading a built TensorRT-LLM engine. '''
+    if engine_dir and os.path.abspath(model_dir) != os.path.abspath(engine_dir):
+        print(f"Saving engine to {engine_dir}...")
+        llm.save(engine_dir)
 
-    config = ModelConfig(llama_engine_dir)
-    config.parallel_config.tp_size = tp_size
-    llm = LLM(config)
+    prompts = parse_prompts(prompt, prompt_is_digit)
 
-    for output in llm.generate(prompts):
-        print(output)
-
-
-def run_llm_without_tokenizer_from_engine_or_ckpt(engine_or_ckpt_dir: str):
-    ''' Loading a TensorRT-LLM engine built by trtllm-build or a TensorRT-LLM checkpoint generated by convert_checkpoint.py, and the tokenizer is missing too. '''
-
-    config = ModelConfig(engine_or_ckpt_dir)
-    llm = LLM(config)
-
-    # since tokenizer is missing, so we cannot get a default sampling config, create one manually
-    sampling_config = SamplingConfig(end_id=2, pad_id=2)
-
-    prompts = [[23, 14, 3]]
+    sampling_config = SamplingConfig(end_id=end_id,
+                                     pad_id=end_id) if prompt_is_digit else None
 
     for output in llm.generate(prompts, sampling_config=sampling_config):
-        print(output)
-
-
-def run_llm_generate_async_example(prompts: List[str],
-                                   llama_model_dir: str,
+        print("OUTPUT:", output)
+
+
+@click.command('run_llm_generate_async_example')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--streaming',
+              is_flag=True,
+              help='Whether to enable streaming generation.')
+@click.option('--tp_size',
+              type=int,
+              default=1,
+              help='The number of GPUs for Tensor Parallel.')
+@click.option('--pp_size',
+              type=int,
+              default=1,
+              help='The number of GPUs for Pipeline Parallel.')
+def run_llm_generate_async_example(prompt: str,
+                                   model_dir: str,
                                    streaming: bool = False,
-                                   tp_size: int = 1):
+                                   tp_size: int = 1,
+                                   pp_size: int = 1):
     ''' Running LLM generation asynchronously. '''
 
     if get_device_count() < tp_size:
@@ -81,11 +119,14 @@ def run_llm_generate_async_example(prompts: List[str],
     if tp_size > 1:
         print(f'Running LLM with Tensor Parallel on {tp_size} GPUs.')
 
-    config = ModelConfig(llama_model_dir)
-    config.parallel_config.tp_size = tp_size
+    config = ModelConfig(model_dir)
+    # Avoid the tp_size and pp_size setting override the ones loaded from built engine
+    if tp_size > 1: config.parallel_config.tp_size = tp_size
+    if pp_size > 1: config.parallel_config.pp_size = pp_size
 
     llm = LLM(config,
               kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4))
+    prompts = parse_prompts(prompt, False)
 
     async def task(prompt: str):
         outputs = []
@@ -100,9 +141,14 @@ async def main():
     asyncio.run(main())
 
 
-def run_llm_with_quantization(prompts: List[str],
-                              llama_model_dir: str,
-                              quant_type: str = 'int4_awq'):
+@click.command('run_llm_with_quantization')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--quant_type',
+              type=str,
+              default='int4_awq',
+              help='The quantization type.')
+def run_llm_with_quantization(prompt: str, model_dir: str, quant_type: str):
     ''' Running LLM with quantization.
     quant_type could be 'int4_awq' or 'fp8'.
     '''
@@ -117,7 +163,7 @@ def run_llm_with_quantization(prompts: List[str],
             print("Hopper GPUs are required for fp8 quantization")
             return
 
-    config = ModelConfig(llama_model_dir)
+    config = ModelConfig(model_dir)
     if quant_type == 'int4_awq':
         config.quant_config.quant_algo = QuantAlgo.W4A16_AWQ
     else:
@@ -126,16 +172,21 @@ def run_llm_with_quantization(prompts: List[str],
         config.quant_config.exclude_modules = ["lm_head"]
 
     llm = LLM(config)
+    prompts = parse_prompts(prompt, False)
 
     for output in llm.generate(prompts):
         print(output)
 
 
-def run_llm_with_async_future(prompts: List[str], llama_model_dir: str):
-    config = ModelConfig(llama_model_dir)
+@click.command('run_llm_with_async_future')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+def run_llm_with_async_future(prompt: str, model_dir: str):
+    config = ModelConfig(model_dir)
     llm = LLM(config,
               kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4))
 
+    prompts = parse_prompts(prompt)
     # The result of generate() is similar to a Future, it won't block the main thread, call .result() to explicitly wait for the result
     for generation in llm.generate_async(prompts):
         # .result() is a blocking call, call it when you want to wait for the result
@@ -155,8 +206,15 @@ async def main():
     asyncio.run(main())
 
 
-def run_llm_with_auto_parallel(prompts: List[str],
-                               llama_model_dir: str,
+@click.command('run_llm_with_auto_parallel')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--world_size',
+              type=int,
+              default=1,
+              help='The number of GPUs for Auto Parallel.')
+def run_llm_with_auto_parallel(prompt: str,
+                               model_dir: str,
                                world_size: int = 1):
     ''' Running LLM with auto parallel enabled. '''
     if get_device_count() < world_size:
@@ -167,115 +225,29 @@ def run_llm_with_auto_parallel(prompts: List[str],
     if world_size > 1:
         print(f'Running LLM with Auto Parallel on {world_size} GPUs.')
 
-    config = ModelConfig(llama_model_dir)
+    config = ModelConfig(model_dir)
     config.parallel_config.auto_parallel = True
     config.parallel_config.world_size = world_size
 
     llm = LLM(config)
+    prompts = parse_prompts(prompt)
 
     for output in llm.generate(prompts):
         print(output)
 
 
-def run_llm_with_auto_parallel_async(prompts: List[str],
-                                     llama_model_dir: str,
-                                     world_size: int = 1,
-                                     streaming: bool = False):
-    ''' Running LLM asynchronously with auto parallel enabled. '''
-    if get_device_count() < world_size:
-        print(
-            "Skip the example for auto parallel!!! Since the number of GPUs is less than required"
-        )
-        return
-    if world_size > 1:
-        print(f'Running LLM with Auto Parallel on {world_size} GPUs.')
-
-    config = ModelConfig(llama_model_dir)
-    config.parallel_config.auto_parallel = True
-    config.parallel_config.world_size = world_size
-
-    llm = LLM(config)
-
-    async def task(prompt: str):
-        outputs = []
-        async for output in llm.generate_async(prompt, streaming=streaming):
-            outputs.append(output.text)
-        print(' '.join(outputs))
-
-    async def main():
-        tasks = [task(prompt) for prompt in prompts]
-        await asyncio.gather(*tasks)
-
-    asyncio.run(main())
-
-
-def _parse_arguments():
-    parser = ArgumentParser()
-    parser.add_argument('--task', type=str, choices=_get_functions())
-    parser.add_argument('--hf_model_dir',
-                        type=str,
-                        help='The directory of the model.')
-    parser.add_argument('--dump_engine_dir',
-                        type=str,
-                        help='The directory to dump the engine.',
-                        default=None)
-    parser.add_argument('--ckpt_dir',
-                        type=str,
-                        help='The directory of the TRT-LLM checkpoint.',
-                        default=None)
-    parser.add_argument('--quant_type', type=str, choices=['int4_awq', 'fp8'])
-    parser.add_argument('--prompt', type=str, default="What is LLM?")
-    parser.add_argument('--world_size', type=int, default=1)
-    parser.add_argument('--tp_size', type=int, default=1)
-    parser.add_argument('--streaming', action='store_true')
-    parser.add_argument('--log_level', type=str, default='info')
-    return parser.parse_args()
-
-
-def _get_functions():
-    cur_module = sys.modules[__name__]
-    function_names = [
-        name for name, _ in inspect.getmembers(cur_module, inspect.isfunction)
-        if not name.startswith('_')
-    ]
-    return function_names
+def parse_prompts(prompt: str, is_digit: bool = False) -> Union[str, List[int]]:
+    ''' Process a single prompt. '''
+    if is_digit:
+        return [[int(i) for i in prompt.split()]]
+    else:
+        return [prompt]
 
 
 if __name__ == '__main__':
-    args = _parse_arguments()
-    logger.set_level(args.log_level)
-    assert args.dump_engine_dir is None or args.ckpt_dir is None
-    engine_or_ckpt_dir = args.dump_engine_dir or args.ckpt_dir
-    tasks = dict(
-        run_llm_from_huggingface_model=lambda: run_llm_from_huggingface_model(
-            [args.prompt],
-            args.hf_model_dir,
-            args.dump_engine_dir,
-            tp_size=args.tp_size),
-        run_llm_from_tllm_engine=lambda: run_llm_from_tllm_engine(
-            [args.prompt],
-            args.dump_engine_dir,
-            tp_size=args.tp_size,
-        ),
-        run_llm_generate_async_example=lambda: run_llm_generate_async_example(
-            [args.prompt],
-            args.hf_model_dir,
-            tp_size=args.tp_size,
-            streaming=args.streaming),
-        run_llm_with_quantization=lambda: run_llm_with_quantization(
-            [args.prompt], args.hf_model_dir, args.quant_type),
-        run_llm_with_auto_parallel=lambda: run_llm_with_auto_parallel(
-            [args.prompt], args.hf_model_dir, args.world_size),
-        run_llm_with_auto_parallel_async=lambda:
-        run_llm_with_auto_parallel_async([args.prompt],
-                                         args.hf_model_dir,
-                                         args.world_size,
-                                         streaming=args.streaming),
-        run_llm_without_tokenizer_from_engine_or_ckpt=lambda:
-        run_llm_without_tokenizer_from_engine_or_ckpt(engine_or_ckpt_dir),
-        run_llm_with_async_future=lambda: run_llm_with_async_future(
-            [args.prompt], args.hf_model_dir))
-
-    print(f'Running {args.task} ...')
-
-    tasks[args.task]()
+    cli.add_command(run_llm_generate)
+    cli.add_command(run_llm_generate_async_example)
+    cli.add_command(run_llm_with_quantization)
+    cli.add_command(run_llm_with_async_future)
+    cli.add_command(run_llm_with_auto_parallel)
+    cli()
diff --git a/examples/high-level-api/requirements.txt b/examples/high-level-api/requirements.txt
index c598c3720..eff840a1e 100644
--- a/examples/high-level-api/requirements.txt
+++ b/examples/high-level-api/requirements.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
diff --git a/examples/high-level-api/run_auto_parallel_examples.sh b/examples/high-level-api/run_auto_parallel_examples.sh
deleted file mode 100644
index 46bbc24c3..000000000
--- a/examples/high-level-api/run_auto_parallel_examples.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-
-PROMPT="Tell a story"
-LLAMA_MODEL_DIR=$1
-WORLD_SIZE=${2:-2}
-
-dir=$(dirname "$0")
-
-python3 $dir/llm_examples.py --task run_llm_with_auto_parallel \
-    --prompt="$PROMPT" \
-    --world_size=$WORLD_SIZE \
-    --hf_model_dir=$LLAMA_MODEL_DIR
-
-python3 $dir/llm_examples.py --task run_llm_with_auto_parallel_async \
-    --prompt="$PROMPT" \
-    --world_size=$WORLD_SIZE \
-    --hf_model_dir=$LLAMA_MODEL_DIR
diff --git a/examples/high-level-api/run_examples.py b/examples/high-level-api/run_examples.py
index 8685a95e2..13e18abb8 100644
--- a/examples/high-level-api/run_examples.py
+++ b/examples/high-level-api/run_examples.py
@@ -1,45 +1,121 @@
 #!/usr/bin/env python
-import os
 import subprocess
 import sys
 
-PROMPT = "Tell a story"
-LLAMA_MODEL_DIR = sys.argv[1]
-TMP_ENGINE_DIR = sys.argv[2] if len(sys.argv) > 2 else "./tllm.engine.example"
-EXAMPLES_ROOT = sys.argv[3] if len(sys.argv) > 3 else ""
-LLM_EXAMPLES = os.path.join(EXAMPLES_ROOT, 'llm_examples.py')
-
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
-    f"--dump_engine_dir={TMP_ENGINE_DIR}"
-]
-subprocess.run(run_cmd, check=True)
-
-# TP enabled
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--tp_size=2"
-]
-subprocess.run(run_cmd, check=True)
-
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_from_tllm_engine",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
-    f"--dump_engine_dir={TMP_ENGINE_DIR}"
-]
-subprocess.run(run_cmd, check=True)
-
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}"
-]
-subprocess.run(run_cmd, check=True)
-
-# Both TP and streaming enabled
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--streaming",
-    "--tp_size=2"
-]
-subprocess.run(run_cmd, check=True)
+from llm_examples import *
+
+from tensorrt_llm.hlapi.utils import print_colored
+
+
+@click.group()
+def cli():
+    pass
+
+
+@click.command('run_single_gpu')
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--examples_root',
+              type=str,
+              help='The root directory of the examples.')
+@click.option('--llm_examples',
+              type=str,
+              help='The path to the llm_examples.py.',
+              default='llm_examples.py')
+@click.option('--engine_dir',
+              type=str,
+              help='The directory of the engine.',
+              default="/tmp/hlapi.engine.example")
+def run_single_gpu(
+    prompt: str,
+    model_dir: str,
+    examples_root: str,
+    llm_examples: str,
+    engine_dir: str,
+):
+    run_example(
+        "Running LLM from HuggingFace model",
+        f"{sys.executable} {llm_examples} run_llm_generate --prompt=\"{prompt}\" --model_dir={model_dir} --engine_dir={engine_dir}"
+    )
+
+    run_example(
+        "Running LLM from built engine with streaming enabled",
+        f"{sys.executable} {llm_examples} run_llm_generate_async_example --prompt=\"{prompt}\" --model_dir={engine_dir} --streaming"
+    )
+
+    run_example(
+        "Running LLM with async future",
+        f"{sys.executable} {llm_examples} run_llm_with_async_future --prompt=\"{prompt}\" --model_dir={engine_dir}"
+    )
+
+
+@click.command("run_multi_gpu")
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--examples_root',
+              type=str,
+              help='The root directory of the examples.')
+@click.option('--llm_examples',
+              type=str,
+              help='The path to the llm_examples.py.',
+              default='llm_examples.py')
+@click.option('--engine_dir',
+              type=str,
+              help='The directory of the engine.',
+              default="/tmp/hlapi.engine.example")
+def run_multi_gpu(
+    prompt: str,
+    model_dir: str,
+    examples_root: str,
+    llm_examples: str,
+    engine_dir: str,
+):
+    run_example(
+        "Running LLM from HuggingFace model with TP enabled",
+        f"{sys.executable} {llm_examples} run_llm_generate --prompt=\"{prompt}\" --model_dir={model_dir} --tp_size=2 --engine_dir={engine_dir}.tp2"
+    )
+
+    run_example(
+        "Running LLM from built engine with streaming enabled and TP=2",
+        f"{sys.executable} {llm_examples} run_llm_generate_async_example --prompt=\"{prompt}\" --model_dir={engine_dir}.tp2 --streaming"
+    )  # Loading the engine with TP=2.
+
+    run_example(
+        "Running LLM with auto parallel",
+        f"{sys.executable} {llm_examples} run_llm_with_auto_parallel --prompt=\"{prompt}\" --model_dir={model_dir} --world_size=2"
+    )
+
+
+@click.command("run_quant")
+@click.option('--prompt', type=str, default="What is LLM?")
+@click.option('--model_dir', type=str, help='The directory of the model.')
+@click.option('--examples_root',
+              type=str,
+              help='The root directory of the examples.')
+@click.option('--llm_examples',
+              type=str,
+              help='The path to the llm_examples.py.',
+              default='llm_examples.py')
+def run_quant(
+    prompt: str,
+    model_dir: str,
+    examples_root: str,
+    llm_examples: str,
+):
+    run_example(
+        "Running LLM with quantization",
+        f"{sys.executable} {llm_examples} run_llm_with_quantization --quant_type=int4_awq --prompt=\"{prompt}\" --model_dir={model_dir}"
+    )
+
+
+def run_example(hint: str, command: str):
+    print_colored(hint + "\n", "bold_green")
+    print(command)
+    subprocess.run(command, shell=True, check=True)
+
+
+if __name__ == '__main__':
+    cli.add_command(run_single_gpu)
+    cli.add_command(run_multi_gpu)
+    cli.add_command(run_quant)
+    cli()
diff --git a/examples/high-level-api/run_quant_examples.py b/examples/high-level-api/run_quant_examples.py
deleted file mode 100644
index 3cead2d95..000000000
--- a/examples/high-level-api/run_quant_examples.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python
-import os
-import subprocess
-import sys
-
-PROMPT = "Tell a story"
-LLAMA_MODEL_DIR = sys.argv[1]
-EXAMPLES_ROOT = sys.argv[2] if len(sys.argv) > 2 else ""
-LLM_EXAMPLES = os.path.join(EXAMPLES_ROOT, 'llm_examples.py')
-
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_with_quantization",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
-    "--quant_type=int4_awq"
-]
-subprocess.run(run_cmd, check=True)
-
-run_cmd = [
-    sys.executable, LLM_EXAMPLES, "--task=run_llm_with_quantization",
-    f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
-    "--quant_type=fp8"
-]
-subprocess.run(run_cmd, check=True)
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
index 8cb1e1e60..84ff0df40 100644
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 7039f4758..7febdbf45 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -8,6 +8,7 @@ This document shows how to build and run a LLaMA model in TensorRT-LLM on both s
   - [Usage](#usage)
     - [Build TensorRT engine(s)](#build-tensorrt-engines)
       - [LLaMA v2 Updates](#llama-v2-updates)
+      - [LLaMA v3 Updates](#llama-v3-updates)
     - [Using RoPE Scaling](#using-rope-scaling)
     - [Long context length](#long-context-length)
       - [INT8 KV cache](#int8-kv-cache)
@@ -66,7 +67,7 @@ TensorRT-LLM LLaMA builds TensorRT engine(s) from HF checkpoint. If no checkpoin
 
 Normally `trtllm-build` only requires single GPU, but if you've already got all the GPUs needed for inference, you could enable parallel building to make the engine building process faster by adding `--workers` argument. Please note that currently `workers` feature only supports single node.
 
-`--use_fused_mlp` enables GEMM horizontal fusion in gated MLP layer, which reduces input traffic and potentially improves performance. For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded (accuracy 0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).
+`--use_fused_mlp` enables GEMM horizontal fusion in gated MLP layer, which reduces input traffic and potentially improves performance. For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded (accuracy 0.45734 vs 0.45755 for LLaMA-v2 7B using modelopt/examples/hf/instruct_eval/mmlu.py).
 
 Here're some examples:
 
@@ -190,6 +191,69 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp8 \
 
 Same instructions can be applied to fine-tuned versions of the LLaMA v2 models (e.g. 7Bf or llama-2-7b-chat).
 
+#### LLaMA v3 Updates
+The LLaMA v3 models with 8B and 70b are compatible with the LLaMA v2 implementation. The above
+commands still work.
+
+Note that the `rope_theta` and `vocab_size` are larger in LLaMA v3 models and these values are now inferred
+or pickup up from the `params.json` when using the `meta_ckpt_dir`.
+
+```bash
+# Build LLaMA v3 8B TP=1 using HF checkpoints directly.
+python convert_checkpoint.py --model_dir ./tmp/llama/8B/hf/ \
+                            --output_dir ./tllm_checkpoint_1gpu_tp1 \
+                            --dtype float16 \
+                            --tp_size 1
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_tp1 \
+            --output_dir ./tmp/llama/8B/trt_engines/fp16/1-gpu/ \
+            --gemm_plugin float16 \
+
+# Build LLaMA v3 8B TP=1 using Meta checkpoints directly.
+python convert_checkpoint.py --meta_ckpt_dir ./tmp/llama/8B/ \
+                            --output_dir ./tllm_checkpoint_1gpu_tp1 \
+                            --dtype float16 \
+                            --tp_size 1
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_tp1 \
+            --output_dir ./tmp/llama/8B/trt_engines/fp16/1-gpu/ \
+            --gemm_plugin float16 \
+
+# Build LLaMA v3 70B using 8-way tensor parallelism.
+python convert_checkpoint.py --model_dir ./tmp/llama/70B/hf/ \
+                            --output_dir ./tllm_checkpoint_8gpu_tp8 \
+                            --dtype float16 \
+                            --tp_size 8
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp8 \
+            --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
+            --gemm_plugin float16
+
+# Build LLaMA v3 70B using 4-way tensor parallelism and 2-way pipeline parallelism.
+python convert_checkpoint.py --model_dir ./tmp/llama/70B/hf/ \
+                            --output_dir ./tllm_checkpoint_8gpu_tp4_pp2 \
+                            --dtype float16 \
+                            --tp_size 4 \
+                            --pp_size 2
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp4_pp2 \
+            --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
+            --gemm_plugin float16
+
+# Build LLaMA v3 70B TP=8 using Meta checkpoints directly.
+python convert_checkpoint.py --meta_ckpt_dir ./tmp/llama/70B/ \
+                            --output_dir ./tllm_checkpoint_8gpu_tp8 \
+                            --dtype float16 \
+                            --tp_size 8
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp8 \
+            --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
+            --gemm_plugin float16 \
+```
+
+Same instructions can be applied to fine-tuned versions of the LLaMA v2 models (e.g. 7Bf or llama-2-7b-chat).
+
+
 ### Using RoPE Scaling
 RoPE scaling is supported through GPT Attention Plugin. You can add `--rotary_scaling <type> <factor>` during the build command to enable it.
 - The value of `type` can be either `linear` and `dynamic`.
@@ -369,9 +433,9 @@ trtllm-build --checkpoint_dir /tmp/tllm_checkpoint_1gpu_sq \
 
 #### FP8 Post-Training Quantization
 
-The examples below uses the NVIDIA AMMO (AlgorithMic Model Optimization) toolkit for the model quantization process.
+The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
 
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
 
 ```bash
@@ -403,7 +467,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu
 - `--use_weight_only` enables weight only GEMMs in the network.
 - `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
 - `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`.
-- `--ammo_quant_ckpt_path` passes the quantized checkpoint to build the engine.
+- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine.
 
 AWQ/GPTQ examples below involves 2 steps:
 1. Weight quantization
@@ -412,7 +476,7 @@ AWQ/GPTQ examples below involves 2 steps:
 ##### AWQ
 1. Weight quantization:
 
-    NVIDIA AMMO toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for AMMO installation instructions.
+    NVIDIA Modelopt toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for Modelopt installation instructions.
 
     ```bash
     # Quantize HF LLaMA 7B checkpoint into INT4 AWQ format
@@ -459,7 +523,7 @@ To run the GPTQ LLaMa example, the following steps are required:
     python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \
                                  --output_dir ./tllm_checkpoint_2gpu_gptq \
                                  --dtype float16 \
-                                 --ammo_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
+                                 --modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
                                  --use_weight_only \
                                  --weight_only_precision int4_gptq \
                                  --per_group \
diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py
index 04c2ca27d..2ead48fa0 100644
--- a/examples/llama/convert_checkpoint.py
+++ b/examples/llama/convert_checkpoint.py
@@ -99,7 +99,7 @@ def parse_arguments():
         'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
     )
     parser.add_argument(
-        '--ammo_quant_ckpt_path',
+        '--modelopt_quant_ckpt_path',
         type=str,
         default=None,
         help='Path of a quantized model checkpoint in .npz format')
@@ -388,7 +388,7 @@ def convert_and_save_gptq(args, rank):
         mapping=mapping,
         quantization=args_to_quantization(args),
         skip_loading_weights=True)
-    weights = load_from_gptq_llama(llama.config, args.ammo_quant_ckpt_path)
+    weights = load_from_gptq_llama(llama.config, args.modelopt_quant_ckpt_path)
     llama.load(weights)
     llama.save_checkpoint(args.output_dir, rank == 0)
 
@@ -432,11 +432,11 @@ def main():
         execute(args.workers, [convert_and_save_meta] * world_size, args)
     elif args.weight_only_precision == 'int4_gptq':
         assert args.model_dir is not None
-        assert args.ammo_quant_ckpt_path is not None
+        assert args.modelopt_quant_ckpt_path is not None
         execute(args.workers, [convert_and_save_gptq] * world_size, args)
     else:  # all other non-gptq paths from hf model
         assert args.model_dir is not None
-        assert args.ammo_quant_ckpt_path is None, "only gptq weights only needs this option"
+        assert args.modelopt_quant_ckpt_path is None, "only gptq weights only needs this option"
         convert_and_save_hf(args)
 
     tok = time.time()
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index 55fc46c1c..74ed83f02 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index c598c3720..eff840a1e 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
diff --git a/examples/medusa/README.md b/examples/medusa/README.md
index 9a151759b..8d915de22 100644
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@@ -44,6 +44,7 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
 trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
              --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
              --gemm_plugin float16 \
+             --speculative_decoding_mode medusa \
              --max_batch_size 8
 
 # Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism.
@@ -58,6 +59,7 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
 trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
              --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
              --gemm_plugin float16 \
+             --speculative_decoding_mode medusa \
              --max_batch_size 8
 ```
 
diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py
index fb4cde001..476ce8250 100644
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@@ -102,7 +102,7 @@ def parse_arguments():
         'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
     )
     parser.add_argument(
-        '--ammo_quant_ckpt_path',
+        '--modelopt_quant_ckpt_path',
         type=str,
         default=None,
         help='Path of a quantized model checkpoint in .npz format')
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index 5dda27046..22b957fb6 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md
index c61349709..e0c41296e 100644
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
@@ -116,7 +116,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \
 
 ### FP8 Post-Training Quantization
 
-Mixtral supports FP8 quantization, using AMMO. See [`examples/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing AMMO
+Mixtral supports FP8 quantization, using Modelopt. See [`examples/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing Modelopt
 
 ```bash
 # Quantize HF Mixtral into FP8 and export trtllm checkpoint
diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt
index 952089319..35e75a1c8 100644
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 transformers==4.38.2
 accelerate==0.25.0
diff --git a/examples/mpt/README.md b/examples/mpt/README.md
index af37380dc..ea05b002a 100644
--- a/examples/mpt/README.md
+++ b/examples/mpt/README.md
@@ -10,10 +10,10 @@ This document explains how to build the [MPT](https://huggingface.co/mosaicml/mp
       - [1.2 Convert from HF Transformers with weight-only quantization](#12-convert-from-hf-transformers-with-weight-only-quantization)
       - [1.3 Convert from HF Transformers with SmoothQuant quantization](#13-convert-from-hf-transformers-with-smoothquant-quantization)
       - [1.4 Convert from HF Transformers with INT8 KV cache quantization](#14-convert-from-hf-transformers-with-int8-kv-cache-quantization)
-      - [1.5 AWQ weight-only quantization with AMMO](#15-awq-weight-only-quantization-with-ammo)
-      - [1.6 FP8 Post-Training Quantization with AMMO](#16-fp8-post-training-quantization-with-ammo)
-      - [1.6 Weight-only quantization with AMMO](#16-weight-only-quantization-with-ammo)
-      - [1.7 SmoothQuant and INT8 KV cache with AMMO](#17-smoothquant-and-int8-kv-cache-with-ammo)
+      - [1.5 AWQ weight-only quantization with Modelopt](#15-awq-weight-only-quantization-with-modelopt)
+      - [1.6 FP8 Post-Training Quantization with Modelopt](#16-fp8-post-training-quantization-with-modelopt)
+      - [1.6 Weight-only quantization with Modelopt](#16-weight-only-quantization-with-modelopt)
+      - [1.7 SmoothQuant and INT8 KV cache with Modelopt](#17-smoothquant-and-int8-kv-cache-with-modelopt)
     - [2.1 Build TensorRT engine(s)](#21-build-tensorrt-engines)
     - [MPT 30B](#mpt-30b)
       - [1. Convert weights from HF Transformers to TRTLLM format](#1-convert-weights-from-hf-transformers-to-trtllm-format)
@@ -91,34 +91,34 @@ python convert_checkpoint.py --model_dir mosaicml/mpt-7b --output_dir ./ckpts/mp
 ***INT8-KV-cache can be used with SQ and Weight-only at the same time***
 
 
-***We now introduce AMMO to do all quantization***
-First make sure AMMO toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+***We now introduce Modelopt to do all quantization***
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
 
-#### 1.5 AWQ weight-only quantization with AMMO
+#### 1.5 AWQ weight-only quantization with Modelopt
 
 ```bash
-# INT4 AWQ quantization using AMMO.
+# INT4 AWQ quantization using Modelopt.
 python ../quantization/quantize.py --model_dir mosaicml/mpt-7b --output_dir ./ckpts/mpt-7b/int4_awq/ --qformat int4_awq
 ```
 
-#### 1.6 FP8 Post-Training Quantization with AMMO
+#### 1.6 FP8 Post-Training Quantization with Modelopt
 
 ```bash
-# FP8 quantization using AMMO.
+# FP8 quantization using Modelopt.
 python ../quantization/quantize.py --model_dir mosaicml/mpt-7b --output_dir ./ckpts/mpt-7b/fp8/ --qformat fp8 --kv_cache_dtype fp8
 ```
 
-#### 1.6 Weight-only quantization with AMMO
+#### 1.6 Weight-only quantization with Modelopt
 
 ```bash
-# INT8 Weight-only quantization using AMMO with TP=2.
+# INT8 Weight-only quantization using Modelopt with TP=2.
 python ../quantization/quantize.py --model_dir mosaicml/mpt-7b --output_dir ./ckpts/mpt-7b/int8_wo/ --qformat int8_wo --tp_size 2
 
-# INT4 Weight-only quantization using AMMO.
+# INT4 Weight-only quantization using Modelopt.
 python ../quantization/quantize.py --model_dir mosaicml/mpt-7b --output_dir ./ckpts/mpt-7b/int4_wo/ --qformat int4_wo
 ```
 
-#### 1.7 SmoothQuant and INT8 KV cache with AMMO
+#### 1.7 SmoothQuant and INT8 KV cache with Modelopt
 
 ```bash
 # Use int4 awq quantization.
@@ -129,7 +129,7 @@ python ../quantization/quantize.py --model_dir mosaicml/mpt-7b --output_dir ./ck
 
 ### 2.1 Build TensorRT engine(s)
 
-All of the checkpoint generated by `convert_checkpoint.py` or `quantize.py` (AMMO) can share the same building commands.
+All of the checkpoint generated by `convert_checkpoint.py` or `quantize.py` (Modelopt) can share the same building commands.
 
 ```bash
 # Build a single-GPU float16 engine using TRTLLM checkpoints.
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index 3c2b69615..c635af18f 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index 7d581dbfb..439c35373 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -5,10 +5,13 @@ This document shows how to run multimodal pipelines with TensorRT-LLM, e.g. from
 
 Multimodal models' LLM part has an additional parameter `--max_multimodal_len` compared to LLM-only build commands. Under the hood, `max_multimodal_len` and `max_prompt_embedding_table_size` are effectively the same concept, i.e., prepended/concatenated embeddings (either multimodal feature embeddings or prompt tuning embeddings) to the LLM input embeddings. The multimodal features from the visual encoder of shape `[batch_size, num_visual_features, visual_hidden_dim]` is flattened as `[batch_size * num_visual_features, visual_hidden_dim]` and passed like a prompt embedding table.
 
-We first describe how to run each model on a single GPU. We then provide general guidelines on using tensor parallelism for LLM part of the pipeline.
+We first describe how to run each model on a single GPU. We then provide general guidelines on using tensor parallelism for the LLM part of the pipeline.
 
 - [BLIP2-T5](#blip2-t5)
 - [BLIP2-OPT](#blip2-opt)
+- [CogVLM](#cogvlm)
+- [Deplot](#deplot)
+- [Fuyu](#fuyu)
 - [LLaVA and VILA](#llava-and-vila)
 - [Nougat](#nougat)
 - [Enabling tensor parallelism for multi-GPU](#enabling-tensor-parallelism-for-multi-gpu)
@@ -75,7 +78,7 @@ We first describe how to run each model on a single GPU. We then provide general
 
     The built T5 engines are located in `./tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/tp1`.
 
-3.  Build TensorRT engines for visual components
+3. Build TensorRT engines for visual components
 
     ```bash
     python build_visual_engine.py --model_type ${MODEL_NAME} --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 8
@@ -169,12 +172,14 @@ OPT pipeline needs few minor changes from T5 pipeline
    unlike BLIP2 example which downloads only LLM components from Huggingface.
 
     For LLaVA,
+
     ```bash
         export MODEL_NAME="llava-1.5-7b-hf" # also llava-1.5-13b-hf
         git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
     ```
 
     For VILA, we need a few more steps until it is added to HF model zoo
+
     ```bash
         # clone original VILA repo
         export VILA_PATH="tmp/hf_models/VILA"
@@ -209,9 +214,10 @@ OPT pipeline needs few minor changes from T5 pipeline
         --max_output_len 512 \
         --max_multimodal_len 576 # 1 (max_batch_size) * 576 (num_visual_features)
     ```
+
     Note: do not use `--use_fused_mlp` flag in quantization mode.
 
-3.  Build TensorRT engines for visual components
+3. Build TensorRT engines for visual components
 
     ```bash
     python build_visual_engine.py --model_path tmp/hf_models/${MODEL_NAME} --model_type llava # for LLaVA
@@ -226,6 +232,7 @@ OPT pipeline needs few minor changes from T5 pipeline
         --llm_engine_dir trt_engines/${MODEL_NAME}/fp16/1-gpu \
         --input_text "Question: which city is this? Answer:" # or "Please describe the traffic condition." for VILA
     ```
+
     Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check.
 
 4. (Optional) INT8/INT4 weight-only quantization for LLaMA can be enabled as follows (take `INT4` as an example, while `INT8` is the default precision for weight-only quantization):
@@ -255,6 +262,7 @@ OPT pipeline needs few minor changes from T5 pipeline
    quantized TRT engines for LLM component of LLaVA/VILA.
 
    For example,
+
    ```bash
    python ../quantization/quantize.py \
         --model_dir tmp/hf_models/${MODEL_NAME} \
@@ -325,8 +333,171 @@ OPT pipeline needs few minor changes from T5 pipeline
         --visual_engine_dir visual_engines/${MODEL_NAME} \
         --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/tp1 \
     ```
+
     Note: Nougat models usually do not need a text prompt.
 
+## CogVLM
+
+Currently, CogVLM only support bfloat16 precision and doesn't support `remove_input_padding` feature.
+
+1. Download Huggingface weights
+
+    ```bash
+    export MODEL_NAME="cogvlm-chat-hf"
+    git clone https://huggingface.co/THUDM/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+    export TOKENIZER_NAME="vicuna-7b-v1.5"
+    git clone https://huggingface.co/lmsys/${TOKENIZER_NAME} tmp/hf_models/${TOKENIZER_NAME}
+    ```
+
+    Because currently onnx doesn't support `xops.memory_efficient_attention`, we need to modify some source code of the huggingface CogVLM.
+    ```
+    cd tmp/hf_models/${MODEL_NAME}
+    sed -i '4s/.*//;40s/.*/        out = self.attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2).contiguous()/;41s/.*//;42s/.*//' visual.py   # It will replace memory_efficient_attention with some basic ops
+    ```
+
+2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/cogvlm`
+
+   CogVLM uses a Vit encoder as LLM encoder and a modified Llama as decoder.
+
+    ```bash
+    python ../cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME}  --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16 --use_prompt_tuning
+
+    trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_bf16  \
+    --output_dir ./tmp/cogvlm/trt_engines/bf16/1-gpu \
+    --gemm_plugin bfloat16 \
+    --gpt_attention_plugin bfloat16 \
+    --context_fmha_fp32_acc enable \
+    --remove_input_padding disable \
+    --max_batch_size 48 \
+    --max_input_len 2048 \
+    --max_output_len 1024 \
+    --paged_kv_cache disable \
+    --use_custom_all_reduce disable \
+    --enable_xqa disable \
+    --bert_attention_plugin disable \
+    --moe_plugin disable \
+    --max_multimodal_len 61440 # 48 (max_batch_size) * 1280 (max_num_visual_features)
+    ```
+
+3. Generate TensorRT engines for visual components and combine everything into final pipeline.
+
+    ```bash
+    python build_visual_engine.py --model_type cogvlm --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 48
+
+    python run.py \
+    --max_new_tokens 1000 \
+    --input_text " [INST] please describe this image in detail [/INST] " \
+    --hf_model_dir tmp/hf_models/${TOKENIZER_NAME} \
+     --visual_engine_dir visual_engines/${MODEL_NAME} \
+     --llm_engine_dir tmp/cogvlm/trt_engines/bf16/1-gpu \
+     --batch_size 1 \
+     --top_p 0.4 \
+     --top_k 1 \
+     --temperature 0.2 \
+     --repetition_penalty 1.2
+    ```
+
+## Fuyu
+
+1. Download Huggingface weights
+
+    ```bash
+    export MODEL_NAME="fuyu-8b"
+    git clone https://huggingface.co/adept/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+    ```
+
+2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/gpt`.
+    The LLM portion of Fuyu uses a Persimmon model
+    ```bash
+    python ../gpt/convert_checkpoint.py \
+        --model_dir tmp/hf_models/${MODEL_NAME} \
+        --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
+        --dtype float16 \
+        --gpt_variant persimmon
+
+    trtllm-build \
+        --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
+        --output_dir trt_engines/${MODEL_NAME}/fp16/1-gpu \
+        --gemm_plugin float16 \
+        --use_fused_mlp \
+        --max_batch_size 1 \
+        --max_input_len 2048 \
+        --max_output_len 512 \
+        --max_multimodal_len 2048
+    ```
+
+3. Generate TensorRT engines for visual components and combine everything into final pipeline.
+
+    ```bash
+    python build_visual_engine.py --model_type fuyu --model_path tmp/hf_models/${MODEL_NAME}
+
+    python run.py \
+        --hf_model_dir tmp/hf_models/${MODEL_NAME} \
+        --visual_engine_dir visual_engines/${MODEL_NAME} \
+        --llm_engine_dir trt_engines/${MODEL_NAME}/1-gpu/bfloat16/tp1
+    ```
+
+## Deplot
+
+1. Download Huggingface weights and convert original checkpoint to TRT-LLM checkpoint format
+   following example in `examples/enc_dec/README.md`.
+
+    ```bash
+    export MODEL_NAME="deplot"
+    git clone https://huggingface.co/google/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    python ../enc_dec/convert_checkpoint.py --model_type pix2struct \
+        --model_dir tmp/hf_models/${MODEL_NAME} \
+        --output_dir tmp/trt_models/${MODEL_NAME}/float16 \
+        --tp_size 1 \
+        --pp_size 1 \
+        --weight_data_type float32 \
+        --dtype float16
+    ```
+
+2. Build TRT-LLM engine from TRT-LLM checkpoint
+
+    ```bash
+    trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/float16/tp1/pp1/decoder \
+        --output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/float16/tp1/decoder \
+        --paged_kv_cache disable \
+        --moe_plugin disable \
+        --enable_xqa disable \
+        --use_custom_all_reduce disable \
+        --gemm_plugin float16 \
+        --bert_attention_plugin float16 \
+        --gpt_attention_plugin float16 \
+        --remove_input_padding enable \
+        --context_fmha disable \
+        --max_beam_width 1 \
+        --max_batch_size 8 \
+        --max_output_len 510 \
+        --max_encoder_input_len 2048 \
+        --max_input_len 1
+    ```
+
+    The built deplot engines are located in `./tmp/trt_engines/${MODEL_NAME}/1-gpu/float16/tp1`.
+
+3. Build TensorRT engines for visual components
+
+    ```bash
+    python build_visual_engine.py --model_type pix2struct --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 8
+    ```
+
+    The built engines are located in `./visual_engines/${MODEL_NAME}`.
+
+    To run the deplot pipeline with batch size > 1, change `--max_batch_size` argument to `build_visual_engine.py` accordingly.
+
+4. Assemble everything into deplot pipeline
+
+    ```bash
+    python run.py \
+        --max_new_tokens 100 \
+        --input_text "" \
+        --hf_model_dir tmp/hf_models/${MODEL_NAME} \
+        --visual_engine_dir visual_engines/${MODEL_NAME} \
+        --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/float16/tp1
+    ```
 ## Enabling tensor parallelism for multi-GPU
 
 The LLM part of the pipeline can be run on multiple GPUs using tensor parallelism.
diff --git a/examples/multimodal/build_visual_engine.py b/examples/multimodal/build_visual_engine.py
index c7c181b95..9ede35420 100644
--- a/examples/multimodal/build_visual_engine.py
+++ b/examples/multimodal/build_visual_engine.py
@@ -11,9 +11,13 @@
 # isort: on
 
 from PIL import Image
-from transformers import (AutoProcessor, Blip2ForConditionalGeneration,
-                          Blip2Processor, LlavaForConditionalGeneration,
-                          NougatProcessor, VisionEncoderDecoderModel)
+from torchvision import transforms
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
+                          Blip2ForConditionalGeneration, Blip2Processor,
+                          FuyuForCausalLM, FuyuProcessor,
+                          LlavaForConditionalGeneration, NougatProcessor,
+                          Pix2StructForConditionalGeneration,
+                          VisionEncoderDecoderModel)
 
 
 def parse_arguments():
@@ -23,7 +27,8 @@ def parse_arguments():
                         default=None,
                         choices=[
                             'opt-2.7b', 'opt-6.7b', 'flan-t5-xl', 'flan-t5-xxl',
-                            'llava', 'vila', 'nougat'
+                            'llava', 'vila', 'nougat', 'cogvlm', 'fuyu',
+                            'pix2struct'
                         ],
                         help="Model type")
     parser.add_argument('--model_path',
@@ -62,6 +67,8 @@ def build(self):
         args = self.args
         if 'opt' in args.model_type or 't5' in args.model_type:
             build_blip2_engine(args)
+        elif args.model_type == 'pix2struct':
+            build_pix2struct_engine(args)
         elif args.model_type == 'llava':
             build_llava_engine(args)
         elif args.model_type == 'vila':
@@ -69,26 +76,37 @@ def build(self):
             build_vila_engine(args)
         elif args.model_type == 'nougat':
             build_nougat_engine(args)
+        elif args.model_type == 'cogvlm':
+            build_cogvlm_engine(args)
+        elif args.model_type == 'fuyu':
+            build_fuyu_engine(args)
         else:
             raise RuntimeError(f"Invalid model type {args.model_type}")
 
 
-def export_visual_wrapper_onnx(visual_wrapper, image, output_dir):
+def export_visual_wrapper_onnx(visual_wrapper,
+                               input,
+                               output_dir,
+                               input_names=['input'],
+                               dynamic_axes={'input': {
+                                   0: 'batch'
+                               }}):
     logger.log(trt.Logger.INFO, "Exporting onnx")
     os.makedirs(f'{output_dir}/onnx', exist_ok=True)
     torch.onnx.export(visual_wrapper,
-                      image,
+                      input,
                       f'{output_dir}/onnx/visual_encoder.onnx',
                       opset_version=17,
-                      input_names=['input'],
+                      input_names=input_names,
                       output_names=['output'],
-                      dynamic_axes={'input': {
-                          0: 'batch'
-                      }})
+                      dynamic_axes=dynamic_axes)
 
 
-def build_trt_engine(model_type, img_height, img_width, output_dir,
-                     max_batch_size):
+def build_trt_engine(model_type,
+                     input_sizes,
+                     output_dir,
+                     max_batch_size,
+                     dtype=torch.float16):
     part_name = 'visual_encoder'
     onnx_file = '%s/onnx/%s.onnx' % (output_dir, part_name)
     engine_file = '%s/%s.engine' % (output_dir, part_name)
@@ -99,8 +117,9 @@ def build_trt_engine(model_type, img_height, img_width, output_dir,
     network = builder.create_network(
         1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
     profile = builder.create_optimization_profile()
-    config_wrapper = Builder().create_builder_config(precision="float16",
-                                                     model_type=model_type)
+    config_wrapper = Builder().create_builder_config(
+        precision="float16" if dtype == torch.float16 else "bfloat16",
+        model_type=model_type)
     config = config_wrapper.trt_builder_config
 
     parser = trt.OnnxParser(network, logger)
@@ -120,13 +139,31 @@ def build_trt_engine(model_type, img_height, img_width, output_dir,
     nOptBS = max(nMinBS, int(max_batch_size / 2))
     nMaxBS = max_batch_size
 
-    logger.log(trt.Logger.INFO,
-               f"Processed image dims {img_height}x{img_width}")
-    H, W = img_height, img_width
     inputT = network.get_input(0)
-    inputT.shape = [nBS, 3, H, W]
-    profile.set_shape(inputT.name, [nMinBS, 3, H, W], [nOptBS, 3, H, W],
-                      [nMaxBS, 3, H, W])
+
+    # input sizes can be a list of ints (e.g., [3, H, W]) when inputs are images,
+    # or a list of three int lists (e.g., [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]]).
+    assert isinstance(input_sizes, list), "input_sizes must be a list"
+    if isinstance(input_sizes[0], int):
+        logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}")
+        inputT.shape = [nBS, *input_sizes]
+        min_size = opt_size = max_size = input_sizes
+    elif len(input_sizes) == 3 and isinstance(input_sizes[0], list):
+        min_size, opt_size, max_size = input_sizes
+        logger.log(
+            trt.Logger.INFO,
+            f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}"
+        )
+    else:
+        raise ValueError(f"invalid input sizes: {input_sizes}")
+
+    profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size],
+                      [nMaxBS, *max_size])
+    if model_type == "pix2struct":
+        inputT = network.get_input(1)
+        P = input_sizes[0]  # Number of patches
+        inputT.shape = [nBS, P]
+        profile.set_shape(inputT.name, [nMinBS, P], [nOptBS, P], [nMaxBS, P])
     config.add_optimization_profile(profile)
 
     t0 = time()
@@ -176,8 +213,61 @@ def forward(self, image):
     wrapper.to(args.device)
 
     export_visual_wrapper_onnx(wrapper, image, args.output_dir)
-    build_trt_engine(model_type, image.shape[2], image.shape[3],
-                     args.output_dir, args.max_batch_size)
+    build_trt_engine(
+        model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        args.output_dir,
+        args.max_batch_size)
+
+
+def build_pix2struct_engine(args):
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    raw_image = Image.new('RGB', [10, 10])  # dummy image
+    dtype = torch.float16
+    inputs = processor(text="dummy", images=raw_image, return_tensors="pt")
+    image = inputs['flattened_patches'].to(args.device, dtype)
+    attention_mask = inputs['attention_mask'].to(args.device, torch.int)
+
+    class pix2structVisionWrapper(torch.nn.Module):
+
+        def __init__(self, encoder):
+            super().__init__()
+            self.encoder = encoder
+
+        def forward(self, image, attention_mask):
+            vision_x = self.encoder.embeddings(image)
+            img_features = self.encoder.encoder(vision_x,
+                                                attention_mask=attention_mask)
+            img_features = self.encoder.layernorm(img_features[0])
+            return img_features
+
+    model = Pix2StructForConditionalGeneration.from_pretrained(
+        args.model_path, torch_dtype=dtype)
+
+    wrapper = pix2structVisionWrapper(model.encoder.to(args.device))
+    # input shape: batch size, number of patches, hidden dimension
+    # attention mask shape: batch size, number of patches
+    # The number of image patches can vary depending on the image size, but it typically
+    # falls within a relatively narrow range. To improve performance, we can avoid using
+    # dynamic axis for the input patches and instead use a fixed number of patches along
+    # with an attention mask.
+    export_visual_wrapper_onnx(wrapper, (image, attention_mask),
+                               args.output_dir,
+                               input_names=['input', 'attention_mask'],
+                               dynamic_axes={
+                                   'input': {
+                                       0: 'batch'
+                                   },
+                                   'attention_mask': {
+                                       0: 'batch'
+                                   }
+                               })
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2]],  # Number of Patches, Hidden Dimension
+        args.output_dir,
+        args.max_batch_size,
+        torch.bfloat16)
 
 
 def build_llava_engine(args):
@@ -208,8 +298,11 @@ def forward(self, image):
                                  model.config.vision_feature_layer)
 
     export_visual_wrapper_onnx(wrapper, image, args.output_dir)
-    build_trt_engine(args.model_type, image.shape[2], image.shape[3],
-                     args.output_dir, args.max_batch_size)
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        args.output_dir,
+        args.max_batch_size)
 
 
 def build_vila_engine(args):
@@ -244,8 +337,11 @@ def forward(self, image):
         model.get_model().mm_projector.to(args.device))
 
     export_visual_wrapper_onnx(wrapper, image, args.output_dir)
-    build_trt_engine(args.model_type, image.shape[2], image.shape[3],
-                     args.output_dir, args.max_batch_size)
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        args.output_dir,
+        args.max_batch_size)
 
 
 def build_nougat_engine(args):
@@ -269,8 +365,90 @@ def forward(self, image):
     wrapper = SwinEncoderWrapper(swin_encoder)
 
     export_visual_wrapper_onnx(wrapper, image, args.output_dir)
-    build_trt_engine(args.model_type, image.shape[2], image.shape[3],
-                     args.output_dir, args.max_batch_size)
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        args.output_dir,
+        args.max_batch_size)
+
+
+def build_cogvlm_engine(args):
+    raw_image = Image.new('RGB', [10, 10])  # dummy image
+    hf_config = AutoConfig.from_pretrained(args.model_path,
+                                           trust_remote_code=True)
+    image_size = hf_config.vision_config['image_size']
+    dtype = hf_config.torch_dtype
+    transform = transforms.Compose([
+        transforms.Resize((image_size, image_size),
+                          interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.ToTensor(),
+        transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                             (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    image = transform(raw_image).unsqueeze(0).to(args.device, dtype)
+
+    class CogVlmVisionWrapper(torch.nn.Module):
+
+        def __init__(self, encoder):
+            super().__init__()
+            self.encoder = encoder
+
+        def forward(self, image):
+            return self.encoder(image)
+
+    cogvlm = AutoModelForCausalLM.from_pretrained(args.model_path,
+                                                  torch_dtype=dtype,
+                                                  trust_remote_code=True)
+    vit_encoder = cogvlm.model.vision.to(args.device).eval()
+
+    wrapper = CogVlmVisionWrapper(vit_encoder)
+    export_visual_wrapper_onnx(wrapper, image, args.output_dir)
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        args.output_dir,
+        args.max_batch_size,
+        dtype)
+
+
+def build_fuyu_engine(args):
+    processor = FuyuProcessor.from_pretrained(args.model_path)
+    raw_image = Image.new('RGB', [10, 10])
+    image = processor(text="dummy", images=raw_image,
+                      return_tensors="pt")['image_patches'][0].to(
+                          args.device, torch.float16).unsqueeze(0)
+
+    class FuyuEncoderWrapper(torch.nn.Module):
+
+        def __init__(self, linear):
+            super().__init__()
+            self.linear = linear.to(torch.float16)
+
+        def forward(self, patches):
+            return self.linear(patches).flatten(0, 1)
+
+    model = FuyuForCausalLM.from_pretrained(args.model_path,
+                                            torch_dtype=torch.float16)
+
+    vision_encoder = model.vision_embed_tokens
+    wrapper = FuyuEncoderWrapper(vision_encoder).to(args.device)
+
+    export_visual_wrapper_onnx(wrapper,
+                               image,
+                               args.output_dir,
+                               dynamic_axes={'input': {
+                                   0: 'batch',
+                                   2: 'patch'
+                               }})
+    build_trt_engine(
+        args.model_type,
+        # [nImgs, nImgPatches, nDims]
+        # nImgs is always one since each query has exactly one image
+        # nImgPatches depends on image size (patch size: 30x30)
+        # nDims is 30x30x3=2700 (patch size x color channels)
+        [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]],
+        args.output_dir,
+        args.max_batch_size)
 
 
 if __name__ == '__main__':
diff --git a/examples/multimodal/run.py b/examples/multimodal/run.py
index 93354d3f1..f8f8221a7 100644
--- a/examples/multimodal/run.py
+++ b/examples/multimodal/run.py
@@ -13,12 +13,14 @@
 
 from huggingface_hub import hf_hub_download
 from PIL import Image
+from torchvision import transforms
 from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
                           Blip2Processor, NougatProcessor, NougatTokenizerFast)
 
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
+from tensorrt_llm._utils import str_dtype_to_trt
 from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo
 
 sys.path.append(str(Path(__file__).parent.parent))
@@ -51,6 +53,9 @@ def parse_arguments():
                         help="Use beam search if num_beams >1",
                         default=1)
     parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
     parser.add_argument('--run_profiling',
                         action='store_true',
                         help='Profile runtime over several iterations')
@@ -68,6 +73,8 @@ def trt_dtype_to_torch(dtype):
         return torch.float32
     elif dtype == trt.int32:
         return torch.int32
+    elif dtype == trt.bfloat16:
+        return torch.bfloat16
     else:
         raise TypeError("%s is not supported" % dtype)
 
@@ -90,9 +97,13 @@ def __init__(self, args):
                   "r") as f:
             config = json.load(f)
         self.model_type = config['builder_config']['model_type']
+        self.vision_precision = config['builder_config']['precision']
+        if self.model_type == 'pix2struct':
+            self.vision_precision = 'float16'
         self.decoder_llm = not (
-            't5' in self.model_type or 'nougat' in self.model_type
-        )  # BLIP2-T5 and Nougat are using encoder-decoder models as LLMs
+            't5' in self.model_type
+            or self.model_type in ['nougat', 'pix2struct']
+        )  # BLIP2-T5, pix2struct and Nougat are using encoder-decoder models as LLMs
 
         self.profiling_iterations = 20
 
@@ -132,44 +143,73 @@ def init_llm(self):
             self.model = TRTLLMEncDecModel.from_engine(
                 os.path.basename(self.args.hf_model_dir),
                 self.args.llm_engine_dir,
-                skip_encoder=(self.model_type == 'nougat'),
+                skip_encoder=self.model_type in ['nougat', 'pix2struct'],
                 debug_mode=False,
                 stream=self.stream)
-
-            if self.model_type == 'nougat':
+            if self.model_type in ['nougat', 'pix2struct']:
                 self.model_config = self.model.decoder_model_config
                 self.runtime_mapping = self.model.decoder_runtime_mapping
             else:
                 self.model_config = self.model.encoder_model_config
                 self.runtime_mapping = self.model.encoder_runtime_mapping
 
-    def generate(self, pre_prompt, post_prompt, image, decoder_input_ids,
-                 max_new_tokens, warmup):
-        if not warmup:
-            profiler.start("Generate")
-            profiler.start("Vision")
-        visual_features, visual_atts = self.get_visual_features(image)
+    def preprocess(self, warmup, pre_prompt, post_prompt, image,
+                   attention_mask):
+        visual_features, visual_atts = self.get_visual_features(
+            torch.stack(image['image_patches'], dim=0)
+            if self.model_type == 'fuyu' else image, attention_mask)
         if not warmup:
             profiler.stop("Vision")
 
-        pre_input_ids = self.tokenizer(pre_prompt,
-                                       return_tensors="pt",
-                                       padding=True).input_ids
-        if post_prompt[0] is not None:
-            post_input_ids = self.tokenizer(post_prompt,
-                                            return_tensors="pt",
-                                            padding=True).input_ids
-            length = pre_input_ids.shape[1] + post_input_ids.shape[
-                1] + visual_atts.shape[1]
+        if self.model_type == 'fuyu':
+            visual_features = visual_features.squeeze()
+            input_ids = image['input_ids'].to(torch.int32)
+            image_patches_indices = image['image_patches_indices'].to(
+                torch.int32)
+
+            input_ids = input_ids.expand(self.args.batch_size,
+                                         *input_ids.shape[1:])
+            image_patches_indices = image_patches_indices.expand(
+                self.args.batch_size, *image_patches_indices.shape[1:])
+
+            input_ids = self.ptuning_setup_fuyu(input_ids,
+                                                image_patches_indices)
+            input_ids = torch.stack(input_ids, dim=0).to('cpu')
+            length = input_ids.shape[1]
         else:
-            post_input_ids = None
-            length = pre_input_ids.shape[1] + visual_atts.shape[1]
+            pre_input_ids = self.tokenizer(pre_prompt,
+                                           return_tensors="pt",
+                                           padding=True).input_ids
+            if post_prompt[0] is not None:
+                post_input_ids = self.tokenizer(post_prompt,
+                                                return_tensors="pt",
+                                                padding=True).input_ids
+                length = pre_input_ids.shape[1] + post_input_ids.shape[
+                    1] + visual_atts.shape[1]
+            else:
+                post_input_ids = None
+                length = pre_input_ids.shape[1] + visual_atts.shape[1]
 
         input_lengths = torch.IntTensor([length] * args.batch_size).to(
             torch.int32)
+
+        if self.model_type == 'fuyu':
+            return input_ids, input_lengths, [visual_features], visual_features
+
         input_ids, ptuning_args = self.setup_fake_prompts(
             visual_features, pre_input_ids, post_input_ids, input_lengths)
 
+        return input_ids, input_lengths, ptuning_args, visual_features
+
+    def generate(self, pre_prompt, post_prompt, image, decoder_input_ids,
+                 max_new_tokens, attention_mask, warmup):
+        if not warmup:
+            profiler.start("Generate")
+            profiler.start("Vision")
+
+        input_ids, input_lengths, ptuning_args, visual_features = self.preprocess(
+            warmup, pre_prompt, post_prompt, image, attention_mask)
+
         if warmup: return None
 
         profiler.start("LLM")
@@ -182,21 +222,30 @@ def generate(self, pre_prompt, post_prompt, image, decoder_input_ids,
                                                add_special_tokens=False)[0]
 
             ptuning_args[0] = torch.stack([ptuning_args[0]])
-            output_ids = self.model.generate(input_ids,
-                                             sampling_config=None,
-                                             prompt_table=ptuning_args[0],
-                                             max_new_tokens=max_new_tokens,
-                                             end_id=end_id,
-                                             pad_id=self.tokenizer.pad_token_id,
-                                             top_k=self.args.top_k,
-                                             num_beams=self.args.num_beams,
-                                             output_sequence_lengths=False,
-                                             return_dict=False)
+            output_ids = self.model.generate(
+                input_ids,
+                sampling_config=None,
+                prompt_table=ptuning_args[0],
+                max_new_tokens=max_new_tokens,
+                end_id=end_id,
+                pad_id=self.tokenizer.pad_token_id
+                if self.tokenizer.pad_token_id else
+                self.tokenizer.all_special_ids[0],
+                top_k=self.args.top_k,
+                top_p=self.args.top_p,
+                temperature=self.args.temperature,
+                repetition_penalty=self.args.repetition_penalty,
+                num_beams=self.args.num_beams,
+                output_sequence_lengths=False,
+                return_dict=False)
         else:
-            if self.model_type == 'nougat':
+            if self.model_type in ['nougat', 'pix2struct']:
                 # Trim encoder input_ids to match visual features shape
                 ids_shape = (self.args.batch_size, visual_features.shape[1])
-                input_ids = torch.zeros(ids_shape, dtype=torch.int32)
+                if self.model_type == 'nougat':
+                    input_ids = torch.zeros(ids_shape, dtype=torch.int32)
+                elif self.model_type == 'pix2struct':
+                    input_ids = torch.ones(ids_shape, dtype=torch.int32)
 
             output_ids = self.model.generate(
                 input_ids,
@@ -209,7 +258,8 @@ def generate(self, pre_prompt, post_prompt, image, decoder_input_ids,
                 debug_mode=False,
                 prompt_embedding_table=ptuning_args[0],
                 prompt_tasks=ptuning_args[1],
-                prompt_vocab_size=ptuning_args[2])
+                prompt_vocab_size=ptuning_args[2],
+                attention_mask=attention_mask)
 
             # Reset input_lengths to match decoder_input_ids
             input_lengths = torch.ones(input_lengths.shape,
@@ -235,10 +285,24 @@ def generate(self, pre_prompt, post_prompt, image, decoder_input_ids,
             profiler.stop("Generate")
             return None
 
-    def get_visual_features(self, image):
-        visual_features = {'input': image.half()}
+    def get_visual_features(self, image, attention_mask):
+        visual_features = {
+            'input':
+            image.to(
+                tensorrt_llm._utils.str_dtype_to_torch(self.vision_precision))
+        }
+        if attention_mask is not None:
+            visual_features['attention_mask'] = attention_mask
+        tensor_info = [
+            TensorInfo('input', str_dtype_to_trt(self.vision_precision),
+                       image.shape)
+        ]
+        if attention_mask is not None:
+            tensor_info.append(
+                TensorInfo('attention_mask', trt.DataType.INT32,
+                           attention_mask.shape))
         visual_output_info = self.visual_encoder_session.infer_shapes(
-            [TensorInfo('input', trt.DataType.HALF, image.shape)])
+            tensor_info)
         visual_outputs = {
             t.name: torch.empty(tuple(t.shape),
                                 dtype=trt_dtype_to_torch(t.dtype),
@@ -266,11 +330,16 @@ def setup_fake_prompts(self, visual_features, pre_input_ids, post_input_ids,
         fake_prompt_id = fake_prompt_id.reshape(visual_features.shape[0],
                                                 visual_features.shape[1])
 
-        if post_input_ids is not None:
-            input_ids = [pre_input_ids, fake_prompt_id, post_input_ids]
+        if 'cogvlm' in self.model_type:
+            input_ids = torch.cat(
+                [pre_input_ids[:, 0:1], fake_prompt_id, pre_input_ids[:, 1:]],
+                dim=1).contiguous().to(torch.int32)
         else:
-            input_ids = [fake_prompt_id, pre_input_ids]
-        input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32)
+            if post_input_ids is not None:
+                input_ids = [pre_input_ids, fake_prompt_id, post_input_ids]
+            else:
+                input_ids = [fake_prompt_id, pre_input_ids]
+            input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32)
 
         if self.decoder_llm or self.runtime_mapping.is_first_pp_rank():
             ptuning_args = self.ptuning_setup(visual_features, input_ids,
@@ -280,6 +349,24 @@ def setup_fake_prompts(self, visual_features, pre_input_ids, post_input_ids,
 
         return input_ids, ptuning_args
 
+    def ptuning_setup_fuyu(self, input_ids, image_patches_indices):
+        res_input_ids = []
+        for cur_input_ids, cur_image_patches_indices in zip(
+                input_ids, image_patches_indices):
+            # Truncate input_ids to the length of image_patches_indices
+            cur_image_patches_indices = cur_image_patches_indices[:len(
+                cur_input_ids)]
+            # Get ids of the image_patches
+            non_zero_mask = cur_image_patches_indices != -1
+            # Replace input_ids with image_patches_indices values (where the patches are placed)
+            cur_input_ids = cur_input_ids.masked_scatter(
+                non_zero_mask,
+                cur_image_patches_indices[non_zero_mask] +
+                self.model_config.vocab_size,
+            )
+            res_input_ids.append(cur_input_ids)
+        return res_input_ids
+
     def ptuning_setup(self, prompt_table, input_ids, input_lengths):
         hidden_size = self.model_config.hidden_size * self.runtime_mapping.tp_size
         if prompt_table is not None:
@@ -321,6 +408,15 @@ def load_test_image(self):
                 filename="nougat_paper.png",
                 repo_type="dataset")
             image = Image.open(filepath)
+        elif "fuyu" in self.model_type:
+            filepath = hf_hub_download(repo_id="adept/fuyu-8b",
+                                       filename="skateboard.png",
+                                       repo_type='model')
+            image = Image.open(filepath)
+        elif "pix2struct" in self.model_type:
+            img_url = 'https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_40963.png'
+            image = Image.open(requests.get(img_url,
+                                            stream=True).raw).convert('RGB')
         else:
             img_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png'
             image = Image.open(requests.get(img_url,
@@ -329,6 +425,7 @@ def load_test_image(self):
         return image
 
     def setup_inputs(self, input_text, raw_image):
+        attention_mask = None
         if 'blip2' in self.model_type:
             processor = Blip2Processor.from_pretrained(self.model_type)
             image = processor(raw_image, input_text,
@@ -349,7 +446,41 @@ def setup_inputs(self, input_text, raw_image):
 
             pre_prompt = input_text
             post_prompt = None
-        elif 'llava' in self.model_type or 'vila' in self.model_type:
+        elif 'cogvlm' in self.model_type:
+            image_size = 490
+            dtype = torch.bfloat16
+            transform = transforms.Compose([
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                     (0.26862954, 0.26130258, 0.27577711)),
+            ])
+            image = transform(raw_image).to(dtype).unsqueeze(0)
+
+            if input_text is None:
+                input_text = " [INST] which city is this? [/INST] "
+            pre_prompt = input_text
+            post_prompt = None
+        elif self.model_type == "pix2struct":
+            image_processor = AutoProcessor.from_pretrained(args.hf_model_dir)
+            if input_text is None:
+                input_text = ""
+            inputs = image_processor(
+                images=raw_image,
+                text=input_text,
+                return_tensors="pt",
+            )
+            image = inputs['flattened_patches']
+            image = image.expand(self.args.batch_size, -1, -1).contiguous()
+            attention_mask = inputs['attention_mask'].to(self.device).to(
+                torch.int)
+            attention_mask = attention_mask.expand(args.batch_size,
+                                                   -1).contiguous()
+            pre_prompt = ""
+            post_prompt = None
+        elif 'llava' in self.model_type or 'vila' in self.model_type or 'fuyu' in self.model_type:
             # LLaVA and VILA
             if self.model_type == "llava":
                 pre_prompt = "USER:\n"
@@ -359,7 +490,14 @@ def setup_inputs(self, input_text, raw_image):
                 pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
                 if input_text is None:
                     input_text = "Please describe the traffic condition."
-            post_prompt = input_text + " ASSISTANT:"
+            elif self.model_type == 'fuyu':
+                pre_prompt = "Describe this image:"
+                if input_text is None:
+                    input_text = "Answer the following VQAv2 question based on the image: How many people are in the image?\n"
+            if self.model_type != 'fuyu':
+                post_prompt = input_text + " ASSISTANT:"
+            else:
+                post_prompt = None
 
             if self.model_type == "vila":
                 sys.path.append(self.args.hf_model_dir + "/../VILA")
@@ -373,14 +511,20 @@ def setup_inputs(self, input_text, raw_image):
             else:
                 processor = AutoProcessor.from_pretrained(
                     self.args.hf_model_dir)
-                image = processor(text=input_text,
-                                  images=raw_image,
-                                  return_tensors="pt")['pixel_values']
+                if self.model_type == 'fuyu':
+                    image = processor(text=input_text,
+                                      images=raw_image,
+                                      return_tensors='pt')
+                else:
+                    image = processor(text=input_text,
+                                      images=raw_image,
+                                      return_tensors="pt")['pixel_values']
 
         # Repeat inputs to match batch size
         pre_prompt = [pre_prompt] * self.args.batch_size
         post_prompt = [post_prompt] * self.args.batch_size
-        image = image.expand(self.args.batch_size, -1, -1, -1).contiguous()
+        if self.model_type not in ['fuyu', 'pix2struct']:
+            image = image.expand(args.batch_size, -1, -1, -1).contiguous()
         image = image.to(self.device)
 
         # Generate decoder_input_ids for enc-dec models
@@ -397,17 +541,18 @@ def setup_inputs(self, input_text, raw_image):
             decoder_input_ids = torch.IntTensor([[decoder_start_id]])
             decoder_input_ids = decoder_input_ids.repeat((args.batch_size, 1))
 
-        return input_text, pre_prompt, post_prompt, image, decoder_input_ids
+        return input_text, pre_prompt, post_prompt, image, decoder_input_ids, attention_mask
 
     def run(self, input_text, input_image, max_new_tokens):
-        input_text, pre_prompt, post_prompt, processed_image, decoder_input_ids = model.setup_inputs(
-            input_text, raw_image)
+        input_text, pre_prompt, post_prompt, processed_image, decoder_input_ids, attention_mask = model.setup_inputs(
+            input_text, input_image)
 
         model.generate(pre_prompt,
                        post_prompt,
                        processed_image,
                        decoder_input_ids,
                        max_new_tokens,
+                       attention_mask=attention_mask,
                        warmup=True)
         num_iters = self.profiling_iterations if self.args.run_profiling else 1
         for _ in range(num_iters):
@@ -416,6 +561,7 @@ def run(self, input_text, input_image, max_new_tokens):
                                          processed_image,
                                          decoder_input_ids,
                                          max_new_tokens,
+                                         attention_mask=attention_mask,
                                          warmup=False)
         if self.runtime_rank == 0:
             self.print_result(input_text, output_text)
@@ -441,6 +587,11 @@ def print_result(self, input_text, output_text):
                 if self.model_type == "vila":
                     assert output_text[0][0].lower(
                     ) == 'the traffic condition in the image is quite busy, with multiple cars and bicycles sharing the road. there are also pedestrians walking on'
+                elif self.model_type == 'fuyu':
+                    assert output_text[0][0].lower() == '4'
+                elif self.model_type == "pix2struct":
+                    assert "characteristic | cat food, day | cat food, wet | cat treats" in output_text[
+                        0][0].lower()
                 else:
                     assert output_text[0][0].lower() == 'singapore'
 
diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt
index 3c2b69615..c635af18f 100644
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/phi/README.md b/examples/phi/README.md
index 2c929cfd5..59a672ea0 100644
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@@ -1,6 +1,6 @@
 # Phi
 
-This document explains how to build the [Phi](https://huggingface.co/microsoft/phi-2) model using TensorRT-LLM and run on a single GPU.
+This document explains how to build the [phi-2](https://huggingface.co/microsoft/phi-2), [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) models using TensorRT-LLM and run on a single GPU.
 
 - [Phi](#phi)
   - [Overview](#overview)
@@ -13,7 +13,7 @@ This document explains how to build the [Phi](https://huggingface.co/microsoft/p
 
 ## Overview
 
-The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There is one file:
+The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There is one file:
 
 * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
 
@@ -26,6 +26,16 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
   * FP16
   * BF16
   * Tensor Parallel
+  ## Support Matrix
+
+|    Model Name    | FP16  | BF16  |  TP   |
+| :--------------: | :---: | :---: | :---: |
+|    phi-2    |   Y   |   Y    |   Y   |
+| Phi-3-mini-4k-instruct    |   Y   |   Y   |     |
+| Phi-3-mini-128k-instruct  |   Y   |   Y   |    |
+
+* Model Name: the name of the model, the same as the name on HuggingFace
+* TP: Tensor Parallel
 
 ## Usage
 
@@ -38,7 +48,11 @@ pip install -r requirements.txt
 ```
 
 ```bash
-python ./convert_checkpoint.py --model_dir "microsoft/phi-2" --output_dir ./phi-2-checkpoint --dtype float16
+export MODEL_TYPE="phi-2" # or Phi-3-mini-4k-instruct, Phi-3-mini-128k-instruct
+python ./convert_checkpoint.py --model_type ${MODEL_TYPE} \
+                    --model_dir "microsoft/${MODEL_TYPE}" \
+                    --output_dir ./phi-checkpoint \
+                    --dtype float16
 ```
 
 ### 2. Build TensorRT engine(s)
@@ -52,8 +66,8 @@ Examples of build invocations:
 # Enable several TensorRT-LLM plugins to increase runtime performance. It also helps with build time.
 # --tp_size and --pp_size are the model shard size
 trtllm-build \
-    --checkpoint_dir ./phi-2-checkpoint \
-    --output_dir ./phi-2-engine \
+    --checkpoint_dir ./phi-checkpoint \
+    --output_dir ./phi-engine \
     --gemm_plugin float16 \
     --max_batch_size 8 \
     --max_input_len 1024 \
@@ -85,8 +99,8 @@ The summarization can be done using the [`../summarize.py`](../summarize.py) scr
 
 ```bash
 # Run the summarization task using a TensorRT-LLM model and a single GPU.
-python3 ../summarize.py --engine_dir ./phi-2-engine \
-                        --hf_model_dir "microsoft/phi-2" \
+python3 ../summarize.py --engine_dir ./phi-engine \
+                        --hf_model_dir "microsoft/$(MODEL_TYPE)" \
                         --batch_size 1 \
                         --test_trt_llm \
                         --test_hf \
@@ -96,8 +110,8 @@ python3 ../summarize.py --engine_dir ./phi-2-engine \
 
 # Run the summarization task using a TensorRT-LLM model and 2-way tensor parallelism.
 mpirun -n 2 --allow-run-as-root                             \
-python3 ../summarize.py --engine_dir ./phi-2-engine-tp2  \
-                        --hf_model_dir "microsoft/phi-2"    \
+python3 ../summarize.py --engine_dir ./phi-engine-tp2  \
+                        --hf_model_dir "microsoft/$(MODEL_TYPE)"    \
                         --batch_size 1                      \
                         --test_hf                           \
                         --test_trt_llm                      \
diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py
index d7f2bdb76..be66fb6e1 100644
--- a/examples/phi/convert_checkpoint.py
+++ b/examples/phi/convert_checkpoint.py
@@ -17,7 +17,7 @@
 import time
 
 import tensorrt_llm
-from tensorrt_llm.models.phi.model import PhiForCausalLM
+from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM
 
 
 def parse_arguments():
@@ -31,6 +31,12 @@ def parse_arguments():
                         type=str,
                         default='tllm_checkpoint',
                         help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--model_type',
+        type=str,
+        default='phi-2',
+        choices=['phi-2', 'Phi-3-mini-4k-instruct', 'Phi-3-mini-128k-instruct'],
+        help='Model to be converted.')
     args = parser.parse_args()
 
     return args
@@ -44,9 +50,10 @@ def parse_arguments():
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
-    PhiForCausalLM.convert_hf_checkpoint(args.model_dir,
-                                         dtype=args.dtype,
-                                         output_dir=args.output_dir)
+    modelForCausalLM = PhiForCausalLM if args.model_type == "phi-2" else Phi3ForCausalLM
+    modelForCausalLM.convert_hf_checkpoint(args.model_dir,
+                                           dtype=args.dtype,
+                                           output_dir=args.output_dir)
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index d2e170646..b926b6bfa 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/quantization/README.md b/examples/quantization/README.md
index 51998f571..fe88afa49 100644
--- a/examples/quantization/README.md
+++ b/examples/quantization/README.md
@@ -17,11 +17,11 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr
 docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g -it <the docker image with TensorRT-LLM installed> bash
 ```
 
-2. Install the quantization toolkit `ammo` and the related dependencies on top of the TensorRT-LLM installation or docker file.
+2. Install the quantization toolkit `modelopt` and the related dependencies on top of the TensorRT-LLM installation or docker file.
 
 ```bash
-# Install AMMO
-pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo==0.9.3
+# Install Modelopt
+pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-modelopt==0.9.3
 # Install the additional requirements
 cd <this example folder>
 pip install -r requirements.txt
@@ -44,7 +44,7 @@ PTQ can be achieved with simple calibration on a small set of training or evalua
 import torch
 from torch.utils.data import DataLoader
 from transformers import AutoModelForCausalLM
-import ammo.torch.quantization as atq
+import modelopt.torch.quantization as atq
 
 model = AutoModelForCausalLM.from_pretrained(...)
 
@@ -74,7 +74,7 @@ After the model is quantized, it can be exported to a TensorRT-LLM checkpoint, w
 The export API is
 
 ```python
-from ammo.torch.export import export_tensorrt_llm_checkpoint
+from modelopt.torch.export import export_tensorrt_llm_checkpoint
 
 with torch.inference_mode():
     export_tensorrt_llm_checkpoint(
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index 7a8ef2f54..a7ad0591e 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
diff --git a/examples/qwen/README.md b/examples/qwen/README.md
index 49e836a99..ee8bb6957 100644
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@@ -265,7 +265,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_gptq \
 To run the AWQ Qwen example, the following steps are required:
 1. Weight quantization
 
-    NVIDIA AMMO toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for AMMO installation instructions.
+    NVIDIA Modelopt toolkit is used for AWQ weight quantization. Please see [examples/quantization/README.md](/examples/quantization/README.md#preparation) for Modelopt installation instructions.
 
     ```bash
     # Quantize Qwen-7B-Chat checkpoint into INT4 AWQ format
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index e7a3ce081..4cb22ede7 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index 41987be9a..27299941b 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index c598c3720..eff840a1e 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt
index bad7e4fc9..822b153d4 100644
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt
index 55fc46c1c..74ed83f02 100644
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index ecc960449..3178b8104 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.10.0.dev2024043000
+tensorrt_llm==0.10.0.dev2024050700
 tiktoken
 datasets
 kaldialign
diff --git a/requirements-windows.txt b/requirements-windows.txt
index 190c131fc..25909cf73 100644
--- a/requirements-windows.txt
+++ b/requirements-windows.txt
@@ -2,7 +2,7 @@
 accelerate==0.25.0
 build
 colored
-cuda-python==12.3.0
+cuda-python==12.4.0
 diffusers==0.27.0
 numpy<2
 onnx>=1.12.0
@@ -15,10 +15,7 @@ h5py==3.10.0
 pywin32
 StrEnum
 sentencepiece>=0.1.99
-# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
-# "tensorrt==9.3.0.post12.dev1" specifies "nvidia-cudnn-cu12" but actually requires "nvidia-cudnn-cu12~=8.9".
-nvidia-cudnn-cu12~=8.9; platform_machine == "AMD64"
-tensorrt==9.3.0.post12.dev1
+tensorrt==10.0.1
 tokenizers>=0.14
 # Default torch is CPU-only on Windows, so need to specify a torch version with GPU support
 torch==2.2.0+cu121
diff --git a/requirements.txt b/requirements.txt
index 3e7702be4..c20f6bdf7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,10 @@
 --extra-index-url https://pypi.nvidia.com
-accelerate==0.25.0
+--extra-index-url https://gitlab-master.nvidia.com/api/v4/projects/95421/packages/pypi/simple
+accelerate>=0.25.0
 build
 colored
 cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image.
-diffusers==0.27.0
+diffusers>=0.27.0
 lark
 mpi4py
 numpy<2
@@ -16,17 +17,14 @@ pandas
 h5py==3.10.0
 StrEnum
 sentencepiece>=0.1.99
-# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
-# "tensorrt==9.3.0.post12.dev1" specifies "nvidia-cudnn-cu12" but actually requires "nvidia-cudnn-cu12~=8.9".
-nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt==9.3.0.post12.dev1
-# https://github.com/pytorch/pytorch/blob/v2.2.1/version.txt still uses 2.2.0a0.
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html#rel-24-02 uses 2.3.0a0.
+tensorrt==10.0.1
+# https://github.com/pytorch/pytorch/blob/v2.2.2/version.txt still uses 2.2.0a0.
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-03.html#rel-24-03 uses 2.3.0a0.
 torch>=2.2.0a,<=2.3.0a
-nvidia-ammo==0.9.3
-transformers==4.38.2
+nvidia-modelopt~=0.11
+transformers>=4.38.2
 wheel
 optimum
 evaluate
 janus
-mpmath==1.3.0
+mpmath>=1.3.0
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 39be95c40..bea3cf2d9 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -186,6 +186,10 @@ def main(build_type: str = "Release",
         copy(
             build_dir / f"tensorrt_llm/plugins/nvinfer_plugin_tensorrt_llm.dll",
             lib_dir / "nvinfer_plugin_tensorrt_llm.dll")
+        copy(
+            build_dir /
+            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/tensorrt_llm_nvrtc_wrapper.dll",
+            lib_dir / "tensorrt_llm_nvrtc_wrapper.dll")
     else:
         copy(build_dir / "tensorrt_llm/libtensorrt_llm.so",
              lib_dir / "libtensorrt_llm.so")
@@ -195,6 +199,10 @@ def main(build_type: str = "Release",
             build_dir /
             "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
             lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
+        copy(
+            build_dir /
+            "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so",
+            lib_dir / "libtensorrt_llm_nvrtc_wrapper.so")
 
     if not cpp_only:
 
diff --git a/setup.py b/setup.py
index f553f1fc0..4cd1b9cf7 100644
--- a/setup.py
+++ b/setup.py
@@ -94,11 +94,13 @@ def has_ext_modules(self):
     package_data={
         'tensorrt_llm': ([
             'libs/th_common.dll', 'libs/tensorrt_llm.dll',
-            'libs/nvinfer_plugin_tensorrt_llm.dll', 'bindings.*.pyd'
+            'libs/nvinfer_plugin_tensorrt_llm.dll',
+            'libs/tensorrt_llm_nvrtc_wrapper.dll', 'bindings.*.pyd'
         ] if on_windows else [
             'libs/libtensorrt_llm.so',
             'libs/libth_common.so',
             'libs/libnvinfer_plugin_tensorrt_llm.so',
+            'libs/libtensorrt_llm_nvrtc_wrapper.so',
             'bindings.*.so',
         ]) + ['bindings/*.pyi', 'tools/plugin_gen/templates/*'],
     },
diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
index 45d3d17ad..873f7831a 100644
--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
@@ -35,8 +35,8 @@ class IdxEntry(Enum):
     ENCODER_INPUT_LENGTH = auto()
     HOST_CONTEXT_LENGTH = auto()
     QKV_BIAS_TENSOR = auto()
-    MEDUSA_PACKED_MASK = auto()
-    MEDUSA_POSITION_OFFSETS = auto()
+    SPEC_DECODING_PACKED_MASK = auto()
+    SPEC_DECODING_POSITION_OFFSETS = auto()
 
 
 class IdxEntryParser:
@@ -57,8 +57,8 @@ def __init__(self, plugin_info):
             plugin_info.pfc_as_list['kv_cache_quant_mode'][0])
         self.position_embedding_type = PositionEmbeddingType(
             plugin_info.pfc_as_list['position_embedding_type'][0])
-        self.is_medusa_enabled = bool(
-            plugin_info.pfc_as_list['is_medusa_enabled'][0])
+        self.is_spec_decoding_enabled = bool(
+            plugin_info.pfc_as_list['is_spec_decoding_enabled'][0])
         self.init_entry_to_index()
 
     # WARNING: Must in sync with GPTAttentionPlugin::isEntryUsed in cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -113,10 +113,10 @@ def is_entry_used(self, entry: IdxEntry) -> bool:
             return self.remove_input_padding
         elif entry == IdxEntry.QKV_BIAS_TENSOR:
             return self.qkv_bias_enabled
-        elif entry == IdxEntry.MEDUSA_PACKED_MASK:
-            return self.is_medusa_enabled
-        elif entry == IdxEntry.MEDUSA_POSITION_OFFSETS:
-            return self.is_medusa_enabled
+        elif entry == IdxEntry.SPEC_DECODING_PACKED_MASK:
+            return self.is_spec_decoding_enabled
+        elif entry == IdxEntry.SPEC_DECODING_POSITION_OFFSETS:
+            return self.is_spec_decoding_enabled
         else:
             return False
 
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index e32834fcb..20df6c621 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -33,7 +33,7 @@
 from .logger import logger
 from .lora_manager import LoraBuildConfig
 from .models import PretrainedConfig, PretrainedModel
-from .models.modeling_utils import optimize_model
+from .models.modeling_utils import SpeculativeDecodingMode, optimize_model
 from .network import Network, net_guard
 from .plugin import PluginConfig
 from .quantization import QuantAlgo, QuantMode
@@ -414,6 +414,7 @@ def save_config(builder_config: BuilderConfig, config_path: str):
 class BuildConfig:
     max_input_len: int = 256
     max_output_len: int = 256
+    opt_batch_size: int = 8
     max_batch_size: int = 8
     max_beam_width: int = 1
     max_num_tokens: Optional[int] = None
@@ -426,6 +427,7 @@ class BuildConfig:
     profiling_verbosity: str = 'layer_names_only'
     enable_debug_output: bool = False
     max_draft_len: int = 0
+    speculative_decoding_mode: SpeculativeDecodingMode = SpeculativeDecodingMode.NONE
     use_refit: bool = False
     input_timing_cache: str = None
     output_timing_cache: str = None
@@ -446,6 +448,7 @@ def from_dict(cls, config, plugin_config=None):
         max_beam_width = config.pop('max_beam_width')
         max_num_tokens = config.pop('max_num_tokens')
         opt_num_tokens = config.pop('opt_num_tokens')
+        opt_batch_size = config.pop('opt_batch_size', None)
         max_prompt_embedding_table_size = config.pop(
             'max_prompt_embedding_table_size', 0)
         gather_context_logits = config.pop('gather_context_logits', False)
@@ -457,6 +460,8 @@ def from_dict(cls, config, plugin_config=None):
                                          'layer_names_only')
         enable_debug_output = config.pop('enable_debug_output', False)
         max_draft_len = config.pop('max_draft_len', 0)
+        speculative_decoding_mode = config.pop('speculative_decoding_mode',
+                                               SpeculativeDecodingMode.NONE)
         use_refit = config.pop('use_refit', False)
         input_timing_cache = config.pop('input_timing_cache', None)
         output_timing_cache = config.pop('output_timing_cache', None)
@@ -480,6 +485,7 @@ def from_dict(cls, config, plugin_config=None):
             max_beam_width=max_beam_width,
             max_num_tokens=max_num_tokens,
             opt_num_tokens=opt_num_tokens,
+            opt_batch_size=opt_batch_size,
             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
             gather_context_logits=gather_context_logits,
             gather_generation_logits=gather_generation_logits,
@@ -488,6 +494,7 @@ def from_dict(cls, config, plugin_config=None):
             profiling_verbosity=profiling_verbosity,
             enable_debug_output=enable_debug_output,
             max_draft_len=max_draft_len,
+            speculative_decoding_mode=speculative_decoding_mode,
             use_refit=use_refit,
             input_timing_cache=input_timing_cache,
             output_timing_cache=output_timing_cache,
@@ -515,6 +522,16 @@ def to_dict(self):
         )
         return output
 
+    def update_from_dict(self, config: dict):
+        for name, value in config.items():
+            if not hasattr(self, name):
+                raise AttributeError(
+                    f"{self.__class__} object has no attribute {name}")
+            setattr(self, name, value)
+
+    def update(self, **kwargs):
+        self.update_from_dict(kwargs)
+
 
 class EngineConfig:
 
@@ -625,6 +642,11 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
 
     if hasattr(model.config, 'max_medusa_token_len'):
         build_config.max_draft_len = model.config.max_medusa_token_len
+        if build_config.speculative_decoding_mode != SpeculativeDecodingMode.MEDUSA:
+            logger.warn(
+                'speculative_decoding_mode is not Medusa for Medusa model. Overwriting speculative_decoding_mode'
+            )
+        build_config.speculative_decoding_mode = SpeculativeDecodingMode.MEDUSA
 
     use_auto_parallel = build_config.auto_parallel_config.enabled
 
@@ -704,20 +726,33 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
 
         # Forward
         prepare_input_args = {
-            "max_batch_size": build_config.max_batch_size,
-            "max_input_len": build_config.max_input_len,
+            "max_batch_size":
+            build_config.max_batch_size,
+            "max_input_len":
+            build_config.max_input_len,
             "max_seq_len":
             build_config.max_input_len + build_config.max_output_len,
-            "use_cache": True,
-            "max_beam_width": build_config.max_beam_width,
-            "max_num_tokens": build_config.max_num_tokens,
-            "opt_num_tokens": build_config.opt_num_tokens,
+            "use_cache":
+            True,
+            "max_beam_width":
+            build_config.max_beam_width,
+            "max_num_tokens":
+            build_config.max_num_tokens,
+            "opt_num_tokens":
+            build_config.opt_num_tokens,
             "prompt_embedding_table_size":
             build_config.max_prompt_embedding_table_size,
-            "max_draft_len": build_config.max_draft_len,
-            "gather_context_logits": build_config.gather_context_logits,
-            "gather_generation_logits": build_config.gather_generation_logits,
-            "lora_target_modules": build_config.lora_config.lora_target_modules
+            "max_draft_len":
+            build_config.max_draft_len,
+            "speculative_decoding_draft_tokens_external":
+            build_config.speculative_decoding_mode ==
+            SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL,
+            "gather_context_logits":
+            build_config.gather_context_logits,
+            "gather_generation_logits":
+            build_config.gather_generation_logits,
+            "lora_target_modules":
+            build_config.lora_config.lora_target_modules
         }
 
         if model.config.architecture == "DecoderModel":
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index 023c731f3..7c8776294 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -32,7 +32,7 @@
 from ..lora_manager import LoraBuildConfig
 from ..models import PretrainedConfig
 from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig,
-                                     load_model)
+                                     SpeculativeDecodingMode, load_model)
 from ..plugin import PluginConfig, add_plugin_argument
 from ..quantization import QuantAlgo
 
@@ -105,7 +105,7 @@ def parse_arguments():
         help=
         'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. '
         'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded. '
-        '(An example for reference only: 0.45734 vs 0.45755 for LLaMA-v2 7B using `ammo/examples/hf/instruct_eval/mmlu.py`).'
+        '(An example for reference only: 0.45734 vs 0.45755 for LLaMA-v2 7B using `modelopt/examples/hf/instruct_eval/mmlu.py`).'
     )
     parser.add_argument(
         '--gather_all_token_logits',
@@ -224,6 +224,13 @@ def parse_arguments():
         help=
         'Run through the build process except the actual Engine build for debugging. '
     )
+    parser.add_argument('--speculative_decoding_mode',
+                        default=None,
+                        choices=[
+                            "draft_tokens_external",
+                            "medusa",
+                        ],
+                        help='Mode of speculative decoding.')
 
     plugin_config_parser = parser.add_argument_group("plugin_config")
     add_plugin_argument(plugin_config_parser)
@@ -233,6 +240,11 @@ def parse_arguments():
         args.gather_context_logits = True
         args.gather_generation_logits = True
 
+    if args.gather_context_logits and args.max_draft_len > 0:
+        raise RuntimeError(
+            "Gather context logits is not support with draft len > 0. "
+            "If want to get the accepted tokens' logits from target model, please just enable gather_generation_logits"
+        )
     return args
 
 
@@ -407,6 +419,7 @@ def main():
         'max_lora_rank': args.max_lora_rank,
         'lora_target_modules': args.lora_target_modules,
     }
+    speculative_decoding_mode = SpeculativeDecodingMode.from_arguments(args)
     if args.build_config is None:
         if args.multiple_profiles == "enable" and args.opt_num_tokens is not None:
             raise RuntimeError(
@@ -443,6 +456,7 @@ def main():
                 'profiling_verbosity': args.profiling_verbosity,
                 'enable_debug_output': args.enable_debug_output,
                 'max_draft_len': args.max_draft_len,
+                'speculative_decoding_mode': speculative_decoding_mode,
                 'input_timing_cache': args.input_timing_cache,
                 'output_timing_cache': args.output_timing_cache,
                 'auto_parallel_config': {
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 943762410..8a4bf4ea4 100644
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -630,13 +630,14 @@ class PositionEmbeddingType(IntEnum):
     learned_absolute = 0
     rope_gptj = 1
     rope_gpt_neox = 2
-    alibi = 3
-    alibi_with_scale = 4
-    relative = 5
-    chatglm = 6
+    long_rope = 3
+    alibi = 4
+    alibi_with_scale = 5
+    relative = 6
+    chatglm = 7
 
     def is_rope(self) -> bool:
-        return self in [self.rope_gptj, self.rope_gpt_neox]
+        return self in [self.rope_gptj, self.rope_gpt_neox, self.long_rope]
 
     def is_alibi(self) -> bool:
         return self in [self.alibi, self.alibi_with_scale]
@@ -2133,38 +2134,66 @@ def cumsum(input: Tensor, dim: int) -> Tensor:
 
     dim = dim_resolve_negative(dim, input.ndim())[0]
 
-    slice_shape = []
-    for i in range(input.ndim()):
-        if i != dim:
-            slice_shape.append(shape(input, i))
+    if (dim == input.ndim() - 1) and input.size(-1) > 0:
+        old_shape = shape(input)
+        if input.ndim() != 2:
+            input_2d = input.view([-1, input.size(-1)])
+        else:
+            input_2d = input
+        cumsum_last_dim_plg_creator = trt.get_plugin_registry(
+        ).get_plugin_creator('CumsumLastDim', '1', TRT_LLM_PLUGIN_NAMESPACE)
+        assert cumsum_last_dim_plg_creator is not None
+        input_length = trt.PluginField(
+            "input_length", np.array(input_2d.size(-1), dtype=np.int32),
+            trt.PluginFieldType.INT32)
+        pf_type = trt.PluginField("type_id",
+                                  np.array([int(input_2d.dtype)], np.int32),
+                                  trt.PluginFieldType.INT32)
+        pfc = trt.PluginFieldCollection([input_length, pf_type])
+        cumsum_last_dim_plug = cumsum_last_dim_plg_creator.create_plugin(
+            "cumsum_last_dim", pfc)
+        plug_inputs = [input_2d]
+        plug_inputs = [i.trt_tensor for i in plug_inputs]
+        layer = default_trtnet().add_plugin_v2(plug_inputs,
+                                               cumsum_last_dim_plug)
+        _add_plugin_info(layer, cumsum_last_dim_plg_creator, "cumsum_last_dim",
+                         pfc)
+        output = _create_tensor(layer.get_output(0), layer)
+        output = output.view(old_shape)
+        return output
+    else:
+        slice_shape = []
+        for i in range(input.ndim()):
+            if i != dim:
+                slice_shape.append(shape(input, i))
 
-    zero_tensor = constant_to_tensor_(0, input.dtype, False)
-    if len(slice_shape) > 0:
-        zero_tensor = expand_dims(zero_tensor,
-                                  [i for i in range(len(slice_shape))])
-        slice_shape = concat(slice_shape)
-        zero_tensor = expand(zero_tensor, slice_shape)
+        zero_tensor = constant_to_tensor_(0, input.dtype, False)
+        if len(slice_shape) > 0:
+            zero_tensor = expand_dims(zero_tensor,
+                                      [i for i in range(len(slice_shape))])
+            slice_shape = concat(slice_shape)
+            zero_tensor = expand(zero_tensor, slice_shape)
 
-    loop_layer = default_trtnet().add_loop()
-    trip_limit = shape(input, dim).trt_tensor
-    loop_layer.add_trip_limit(trip_limit, trt.TripLimit.COUNT)
+        loop_layer = default_trtnet().add_loop()
+        trip_limit = shape(input, dim).trt_tensor
+        loop_layer.add_trip_limit(trip_limit, trt.TripLimit.COUNT)
 
-    iterator_layer = loop_layer.add_iterator(input.trt_tensor, dim)
-    cur_slice = iterator_layer.get_output(0)
+        iterator_layer = loop_layer.add_iterator(input.trt_tensor, dim)
+        cur_slice = iterator_layer.get_output(0)
 
-    running_sum_layer = loop_layer.add_recurrence(zero_tensor.trt_tensor)
-    running_sum = running_sum_layer.get_output(0)
+        running_sum_layer = loop_layer.add_recurrence(zero_tensor.trt_tensor)
+        running_sum = running_sum_layer.get_output(0)
 
-    cur_sum_layer = default_trtnet().add_elementwise(
-        cur_slice, running_sum, trt.ElementWiseOperation.SUM)
-    cur_sum = cur_sum_layer.get_output(0)
-    running_sum_layer.set_input(1, cur_sum)
+        cur_sum_layer = default_trtnet().add_elementwise(
+            cur_slice, running_sum, trt.ElementWiseOperation.SUM)
+        cur_sum = cur_sum_layer.get_output(0)
+        running_sum_layer.set_input(1, cur_sum)
 
-    loop_output_layer = loop_layer.add_loop_output(cur_sum,
-                                                   trt.LoopOutput.CONCATENATE,
-                                                   dim)
-    loop_output_layer.set_input(1, trip_limit)
-    return _create_tensor(loop_output_layer.get_output(0), loop_output_layer)
+        loop_output_layer = loop_layer.add_loop_output(
+            cur_sum, trt.LoopOutput.CONCATENATE, dim)
+        loop_output_layer.set_input(1, trip_limit)
+        return _create_tensor(loop_output_layer.get_output(0),
+                              loop_output_layer)
 
 
 def masked_scatter(input: Tensor, mask: Tensor, source: Tensor) -> Tensor:
@@ -3841,6 +3870,77 @@ def create_sinusoidal_positions_for_attention_plugin(
 
         return concat.reshape(1, -1).astype(dtype)
 
+    @staticmethod
+    def create_sinusoidal_positions_for_cogvlm_attention_plugin(
+            num_pos: int,
+            dim: int,
+            theta: float = 10000.0,
+            scale: float = 1.0,
+            scale_type: RotaryScalingType = RotaryScalingType.none,
+            vision_start: int = 1,
+            vision_length: int = 1225,
+            dtype=np.float32):
+        if scale_type == RotaryScalingType.linear:
+            scale = 1.0 / scale
+        inv_freq = scale / (theta**(np.arange(0, dim, 2) / dim)).astype(dtype)
+        position_id = np.hstack([
+            np.arange(0, vision_start + 1, dtype=dtype),
+            np.full(vision_length, vision_start + 1, dtype=dtype),
+            np.arange(vision_start + 2,
+                      num_pos - (vision_length - 1),
+                      dtype=dtype)
+        ])
+        sinusoid_inp = np.expand_dims(np.einsum("i , j -> i j",
+                                                position_id,
+                                                inv_freq,
+                                                dtype=dtype),
+                                      axis=-1)
+        # fuse cos/sin into float2 (cos, sin).
+        concat = np.concatenate((np.cos(sinusoid_inp), np.sin(sinusoid_inp)),
+                                axis=-1)
+
+        return concat.reshape(1, -1).astype(dtype)
+
+    def create_sinusoidal_positions_long_rope(
+            num_pos: int,
+            num_orig_pos: int,
+            dim: int,
+            theta: float = 10000.0,
+            scaling_short_factors: Tensor = 1.0,
+            scaling_long_factors: Tensor = 1.0,
+            dtype=np.float32):
+
+        def _calc_mscale(scale):
+            if scale <= 1.0:
+                return 1.0
+            return math.sqrt(1 + math.log(scale) / math.log(num_orig_pos))
+
+        mscale = _calc_mscale(num_pos / num_orig_pos)
+
+        def _compute_sinusoidal_positions(scale_factors,
+                                          for_attention_plugin=True):
+            inv_freq = 1 / (scale_factors *
+                            (theta**(np.arange(0, dim, 2) / dim)).astype(dtype))
+            sinusoid_inp = np.einsum("i , j -> i j",
+                                     np.arange(num_pos, dtype=dtype),
+                                     inv_freq,
+                                     dtype=dtype)
+            if for_attention_plugin:
+                sinusoid_inp = np.expand_dims(sinusoid_inp, axis=-1)
+                concat = np.concatenate(
+                    (np.cos(sinusoid_inp), np.sin(sinusoid_inp)), axis=-1)
+            else:
+                concat = np.concatenate(
+                    (np.sin(sinusoid_inp), np.cos(sinusoid_inp)), axis=1)
+                concat = np.expand_dims(concat, axis=0)
+            return concat.astype(dtype) * mscale
+
+        return _compute_sinusoidal_positions(
+            scaling_short_factors, False), _compute_sinusoidal_positions(
+                scaling_long_factors, False), _compute_sinusoidal_positions(
+                    scaling_short_factors, True), _compute_sinusoidal_positions(
+                        scaling_long_factors, True), mscale
+
     @staticmethod
     def rotate_every_two(tensor: Tensor) -> Tensor:
         assert tensor.ndim() == 4
@@ -3893,7 +3993,7 @@ def apply_rotary_pos_emb(
     ) -> Tensor:
 
         rotate_func = None
-        if pos_emb_type == PositionEmbeddingType.rope_gpt_neox:
+        if pos_emb_type == PositionEmbeddingType.rope_gpt_neox or pos_emb_type == PositionEmbeddingType.long_rope:
             assert len(position_embedding) == 2
             cos, sin = position_embedding
             sin = expand_dims(sin, 2)
@@ -4048,6 +4148,8 @@ def gpt_attention(
     rotary_embedding_dim: int = 0,
     rotary_embedding_base: float = 10000.0,
     rotary_embedding_scale_type: RotaryScalingType = RotaryScalingType.none,
+    rotary_embedding_scaling_factors: Optional[Tensor] = None,
+    rotary_embedding_m_scale: Optional[float] = None,
     rotary_embedding_scale: float = 1.0,
     rotary_embedding_max_positions: int = 1024,
     position_embedding_type: PositionEmbeddingType = PositionEmbeddingType.
@@ -4062,6 +4164,8 @@ def gpt_attention(
     alibi_slopes: Optional[Tensor] = None,
     tp_size: int = 1,
     tp_rank: int = 0,
+    vision_start: int = -1,
+    vision_length: int = -1,
     kv_cache_block_offsets: Optional[Tensor] = None,
     host_kv_cache_block_offsets: Tensor = None,
     host_kv_cache_pool_pointers: Tensor = None,
@@ -4074,8 +4178,8 @@ def gpt_attention(
     host_context_lengths: Optional[Tensor] = None,  # for pad-free input mode
     qkv_bias: Optional[Tensor] = None,
     use_cache: bool = True,
-    medusa_position_offsets: Tensor = None,
-    medusa_packed_mask: Tensor = None,
+    spec_decoding_position_offsets: Tensor = None,
+    spec_decoding_packed_mask: Tensor = None,
 ) -> Tuple[Tensor, Optional[Tensor]]:
     '''
     Add an operation that performs the multi-head attention in GPT-like models.
@@ -4261,15 +4365,15 @@ def gpt_attention(
 
         medusa_generation_lengths: Tensor = None,
             The generation phase tokens' lengths for each sequence.
-            Shape: [Batch_size]
+            Shape: [batch_size]
 
-        medusa_position_offsets: Tensor = None,
-            The generation phase tokens's position offsets (can be different for each sequences).
-            Shape: [bs, max_num_medusa_tokens + 1].
+        spec_decoding_position_offsets: Tensor = None,
+            The speculative decoding tokens's position offsets (shared by all sequences).
+            Shape: [batch_size, num_draft_tokens + 1].
 
-        medusa_packed_mask: Tensor = None,
-            The generation phase tokens's packed attention mask (can be different for each sequences).
-            Shape: [bs*(max_num_medusa_tokens+1), divUp(max_num_medusa_tokens + 1, 32)].
+        spec_decoding_packed_mask: Tensor = None,
+            The speculative decoding tokens's attention mask (packed into uint32_t bits).
+            Shape: [batch_size, num_draft_tokens + 1, divUp(num_draft_tokens + 1, 32)].
 
     Returns:
         The tensor produced by that layer.
@@ -4299,6 +4403,12 @@ def gpt_attention(
                                 trt.PluginFieldType.INT32)
     nheads = trt.PluginField("num_heads", np.array(num_heads, dtype=np.int32),
                              trt.PluginFieldType.INT32)
+    vision_start = trt.PluginField("vision_start",
+                                   np.array(vision_start, dtype=np.int32),
+                                   trt.PluginFieldType.INT32)
+    vision_length = trt.PluginField("vision_length",
+                                    np.array(vision_length, dtype=np.int32),
+                                    trt.PluginFieldType.INT32)
     num_kv_heads = trt.PluginField("num_kv_heads",
                                    np.array(num_kv_heads, dtype=np.int32),
                                    trt.PluginFieldType.INT32)
@@ -4326,6 +4436,10 @@ def gpt_attention(
         "rotary_embedding_scale",
         np.array(rotary_embedding_scale, dtype=np.float32),
         trt.PluginFieldType.FLOAT32)
+    rotary_embedding_m_scale = trt.PluginField(
+        "rotary_embedding_m_scale",
+        np.array(rotary_embedding_m_scale, dtype=np.float32),
+        trt.PluginFieldType.FLOAT32)
     rotary_embedding_max_positions = trt.PluginField(
         "rotary_embedding_max_positions",
         np.array(rotary_embedding_max_positions, dtype=np.int32),
@@ -4342,9 +4456,9 @@ def gpt_attention(
         "remove_input_padding",
         np.array(np.int8(default_net().plugin_config.remove_input_padding),
                  dtype=np.int8), trt.PluginFieldType.INT8)
-    is_medusa_enabled = trt.PluginField(
-        "is_medusa_enabled",
-        np.array(np.int8(medusa_packed_mask is not None), dtype=np.int8),
+    is_spec_decoding_enabled = trt.PluginField(
+        "is_spec_decoding_enabled",
+        np.array(np.int8(spec_decoding_packed_mask is not None), dtype=np.int8),
         trt.PluginFieldType.INT8)
     p_dtype = default_net().plugin_config.gpt_attention_plugin
     pf_type = trt.PluginField(
@@ -4415,16 +4529,17 @@ def gpt_attention(
                                    trt.PluginFieldType.INT32)
 
     pfc = trt.PluginFieldCollection([
-        layer_idx, nheads, num_kv_heads, head_size, unidirectional, q_scaling,
-        position_embedding_type, rotary_embedding_dim, rotary_embedding_base,
+        layer_idx, nheads, vision_start, vision_length, num_kv_heads, head_size,
+        unidirectional, q_scaling, position_embedding_type,
+        rotary_embedding_dim, rotary_embedding_base,
         rotary_embedding_scale_type, rotary_embedding_scale,
-        rotary_embedding_max_positions, tp_size, tp_rank, unfuse_qkv_gemm,
-        context_fmha_type, multi_block_mode, enable_xqa,
-        kv_cache_quant_mode_field, remove_input_padding, mask_type,
+        rotary_embedding_m_scale, rotary_embedding_max_positions, tp_size,
+        tp_rank, unfuse_qkv_gemm, context_fmha_type, multi_block_mode,
+        enable_xqa, kv_cache_quant_mode_field, remove_input_padding, mask_type,
         paged_kv_cache, tokens_per_block, pf_type, max_context_length,
         qkv_bias_enabled, do_cross_attention_field, max_distance,
         pos_shift_enabled, dense_context_fmha, use_paged_context_fmha_field,
-        use_fp8_context_fmha_field, use_cache_pf, is_medusa_enabled
+        use_fp8_context_fmha_field, use_cache_pf, is_spec_decoding_enabled
     ])
 
     attn_plug = attn_plg_creator.create_plugin("causal_attn", pfc)
@@ -4468,6 +4583,8 @@ def gpt_attention(
 
     if rotary_cos_sin is not None:
         plug_inputs += [rotary_cos_sin]
+    if rotary_embedding_scaling_factors is not None:
+        plug_inputs += [rotary_embedding_scaling_factors]
 
     if alibi_slopes is not None:
         plug_inputs += [alibi_slopes]
@@ -4484,10 +4601,12 @@ def gpt_attention(
     if qkv_bias is not None:
         plug_inputs += [qkv_bias]
 
-    if medusa_packed_mask is not None:
-        # add position_ids as well only if medusa mode
-        assert medusa_position_offsets is not None
-        plug_inputs += [medusa_packed_mask, medusa_position_offsets]
+    if spec_decoding_packed_mask is not None:
+        # add position_ids as well only if speculative decoding mode
+        assert spec_decoding_position_offsets is not None
+        plug_inputs += [
+            spec_decoding_packed_mask, spec_decoding_position_offsets
+        ]
 
     for idx, i in enumerate(plug_inputs):
         assert i is not None, f"Found None input for {idx} th item in plugin inputs {plug_inputs}"
diff --git a/tensorrt_llm/hlapi/llm.py b/tensorrt_llm/hlapi/llm.py
index a5aa2300d..af7ed9236 100644
--- a/tensorrt_llm/hlapi/llm.py
+++ b/tensorrt_llm/hlapi/llm.py
@@ -35,13 +35,52 @@ class ParallelConfig:
     ''' The model distribution configs for LLM.  '''
     tp_size: int = 1
     pp_size: int = 1
-    world_size: int = 1
-    devices: List[int] = field(default_factory=list)
     auto_parallel: bool = False
+    _world_size: int = field(default=1, init=False)
+    _devices: Optional[List[int]] = field(default=None, init=False)
 
-    def get_devices(self) -> List[int]:
-        ''' Get the devices for the model. '''
-        return self.devices if self.devices else list(range(self.tp_size))
+    @property
+    def devices(self) -> List[int]:
+        if self._devices is None:
+            return list(range(self.world_size))
+        return self._devices
+
+    @devices.setter
+    def devices(self, devices: List[int]):
+        if len(devices) != self.world_size:
+            raise ValueError(
+                f"devices {devices} should have the same length as world_size {self.world_size}"
+            )
+        self._devices = devices
+
+    @property
+    def world_size(self) -> bool:
+        if self.auto_parallel:
+            if self.tp_size > 1 or self.pp_size > 1:
+                raise RuntimeError(
+                    "manually TP and PP are not supported in auto parallel mode."
+                )
+            return self._world_size
+
+        if self._world_size > 1:
+            raise RuntimeError(
+                "world_size > 1 is only supported in auto parallel mode.")
+        return self.tp_size * self.pp_size
+
+    @world_size.setter
+    def world_size(self, world_size: int):
+        if self.auto_parallel:
+            self._world_size = world_size
+        elif (not self.auto_parallel
+              ) and world_size != self.tp_size * self.pp_size:
+            raise ValueError(
+                f"world_size {world_size} should be equal to tp_size * pp_size {self.tp_size * self.pp_size} in non-auto_parallel mode.\n"
+                "For non-auto-parallel mode, the world_size is not needed to set"
+            )
+
+    @property
+    def is_multi_gpu(self) -> bool:
+        return self.world_size > 1
 
 
 @dataclass
@@ -69,27 +108,6 @@ class ModelConfig:
     # This is not suggested to be used directly, ideally the HLAPI will deduce all of options automatically.
     plugin_config: Union[PluginConfig, Dict[str, Any], None] = None
 
-    @property
-    def is_multi_gpu(self) -> bool:
-        if self.parallel_config.auto_parallel:
-            return self.parallel_config.world_size > 1
-        else:
-            return self.parallel_config.tp_size > 1 or self.parallel_config.pp_size > 1
-
-    @property
-    def world_size(self) -> bool:
-        if self.parallel_config.auto_parallel:
-            if self.parallel_config.tp_size > 1 or self.parallel_config.pp_size > 1:
-                raise RuntimeError(
-                    "manually TP and PP are not supported in auto parallel mode."
-                )
-            return self.parallel_config.world_size
-
-        if self.parallel_config.world_size > 1:
-            raise RuntimeError(
-                "world_size > 1 is only supported in auto parallel mode.")
-        return self.parallel_config.tp_size * self.parallel_config.pp_size
-
     def _set_additional_options(self,
                                 max_batch_size: Optional[int] = None,
                                 max_input_len: Optional[int] = None,
@@ -188,7 +206,7 @@ def _update_plugin_config(self, key: str, value: Any):
 
     def _validate_gpu_for_paged_context(self, value: bool):
         if value:
-            devices = self.parallel_config.get_devices()
+            devices = self.parallel_config.devices
             if torch.cuda.get_device_properties(devices[0]).major < 8:
                 raise ValueError(
                     "Paged context is only supported on post Volta GPUs")
@@ -308,6 +326,7 @@ def __init__(self,
             multi_block_mode(bool): Switch the optimization on multi-head attention optimization for long context decoding.
             enable_chunked_context(bool): Whether to enable the chunked context for the generation.
             scheduling_policy(SchedulerPolicy): The scheduling policy for the generation.
+            trt_strongly_typed(bool): Whether to create a strongly typed TensorRT plan where tensor data types are inferred from network input types and operator type specification. Enabling this option will reduce the engine building time.
         '''
 
         self.config = config
@@ -339,25 +358,31 @@ def __init__(self,
             'enable_trt_overlap', None)
         self.scheduling_policy = _additional_options.pop(
             'scheduling_policy', SchedulerPolicy.GUARANTEED_NO_EVICT)
+
+        self._build_config = BuildConfig()
+        self._build_config.strongly_typed = _additional_options.pop(
+            'trt_strongly_typed', True)
         if _additional_options:
             raise ValueError(f"Unknown options {_additional_options}")
 
-        devices = self.config.parallel_config.get_devices()
+        devices = self.config.parallel_config.devices
         if torch.cuda.get_device_properties(devices[0]).major < 8:
             logger.info(
                 f"Disable the chunked context on GPUs that predate the Volta architecture."
             )
             self.enable_chunked_context = False
 
-        if self.config.is_multi_gpu:
-            if get_device_count() < self.config.world_size:
+        if self.config.parallel_config.is_multi_gpu:
+            if get_device_count() < self.config.parallel_config.world_size:
                 raise RuntimeError(
-                    f"Only {get_device_count()} GPUs are available, but {self.config.world_size} are required."
+                    f"Only {get_device_count()} GPUs are available, but {self.config.parallel_config.world_size} are required."
                 )
 
             logger.info(
-                f'start MpiSession with {self.config.world_size} workers')
-            self.mpi_session = MpiSession(n_workers=self.config.world_size)
+                f'start MpiSession with {self.config.parallel_config.world_size} workers'
+            )
+            self.mpi_session = MpiSession(
+                n_workers=self.config.parallel_config.world_size)
 
         # Due to the gptManager can only accept a engine path, we need to save the engine to a directory
         self._engine_dir: Union[tempfile.TemporaryDirectory, str, Path,
@@ -371,6 +396,7 @@ def __init__(self,
         # When got an engine, the plugin config are fixed, shouldn't be altered.
         # TODO[chunweiy]: Refine the rules here and make them easy to be updated through versions
         # TODO[chunweiy]: Deal with the rules those depend on each other
+
         if self.config.model_format is not _ModelFormatKind.TLLM_ENGINE:
             if self.kv_cache_config is not None:
                 if self.kv_cache_config.enable_block_reuse:
@@ -381,6 +407,8 @@ def __init__(self,
                                                       True)
             if self.config.quant_config.quant_algo is QuantAlgo.FP8:
                 self.enable_chunked_context = False
+                self.config._update_plugin_config("use_paged_context_fmha",
+                                                  False)
             if self.enable_chunked_context is not None:
                 self.config._update_plugin_config("enable_chunked_context",
                                                   self.enable_chunked_context)
@@ -516,7 +544,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> bool:
     def _save_engine(self, engine_dir: str):
         logger.info(f"Save model to {engine_dir}")
 
-        if self.config.is_multi_gpu:
+        if self.config.parallel_config.is_multi_gpu:
             if self._executor is not None:
                 self._executor.shutdown()
             self.mpi_session.submit_sync(LLM._node_save_task, engine_dir,
@@ -563,7 +591,7 @@ def get_engine_dir():
             if self._engine_dir is None:
                 self._engine_dir = tempfile.TemporaryDirectory()
 
-            if self.config.is_multi_gpu:
+            if self.config.parallel_config.is_multi_gpu:
                 self.mpi_session.submit_sync(
                     LLM._node_build_task,
                     self.config,
@@ -580,6 +608,7 @@ def get_engine_dir():
                         self.config,
                         tokenizer=self._tokenizer,
                         workspace=self._workspace.name,
+                        build_config=self._build_config,
                 ) as model_loader:
 
                     runtime_context = model_loader()
@@ -612,7 +641,7 @@ def get_engine_dir():
             max_beam_width=self.config.max_beam_width,
             executor_config=executor_config,
             executor_policy=self.scheduling_policy,
-            model_world_size=self.config.world_size,
+            model_world_size=self.config.parallel_config.world_size,
             mpi_session=self.mpi_session,
             executor_type=tllm.TrtGptModelType.InflightFusedBatching,
         )
@@ -727,17 +756,20 @@ class ModelLoader:
     def __init__(self,
                  config: ModelConfig,
                  tokenizer: Optional[TokenizerBase],
-                 workspace: Optional[str] = None):
+                 workspace: Optional[str] = None,
+                 build_config: Optional[BuildConfig] = None):
         self.config = config
         self.tokenizer = tokenizer
         self.workspace = workspace
-        self.rank = mpi_rank() if config.is_multi_gpu else 0
-        if config.is_multi_gpu and not config.parallel_config.auto_parallel:
+
+        self.build_config = build_config or BuildConfig()
+        self.rank = mpi_rank() if config.parallel_config.is_multi_gpu else 0
+        if config.parallel_config.is_multi_gpu and not config.parallel_config.auto_parallel:
             self.mapping = Mapping(
                 tp_size=config.parallel_config.tp_size,
                 pp_size=config.parallel_config.pp_size,
                 rank=self.rank,
-                world_size=config.world_size,
+                world_size=config.parallel_config.world_size,
             )
         else:
             self.mapping = Mapping()
@@ -748,7 +780,8 @@ def __init__(self,
         self._model_info: Optional[_ModelInfo] = None
         self._model_name = self.config.model
         self.auto_parallel_config = AutoParallelConfig(
-            world_size=config.parallel_config.world_size)
+            world_size=config.parallel_config.world_size if config.
+            parallel_config.auto_parallel else 1)
         default_config = self.config.auto_parallel_config
         self.auto_parallel_config.set_defaults(
             cluster_key=default_config.cluster_key,
@@ -799,7 +832,7 @@ def __init__(self,
                 ("Initialize tokenizer", self._load_hf_tokenizer))
 
     def __call__(self) -> _ModelRuntimeContext:
-        if self.config.is_multi_gpu:
+        if self.config.parallel_config.is_multi_gpu:
             torch.cuda.set_device(self.rank)
 
         n_steps = len(self._model_pipeline)
@@ -939,7 +972,7 @@ def _load_model_from_hf(self):
                     self.config.quant_config,
                     mapping=self.mapping,
                 )
-            if self.config.is_multi_gpu:
+            if self.config.parallel_config.is_multi_gpu:
                 mpi_barrier()
             self.model = model2struct[model_arch].from_checkpoint(
                 checkpoint_dir, rank=self.mapping.rank)
@@ -980,19 +1013,18 @@ def _build_engine(self):
                 for k, v in self.config.plugin_config.items():
                     setattr(plugin_config, k, v)
 
-        build_config = BuildConfig(
+        self.build_config.update(
             max_input_len=self.config.max_input_len,
             max_output_len=self.config.max_output_len,
             max_batch_size=self.config.max_batch_size,
             max_beam_width=self.config.max_beam_width,
             max_num_tokens=self.config.max_num_tokens,
-            strongly_typed=True,
             auto_parallel_config=self.auto_parallel_config,
             plugin_config=plugin_config,
         )
         if self.auto_parallel_config.enabled:
             self.model.config.mapping.rank = self.rank
-        engine = build(self.model, build_config)
+        engine = build(self.model, self.build_config)
 
         self._engine_buffer = engine.engine
         self._engine_config = engine.config
diff --git a/tensorrt_llm/layers/__init__.py b/tensorrt_llm/layers/__init__.py
index 0885ce528..aac3779bc 100644
--- a/tensorrt_llm/layers/__init__.py
+++ b/tensorrt_llm/layers/__init__.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from .activation import Mish
 from .attention import (Attention, AttentionMaskType, AttentionParams,
-                        BertAttention, KeyValueCacheParams,
+                        BertAttention, CogVLMAttention, KeyValueCacheParams,
                         PositionEmbeddingType)
 from .cast import Cast
 from .conv import Conv1d, Conv2d, ConvTranspose2d
@@ -41,6 +41,7 @@
     'PositionEmbeddingType',
     'Attention',
     'BertAttention',
+    'CogVLMAttention',
     'GroupNorm',
     'Embedding',
     'PromptTuningEmbedding',
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
index 4cedc59be..5f7225479 100644
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@@ -27,14 +27,19 @@
                           bert_attention, cast, clip, concat, conditional,
                           constant, embedding, expand, expand_dims, expand_mask,
                           generate_alibi_biases, generate_alibi_slopes,
-                          gpt_attention, matmul, minimum, repeat_interleave,
-                          shape, slice, softmax, split, unsqueeze, where)
+                          gpt_attention, matmul)
+from ..functional import max as fmax
+from ..functional import (minimum, repeat_interleave, shape, slice, softmax,
+                          split, unsqueeze, where)
 from ..module import Module
 from ..parameter import Parameter
 from ..quantization import QuantMode
 from ..quantization.functional import dequantize, quantize
 from .linear import ColumnLinear, QKVColumnLinear, RowLinear
 from .lora import LoraRuntimeParams
+from .normalization import LayerNorm
+
+from ..functional import maximum  # isort:skip
 
 
 def make_causal_mask(bsz, tgt_len, past_key_values_length, dtype):
@@ -233,6 +238,7 @@ def __init__(
         num_layers=1,
         apply_query_key_layer_scaling=False,
         attention_head_size=None,
+        qk_layernorm=False,
         attention_mask_type=AttentionMaskType.padding,
         bias=True,
         dtype=None,
@@ -240,6 +246,9 @@ def __init__(
         rotary_embedding_base=10000.0,
         rotary_embedding_scaling=None,
         rotary_embedding_percentage=1.0,
+        rope_scaling_short_factors=None,
+        rope_scaling_long_factors=None,
+        original_max_position_embeddings=1024,
         tp_group=None,
         tp_size=1,
         tp_rank=0,
@@ -311,7 +320,7 @@ def __init__(
         self.embed_positions = None
         self.rotary_enabled = False
         self.rotary_embedding_dim = 0
-
+        self.mscale = None
         if self.position_embedding_type.is_rope():
             self.rotary_embedding_dim = int(self.attention_head_size *
                                             rotary_embedding_percentage)
@@ -319,11 +328,25 @@ def __init__(
             self.embed_positions = RopeEmbeddingUtils.create_sinusoidal_positions(
                 self.max_position_embeddings,
                 self.rotary_embedding_dim,
-            )
+                theta=self.rotary_embedding_base)
             self.embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                 self.max_position_embeddings, self.rotary_embedding_dim,
                 self.rotary_embedding_base, self.rotary_embedding_scale,
                 self.rotary_embedding_scale_type)
+            if self.position_embedding_type == PositionEmbeddingType.long_rope:
+                self.embed_positions_short_factors, self.embed_positions_long_factors, \
+                self.embed_positions_short_factors_for_attention_plugin, \
+                self.embed_positions_long_factors_for_attention_plugin, self.mscale \
+                    = RopeEmbeddingUtils.create_sinusoidal_positions_long_rope(
+                    self.max_position_embeddings,
+                    original_max_position_embeddings, self.rotary_embedding_dim,
+                    self.rotary_embedding_base, rope_scaling_short_factors,
+                    rope_scaling_long_factors)
+                self.rope_scaling_short_factors = np.array(
+                    rope_scaling_short_factors).reshape(1, -1)
+                self.rope_scaling_long_factors = np.array(
+                    rope_scaling_long_factors).reshape(1, -1)
+                self.original_max_position_embeddings = original_max_position_embeddings
 
         self.quant_mode = quant_mode
         self.register_parameter('kv_cache_scaling_factor', None)
@@ -358,6 +381,10 @@ def __init__(
             self.rel_attn_table = Parameter(shape=(num_attention_heads //
                                                    tp_size, num_buckets),
                                             dtype=dtype)
+        self.qk_layernorm = qk_layernorm
+        if self.qk_layernorm:
+            self.q_layernorm = LayerNorm(self.attention_head_size, dtype=dtype)
+            self.k_layernorm = LayerNorm(self.attention_head_size, dtype=dtype)
 
         if clip_qkv is not None:
             self.clip_qkv = fp32_array([clip_qkv])
@@ -369,8 +396,8 @@ def __init__(
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                medusa_packed_mask=None,
-                medusa_position_offsets=None,
+                spec_decoding_packed_mask=None,
+                spec_decoding_position_offsets=None,
                 use_cache=False,
                 kv_cache_params=None,
                 attention_params=None,
@@ -476,7 +503,29 @@ def forward(self,
                 qkv_lora = concat([q_lora, k_lora, v_lora],
                                   dim=q_lora.rank() - 1)
                 qkv = qkv + qkv_lora
+        if self.qk_layernorm:
+            base_shape = shape(qkv, 0) if qkv.ndim() == 2 else concat(
+                [shape(qkv, 0), shape(qkv, 1)])
+            # here we assume that q, k and v have the same number of attention heads
+            # TODO: allow different number of attention heads for q, k and v.
+            qkv = qkv.view(
+                concat([
+                    base_shape, self.num_attention_heads, 3,
+                    self.attention_head_size
+                ]))
 
+            query, key, value = split(qkv, 1, dim=qkv.ndim() - 2)
+            q_shape = concat([
+                base_shape, self.num_attention_heads, self.attention_head_size
+            ])
+            query = query.view(q_shape)
+            key = key.view(q_shape)
+            value = value.view(q_shape)
+
+            query = self.q_layernorm(query)
+            key = self.k_layernorm(key)
+            qkv = concat([query, key, value], dim=query.ndim() - 2)
+            qkv = qkv.view(concat([base_shape, self.attention_hidden_size * 3]))
         if self.position_embedding_type == PositionEmbeddingType.chatglm:
             qkv = RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm(
                 qkv,
@@ -571,10 +620,48 @@ def forward(self,
             else:
                 attention_output_orig_quant_scale = None
 
-            # Rotary cos/sin cache.
-            rotary_cos_sin = constant(
-                self.embed_positions_for_gpt_attention
-            ) if self.position_embedding_type.is_rope() else None
+            if self.position_embedding_type == PositionEmbeddingType.long_rope:
+                short = slice(
+                    constant(
+                        self.embed_positions_short_factors_for_attention_plugin
+                    ), concat([0, 0, 0]),
+                    concat([
+                        max(attention_params.sequence_length,
+                            self.original_max_position_embeddings),
+                        self.rotary_embedding_dim // 2, 2
+                    ]))
+                long = slice(
+                    constant(
+                        self.embed_positions_long_factors_for_attention_plugin),
+                    concat([0, 0, 0]),
+                    concat([
+                        max(attention_params.sequence_length,
+                            self.original_max_position_embeddings),
+                        self.rotary_embedding_dim // 2, 2
+                    ]))
+                short = short.view((1, -1))
+                long = long.view((1, -1))
+                embed_positions = concat([short, long], dim=0)
+                select = where(
+                    fmax(attention_params.sequence_length, dim=0) <=
+                    self.original_max_position_embeddings, 0, 1)
+                rotary_cos_sin = slice(embed_positions,
+                                       concat([select, 0]),
+                                       sizes=concat([1, shape(long, 1)]))
+                short_factors = constant(self.rope_scaling_short_factors)
+                long_factors = constant(self.rope_scaling_long_factors)
+                scale_factors = concat([short_factors, long_factors], dim=0)
+                rope_scaling_factors = slice(scale_factors,
+                                             concat([select, 0]),
+                                             sizes=concat(
+                                                 [1, shape(long_factors, 1)]))
+                rope_scaling_factors = rope_scaling_factors.view((-1, ))
+            else:
+                # Rotary cos/sin cache.
+                rotary_cos_sin = constant(
+                    self.embed_positions_for_gpt_attention
+                ) if self.position_embedding_type.is_rope() else None
+                rope_scaling_factors = None
             context, past_key_value = gpt_attention(
                 qkv=qkv,
                 past_key_value=past_key_value,
@@ -595,6 +682,8 @@ def forward(self,
                 rotary_embedding_dim=self.rotary_embedding_dim,
                 rotary_embedding_base=self.rotary_embedding_base,
                 rotary_embedding_scale_type=self.rotary_embedding_scale_type,
+                rotary_embedding_scaling_factors=rope_scaling_factors,
+                rotary_embedding_m_scale=self.mscale,
                 rotary_embedding_scale=self.rotary_embedding_scale,
                 rotary_embedding_max_positions=self.max_position_embeddings,
                 position_embedding_type=self.position_embedding_type,
@@ -627,8 +716,8 @@ def forward(self,
                 max_distance=self.max_distance,
                 host_context_lengths=attention_params.host_context_lengths,
                 use_cache=use_cache,
-                medusa_position_offsets=medusa_position_offsets,
-                medusa_packed_mask=medusa_packed_mask,
+                spec_decoding_position_offsets=spec_decoding_position_offsets,
+                spec_decoding_packed_mask=spec_decoding_packed_mask,
             )
 
         else:
@@ -675,7 +764,35 @@ def transpose_for_scores(x,
             value = transpose_for_scores(value, is_kv=True)
 
             if self.rotary_enabled:
-                if is_same_dtype(self.dtype, trt.bfloat16):
+                if self.position_embedding_type == PositionEmbeddingType.long_rope:
+                    sequence_length = shape(hidden_states, 1)
+                    short = slice(
+                        constant(self.embed_positions_short_factors),
+                        concat([0, 0, 0]),
+                        concat([
+                            1,
+                            max(sequence_length,
+                                self.original_max_position_embeddings),
+                            self.rotary_embedding_dim
+                        ]))
+                    long = slice(
+                        constant(self.embed_positions_long_factors),
+                        concat([0, 0, 0]),
+                        concat([
+                            1,
+                            max(sequence_length,
+                                self.original_max_position_embeddings),
+                            self.rotary_embedding_dim
+                        ]))
+                    embed_positions = concat([short, long], dim=0)
+                    select = where(
+                        sequence_length <=
+                        self.original_max_position_embeddings, 0, 1)
+                    self.embed_positions = slice(embed_positions,
+                                                 concat([select, 0, 0]),
+                                                 sizes=shape(short))
+                    embed_positions = cast(self.embed_positions, self.dtype)
+                elif is_same_dtype(self.dtype, trt.bfloat16):
                     embed_positions = numpy_fp32_to_bf16(
                         self.embed_positions.astype(np.float32))
                     embed_positions = constant(embed_positions)
@@ -818,12 +935,14 @@ def transpose_for_scores(x,
                     query_length = shape(query, 2)
                     starts = concat([0, 0, key_length - query_length, 0])
                     sizes = concat([1, 1, query_length, key_length])
+                    if self.position_embedding_type == PositionEmbeddingType.long_rope:
+                        buf_shape = (self.original_max_position_embeddings,
+                                     self.original_max_position_embeddings)
+                    else:
+                        buf_shape = (self.max_position_embeddings,
+                                     self.max_position_embeddings)
                     select_buf = np.expand_dims(
-                        np.tril(
-                            np.ones(
-                                (self.max_position_embeddings,
-                                 self.max_position_embeddings))).astype(bool),
-                        (0, 1))
+                        np.tril(np.ones(buf_shape)).astype(bool), (0, 1))
 
                     select_buf = np.logical_not(select_buf)
                     mask_buf = np.zeros_like(select_buf, np.float32)
@@ -1150,3 +1269,218 @@ def transpose_for_scores(x):
         context = self.dense(context, lora_runtime_params=dense_lora_params)
 
         return context
+
+
+class CogVLMAttention(Attention):
+
+    def __init__(
+            self,
+            *,
+            local_layer_idx,
+            hidden_size,
+            num_attention_heads,
+            num_kv_heads=None,
+            max_position_embeddings=1024,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=True,
+            dtype=None,
+            rotary_embedding_base=10000.0,
+            rotary_embedding_scaling=None,
+            tp_group=None,
+            tp_size=1,
+            tp_rank=0,
+            vision_start=1,
+            vision_length=1225,
+            quant_mode: QuantMode = QuantMode(0),
+            dense_bias=None,
+    ):
+        super().__init__(
+            local_layer_idx=local_layer_idx,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            max_position_embeddings=max_position_embeddings,
+            dtype=dtype,
+            attention_mask_type=attention_mask_type,
+            bias=bias,
+            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+            rotary_embedding_base=rotary_embedding_base,
+            rotary_embedding_scaling=rotary_embedding_scaling,
+            tp_group=tp_group,
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+            quant_mode=quant_mode)
+
+        self.vision_length = vision_length
+        self.vision_start = vision_start
+
+        self.vis_qkv = QKVColumnLinear(
+            hidden_size,
+            tp_size * self.num_attention_heads * self.attention_head_size +
+            (2 * tp_size * self.num_attention_kv_heads *
+             self.attention_head_size),
+            bias=bias,
+            dtype=dtype,
+            tp_group=tp_group,
+            tp_size=tp_size,
+            gather_output=False)
+        self.vis_dense = RowLinear(tp_size * self.num_attention_heads *
+                                   self.attention_head_size,
+                                   hidden_size,
+                                   bias=dense_bias,
+                                   dtype=dtype,
+                                   tp_group=tp_group,
+                                   tp_size=tp_size)
+        self.embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_cogvlm_attention_plugin(
+            self.max_position_embeddings, self.rotary_embedding_dim,
+            self.rotary_embedding_base, self.rotary_embedding_scale,
+            self.rotary_embedding_scale_type, self.vision_start,
+            self.vision_length)
+
+    def forward(self,
+                hidden_states: Tensor,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None):
+
+        assert isinstance(hidden_states, Tensor)
+        assert (not default_net().plugin_config.remove_input_padding)
+        assert (default_net().plugin_config.gpt_attention_plugin)
+
+        bs = shape(hidden_states, 0)
+        seq_length = shape(hidden_states, 1)
+        bos = slice(hidden_states, [0, 0, 0],
+                    concat([bs, self.vision_start, self.hidden_size]))
+        vis_seq_length = minimum(self.vision_length + 1, seq_length - 1)
+        vision_hidden_states = slice(
+            hidden_states, [0, self.vision_start, 0],
+            concat([bs, vis_seq_length, self.hidden_size]))
+        text_seq_length = maximum(
+            0, seq_length - (self.vision_length + 1 + self.vision_start))
+        language_hidden_states = slice(
+            hidden_states, [0, self.vision_length + 1 + self.vision_start, 0],
+            concat([bs, text_seq_length, self.hidden_size]))
+        bos_qkv = self.qkv(bos)
+        language_qkv = self.qkv(language_hidden_states)
+        vision_qkv = self.vis_qkv(vision_hidden_states)
+        qkv = concat([bos_qkv, vision_qkv, language_qkv], dim=1)
+
+        assert attention_params is None or attention_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin,
+            default_net().plugin_config.remove_input_padding)
+        assert kv_cache_params is None or kv_cache_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin)
+
+        past_key_value = None if kv_cache_params is None else kv_cache_params.get_first_past_key_value(
+        )
+
+        if default_net().plugin_config.gpt_attention_plugin:
+            if self.cross_attention and (past_key_value is not None):
+                past_key_value = kv_cache_params.past_key_value[1]
+            assert self.attention_mask_type in [
+                AttentionMaskType.causal, AttentionMaskType.bidirectional,
+                AttentionMaskType.bidirectionalglm
+            ], 'Plugin only support masked MHA.'
+
+            # KV cache scales.
+            kv_orig_quant_scale = constant(
+                fp32_array([1.0])
+            ) / self.kv_cache_scaling_factor.value if self.quant_mode.has_kv_cache_quant(
+            ) else None
+            kv_quant_orig_scale = self.kv_cache_scaling_factor.value if self.quant_mode.has_kv_cache_quant(
+            ) else None
+
+            # Attention output scales
+            assert (
+                not default_net().plugin_config.use_fp8_context_fmha
+            ) or self.quant_mode.has_fp8_qdq(
+            ), "FP8 Context FMHA must be used together with the fp8 quantization workflow."
+
+            if self.quant_mode.has_fp8_qdq() and default_net(
+            ).plugin_config.use_fp8_context_fmha:
+                # the attention plugin only quantizes the output when fp8 context fmha is enabled.
+                attention_output_orig_quant_scale = constant(
+                    fp32_array([1.0] /
+                               self.dense.activation_scaling_factor.raw_value))
+            else:
+                attention_output_orig_quant_scale = None
+
+            rotary_cos_sin = constant(self.embed_positions_for_gpt_attention)
+
+            context, past_key_value = gpt_attention(
+                qkv=qkv,
+                past_key_value=past_key_value,
+                sequence_length=attention_params.sequence_length,
+                host_past_key_value_lengths=kv_cache_params.
+                host_past_key_value_lengths,
+                host_max_attention_window_sizes=kv_cache_params.
+                host_max_attention_window_sizes,
+                host_sink_token_length=kv_cache_params.host_sink_token_length,
+                context_lengths=attention_params.context_lengths,
+                cache_indirection=kv_cache_params.cache_indirection,
+                host_request_types=attention_params.host_request_types,
+                layer_idx=self.layer_idx,
+                num_heads=self.num_attention_heads,
+                num_kv_heads=self.num_attention_kv_heads,
+                hidden_size_per_head=self.attention_head_size,
+                q_scaling=self.q_scaling,
+                rotary_embedding_dim=self.rotary_embedding_dim,
+                rotary_embedding_base=self.rotary_embedding_base,
+                rotary_embedding_scale_type=self.rotary_embedding_scale_type,
+                rotary_embedding_scale=self.rotary_embedding_scale,
+                rotary_embedding_max_positions=self.max_position_embeddings,
+                position_embedding_type=self.position_embedding_type,
+                rotary_cos_sin=rotary_cos_sin,
+                kv_orig_quant_scale=kv_orig_quant_scale,
+                kv_quant_orig_scale=kv_quant_orig_scale,
+                attention_output_orig_quant_scale=
+                attention_output_orig_quant_scale,
+                kv_cache_quant_mode=self.quant_mode,
+                max_context_length=attention_params.max_context_length,
+                mask_type=self.attention_mask_type,
+                alibi_slopes=None,
+                tp_size=self.tp_size,
+                tp_rank=self.tp_rank,
+                vision_start=self.vision_start,
+                vision_length=self.vision_length,
+                kv_cache_block_offsets=kv_cache_params.kv_cache_block_offsets,
+                host_kv_cache_block_offsets=kv_cache_params.
+                host_kv_cache_block_offsets,
+                host_kv_cache_pool_pointers=kv_cache_params.
+                host_kv_cache_pool_pointers,
+                do_cross_attention=self.cross_attention,
+                cross_qkv=None,
+                cross_qkv_length=attention_params.encoder_max_input_length,
+                encoder_input_lengths=attention_params.encoder_input_lengths,
+                relative_attention_bias=self.rel_attn_table.value
+                if self.relative_attention else None,
+                max_distance=self.max_distance,
+                host_context_lengths=attention_params.host_context_lengths,
+                use_cache=use_cache,
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None,
+            )
+
+        bs = shape(context, 0)
+        seq_length = shape(context, 1)
+        bos = slice(context, [0, 0, 0],
+                    concat([bs, self.vision_start, self.hidden_size]))
+        vis_seq_length = minimum(self.vision_length + 1, seq_length - 1)
+        vision_hidden_states = slice(
+            context, [0, self.vision_start, 0],
+            concat([bs, vis_seq_length, self.hidden_size]))
+        text_seq_length = maximum(
+            0, seq_length - (self.vision_length + 1 + self.vision_start))
+        language_hidden_states = slice(
+            context, [0, self.vision_length + 1 + self.vision_start, 0],
+            concat([bs, text_seq_length, self.hidden_size]))
+
+        bos_dense = self.dense(bos)
+        language_dense = self.dense(language_hidden_states)
+        vision_dense = self.vis_dense(vision_hidden_states)
+        context = concat([bos_dense, vision_dense, language_dense], dim=1)
+
+        if use_cache:
+            return (context, past_key_value)
+        else:
+            return context
diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py
index 1fce360c5..80718fe51 100644
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@@ -191,7 +191,7 @@ def from_parameter(x):
         plugin_inputs += [finished]
 
     # Add conditional inputs
-    if quant_mode.has_any_quant():
+    if quant_mode.is_weight_only() or quant_mode.has_fp8_qdq():
         assert expert_scale_1
         assert expert_scale_2
         plugin_inputs += [expert_scale_1, expert_scale_2]
@@ -319,6 +319,7 @@ def __init__(self,
         # all is more efficient as no allreduce required in the end.
         # Note that if we see models that have large number of experts, we may
         # need to consider add TP back here.
+        # TODO: Arctic has large # experts, we may need to add TP back here.
         self.router = RowLinear(
             hidden_size,
             self.num_experts,
diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py
index 0c624a81b..f6df2bc00 100755
--- a/tensorrt_llm/models/__init__.py
+++ b/tensorrt_llm/models/__init__.py
@@ -17,6 +17,7 @@
                          BertForSequenceClassification, BertModel)
 from .bloom.model import BloomForCausalLM, BloomModel
 from .chatglm.model import ChatGLMForCausalLM, ChatGLMModel
+from .cogvlm.model import CogVLMForCausalLM
 from .dbrx.model import DbrxForCausalLM
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
 from .falcon.model import FalconForCausalLM, FalconModel
@@ -27,9 +28,11 @@
 from .llama.model import LLaMAForCausalLM, LLaMAModel
 from .mamba.model import MambaLMHeadModel
 from .medusa.model import MedusaForCausalLm
-from .modeling_utils import PretrainedConfig, PretrainedModel
+from .modeling_utils import (PretrainedConfig, PretrainedModel,
+                             SpeculativeDecodingMode)
 from .mpt.model import MPTForCausalLM, MPTModel
 from .opt.model import OPTForCausalLM, OPTModel
+from .phi3.model import Phi3ForCausalLM, Phi3Model
 from .phi.model import PhiForCausalLM, PhiModel
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
@@ -54,7 +57,9 @@
     'GPTNeoXModel',
     'GPTNeoXForCausalLM',
     'PhiModel',
+    'Phi3Model',
     'PhiForCausalLM',
+    'Phi3ForCausalLM',
     'ChatGLMForCausalLM',
     'ChatGLMModel',
     'BaichuanForCausalLM',
@@ -71,6 +76,8 @@
     'GemmaForCausalLM',
     'DbrxForCausalLM',
     'RecurrentGemmaForCausalLM',
+    'CogVLMForCausalLM',
+    'SpeculativeDecodingMode',
 ]
 
 MODEL_MAP = {
@@ -79,6 +86,7 @@
     'BloomForCausalLM': BloomForCausalLM,
     'FalconForCausalLM': FalconForCausalLM,
     'PhiForCausalLM': PhiForCausalLM,
+    'Phi3ForCausalLM': Phi3ForCausalLM,
     'MambaLMHeadModel': MambaLMHeadModel,
     'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
     'GPTJForCausalLM': GPTJForCausalLM,
@@ -87,6 +95,7 @@
     'LlamaForCausalLM': LLaMAForCausalLM,
     'MistralForCausalLM': LLaMAForCausalLM,
     'MixtralForCausalLM': LLaMAForCausalLM,
+    'ArcticForCausalLM': LLaMAForCausalLM,
     'InternLMForCausalLM': LLaMAForCausalLM,
     'MedusaForCausalLM': MedusaForCausalLm,
     'BaichuanForCausalLM': BaichuanForCausalLM,
@@ -97,4 +106,5 @@
     'DecoderModel': DecoderModel,
     'DbrxForCausalLM': DbrxForCausalLM,
     'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
+    'CogVLMForCausalLM': CogVLMForCausalLM,
 }
diff --git a/tensorrt_llm/models/cogvlm/__init__.py b/tensorrt_llm/models/cogvlm/__init__.py
new file mode 100644
index 000000000..71bf6d298
--- /dev/null
+++ b/tensorrt_llm/models/cogvlm/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/models/cogvlm/convert.py b/tensorrt_llm/models/cogvlm/convert.py
new file mode 100644
index 000000000..65b20aa92
--- /dev/null
+++ b/tensorrt_llm/models/cogvlm/convert.py
@@ -0,0 +1,250 @@
+import time
+
+import numpy as np
+import torch
+
+from tensorrt_llm.logger import logger
+
+from ..._utils import pad_vocab_size
+from ..llama.convert import (get_tllm_linear_weight, get_weight, split,
+                             split_matrix_tp, split_qkv_tp)
+
+
+def convert_hf_cogvlm(hf_model,
+                      mapping,
+                      vocab_size=32000,
+                      dtype='float32',
+                      use_parallel_embedding=False,
+                      sharding_dim=0,
+                      use_weight_only=False,
+                      share_embedding_table=False,
+                      use_gemm_woq_plugin=False,
+                      plugin_weight_only_quant_type=torch.int8,
+                      use_smooth_quant=False,
+                      per_channel=False,
+                      per_token=False,
+                      int8_kv_cache=False,
+                      act_range=[],
+                      qkv_para=[],
+                      smoother=[],
+                      moe_config=None,
+                      lora_config=None):
+
+    weights = {}
+    tik = time.time()
+    tensor_parallel = mapping.tp_size
+    model_params = dict(hf_model.named_parameters())
+    dtype = getattr(torch, dtype)
+    num_attention_heads = hf_model.config.num_attention_heads
+    hidden_size = hf_model.config.hidden_size
+    if hasattr(hf_model.config, "num_key_value_heads"):
+        num_key_value_heads = hf_model.config.num_key_value_heads
+    else:
+        num_key_value_heads = num_attention_heads
+    mha_mode = (num_key_value_heads == num_attention_heads)
+    layers_range = mapping.pp_layers(hf_model.config.num_hidden_layers)
+    assert mha_mode, "CogVLM only supports mha mode"
+    assert not use_smooth_quant, "CogVLM currently doesn't support smooth quant"
+    assert not int8_kv_cache, "CogVLM currently doesn't support int8 kv cache"
+
+    for l in layers_range:
+        prefix = f'model.layers.{l}.'
+        tllm_prex = f'transformer.layers.{l - layers_range[0]}.'
+
+        qkv_weight = get_weight(
+            model_params, prefix + 'self_attn.language_expert_query_key_value',
+            dtype)
+        split_v = split_qkv_tp(qkv_weight, num_attention_heads, hidden_size,
+                               tensor_parallel, mapping.tp_rank)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'attention.qkv.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        vis_qkv_weight = get_weight(
+            model_params, prefix + 'self_attn.vision_expert_query_key_value',
+            dtype)
+        split_v = split_qkv_tp(vis_qkv_weight, num_attention_heads, hidden_size,
+                               tensor_parallel, mapping.tp_rank)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'attention.vis_qkv.',
+                                   None, use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        attn_dense_weight = get_weight(
+            model_params, prefix + 'self_attn.language_expert_dense', dtype)
+        split_v = split_matrix_tp(attn_dense_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=1)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'attention.dense.',
+                                   None, use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        attn_vision_dense_weight = get_weight(
+            model_params, prefix + 'self_attn.vision_expert_dense', dtype)
+        split_v = split_matrix_tp(attn_vision_dense_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=1)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'attention.vis_dense.',
+                                   None, use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        mlp_gate_weight = get_weight(model_params,
+                                     prefix + 'mlp.language_mlp.up_proj', dtype)
+        split_v = split_matrix_tp(mlp_gate_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=0)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'mlp.gate.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        vision_mlp_gate_weight = get_weight(model_params,
+                                            prefix + 'mlp.vision_mlp.up_proj',
+                                            dtype)
+        split_v = split_matrix_tp(vision_mlp_gate_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=0)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'vis_mlp.gate.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        mlp_fc_weight = get_weight(model_params,
+                                   prefix + 'mlp.language_mlp.gate_proj', dtype)
+        split_v = split_matrix_tp(mlp_fc_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=0)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'mlp.fc.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        vision_mlp_fc_weight = get_weight(model_params,
+                                          prefix + 'mlp.vision_mlp.gate_proj',
+                                          dtype)
+        split_v = split_matrix_tp(vision_mlp_fc_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=0)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'vis_mlp.fc.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        mlp_proj_weight = get_weight(model_params,
+                                     prefix + 'mlp.language_mlp.down_proj',
+                                     dtype)
+        split_v = split_matrix_tp(mlp_proj_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=1)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'mlp.proj.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        vision_mlp_proj_weight = get_weight(model_params,
+                                            prefix + 'mlp.vision_mlp.down_proj',
+                                            dtype)
+        split_v = split_matrix_tp(vision_mlp_proj_weight,
+                                  tensor_parallel,
+                                  mapping.tp_rank,
+                                  dim=1)
+        weights.update(
+            get_tllm_linear_weight(split_v, tllm_prex + 'vis_mlp.proj.', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type, dtype,
+                                   use_gemm_woq_plugin))
+
+        # Layer norms do not use tensor parallelism
+        input_ln_weight = get_weight(model_params, prefix + 'input_layernorm',
+                                     dtype)
+        weights[tllm_prex + 'input_layernorm.weight'] = input_ln_weight
+
+        post_ln_weight = get_weight(model_params,
+                                    prefix + 'post_attention_layernorm', dtype)
+        weights[tllm_prex + 'post_layernorm.weight'] = post_ln_weight
+        cur_block_weights = [
+            weight_name for weight_name in model_params
+            if weight_name.find(prefix) != -1
+        ]
+        for weight_name in cur_block_weights:
+            model_params[weight_name] = None
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+
+    v = get_weight(model_params, 'model.embed_tokens', dtype)
+    if lora_config.is_valid and lora_config.embedding_weight is not None:
+        v = lora_config.embedding_weight
+    if hf_model.config.tie_word_embeddings:
+        # lm_head.weight has the same weights as embedding
+        if mapping.is_last_pp_rank():
+            if vocab_size % mapping.tp_size != 0:
+                # padding
+                vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
+                pad_width = vocab_size_padded - vocab_size
+
+                v = torch.from_numpy(
+                    np.pad(v.detach().cpu().numpy(), ((0, pad_width), (0, 0)),
+                           'constant',
+                           constant_values=0))
+            weights['lm_head.weight'] = split(v, mapping.tp_size,
+                                              mapping.tp_rank)
+
+    if use_parallel_embedding:
+        v = split_matrix_tp(v,
+                            mapping.tp_size,
+                            mapping.tp_rank,
+                            dim=sharding_dim)
+
+    if mapping.is_first_pp_rank():
+        weights['transformer.vocab_embedding.weight'] = v
+
+    lm_head_weights = get_weight(model_params, 'lm_head', dtype)
+
+    if mapping.is_last_pp_rank():
+
+        if lora_config.is_valid and lora_config.lm_head_weight is not None:
+
+            lm_head_weights = lora_config.lm_head_weight
+
+        if vocab_size % mapping.tp_size != 0:
+            # padding
+            vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
+            pad_width = vocab_size_padded - vocab_size
+
+            lm_head_weights = torch.from_numpy(
+                np.pad(lm_head_weights.detach().cpu().numpy(),
+                       ((0, pad_width), (0, 0)),
+                       'constant',
+                       constant_values=0))
+        weights['lm_head.weight'] = split_matrix_tp(lm_head_weights,
+                                                    tensor_parallel,
+                                                    mapping.tp_rank,
+                                                    dim=0)
+        ln_f_w = get_weight(model_params, 'model.norm', dtype)
+        weights['transformer.ln_f.weight'] = ln_f_w
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Weights loaded. Total time: {t}')
+    return weights
diff --git a/tensorrt_llm/models/cogvlm/model.py b/tensorrt_llm/models/cogvlm/model.py
new file mode 100644
index 000000000..31c33b8f2
--- /dev/null
+++ b/tensorrt_llm/models/cogvlm/model.py
@@ -0,0 +1,304 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from ..._utils import pad_vocab_size
+from ...functional import (Tensor, concat, maximum, minimum, recv, send, shape,
+                           slice)
+from ...layers import (MOE, AttentionMaskType, CogVLMAttention, ColumnLinear,
+                       Embedding, GatedMLP, MoeConfig, PromptTuningEmbedding,
+                       RmsNorm)
+from ...mapping import Mapping
+from ...module import Module
+from ...plugin import init_all_reduce_helper
+# this is to use to module global algo string with a quant_algo prefix
+from ...quantization import QuantMode
+from ...top_model_mixin import TopModelMixin
+from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
+                              PretrainedConfig, QuantConfig)
+
+
+class CogvlmDecoderLayer(Module):
+
+    def __init__(self, config: PretrainedConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+
+        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                       eps=config.norm_epsilon,
+                                       dtype=config.dtype)
+
+        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
+        local_layer_idx = layer_idx - layers_range[0]
+        self.attention = CogVLMAttention(
+            local_layer_idx=local_layer_idx,
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            dtype=config.dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=config.attn_bias,
+            rotary_embedding_base=config.rotary_base,
+            rotary_embedding_scaling=config.rotary_scaling,
+            tp_group=config.mapping.tp_group,
+            tp_size=config.mapping.tp_size,
+            tp_rank=config.mapping.tp_rank,
+            vision_start=config.vision_start,
+            vision_length=config.vision_length,
+            quant_mode=config.quant_mode)
+
+        mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
+
+        ClsMLP = GatedMLP
+        mlp_kwargs = {}
+        if config.moe_num_experts > 1:
+            ClsMLP = MOE
+            mlp_kwargs = {
+                "moe_config":
+                MoeConfig(
+                    config.moe_num_experts,
+                    config.moe_top_k,
+                    config.moe_tp_mode,
+                    config.moe_normalization_mode,
+                ),
+                "tp_rank":
+                config.mapping.tp_rank,
+            }
+        self.vision_start = config.vision_start
+        self.vision_length = config.vision_length
+        self.hidden_size = config.hidden_size
+        self.mlp = ClsMLP(hidden_size=config.hidden_size,
+                          ffn_hidden_size=mlp_hidden_size,
+                          hidden_act=config.hidden_act,
+                          dtype=config.dtype,
+                          bias=config.mlp_bias,
+                          tp_group=config.mapping.tp_group,
+                          tp_size=config.mapping.tp_size,
+                          quant_mode=config.quant_mode,
+                          **mlp_kwargs)
+        self.vis_mlp = ClsMLP(hidden_size=config.hidden_size,
+                              ffn_hidden_size=mlp_hidden_size,
+                              hidden_act=config.hidden_act,
+                              dtype=config.dtype,
+                              bias=config.mlp_bias,
+                              tp_group=config.mapping.tp_group,
+                              tp_size=config.mapping.tp_size,
+                              quant_mode=config.quant_mode,
+                              **mlp_kwargs)
+        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                      eps=config.norm_epsilon,
+                                      dtype=config.dtype)
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                spec_decoding_packed_mask=None,
+                spec_decoding_position_offsets=None,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None,
+                lora_layer_params=None):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        attention_output = self.attention(hidden_states,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        hidden_states = residual + attention_output
+
+        residual = hidden_states
+        hidden_states = self.post_layernorm(hidden_states)
+
+        bs = shape(hidden_states, 0)
+        seq_length = shape(hidden_states, 1)
+        bos = slice(hidden_states, [0, 0, 0],
+                    concat([bs, self.vision_start, self.hidden_size]))
+        vis_seq_length = minimum(self.vision_length + 1, seq_length - 1)
+        vision_hidden_states = slice(
+            hidden_states, [0, self.vision_start, 0],
+            concat([bs, vis_seq_length, self.hidden_size]))
+        text_seq_length = maximum(
+            0, seq_length - (self.vision_length + 1 + self.vision_start))
+        language_hidden_states = slice(
+            hidden_states, [0, self.vision_length + 1 + self.vision_start, 0],
+            concat([bs, text_seq_length, self.hidden_size]))
+
+        bos_qkv = self.mlp(bos)
+        language_qkv = self.mlp(language_hidden_states)
+        vision_qkv = self.vis_mlp(vision_hidden_states)
+        hidden_states = concat([bos_qkv, vision_qkv, language_qkv], dim=1)
+
+        # hidden_states = self.mlp(hidden_states,
+        #                          lora_layer_params=lora_layer_params)
+
+        hidden_states = residual + hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class CogvlmModel(Module):
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
+        init_all_reduce_helper()
+
+        self.mapping = config.mapping
+        self.use_prompt_tuning = config.use_prompt_tuning
+        EmbeddingCls = PromptTuningEmbedding if config.use_prompt_tuning else Embedding
+        if self.mapping.is_first_pp_rank():
+            self.vocab_embedding = EmbeddingCls(
+                num_embeddings=config.vocab_size,
+                embedding_dim=config.hidden_size,
+                dtype=config.dtype,
+                tp_size=self.mapping.tp_size
+                if config.use_parallel_embedding else 1,
+                tp_group=self.mapping.tp_group
+                if config.use_parallel_embedding else None,
+                sharding_dim=config.embedding_sharding_dim,
+                tp_rank=self.mapping.tp_rank,
+            )
+
+        self.layers = DecoderLayerList(CogvlmDecoderLayer, config)
+
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                eps=config.norm_epsilon,
+                                dtype=config.dtype)
+
+    def forward(self,
+                input_ids,
+                position_ids=None,
+                use_cache=False,
+                attention_mask=None,
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None,
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None,
+                prompt_embedding_table: Optional[Tensor] = None,
+                prompt_tasks: Optional[Tensor] = None,
+                prompt_vocab_size: Optional[Tensor] = None,
+                lora_params=None):
+
+        kv_cache_params.fill_none_tensor_list(len(self.layers))
+
+        if use_cache:
+            presents = []
+
+        ptuning_args = [
+            prompt_embedding_table, prompt_tasks, prompt_vocab_size
+        ] if self.use_prompt_tuning else []
+
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+
+        hidden_states = self.layers.forward(
+            hidden_states,
+            use_cache=use_cache,
+            attention_mask=attention_mask,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            lora_params=lora_params,
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
+            spec_decoding_packed_mask=spec_decoding_packed_mask)
+
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
+
+        if use_cache:
+            return (hidden_states, tuple(presents))
+        return hidden_states
+
+
+class CogVLMForCausalLM(DecoderModelForCausalLM, TopModelMixin):
+
+    def __init__(self, config: PretrainedConfig):
+        self.check_config(config)
+        transformer = CogvlmModel(config)
+        vocab_size_padded = pad_vocab_size(config.vocab_size,
+                                           config.mapping.tp_size)
+        if config.mapping.is_last_pp_rank():
+            lm_head = ColumnLinear(config.hidden_size,
+                                   vocab_size_padded,
+                                   bias=False,
+                                   dtype=config.dtype,
+                                   tp_group=config.mapping.tp_group,
+                                   tp_size=config.mapping.tp_size,
+                                   gather_output=True)
+        else:
+            lm_head = None
+        self.quant_mode = config.quant_mode
+        self.mapping = config.mapping
+        super().__init__(config, transformer, lm_head)
+
+    def check_config(self, config):
+        config.set_if_not_exist('mlp_bias', False)
+        config.set_if_not_exist('attn_bias', False)
+        config.set_if_not_exist('rotary_base', 10000.0)
+        config.set_if_not_exist('rotary_scaling', None)
+        config.set_if_not_exist('moe_num_experts', 0)
+        config.set_if_not_exist('moe_top_k', 0)
+        config.set_if_not_exist('moe_tp_mode',
+                                MoeConfig.ParallelismMode.TENSOR_PARALLEL)
+        config.set_if_not_exist(
+            'moe_normalization_mode',
+            MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)
+
+    @classmethod
+    def from_hugging_face(cls,
+                          hf_model_dir,
+                          dtype='float16',
+                          mapping: Optional[Mapping] = None,
+                          quant_mode: Optional[QuantMode] = None,
+                          **kwargs):
+        pass
+
+    def default_plugin_config(self, **kwargs):
+        plugin_config = super().default_plugin_config(**kwargs)
+        if self.quant_mode.is_int4_weight_only_per_group():
+            plugin_config.set_weight_only_groupwise_quant_matmul_plugin()
+        return plugin_config
+
+    @classmethod
+    def quantize(
+        cls,
+        hf_model_dir,
+        output_dir,
+        quant_config: QuantConfig,
+        *,
+        dtype='float16',
+        mapping: Optional[Mapping] = None,
+        calib_batches=512,
+        calib_batch_size=1,
+        random_seed=1234,
+        tokenizer_max_seq_length=2048,
+        **kwargs,
+    ):
+        pass
diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py
index 30340e8b2..985635189 100644
--- a/tensorrt_llm/models/enc_dec/model.py
+++ b/tensorrt_llm/models/enc_dec/model.py
@@ -964,7 +964,8 @@ def __init__(self, config: PretrainedConfig):
         self.has_token_type_embedding = type_vocab_size is not None
 
         self.fp16_clamping = (self.config.dtype
-                              == 'float16') and (self.config.model_type == 't5')
+                              == 'float16') and (self.config.model_type
+                                                 in ['t5', 'pix2struct'])
 
         self.skip_cross_qkv = self.config.skip_cross_qkv
         self.mlp_type = MLPType.MLP if not hasattr(
diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py
index d1a5c059b..e98223b18 100644
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@@ -17,8 +17,9 @@
 
 from ..._utils import pad_vocab_size
 from ...functional import Tensor, cast, recv, send
-from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
-                       GatedMLP, PositionEmbeddingType, RmsNorm)
+from ...layers import (Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
+                       LoraParams, PositionEmbeddingType, RmsNorm)
 from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
@@ -71,24 +72,24 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
                                       eps=config.norm_epsilon,
                                       dtype=config.dtype)
 
-    def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            medusa_packed_mask=None,  # For Medusa support
-            medusa_position_offsets=None,
-            use_cache=False,
-            kv_cache_params=None,
-            attention_params=None,
-            lora_layer_params=None):
+    def forward(self,
+                hidden_states: Tensor,
+                attention_mask: Optional[Tensor] = None,
+                spec_decoding_packed_mask: Optional[Tensor] = None,
+                spec_decoding_position_offsets: Optional[Tensor] = None,
+                use_cache: bool = False,
+                kv_cache_params: Optional[KeyValueCacheParams] = None,
+                attention_params: Optional[AttentionParams] = None,
+                lora_layer_params: Optional[LoraParams] = None):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
 
         attention_output = self.attention(
             hidden_states,
             attention_mask=attention_mask,
-            medusa_packed_mask=medusa_packed_mask,  # For Medusa support
-            medusa_position_offsets=medusa_position_offsets,
+            spec_decoding_packed_mask=
+            spec_decoding_packed_mask,  # For Medusa support
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
             use_cache=use_cache,
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
@@ -187,15 +188,19 @@ def __init__(self, config: PretrainedConfig):
                                            config.mapping.tp_size)
 
         try:
-            import ammo
-            major, minor, patch = ammo.__version__.split(".")
+            import modelopt
+            major, minor, patch = modelopt.__version__.split(".")
             major = int(major)
             minor = int(minor)
             patch = int(patch)
-            if minor > 9 or (minor == 9 and patch > 4):
-                assert config.share_embedding_table, "Gemma only supports share_embedding_table"
+            if major == 0 and minor == 11 and patch < 1:
+                # modelopt=0.11.0 won't force this field to True, this is a hot fix
+                # TODO: can remove after modelop=0.11.1 is out
+                # TRT LLM forces the embedding table to be shared for gemma.
+                config.share_embedding_table = True
+            assert config.share_embedding_table, "Gemma only supports share_embedding_table"
         except:
-            # Not find ammo, assume not use ammo quantized model
+            # Not find modelopt, assume not use modelopt quantized model
             assert config.share_embedding_table, "Gemma only supports share_embedding_table"
 
         if config.mapping.is_last_pp_rank():
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index 41324677c..a18cdbb9d 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -60,11 +60,21 @@ def prepare_attention_inputs(self,
                                  mapping=Mapping(),
                                  use_cache=True,
                                  streamingllm=False,
-                                 attn_layer_idx=None):
+                                 attn_layer_idx=None,
+                                 opt_batch_size=None):
 
         default_range = GenerationMixin.default_range
-        bb_range_cxt = default_range(max_batch_size)
-        bb_range_gen = default_range(max_batch_size * max_beam_width)
+
+        if opt_batch_size:
+            bb_range_cxt = [1, opt_batch_size, max_batch_size]
+            bb_range_gen = [
+                1, opt_batch_size * max_beam_width,
+                max_batch_size * max_beam_width
+            ]
+        else:
+            bb_range_cxt = default_range(max_batch_size)
+            bb_range_gen = default_range(max_batch_size * max_beam_width)
+
         _bs_range = default_range(max_batch_size)
         _beam_width_range = default_range(max_beam_width)
         _max_len_range = default_range(max_seq_len)
@@ -277,51 +287,65 @@ def prepare_attention_inputs(self,
             'host_request_types': host_request_types,
         }
 
-    def prepare_basic_inputs(self,
-                             *,
-                             max_batch_size,
-                             max_beam_width,
-                             max_input_len,
-                             max_seq_len,
-                             num_kv_heads,
-                             head_size,
-                             num_layers,
-                             kv_dtype,
-                             remove_input_padding=False,
-                             use_gpt_attention_plugin=False,
-                             use_gemm_plugin=False,
-                             use_custom_all_reduce=False,
-                             paged_kv_cache=False,
-                             tokens_per_block=64,
-                             gather_context_logits=False,
-                             gather_generation_logits=False,
-                             dtype=None,
-                             num_heads=None,
-                             mapping=Mapping(),
-                             max_num_tokens=None,
-                             opt_num_tokens=None,
-                             prompt_embedding_table_size: int = 0,
-                             position_encoding_2d=False,
-                             use_lora_plugin: bool = False,
-                             lora_target_modules: List[str] = None,
-                             max_draft_len=0,
-                             multiple_profiles: bool = False,
-                             streamingllm: bool = False):
+    def prepare_basic_inputs(
+            self,
+            *,
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_seq_len,
+            num_kv_heads,
+            head_size,
+            num_layers,
+            kv_dtype,
+            remove_input_padding=False,
+            use_gpt_attention_plugin=False,
+            use_gemm_plugin=False,
+            use_custom_all_reduce=False,
+            paged_kv_cache=False,
+            tokens_per_block=64,
+            gather_context_logits=False,
+            gather_generation_logits=False,
+            dtype=None,
+            num_heads=None,
+            mapping=Mapping(),
+            max_num_tokens=None,
+            opt_num_tokens=None,
+            prompt_embedding_table_size: int = 0,
+            position_encoding_2d=False,
+            use_lora_plugin: bool = False,
+            lora_target_modules: List[str] = None,
+            speculative_decoding_draft_tokens_external: bool = False,
+            max_draft_len=0,
+            multiple_profiles: bool = False,
+            streamingllm: bool = False,
+            opt_batch_size=None):
 
         default_range = GenerationMixin.default_range
-        last_token_range = [1, max_draft_len + 1, max_draft_len + 1]
-        bb_range_cxt = default_range(max_batch_size)
-        bb_range_gen = default_range(max_batch_size * max_beam_width)
+        tokens_per_engine_step = max_draft_len + 1
+        [1, tokens_per_engine_step, tokens_per_engine_step]
+        tokens_per_engine_step_range = [
+            1, tokens_per_engine_step, tokens_per_engine_step
+        ]
+        if opt_batch_size:
+            bb_range_cxt = [1, opt_batch_size, max_batch_size]
+            bb_range_gen = [
+                1, opt_batch_size * max_beam_width,
+                max_batch_size * max_beam_width
+            ]
+        else:
+            bb_range_cxt = default_range(max_batch_size)
+            bb_range_gen = default_range(max_batch_size * max_beam_width)
         bbd_range_ctx = [
-            bb_range_cxt[i] * ((max_draft_len + 1) if i != 0 else 1)
+            bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
             for i in range(len(bb_range_cxt))
         ]
         bbd_range_gen = [
-            bb_range_gen[i] * ((max_draft_len + 1) if i != 0 else 1)
+            bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
             for i in range(len(bb_range_gen))
         ]
         inlen_range_cxt = default_range(max_input_len)
-        inlen_range_gen = [1, 1, max_draft_len + 1]
+        inlen_range_gen = [1, 1, tokens_per_engine_step]
 
         enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
             use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
@@ -330,7 +354,7 @@ def prepare_basic_inputs(self,
             # Draft tokens cannot be combined with beam search
             max_num_tokens = max(
                 max_batch_size * max_input_len,
-                max_batch_size * max(1 + max_draft_len, max_beam_width))
+                max_batch_size * max(tokens_per_engine_step, max_beam_width))
         if enable_ctx_gen_opt_profiles:
             num_profiles = 2
             bb_range = [bb_range_cxt, bb_range_gen]
@@ -340,7 +364,7 @@ def prepare_basic_inputs(self,
             num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
             # Draft tokens cannot be combined with beam search
             num_tokens_range_gen = default_range(
-                max_batch_size * max(1 + max_draft_len, max_beam_width))
+                max_batch_size * max(tokens_per_engine_step, max_beam_width))
             num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
         else:
             max_bs_x_max_bw = max_batch_size * max_beam_width
@@ -362,7 +386,8 @@ def prepare_basic_inputs(self,
             bbd_range = [bbd_range_gen] * num_profiles
             inlen_range = [[1, 1, max_input_len]] * num_profiles
             position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
-        last_token_range = [last_token_range] * num_profiles
+        tokens_per_engine_step_range = [tokens_per_engine_step_range
+                                        ] * num_profiles
         position_ids_num_tokens_range = num_tokens_range
 
         input_ids = None
@@ -546,7 +571,7 @@ def prepare_basic_inputs(self,
                     shape=[-1, -1],
                     dim_range=OrderedDict([
                         ('batch_size_beam_width', bb_range),
-                        ('last_token_ids', last_token_range),
+                        ('last_token_ids', tokens_per_engine_step_range),
                     ]),
                 )
             else:
@@ -559,6 +584,38 @@ def prepare_basic_inputs(self,
                     ]),
                 )
 
+        speculative_decoding_position_offsets = None
+        speculative_decoding_packed_mask = None
+        # Use positional offsets and packed mask only when not in SpS spec decoding
+        if speculative_decoding_draft_tokens_external == False and max_draft_len > 0:
+            # 32 bits packed mask aligned.
+            num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32
+            packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles
+            # position offsets that are fixed during the whole session.
+            # it will be shared among all sequences.
+            speculative_decoding_position_offsets = Tensor(
+                name='spec_decoding_position_offsets',
+                dtype=trt.int32,
+                shape=[-1, -1],
+                dim_range=OrderedDict([
+                    ('batch_size_beam_width', bb_range),
+                    ('spec_decoding_position_ids_dim0',
+                     tokens_per_engine_step_range),
+                ]),
+            )
+
+            speculative_decoding_packed_mask = Tensor(
+                name='spec_decoding_packed_mask',
+                dtype=trt.int32,
+                shape=[-1, -1, -1],
+                dim_range=OrderedDict([
+                    ('batch_size_beam_width', bb_range),
+                    ('spec_decoding_packed_mask_dim0',
+                     tokens_per_engine_step_range),
+                    ('spec_decoding_packed_mask_dim1', packed_mask_len_range),
+                ]),
+            )
+
         basic_inputs = {
             'input_ids': input_ids,
             'hidden_states_input': hidden_states,
@@ -569,6 +626,9 @@ def prepare_basic_inputs(self,
             'prompt_vocab_size': prompt_vocab_size,
             'lora_ranks': lora_ranks,
             'lora_weights_pointers': lora_weights_pointers,
+            'spec_decoding_position_offsets':
+            speculative_decoding_position_offsets,
+            'spec_decoding_packed_mask': speculative_decoding_packed_mask
         }
 
         attention_inputs = self.prepare_attention_inputs(
@@ -587,7 +647,8 @@ def prepare_basic_inputs(self,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block,
             mapping=mapping,
-            streamingllm=streamingllm)
+            streamingllm=streamingllm,
+            opt_batch_size=opt_batch_size)
 
         for key, value in attention_inputs.items():
             basic_inputs[key] = value
diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py
index daac9e5f6..87ed44b47 100644
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@@ -96,7 +96,8 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             tp_group=tp_group,
             tp_size=tp_size,
             tp_rank=tp_rank,
-            quant_mode=config.quant_mode)
+            quant_mode=config.quant_mode,
+            qk_layernorm=config.qk_layernorm)
 
         mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
 
@@ -129,7 +130,9 @@ def forward(self,
                 use_cache=False,
                 kv_cache_params=None,
                 attention_params=None,
-                lora_layer_params=None):
+                lora_layer_params=None,
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None):
 
         assert isinstance(hidden_states, Tensor)
 
@@ -137,12 +140,15 @@ def forward(self,
 
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(hidden_states,
-                                          attention_mask=attention_mask,
-                                          use_cache=use_cache,
-                                          kv_cache_params=kv_cache_params,
-                                          attention_params=attention_params,
-                                          lora_layer_params=lora_layer_params)
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            lora_layer_params=lora_layer_params,
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
+            spec_decoding_packed_mask=spec_decoding_packed_mask)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -197,7 +203,9 @@ def forward(self,
                 prompt_embedding_table=None,
                 prompt_tasks=None,
                 prompt_vocab_size=None,
-                lora_params=None):
+                lora_params=None,
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None):
         if self.mapping.is_first_pp_rank():
             ptuning_args = [
                 prompt_embedding_table, prompt_tasks, prompt_vocab_size
@@ -209,12 +217,15 @@ def forward(self,
         else:
             hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
 
-        hidden_states = self.layers(hidden_states,
-                                    use_cache=use_cache,
-                                    attention_mask=attention_mask,
-                                    kv_cache_params=kv_cache_params,
-                                    attention_params=attention_params,
-                                    lora_params=lora_params)
+        hidden_states = self.layers(
+            hidden_states,
+            use_cache=use_cache,
+            attention_mask=attention_mask,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            lora_params=lora_params,
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
+            spec_decoding_packed_mask=spec_decoding_packed_mask)
         if use_cache:
             hidden_states, presents = hidden_states
 
diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py
index 2238ab38f..d11f6d253 100644
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@@ -304,6 +304,35 @@ def smooth_llama_model(model, scales, alpha, llama_qkv_para, llama_smoother):
         scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max(
             dim=1)[0]
 
+        # ==================================================================
+        if hasattr(module, 'residual_mlp'):
+            fc1_layer_name = name + ".residual_mlp.w1"
+            gate_layer_name = name + ".residual_mlp.w3"
+
+            smoother = smooth_gemm_fc1_gate(module.residual_mlp.w1.weight,
+                                            module.residual_mlp.w3.weight,
+                                            scales[fc1_layer_name]["x"],
+                                            module.residual_layernorm.weight,
+                                            None, alpha)
+
+            scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother
+            scales[fc1_layer_name]["w"] = module.residual_mlp.w1.weight.abs(
+            ).max(dim=1)[0]
+
+            scales[gate_layer_name][
+                "x"] = scales[gate_layer_name]["x"] / smoother
+            scales[gate_layer_name]["w"] = module.residual_mlp.w3.weight.abs(
+            ).max(dim=1)[0]
+
+            # ==================================================================
+            layer_name = name + ".residual_mlp.w2"
+            smoother = smooth_gemm(module.residual_mlp.w2.weight,
+                                   scales[layer_name]["x"], None, None, alpha)
+            llama_smoother[layer_name] = smoother.float()
+            scales[layer_name]["x"] = scales[layer_name]["x"] / smoother
+            scales[layer_name]["w"] = module.residual_mlp.w2.weight.abs().max(
+                dim=1)[0]
+
 
 @torch.no_grad()
 def capture_activation_range(model,
@@ -615,6 +644,7 @@ def convert_hf_llama(hf_model,
                      sharding_dim=0,
                      use_weight_only=False,
                      share_embedding_table=False,
+                     residual_mlp=False,
                      use_gemm_woq_plugin=False,
                      plugin_weight_only_quant_type=torch.int8,
                      use_smooth_quant=False,
@@ -827,6 +857,112 @@ def convert_layer(l):
 
             moe_experts_gate_weights = get_weight(
                 model_params, prefix + 'block_sparse_moe.gate', torch.float32)
+
+            if residual_mlp:
+                residual_mlp_gate_weights = get_weight(
+                    model_params, prefix + 'residual_mlp.w3', dtype)
+                if use_smooth_quant:
+                    residual_mlp_gate_weights = residual_mlp_gate_weights.t()
+                    int8_weights = generate_int8(
+                        residual_mlp_gate_weights,
+                        act_range.get(prefix + 'residual_mlp.w3'))
+                    weights.update(
+                        get_tllm_linear_sq_weight(
+                            int8_weights,
+                            tllm_prex + 'residual_mlp.gate.',
+                            [1, hidden_size // tensor_parallel],
+                            tensor_parallel,
+                            is_qkv=False,
+                            per_token=per_token,
+                            per_channel=per_channel,
+                            last_prefix=tllm_prex +
+                            'post_layernorm.scale_to_int',
+                            smoother_value=None,
+                            smoother_shape=None,
+                            rank=mapping.tp_rank,
+                            cat_dim=-1))
+                else:
+                    split_v = split_matrix_tp(residual_mlp_gate_weights,
+                                              tensor_parallel,
+                                              mapping.tp_rank,
+                                              dim=0)
+                    weights.update(
+                        get_tllm_linear_weight(split_v,
+                                               tllm_prex + 'residual_mlp.gate.',
+                                               None, use_weight_only,
+                                               plugin_weight_only_quant_type,
+                                               dtype, use_gemm_woq_plugin))
+
+                residual_mlp_fc_weight = get_weight(model_params,
+                                                    prefix + 'residual_mlp.w1',
+                                                    dtype)
+                if use_smooth_quant:
+                    residual_mlp_fc_weight = residual_mlp_fc_weight.t(
+                    )  #verified
+                    int8_weights = generate_int8(
+                        residual_mlp_fc_weight,
+                        act_range.get(prefix + 'residual_mlp.w1'))
+                    weights.update(
+                        get_tllm_linear_sq_weight(
+                            int8_weights,
+                            tllm_prex + 'residual_mlp.fc.',
+                            [1, hidden_size // tensor_parallel],
+                            tensor_parallel,
+                            is_qkv=False,
+                            per_token=per_token,
+                            per_channel=per_channel,
+                            last_prefix=tllm_prex +
+                            'post_layernorm.scale_to_int',
+                            smoother_value=None,
+                            smoother_shape=None,
+                            rank=mapping.tp_rank,
+                            cat_dim=-1))
+                else:
+                    split_v = split_matrix_tp(residual_mlp_fc_weight,
+                                              tensor_parallel,
+                                              mapping.tp_rank,
+                                              dim=0)
+                    weights.update(
+                        get_tllm_linear_weight(split_v,
+                                               tllm_prex + 'residual_mlp.fc.',
+                                               None, use_weight_only,
+                                               plugin_weight_only_quant_type,
+                                               dtype, use_gemm_woq_plugin))
+
+                residual_mlp_proj_weight = get_weight(
+                    model_params, prefix + 'residual_mlp.w2', dtype)
+
+                if use_smooth_quant:
+                    residual_mlp_proj_weight = residual_mlp_proj_weight.t()
+                    int8_weights = generate_int8(
+                        residual_mlp_proj_weight,
+                        act_range.get(prefix + 'residual_mlp.w2'))
+                    weights.update(
+                        get_tllm_linear_sq_weight(
+                            int8_weights,
+                            tllm_prex + 'residual_mlp.proj.', [1, hidden_size],
+                            tensor_parallel,
+                            is_qkv=False,
+                            per_token=per_token,
+                            per_channel=per_channel,
+                            last_prefix=tllm_prex +
+                            'residual_mlp.quantization_scaling_factor',
+                            smoother_value=smoother[prefix + 'residual_mlp.w2'],
+                            smoother_shape=[1, hidden_size // tensor_parallel],
+                            rank=mapping.tp_rank,
+                            cat_dim=0))
+                else:
+                    split_v = split_matrix_tp(residual_mlp_proj_weight,
+                                              tensor_parallel,
+                                              mapping.tp_rank,
+                                              dim=1)
+                    weights.update(
+                        get_tllm_linear_weight(split_v,
+                                               tllm_prex + 'residual_mlp.proj.',
+                                               None, use_weight_only,
+                                               plugin_weight_only_quant_type,
+                                               dtype, use_gemm_woq_plugin))
+
             weights.update(
                 get_tllm_linear_weight(
                     moe_experts_gate_weights.to(torch.float32),
@@ -943,6 +1079,14 @@ def convert_layer(l):
         post_ln_weight = get_weight(model_params,
                                     prefix + 'post_attention_layernorm', dtype)
         weights[tllm_prex + 'post_layernorm.weight'] = post_ln_weight
+
+        if residual_mlp:
+            residual_ln_weight = get_weight(model_params,
+                                            prefix + 'residual_layernorm',
+                                            dtype)
+            weights[tllm_prex +
+                    'residual_layernorm.weight'] = residual_ln_weight
+
         cur_block_weights = [
             weight_name for weight_name in model_params
             if weight_name.find(prefix) != -1
@@ -1064,7 +1208,8 @@ def create_config_from_hugging_face(hf_model,
     hidden_act = hf_config.hidden_act
     config['rotary_scaling'] = getattr(hf_config, "rope_scaling", None)
     rotary_base = getattr(hf_config, "rope_theta", 10000.0)
-    if hf_config.model_type == "mixtral":
+    config['residual_mlp'] = getattr(hf_config, "parallel_attn_mlp_res", False)
+    if hf_config.model_type == "mixtral" or hf_config.model_type == "arctic":
         # HF LLaMA-type models are implicitly using gated activation.
         # With our MoE implementation, we must make it explicit
         hidden_act = "swiglu"
@@ -1198,7 +1343,7 @@ def quantize(dtype,
     '''
         Quantize the save the model as TRT-LLM checkpoint to output_dir
     '''
-    #TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling ammo
+    #TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling modelopt
     config = create_config_from_hugging_face(model_dir,
                                              dtype,
                                              mapping,
@@ -1295,6 +1440,7 @@ def load_weights_from_hf(*,
         use_parallel_embedding=config.get('use_parallel_embedding', False),
         sharding_dim=config.get('embedding_sharding_dim', 0),
         share_embedding_table=config.get('share_embedding_table', False),
+        residual_mlp=config['residual_mlp'],
         use_smooth_quant=use_smooth_quant,
         per_channel=per_channel_sq,
         per_token=per_token_sq,
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index da9eb01a2..07b31b25a 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -17,7 +17,7 @@
 from typing import Optional
 
 from ..._utils import pad_vocab_size
-from ...functional import Tensor, recv, send
+from ...functional import Tensor, non_gated_version, recv, send
 from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, GatedMLP, MoeConfig, PositionEmbeddingType,
                        RmsNorm)
@@ -92,12 +92,37 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
                                       eps=config.norm_epsilon,
                                       dtype=config.dtype)
 
+        # Residual MLP that applies on pre-attention input
+        # TODO: change to self.has_residual_mlp = self.config.residual_mlp after ModelOpt quantize config is updated
+        self.has_residual_mlp = False
+        if hasattr(self.config,
+                   "residual_mlp") and self.config.residual_mlp is True:
+            self.has_residual_mlp = True
+
+        if self.has_residual_mlp:
+            self.residual_layernorm = RmsNorm(
+                normalized_shape=config.hidden_size,
+                eps=config.norm_epsilon,
+                dtype=config.dtype)
+            ClsMLP = GatedMLP  # TODO: may use FusedGatedMLP to further speedup
+            self.residual_mlp = ClsMLP(
+                hidden_size=config.hidden_size,
+                ffn_hidden_size=config.
+                hidden_size,  # residual mlp uses hidden_size
+                hidden_act=non_gated_version(
+                    config.hidden_act),  # back to non-gated
+                dtype=config.dtype,
+                bias=config.mlp_bias,
+                tp_group=config.mapping.tp_group,
+                tp_size=config.mapping.tp_size,
+                quant_mode=config.quant_mode)
+
     def forward(
             self,
             hidden_states,
             attention_mask=None,
-            medusa_packed_mask=None,  # For Medusa support
-            medusa_position_offsets=None,
+            spec_decoding_packed_mask=None,  # For Medusa support
+            spec_decoding_position_offsets=None,
             use_cache=False,
             kv_cache_params=None,
             attention_params=None,
@@ -108,8 +133,9 @@ def forward(
         attention_output = self.attention(
             hidden_states,
             attention_mask=attention_mask,
-            medusa_packed_mask=medusa_packed_mask,  # For Medusa support
-            medusa_position_offsets=medusa_position_offsets,
+            spec_decoding_packed_mask=
+            spec_decoding_packed_mask,  # For Medusa support
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
             use_cache=use_cache,
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
@@ -120,13 +146,29 @@ def forward(
 
         hidden_states = residual + attention_output
 
-        residual = hidden_states
-        hidden_states = self.post_layernorm(hidden_states)
+        residual_attn = hidden_states
 
-        hidden_states = self.mlp(hidden_states,
-                                 lora_layer_params=lora_layer_params)
+        if self.has_residual_mlp:
+            # arctic layer w/ residual mlp
+
+            # residual mlp
+            hidden_states = self.residual_layernorm(hidden_states)
+            hidden_states = self.residual_mlp(hidden_states)
+            residual_mlp = residual_attn + hidden_states
+
+            # parallel moe
+            # parallel moe layers applies on PRE-ATTENTION input residual, therefore achieving pre-fetching and better parallelism
+            hidden_states = self.post_layernorm(residual)
+            hidden_states = self.mlp(hidden_states,
+                                     lora_layer_params=lora_layer_params)
+            hidden_states = residual_mlp + hidden_states
+        else:
+            # regular llama/mixtral layers
+            hidden_states = self.post_layernorm(hidden_states)
+            hidden_states = self.mlp(hidden_states,
+                                     lora_layer_params=lora_layer_params)
+            hidden_states = residual_attn + hidden_states
 
-        hidden_states = residual + hidden_states
         if use_cache:
             return (hidden_states, presents)
         return hidden_states
@@ -157,8 +199,8 @@ def forward(
             position_ids=None,
             use_cache=False,
             attention_mask=None,
-            medusa_position_offsets=None,  # For Medusa support
-            medusa_packed_mask=None,  # For Medusa support
+            spec_decoding_position_offsets=None,  # For Medusa support
+            spec_decoding_packed_mask=None,  # For Medusa support
             kv_cache_params=None,
             attention_params=None,
             hidden_states=None,
@@ -183,8 +225,8 @@ def forward(
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
             lora_params=lora_params,
-            medusa_position_offsets=medusa_position_offsets,
-            medusa_packed_mask=medusa_packed_mask)
+            spec_decoding_position_offsets=spec_decoding_position_offsets,
+            spec_decoding_packed_mask=spec_decoding_packed_mask)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -276,6 +318,12 @@ def from_meta_ckpt(cls,
         n_embd = meta_config["dim"]
         n_head = meta_config["n_heads"]
         n_kv_head = meta_config.get("n_kv_heads", n_head)
+        vocab_size = meta_config.get("vocab_size", 32000)
+
+        # Reset vocab_size to 32000 for LLama v2 checkpoint.
+        if vocab_size == -1:
+            vocab_size = 32000
+
         if "hidden_dim" in meta_config:
             inter_size = meta_config["hidden_dim"]
         else:
@@ -295,11 +343,11 @@ def from_meta_ckpt(cls,
             'hidden_size': n_embd,
             'intermediate_size': inter_size,
             'num_key_value_heads': n_kv_head,
-            'vocab_size': 32000,
+            'vocab_size': vocab_size,
             'position_embedding_type': 'rope_gpt_neox',
             'max_position_embeddings': 2048,
             'hidden_act': 'silu',
-            'rotary_base': 10000.0,
+            'rotary_base': meta_config.get('rope_theta', 10000),
             'norm_epsilon': meta_config["norm_eps"],
             'mapping': {
                 'world_size': mapping.tp_size * mapping.pp_size,
@@ -334,12 +382,12 @@ def quantize(
         tokenizer_max_seq_length=2048,
         **kwargs,
     ):
-        DEFAULT_AMMO_FLOW = [
+        DEFAULT_Modelopt_FLOW = [
             QuantAlgo.W4A16_AWQ, QuantAlgo.FP8, QuantAlgo.W8A8_SQ_PER_CHANNEL,
             QuantAlgo.W4A8_AWQ
         ]
-        use_ammo_quantization = quant_config.quant_algo in DEFAULT_AMMO_FLOW
-        if use_ammo_quantization:
+        use_modelopt_quantization = quant_config.quant_algo in DEFAULT_Modelopt_FLOW
+        if use_modelopt_quantization:
             super().quantize(hf_model_dir,
                              output_dir,
                              quant_config,
@@ -350,7 +398,7 @@ def quantize(
                              random_seed=random_seed,
                              tokenizer_max_seq_length=tokenizer_max_seq_length)
         else:
-            # non-ammo, the legacy TRT-LLM native quantization algorithm:
+            # non-modelopt, the legacy TRT-LLM native quantization algorithm:
             # sq, int4/int8 weights only, int8 kv cache
             NATIVE_QUANT_FLOW = [QuantAlgo.W4A16, QuantAlgo.W8A16, None
                                  ] + W8A8_SQ_PLUGIN_LIST
@@ -358,7 +406,7 @@ def quantize(
                 (quant_config.kv_cache_quant_algo in [QuantAlgo.INT8, None])
             assert quant_config.quant_algo is not None or quant_config.kv_cache_quant_algo is not None, \
                 "There is no point to call the quantize function if both quant_algo and kv_cache_quant_algo is None"
-            assert is_valid_native_quant, f"Internal error: shall call AMMO for this quantization {quant_config}"
+            assert is_valid_native_quant, f"Internal error: shall call Modelopt for this quantization {quant_config}"
 
             from . import convert
             convert.quantize(
diff --git a/tensorrt_llm/models/llama/weight.py b/tensorrt_llm/models/llama/weight.py
index 43eb31967..995761806 100644
--- a/tensorrt_llm/models/llama/weight.py
+++ b/tensorrt_llm/models/llama/weight.py
@@ -895,6 +895,22 @@ def extract_layer_idx(name):
     if not hasattr(load_from_meta_llama, "saved_embed"):
         load_from_meta_llama.saved_embed = None
 
+    def combine_embeddings(embeds, num_ckpts):
+        if len(embeds) == 1:
+            return embeds[0]
+        assert [
+            embeds[i].shape == embeds[i + 1].shape
+            for i in range(len(embeds) - 1)
+        ]
+        if embeds[0].shape[0] == config.vocab_size // num_ckpts:
+            merge_dim = 0
+        elif embeds[0].shape[1] == config.hidden_size // num_ckpts:
+            merge_dim = 1
+        else:
+            logger.error("Unable to infer embedding split dimension")
+            assert False, "Unable to infer embedding split dimension"
+        return torch.cat(embeds, dim=merge_dim)
+
     def gather_embedding(cur_embed, name: str, num_ckpts):
         if mapping.tp_size == 1:
             # even if num_ckpts > 1, get_current_weights will already have it gathered
@@ -906,7 +922,7 @@ def gather_embedding(cur_embed, name: str, num_ckpts):
                                        f"consolidated.{i:02d}.pth"),
                                   map_location="cpu")
                 embeds[i] = ckpt[name]
-            embed = torch.cat(embeds, dim=1).to(torch_dtype)
+            embed = combine_embeddings(embeds, num_ckpts).to(torch_dtype)
             load_from_meta_llama.saved_embed = embed
 
         return load_from_meta_llama.saved_embed
diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py
index e6732943f..6ea4e85c7 100644
--- a/tensorrt_llm/models/mamba/model.py
+++ b/tensorrt_llm/models/mamba/model.py
@@ -213,24 +213,27 @@ def forward(self,
 
         return (lm_logits, present_convs, present_ssms)
 
-    def prepare_inputs(self,
-                       max_batch_size,
-                       max_input_len,
-                       max_seq_len,
-                       use_cache,
-                       max_beam_width: int = 1,
-                       max_num_tokens: int = None,
-                       opt_num_tokens: int = None,
-                       prompt_embedding_table_size: int = 0,
-                       max_draft_len: int = 0,
-                       gather_context_logits: bool = False,
-                       gather_generation_logits: bool = False,
-                       lora_target_modules: List[str] = None):
+    def prepare_inputs(
+            self,
+            max_batch_size,
+            max_input_len,
+            max_seq_len,
+            use_cache,
+            max_beam_width: int = 1,
+            max_num_tokens: int = None,
+            opt_num_tokens: int = None,
+            prompt_embedding_table_size: int = 0,
+            max_draft_len: int = 0,
+            gather_context_logits: bool = False,
+            gather_generation_logits: bool = False,
+            lora_target_modules: List[str] = None,
+            speculative_decoding_draft_tokens_external: bool = False):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
 
             @return: a list contains values which can be fed into the self.forward()
         '''
+        assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba"
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_mamba_conv1d_plugin = default_net(
         ).plugin_config.mamba_conv1d_plugin
diff --git a/tensorrt_llm/models/medusa/model.py b/tensorrt_llm/models/medusa/model.py
index 93c49260a..016442657 100644
--- a/tensorrt_llm/models/medusa/model.py
+++ b/tensorrt_llm/models/medusa/model.py
@@ -12,19 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import OrderedDict
-
-import tensorrt as trt
 
 from tensorrt_llm.models.llama.model import LLaMAForCausalLM
 
 from ..._common import default_net
 from ..._utils import pad_vocab_size
-from ...functional import ACT2FN, Tensor, stack
+from ...functional import ACT2FN, stack
 from ...layers import ColumnLinear
 from ...mapping import Mapping
 from ...module import Module, ModuleList
-from ..generation_mixin import GenerationMixin
 
 
 class MedusaLayer(Module):
@@ -141,51 +137,6 @@ def forward(self, *args, **kwargs):
             return hidden_states
 
     def prepare_inputs(self, *args, **kwargs):
+        kwargs['speculative_decoding_draft_tokens_external'] = False
         kwargs['max_draft_len'] = self.max_medusa_token_len
-        inputs = super().prepare_inputs(*args, **kwargs)
-        num_profiles = len(inputs['input_ids'].profiles)
-        max_gen_token_len = self.max_medusa_token_len + 1
-        medusa_mask_len_range = [[0, max_gen_token_len, max_gen_token_len]
-                                 ] * num_profiles
-        medusa_position_len_range = [[0, max_gen_token_len, max_gen_token_len]
-                                     ] * num_profiles
-        # # 32 bits packed mask aligned.
-        num_packed_medusa_masks = (self.max_medusa_token_len + 1 + 32 - 1) // 32
-        packed_medusa_mask_len_range = [[0, 1, num_packed_medusa_masks]
-                                        ] * num_profiles
-
-        # batch beam range (different sequence may have different medusa offsets or packed masks).
-        bb_range_cxt = GenerationMixin.default_range(kwargs['max_batch_size'])
-        bb_range_gen = GenerationMixin.default_range(kwargs['max_batch_size'] *
-                                                     kwargs['max_beam_width'])
-        # enable_two_optimization_profiles
-        if num_profiles == 2:
-            bb_range = [bb_range_cxt, bb_range_gen]
-        else:
-            bb_range = [bb_range_gen]
-
-        # medusa position offsets that are fixed during the whole session.
-        # it will be shared among all sequences.
-        medusa_position_offsets = Tensor(
-            name='medusa_position_offsets',
-            dtype=trt.int32,
-            shape=[-1, -1],
-            dim_range=OrderedDict([
-                ('batch_size_beam_width', bb_range),
-                ('medusa_position_ids_dim0', medusa_position_len_range),
-            ]),
-        )
-
-        medusa_packed_mask = Tensor(
-            name='medusa_packed_mask',
-            dtype=trt.int32,
-            shape=[-1, -1, -1],
-            dim_range=OrderedDict([
-                ('batch_size_beam_width', bb_range),
-                ('medusa_packed_mask_dim0', medusa_mask_len_range),
-                ('medusa_packed_mask_dim1', packed_medusa_mask_len_range),
-            ]),
-        )
-        inputs['medusa_packed_mask'] = medusa_packed_mask
-        inputs['medusa_position_offsets'] = medusa_position_offsets
-        return inputs
+        return super().prepare_inputs(*args, **kwargs)
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
index 3ba746dd2..3e00e5a7b 100644
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@@ -1,7 +1,9 @@
+import argparse
 import copy
 import dataclasses
 import json
 import os
+from enum import IntFlag, auto
 from functools import cached_property
 from typing import Dict, List, Optional, Union
 
@@ -31,6 +33,27 @@
 WEIGHT_LOADER_MODELS = {"PhiForCausalLM"}
 
 
+class SpeculativeDecodingMode(IntFlag):
+    # [WARNING] KEEP BELOW DEFINITION IN SYNC WITH cpp/tensorrt_llm/runtime/speculativeDecodingMode.h
+    NONE = auto()
+    DRAFT_TOKENS_EXTERNAL = auto()
+    MEDUSA = auto()
+    LOOKAHEAD_DECODING = auto()
+
+    @staticmethod
+    def from_arguments(args: argparse.Namespace):
+        if args.speculative_decoding_mode is None:
+            return SpeculativeDecodingMode.NONE
+        elif args.speculative_decoding_mode == "draft_tokens_external":
+            return SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
+        elif args.speculative_decoding_mode == "medusa":
+            return SpeculativeDecodingMode.MEDUSA
+        elif args.speculative_decoding_mode == "lookahead_decoding":
+            return SpeculativeDecodingMode.LOOKAHEAD_DECODING
+        else:
+            assert False, "Unknown speculative_decoding_mode " + args.speculative_decoding_mode
+
+
 @dataclasses.dataclass
 class QuantConfig:
     '''Serializable quantization configuration class, part of the PretrainedConfig
@@ -55,8 +78,8 @@ def quant_mode(self) -> QuantMode:
             self.kv_cache_quant_algo,
         )
 
-    def quant_algo_to_ammo_qformat(self):
-        algo_to_ammo_map = {
+    def quant_algo_to_modelopt_qformat(self):
+        algo_to_modelopt_map = {
             QuantAlgo.W8A16: "int8_wo",
             QuantAlgo.W4A16: "int4_wo",
             QuantAlgo.W4A16_AWQ: "int4_awq",
@@ -65,8 +88,8 @@ def quant_algo_to_ammo_qformat(self):
             QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq',
         }
         if self.quant_algo is not None:
-            assert self.quant_algo in algo_to_ammo_map, f"We don't use AMMO for quantization algorithm {self.quant_algo}, you probably shall not call this"
-            qformat = algo_to_ammo_map[self.quant_algo]
+            assert self.quant_algo in algo_to_modelopt_map, f"We don't use Modelopt for quantization algorithm {self.quant_algo}, you probably shall not call this"
+            qformat = algo_to_modelopt_map[self.quant_algo]
         else:
             qformat = 'full_prec'
         return qformat
@@ -114,6 +137,7 @@ def __init__(self,
                  embedding_sharding_dim: int = 0,
                  share_embedding_table: bool = False,
                  head_size: int = None,
+                 qk_layernorm: bool = False,
                  **kwargs):
         self.architecture = architecture
         self.dtype = dtype
@@ -126,6 +150,7 @@ def __init__(self,
         self.num_attention_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.head_size = hidden_size // num_attention_heads if head_size is None else head_size
+        self.qk_layernorm = qk_layernorm
         self.hidden_act = hidden_act
         self.intermediate_size = intermediate_size
         self.norm_epsilon = norm_epsilon
@@ -270,8 +295,8 @@ def forward(self,
                 attention_params=None,
                 position_ids=None,
                 lora_params=None,
-                medusa_position_offsets=None,
-                medusa_packed_mask=None):
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None):
         kv_cache_params.fill_none_tensor_list(len(self.layer_list))
 
         if use_cache:
@@ -289,10 +314,11 @@ def forward(self,
                 kwargs['position_ids'] = position_ids
             if lora_layer_params is not None:
                 kwargs['lora_layer_params'] = lora_layer_params
-            if medusa_position_offsets is not None:
-                kwargs['medusa_position_offsets'] = medusa_position_offsets
-            if medusa_packed_mask is not None:
-                kwargs['medusa_packed_mask'] = medusa_packed_mask
+            if spec_decoding_position_offsets is not None:
+                kwargs[
+                    'spec_decoding_position_offsets'] = spec_decoding_position_offsets
+            if spec_decoding_packed_mask is not None:
+                kwargs['spec_decoding_packed_mask'] = spec_decoding_packed_mask
 
             hidden_states = layer(
                 hidden_states,
@@ -443,9 +469,11 @@ def prepare_inputs(self,
                        prompt_embedding_table_size: int = 0,
                        position_encoding_2d: bool = False,
                        max_draft_len: int = 0,
+                       speculative_decoding_draft_tokens_external: bool = False,
                        gather_context_logits: bool = False,
                        gather_generation_logits: bool = False,
-                       lora_target_modules: List[str] = None):
+                       lora_target_modules: List[str] = None,
+                       opt_batch_size: int = 0):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
 
@@ -491,9 +519,12 @@ def prepare_inputs(self,
             use_custom_all_reduce=use_custom_all_reduce,
             use_lora_plugin=use_lora_plugin,
             max_draft_len=max_draft_len,
+            speculative_decoding_draft_tokens_external=
+            speculative_decoding_draft_tokens_external,
             lora_target_modules=lora_target_modules,
             multiple_profiles=multiple_profiles,
-            streamingllm=streamingllm)
+            streamingllm=streamingllm,
+            opt_batch_size=opt_batch_size)
 
         result = {
             'input_ids':
@@ -544,6 +575,11 @@ def prepare_inputs(self,
                 host_context_lengths=model_inputs['host_context_lengths'],
                 max_context_length=max_input_len,
                 host_request_types=model_inputs['host_request_types'])
+        if model_inputs['spec_decoding_packed_mask'] is not None:
+            result['spec_decoding_position_offsets'] = model_inputs[
+                'spec_decoding_position_offsets']
+            result['spec_decoding_packed_mask'] = model_inputs[
+                'spec_decoding_packed_mask']
 
         return result
 
@@ -563,9 +599,9 @@ def quantize(
     ):
         if mapping is None:  # single gpu
             mapping = Mapping()
-        ammo_qformat = quant_config.quant_algo_to_ammo_qformat()
+        modelopt_qformat = quant_config.quant_algo_to_modelopt_qformat()
         kv_cache_dtype = quant_config.kv_cache_quant_algo
-        assert ammo_qformat is not None
+        assert modelopt_qformat is not None
         from ..quantization import quantize_and_export
         hf_model_dir = str(
             hf_model_dir)  # quantize_and_export has some code can not take Path
@@ -573,7 +609,7 @@ def quantize(
             model_dir=hf_model_dir,
             dtype=dtype,
             device='cuda',
-            qformat=ammo_qformat,
+            qformat=modelopt_qformat,
             kv_cache_dtype=kv_cache_dtype,
             calib_size=calib_batches,
             batch_size=calib_batch_size,
@@ -606,8 +642,8 @@ def forward(self,
                 prompt_tasks: Optional[Tensor] = None,
                 prompt_vocab_size: Optional[Tensor] = None,
                 lora_params=None,
-                medusa_position_offsets=None,
-                medusa_packed_mask=None):
+                spec_decoding_position_offsets=None,
+                spec_decoding_packed_mask=None):
         kwargs = {
             'input_ids': input_ids,
             'position_ids': position_ids,
@@ -627,10 +663,11 @@ def forward(self,
         if prompt_vocab_size is not None:
             kwargs['prompt_vocab_size'] = prompt_vocab_size
 
-        if medusa_position_offsets is not None:
-            kwargs['medusa_position_offsets'] = medusa_position_offsets
-        if medusa_packed_mask is not None:
-            kwargs['medusa_packed_mask'] = medusa_packed_mask
+        if spec_decoding_position_offsets is not None:
+            kwargs[
+                'spec_decoding_position_offsets'] = spec_decoding_position_offsets
+        if spec_decoding_packed_mask is not None:
+            kwargs['spec_decoding_packed_mask'] = spec_decoding_packed_mask
 
         hidden_states = self.transformer.forward(**kwargs)
 
diff --git a/tensorrt_llm/models/phi3/__init__.py b/tensorrt_llm/models/phi3/__init__.py
new file mode 100644
index 000000000..71bf6d298
--- /dev/null
+++ b/tensorrt_llm/models/phi3/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py
new file mode 100644
index 000000000..344076994
--- /dev/null
+++ b/tensorrt_llm/models/phi3/convert.py
@@ -0,0 +1,88 @@
+import torch
+
+from ..._utils import str_dtype_to_torch
+
+
+def convert_hf_weights(hf_model, dtype, **kwargs):
+    torch_dtype = str_dtype_to_torch(dtype)
+    hf_state_dict = hf_model.state_dict()
+    weights = {}
+
+    # replace key name
+    for key, value in hf_state_dict.items():
+        # Decoder Layers
+        orig_key = key
+        if "model.layers." in key:
+            key = key.replace("model.layers.", "transformer.layers.")
+            #Attention
+            key = key.replace("self_attn.", "attention.")
+            key = key.replace("Wqkv.weight", "qkv.weight")
+            key = key.replace("qkv_proj.", "qkv.")  #128k
+            #MLP
+            key = key.replace("mlp.fc1.", "mlp.fc.")
+            key = key.replace("mlp.fc2.", "mlp.proj.")
+            key = key.replace("mlp.gate_up_proj.", "mlp.fc.")
+            key = key.replace("mlp.up_proj.", "mlp.gate.")  #128k
+            key = key.replace("mlp.down_proj.", "mlp.proj.")  #128k
+            key = key.replace("mlp.gate_proj.", "mlp.fc.")  #128k
+            key = key.replace("o_proj.", "dense.")  #128k
+            #Layer norm
+            key = key.replace("post_attention_layernorm.",
+                              "post_layernorm.")  #128k
+
+        # Embedding
+        key = key.replace("model.embed_tokens.weight",
+                          "transformer.vocab_embedding.weight")
+        # Final Layer norm
+        key = key.replace("model.final_layernorm.", "transformer.ln_f.")
+        key = key.replace("model.norm.", "transformer.ln_f.")  #128k
+
+        if "mlp.gate_up_proj." in orig_key:  #4k
+            original_weights = value.contiguous().clone()
+            half_split = original_weights.shape[0] // 2
+            first_half, second_half = original_weights[:
+                                                       half_split, :], original_weights[
+                                                           half_split:, :]
+            # Swap the halves
+            value = torch.cat((second_half, first_half), dim=0)
+
+        if "q_proj" in key:  #128k
+            q_param = value
+            k_param = hf_state_dict[orig_key.replace("q_proj", "k_proj")]
+            v_param = hf_state_dict[orig_key.replace("q_proj", "v_proj")]
+            value = torch.cat([q_param, k_param, v_param], dim=0)
+            key = key.replace("q_proj.weight", "qkv.weight")
+        elif "k_proj" in key or "v_proj" in key:
+            continue
+        weights[key] = value.to(torch_dtype).cpu()
+
+    return weights
+
+
+def convert_hf_config(hf_config, dtype, **kwargs):
+    config = {
+        'architecture': "Phi3ForCausalLM",
+        'dtype': dtype,
+        'num_hidden_layers': hf_config.num_hidden_layers,
+        'num_attention_heads': hf_config.num_key_value_heads,
+        'rope_theta': hf_config.rope_theta,
+        'hidden_size': hf_config.hidden_size,
+        'intermediate_size': hf_config.intermediate_size,
+        'vocab_size': hf_config.vocab_size,
+        'max_position_embeddings': hf_config.max_position_embeddings,
+        'hidden_act': hf_config.hidden_act,
+        'share_embedding_table': False,
+        'layer_norm_eps': hf_config.rms_norm_eps,
+    }
+    if hf_config.max_position_embeddings >= 128000:
+        config.update({
+            'original_max_position_embeddings':
+            hf_config.original_max_position_embeddings,
+            'longrope_scaling_short_factors':
+            hf_config.rope_scaling["short_factor"],
+            'longrope_scaling_long_factors':
+            hf_config.rope_scaling["long_factor"]
+        })
+    if config["hidden_act"] == "silu":
+        config["hidden_act"] = "swiglu"
+    return config
diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py
new file mode 100644
index 000000000..f32d34401
--- /dev/null
+++ b/tensorrt_llm/models/phi3/model.py
@@ -0,0 +1,190 @@
+from typing import Optional
+
+import numpy as np
+from transformers import AutoModelForCausalLM
+
+from ..._utils import pad_vocab_size
+from ...functional import PositionEmbeddingType, Tensor
+from ...layers import (MLP, Attention, AttentionMaskType, Embedding,
+                       ParallelLMHead, RmsNorm)
+from ...module import Module
+from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
+                              PretrainedConfig, save_checkpoint)
+from .convert import convert_hf_config, convert_hf_weights
+
+
+class Phi3DecoderLayer(Module):
+
+    def __init__(self, config: PretrainedConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        tp_group = config.mapping.tp_group
+        tp_size = config.mapping.tp_size
+
+        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                       eps=config.layer_norm_eps,
+                                       dtype=config.dtype)
+        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                      eps=config.layer_norm_eps,
+                                      dtype=config.dtype)
+
+        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
+        local_layer_idx = layer_idx - layers_range[0]
+        position_embedding_type = PositionEmbeddingType.rope_gpt_neox
+
+        rope_scaling_short_factors = 1.0
+        rope_scaling_long_factors = 1.0
+        original_max_position_embeddings = config.max_position_embeddings
+        if hasattr(config, "longrope_scaling_short_factors"):
+            rope_scaling_short_factors = np.asarray(
+                config.longrope_scaling_short_factors).astype(np.float32)
+            rope_scaling_long_factors = np.asarray(
+                config.longrope_scaling_long_factors).astype(np.float32)
+            original_max_position_embeddings = config.original_max_position_embeddings
+            position_embedding_type = PositionEmbeddingType.long_rope
+
+        self.attention = Attention(
+            local_layer_idx=local_layer_idx,
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            position_embedding_type=position_embedding_type,
+            rotary_embedding_base=config.rotary_base,
+            max_position_embeddings=config.max_position_embeddings,
+            dtype=config.dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=False,
+            tp_group=tp_group,
+            tp_size=tp_size,
+            quant_mode=config.quant_mode,
+            rope_scaling_short_factors=rope_scaling_short_factors,
+            rope_scaling_long_factors=rope_scaling_long_factors,
+            original_max_position_embeddings=original_max_position_embeddings,
+        )
+
+        self.mlp = MLP(hidden_size=config.hidden_size,
+                       ffn_hidden_size=config.intermediate_size,
+                       hidden_act=config.hidden_act,
+                       dtype=config.dtype,
+                       tp_group=tp_group,
+                       tp_size=tp_size,
+                       quant_mode=config.quant_mode,
+                       bias=False)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask=None,
+        use_cache=False,
+        kv_cache_params=None,
+        attention_params=None,
+    ):
+
+        input_layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            norm_before_bmm1=True,
+        )
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        post_attention_input = hidden_states + attention_output
+        post_attention_output = self.post_layernorm(post_attention_input)
+        feed_forward_hidden_states = self.mlp(post_attention_output, )
+        hidden_states = post_attention_input + feed_forward_hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class Phi3Model(Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.vocab_embedding = Embedding(num_embeddings=config.vocab_size,
+                                         embedding_dim=config.hidden_size,
+                                         dtype=config.dtype)
+
+        self.layers = DecoderLayerList(Phi3DecoderLayer, config)
+        self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                            eps=config.layer_norm_eps,
+                            dtype=config.dtype)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids=None,
+        use_cache=False,
+        attention_mask=None,
+        kv_cache_params=None,
+        attention_params=None,
+        prompt_embedding_table=None,
+        prompt_tasks=None,
+        prompt_vocab_size=None,
+    ):
+        args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size
+                ] if prompt_embedding_table is not None else []
+        hidden_states = self.vocab_embedding(input_ids, *args)
+
+        hidden_states = self.layers(
+            hidden_states,
+            use_cache=use_cache,
+            attention_mask=attention_mask,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+        )
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        hidden_states = self.ln_f(hidden_states)
+
+        if use_cache:
+            return (hidden_states, tuple(presents))
+        return hidden_states
+
+
+class Phi3ForCausalLM(DecoderModelForCausalLM):
+
+    def __init__(self, config: PretrainedConfig):
+        self.check_config(config)
+        transformer = Phi3Model(config)
+        vocab_size_padded = pad_vocab_size(config.vocab_size,
+                                           config.mapping.tp_size)
+
+        lm_head = ParallelLMHead(config.hidden_size,
+                                 vocab_size_padded,
+                                 bias=False,
+                                 dtype=config.dtype,
+                                 tp_group=config.mapping.tp_group,
+                                 tp_size=config.mapping.tp_size,
+                                 gather_output=True)
+
+        super().__init__(config, transformer, lm_head)
+
+    def check_config(self, config):
+        config.set_if_not_exist('rotary_base', 10000.0)
+
+    @classmethod
+    def convert_hf_checkpoint(cls,
+                              hf_model_dir: str,
+                              dtype: Optional[str] = "float16",
+                              output_dir: Optional[str] = None,
+                              **kwargs):
+        '''
+        Convert Huggingface checkpoint to TRT-LLM checkpoint
+        '''
+        hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir,
+                                                        torch_dtype="auto",
+                                                        trust_remote_code=True)
+        config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs)
+        weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs)
+
+        if output_dir:
+            save_checkpoint(output_dir, config=config, weights=weights)
+
+        return {"weights": weights, "config": config}
diff --git a/tensorrt_llm/models/qwen/convert.py b/tensorrt_llm/models/qwen/convert.py
index 80b54f998..3f623bd91 100644
--- a/tensorrt_llm/models/qwen/convert.py
+++ b/tensorrt_llm/models/qwen/convert.py
@@ -1104,7 +1104,7 @@ def quantize(dtype,
     '''
         Quantize the save the model as TRT-LLM checkpoint to output_dir
     '''
-    #TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling ammo
+    #TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling modelopt
     config = create_config_from_hugging_face(model_dir,
                                              dtype,
                                              mapping,
diff --git a/tensorrt_llm/models/recurrentgemma/model.py b/tensorrt_llm/models/recurrentgemma/model.py
index bc4b7b68f..30b88187d 100644
--- a/tensorrt_llm/models/recurrentgemma/model.py
+++ b/tensorrt_llm/models/recurrentgemma/model.py
@@ -399,24 +399,28 @@ def prepare_recurrent_inputs(self, max_batch_size, num_profiles, mapping):
         }
         return return_dict
 
-    def prepare_inputs(self,
-                       max_batch_size,
-                       max_input_len,
-                       max_seq_len,
-                       use_cache,
-                       max_beam_width: int = 1,
-                       max_num_tokens: int = None,
-                       opt_num_tokens: int = None,
-                       prompt_embedding_table_size: int = 0,
-                       max_draft_len: int = 0,
-                       gather_context_logits: bool = False,
-                       gather_generation_logits: bool = False,
-                       lora_target_modules: List[str] = None):
+    def prepare_inputs(
+            self,
+            max_batch_size,
+            max_input_len,
+            max_seq_len,
+            use_cache,
+            max_beam_width: int = 1,
+            max_num_tokens: int = None,
+            opt_num_tokens: int = None,
+            prompt_embedding_table_size: int = 0,
+            max_draft_len: int = 0,
+            gather_context_logits: bool = False,
+            gather_generation_logits: bool = False,
+            lora_target_modules: List[str] = None,
+            speculative_decoding_draft_tokens_external: bool = False):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
 
             @return: a list contains values which can be fed into the self.forward()
         '''
+        assert speculative_decoding_draft_tokens_external == False, \
+            "We don't support speculative decoding for the RecurrentGemma model."
         assert max_beam_width == 1, "We don't support beam search for the RecurrentGemma model."
 
         remove_input_padding = default_net().plugin_config.remove_input_padding
diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py
index 4b4e025b4..bdaac3961 100644
--- a/tensorrt_llm/plugin/plugin.py
+++ b/tensorrt_llm/plugin/plugin.py
@@ -196,6 +196,10 @@ def enable_paged_kv_cache(self, tokens_per_block=128):
         self.set_plugin("tokens_per_block", tokens_per_block)
         return self
 
+    def enable_paged_state(self):
+        self.set_plugin("paged_state", True)
+        return self
+
     def set_gpt_attention_plugin(self, dtype='float16'):
         self.set_plugin("gpt_attention_plugin", dtype)
         return self
diff --git a/tensorrt_llm/quantization/__init__.py b/tensorrt_llm/quantization/__init__.py
index 9e0a7cf3d..0fbf758cb 100644
--- a/tensorrt_llm/quantization/__init__.py
+++ b/tensorrt_llm/quantization/__init__.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from .mode import (KV_CACHE_QUANT_ALGO_LIST, QUANT_ALGO_LIST,
                    W8A8_SQ_PLUGIN_LIST, QuantAlgo, QuantMode)
-from .quantize_by_ammo import quantize_and_export
+from .quantize_by_modelopt import quantize_and_export
 
 __all__ = [
     'QUANT_ALGO_LIST',
diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py
index be45d6059..4ca9c1f2a 100644
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@@ -1061,8 +1061,8 @@ def forward(
         self,
         hidden_states: Tensor,
         attention_mask=None,
-        medusa_packed_mask=None,
-        medusa_position_offsets=None,
+        spec_decoding_packed_mask=None,
+        spec_decoding_position_offsets=None,
         use_cache=False,
         kv_cache_params=None,
         attention_params=None,
@@ -1149,8 +1149,8 @@ def forward(
                 host_kv_cache_pool_pointers=kv_cache_params.
                 host_kv_cache_pool_pointers,
                 host_context_lengths=attention_params.host_context_lengths,
-                medusa_position_offsets=medusa_position_offsets,
-                medusa_packed_mask=medusa_packed_mask)
+                spec_decoding_position_offsets=spec_decoding_position_offsets,
+                spec_decoding_packed_mask=spec_decoding_packed_mask)
         else:
             assert self.paged_kv_cache == False
 
diff --git a/tensorrt_llm/quantization/quantize_by_ammo.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
similarity index 94%
rename from tensorrt_llm/quantization/quantize_by_ammo.py
rename to tensorrt_llm/quantization/quantize_by_modelopt.py
index c071fd404..13eb13057 100644
--- a/tensorrt_llm/quantization/quantize_by_ammo.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -88,7 +88,7 @@
 
 
 def quant_cfg_choices():
-    import ammo.torch.quantization as atq
+    import modelopt.torch.quantization as atq
     QUANT_CFG_CHOICES = {
         "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
         "fp8": atq.FP8_DEFAULT_CFG,
@@ -116,6 +116,7 @@ def quant_cfg_choices():
     "QWen": "qwen",
     "Gemma": "gemma",
     "MixtralForCausalLM": "llama",
+    "ArcticForCausalLM": "llama",
 }
 
 
@@ -161,10 +162,11 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"):
         AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
 
     model_kwargs = {"torch_dtype": "auto"}
-    model = AutoModelForCausalLM.from_pretrained(ckpt_path,
-                                                 device_map="auto",
-                                                 **model_kwargs,
-                                                 trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        ckpt_path,
+        device_map="auto" if device != "cpu" else "cpu",
+        **model_kwargs,
+        trust_remote_code=True)
     model.eval()
 
     model_dtype = next(model.parameters()).dtype
@@ -219,7 +221,7 @@ def get_calib_dataloader(data="cnn_dailymail",
 
 
 def quantize_model(model, quant_cfg, calib_dataloader=None):
-    import ammo.torch.quantization as atq
+    import modelopt.torch.quantization as atq
 
     def calibrate_loop():
         if calib_dataloader is None:
@@ -245,19 +247,18 @@ def quantize_and_export(*, model_dir, dtype, device, qformat, kv_cache_dtype,
                         calib_size, batch_size, awq_block_size, output_dir,
                         tp_size, pp_size, seed, max_seq_length):
     '''
-        Load model from the model_dir, call AMMO to quantize the model, and then export
+        Load model from the model_dir, call Modelopt to quantize the model, and then export
         the quantized model as TRT-LLM checkpoint
     '''
     try:
-        import ammo  # noqa
+        import modelopt  # noqa
     except ImportError as e:
         logger.error(
-            "Failed to import ammo, pls check the AMMO installation. Currently it is known to be unsupported on Windows OS"
+            "Failed to import modelopt, pls check the Modelopt installation. Currently it is known to be unsupported on Windows OS"
         )
         raise e
-    from ammo.torch.export import export_tensorrt_llm_checkpoint
-    from ammo.torch.export.tensorrt_llm_utils import MODEL_NAME_TO_HF_ARCH_MAP
-    MODEL_NAME_TO_HF_ARCH_MAP.update({"gpt2": "GPTForCausalLM"})
+
+    from modelopt.torch.export import export_tensorrt_llm_checkpoint
 
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for inference.")
@@ -360,7 +361,7 @@ def quantize_and_export(*, model_dir, dtype, device, qformat, kv_cache_dtype,
         with open(f"{export_path}/config.json", "w") as f:
             json.dump(tensorrt_llm_config, f, indent=4)
 
-        # Workaround for AMMO 0.9.x fp8_kv_cache knob issue
+        # Workaround for Modelopt 0.9.x fp8_kv_cache knob issue
         if qformat == 'fp8' and kv_cache_dtype is None:
             with open(f"{export_path}/config.json", "r") as f:
                 tensorrt_llm_config = json.load(f)
diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
index b2808ed18..013bee48a 100755
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@@ -775,7 +775,8 @@ def __init__(self,
 
         if model_config.num_medusa_heads > 0:
             expected_tensor_names += [
-                'medusa_position_offsets', 'medusa_packed_mask', 'medusa_logits'
+                'spec_decoding_position_offsets', 'spec_decoding_packed_mask',
+                'medusa_logits'
             ]
 
         found_tensor_names = [
@@ -1629,9 +1630,9 @@ def setup(self,
                 atten_idx += 1
 
         if self.is_medusa_mode:
-            self.buffer['medusa_packed_mask'] = self.medusa_packed_mask
+            self.buffer['spec_decoding_packed_mask'] = self.medusa_packed_mask
             self.buffer[
-                'medusa_position_offsets'] = self.medusa_position_offsets
+                'spec_decoding_position_offsets'] = self.medusa_position_offsets
         self.buffer_allocated = True
         if self.is_medusa_mode:
             return self.num_medusa_tokens
@@ -1892,9 +1893,10 @@ def add_tensor_with_shape(x, name, shape):
                            'host_encoder_input_lengths')
         if self.is_medusa_mode:
             # Medusa mask and position offsets are fixed for the whole session.
-            add_tensor(self.buffer['medusa_packed_mask'], 'medusa_packed_mask')
-            add_tensor(self.buffer['medusa_position_offsets'],
-                       'medusa_position_offsets')
+            add_tensor(self.buffer['spec_decoding_packed_mask'],
+                       'spec_decoding_packed_mask')
+            add_tensor(self.buffer['spec_decoding_position_offsets'],
+                       'spec_decoding_position_offsets')
 
         return tensors
 
@@ -2196,9 +2198,10 @@ def add_tensor_with_shape(x, name, shape):
 
         if self.is_medusa_mode:
             # Medusa mask and position offsets are fixed for the whole session.
-            add_tensor(self.buffer['medusa_packed_mask'], 'medusa_packed_mask')
-            add_tensor(self.buffer['medusa_position_offsets'],
-                       'medusa_position_offsets')
+            add_tensor(self.buffer['spec_decoding_packed_mask'],
+                       'spec_decoding_packed_mask')
+            add_tensor(self.buffer['spec_decoding_position_offsets'],
+                       'spec_decoding_position_offsets')
 
         return tensors
 
diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py
index b02e29e48..6ee92051e 100644
--- a/tensorrt_llm/runtime/model_runner.py
+++ b/tensorrt_llm/runtime/model_runner.py
@@ -23,6 +23,8 @@
 import tensorrt as trt
 import torch
 
+from tensorrt_llm._utils import numpy_to_torch
+
 from .. import profiler
 from .._utils import mpi_comm, mpi_world_size
 from ..bindings import GptSession
@@ -334,6 +336,18 @@ def _prepare_outputs(self, outputs: Optional[dict],
 
         return outputs
 
+    def _prepare_embedding_table(self, prompt_table: Union[str, torch.Tensor]):
+        if isinstance(prompt_table, str):
+            prompt_table_data = numpy_to_torch(
+                np.load(prompt_table)).to(dtype=self.dtype)
+        else:
+            assert isinstance(
+                prompt_table,
+                torch.Tensor), "Prompt table should be str or torch.Tensor"
+            prompt_table_data = prompt_table.to(dtype=self.dtype)
+
+        return prompt_table_data.cuda()
+
     def _prepare_ptuning(self, prompt_table: Union[str, torch.Tensor],
                          tasks: str, batch_size: int):
         if self.max_prompt_embedding_table_size == 0:
@@ -341,7 +355,7 @@ def _prepare_ptuning(self, prompt_table: Union[str, torch.Tensor],
 
         if prompt_table is not None:
             if isinstance(prompt_table, str):
-                prompt_table_data = torch.from_numpy(
+                prompt_table_data = numpy_to_torch(
                     np.load(prompt_table)).to(dtype=self.dtype)
             else:
                 assert isinstance(
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index 23d60cb1e..ae06c1776 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.10.0.dev2024043000"
+__version__ = "0.10.0.dev2024050700"
diff --git a/tests/hlapi/grid_searcher.py b/tests/hlapi/grid_searcher.py
index 09241abd3..2b61fabd7 100644
--- a/tests/hlapi/grid_searcher.py
+++ b/tests/hlapi/grid_searcher.py
@@ -108,7 +108,7 @@ def tunable_space(self):
             ],
             enable_chunked_context=[False, True],
         )
-        if self.model_config.is_multi_gpu:
+        if self.model_config.parallel_config.is_multi_gpu:
             tunable_options["use_custom_all_reduce"] = [False, True]
 
         self.space_size = reduce(operator.mul,
diff --git a/tests/hlapi/hlapi_evaluator.py b/tests/hlapi/hlapi_evaluator.py
index 3c73d3dd0..b8abd340b 100644
--- a/tests/hlapi/hlapi_evaluator.py
+++ b/tests/hlapi/hlapi_evaluator.py
@@ -8,6 +8,7 @@
 
 from tensorrt_llm.hlapi import ModelConfig
 from tensorrt_llm.hlapi._perf_evaluator import LLMPerfEvaluator
+from tensorrt_llm.hlapi.llm import ModelLoader, _ModelFormatKind
 from tensorrt_llm.hlapi.utils import print_colored
 
 try:
@@ -66,8 +67,12 @@ def benchmark_main(model_path: str,
     if engine_output_dir:
         engine_output_dir = Path(engine_output_dir)
     elif cpp_executable:
-        temp_dir = tempfile.TemporaryDirectory()
-        engine_output_dir = Path(temp_dir.name)
+        if ModelLoader.get_model_format(
+                model_path) is _ModelFormatKind.TLLM_ENGINE:
+            engine_output_dir = model_path
+        else:
+            temp_dir = tempfile.TemporaryDirectory()
+            engine_output_dir = Path(temp_dir.name)
 
     def run_hlapi():
         print_colored(f"Running HLAPI benchmark ...\n", "bold_green")
diff --git a/tests/hlapi/run_llm.py b/tests/hlapi/run_llm.py
new file mode 100644
index 000000000..8e654853d
--- /dev/null
+++ b/tests/hlapi/run_llm.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+import click
+
+from tensorrt_llm.hlapi.llm import LLM, ModelConfig, ModelLoader
+
+
+@click.command()
+@click.option('--model_dir',
+              type=str,
+              required=True,
+              help='Path to the model directory')
+@click.option('--tokenizer_dir',
+              type=str,
+              default=None,
+              help='Path to the tokenizer directory')
+@click.option('--prompt',
+              type=str,
+              default="Tell a story",
+              help='Prompt to generate text from')
+def main(model_dir: str, tokenizer_dir: str, prompt: str):
+    config = ModelConfig(model_dir)
+
+    if tokenizer_dir is None:
+        tokenizer_dir = model_dir
+
+    tokenizer = ModelLoader.load_hf_tokenizer(tokenizer_dir)
+
+    llm = LLM(config, tokenizer=tokenizer)
+
+    for output in llm.generate([prompt]):
+        print("OUTPUT:", output.text)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/hlapi/test_llm.py b/tests/hlapi/test_llm.py
index d66af5ba1..a4d88603e 100644
--- a/tests/hlapi/test_llm.py
+++ b/tests/hlapi/test_llm.py
@@ -7,17 +7,18 @@
 
 import pytest
 import torch
+from parameterized import parameterized
 from transformers import AutoTokenizer
 
 from tensorrt_llm.hlapi.llm import (LLM, KvCacheConfig, ModelConfig,
-                                    SamplingConfig, StreamingLLMParam,
-                                    TokenizerBase)
+                                    ParallelConfig, SamplingConfig,
+                                    StreamingLLMParam, TokenizerBase)
 from tensorrt_llm.hlapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.hlapi.utils import get_total_gpu_memory
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.llm_data import llm_models_root
-from utils.util import force_ampere
+from utils.util import force_ampere, unittest_name_func
 
 from tensorrt_llm.models.llama.model import LLaMAForCausalLM
 
@@ -31,10 +32,17 @@ def get_model_path(model_name):
 
 default_model_name = "llama-models/llama-7b-hf"
 mixtral_model_name = "Mixtral-8x7B-v0.1"
+tinyllama_model_name = "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 llama_model_path = get_model_path(default_model_name)
 llm_engine_dir = os.environ.get('LLM_ENGINE_DIR', './tmp.engine')
 prompts = ["A B C"]
+output_text_refs = {
+    default_model_name:
+    "<s> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\nA B C D E F G H",
+    tinyllama_model_name:
+    "<s> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\n\nI hope this helps! Let me",
+}
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
 models_root = os.path.join(cur_dir, '../../models')
@@ -48,7 +56,7 @@ def test_llm_loading_from_hf():
     config = ModelConfig(llama_model_path)
     # The performance-related flags are turned on eagerly to check the functionality
 
-    devices = config.parallel_config.get_devices()
+    devices = config.parallel_config.devices
     if torch.cuda.get_device_properties(devices[0]).major >= 8:
         # only available for A100 or newer GPUs
         config.multi_block_mode = True
@@ -65,7 +73,7 @@ def test_llm_loading_from_hf():
 
     for output in llm.generate(prompts):
         print(output)
-        assert output.text == "<s> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\nA B C D E F G H"
+        assert output.text == output_text_refs[default_model_name]
 
 
 @force_ampere
@@ -89,7 +97,31 @@ def test_llm_loading_from_ckpt():
 
         for output in llm.generate(prompts):
             print(output)
-            assert output.text == "<s> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\nA B C D E F G H"
+            assert output.text == output_text_refs[default_model_name]
+
+
+def llm_end2end_cases():
+    yield ({}, )  # Default options
+    yield ({'trt_strongly_typed': False}, )
+
+
+@parameterized.expand(llm_end2end_cases(), name_func=unittest_name_func)
+def test_llm_end2end(llm_additional_options):
+    model_path = get_model_path(tinyllama_model_name)
+    config = ModelConfig(model_path)
+    llm = LLM(config, **llm_additional_options)
+
+    if 'trt_strongly_typed' in llm_additional_options:
+        assert llm._build_config.strongly_typed == llm_additional_options.pop(
+            'trt_strongly_typed')
+    else:
+        assert llm._build_config.strongly_typed is True
+
+    assert len(llm_additional_options) == 0
+
+    for output in llm.generate(prompts):
+        print(output)
+        assert output.text == output_text_refs[tinyllama_model_name]
 
 
 class MyTokenizer(TokenizerBase):
@@ -172,13 +204,13 @@ def test_llm_generate_async(model_name=default_model_name,
         pytest.skip("Auto parallel is not supported for Mixtral models")
     config = ModelConfig(llama_model_path)
     if use_auto_parallel:
-        config.parallel_config.world_size = tp_size
         config.parallel_config.auto_parallel = True
+        config.parallel_config.world_size = tp_size
     else:
         config.parallel_config.tp_size = tp_size
 
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
-    devices = config.parallel_config.get_devices()
+    devices = config.parallel_config.devices
     if torch.cuda.get_device_properties(devices[0]).major >= 8:
         kv_cache_config.enable_block_reuse = True
 
@@ -380,13 +412,21 @@ def test_sampling_config():
     assert sc0.max_new_tokens == 1024
 
 
+def test_parallel_config():
+    config = ParallelConfig()
+    config.tp_size = 2
+    config.pp_size = 2
+    assert config.world_size == 4
+    config.world_size = 4  # should not raise exception
+
+
 # TODO[chunweiy]: Add test for loading inmemory model
 
 if __name__ == '__main__':
+    test_llm_loading_from_hf()
+    test_llm_generate_async()
     test_llm_without_tokenizer()
     test_generate_with_streaming_llm()
     test_generate_with_sampling_config()
     test_llm_loading_from_hf()
-    test_llm_generate_async_tp2(use_auto_parallel=True)
-    test_llm_generate_async_tp2(use_auto_parallel=False)
     test_sampling_config()
diff --git a/tests/hlapi/test_llm_multi_gpu.py b/tests/hlapi/test_llm_multi_gpu.py
index d23e03611..738588373 100644
--- a/tests/hlapi/test_llm_multi_gpu.py
+++ b/tests/hlapi/test_llm_multi_gpu.py
@@ -127,6 +127,19 @@ def test_llm_generate_mixtral_for_tp2():
         print(output)
 
 
+def test_llm_pp2():
+    config = ModelConfig(llama_model_path)
+    config.parallel_config.pp_size = 2
+    config.parallel_config.auto_parallel = False
+    llm = LLM(
+        config,
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
+    )
+    for output in llm.generate(prompts):
+        assert output.text == "<s> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\nA B C D E F G H"
+
+
 if __name__ == '__main__':
     test_llm_generate_async_tp2(use_auto_parallel=True)
     test_llm_generate_async_tp2(use_auto_parallel=False)
+    test_llm_pp2()
diff --git a/tests/hlapi/test_llm_perf_evaluator.py b/tests/hlapi/test_llm_perf_evaluator.py
index 874d8c7e3..e0720680a 100644
--- a/tests/hlapi/test_llm_perf_evaluator.py
+++ b/tests/hlapi/test_llm_perf_evaluator.py
@@ -104,4 +104,4 @@ def test_grid_search_tester(sample_length: int = 16,
 
 if __name__ == '__main__':
     test_perf_evaluator()
-    #test_grid_search_tester()
+    test_grid_search_tester()
diff --git a/tests/model/test_arctic.py b/tests/model/test_arctic.py
new file mode 100644
index 000000000..3f53de32c
--- /dev/null
+++ b/tests/model/test_arctic.py
@@ -0,0 +1,416 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import sys
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from parameterized import parameterized
+from transformers import MistralConfig, MistralForCausalLM
+
+import tensorrt_llm
+from tensorrt_llm import Builder
+from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm.models.llama.weight import load_from_hf_llama
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from utils.util import (skip_bf16_pre_ampere, skip_fp32_accum_pre_ampere,
+                        unittest_name_func)
+
+
+class TestArctic(unittest.TestCase):
+    EOS_TOKEN = 2
+    PAD_TOKEN = 2
+
+    def _gen_tensorrt_llm_network(self, network, hf_mistral,
+                                  mistral_config: MistralConfig, batch_size,
+                                  beam_width, input_len, output_len, dtype,
+                                  rank, tensor_parallel):
+        list(range(tensor_parallel))
+
+        with net_guard(network):
+            str_dtype_to_trt(dtype)
+
+            config = {
+                'architecture': "LlamaForCausalLM",
+                'dtype': dtype,
+                'logits_dtype': 'float32',
+                'num_hidden_layers': mistral_config.num_hidden_layers,
+                'num_attention_heads': mistral_config.num_attention_heads,
+                'hidden_size': mistral_config.hidden_size,
+                'intermediate_size': mistral_config.intermediate_size,
+                'num_key_value_heads': mistral_config.num_key_value_heads,
+                'vocab_size': mistral_config.vocab_size,
+                'position_embedding_type': 'rope_gpt_neox',
+                'max_position_embeddings':
+                mistral_config.max_position_embeddings,
+                'hidden_act': mistral_config.hidden_act,
+                'rotary_base': getattr(mistral_config, 'rotary_base', 10000.0),
+                'rotary_scaling': getattr(mistral_config, 'rotary_scaling',
+                                          None),
+                'norm_epsilon': mistral_config.rms_norm_eps,
+                'residual_mlp': mistral_config.residual_mlp,
+                'mapping': {
+                    'world_size': tensor_parallel,
+                    'tp_size': tensor_parallel,
+                },
+                'use_parallel_embedding': False,
+                'embedding_sharding_dim': 0,
+                'moe_num_experts': 0,
+                'moe_top_k': 0,
+                'moe_tp_mode': 1,
+                'moe_normalization_mode': 1,
+                'use_fused_mlp': False,
+            }
+
+            # Initialize model
+            tensorrt_llm_mistral = tensorrt_llm.models.LLaMAForCausalLM(
+                PretrainedConfig.from_dict(config))
+            if not mistral_config.residual_mlp:
+                weights = load_from_hf_llama(tensorrt_llm_mistral,
+                                             hf_mistral,
+                                             dtype=dtype,
+                                             mapping=tensorrt_llm.Mapping(
+                                                 world_size=tensor_parallel,
+                                                 rank=rank,
+                                                 tp_size=tensor_parallel))
+                tensorrt_llm_mistral.load(weights)
+            # Prepare
+            network.set_named_parameters(
+                tensorrt_llm_mistral.named_parameters())
+            inputs = tensorrt_llm_mistral.prepare_inputs(
+                max_batch_size=batch_size,
+                max_input_len=input_len,
+                max_seq_len=input_len + output_len,
+                use_cache=True,
+                max_beam_width=beam_width)
+            # Forward
+            tensorrt_llm_mistral(**inputs)
+
+        return network
+
+    def _gen_tensorrt_llm_engine(self,
+                                 dtype,
+                                 rank,
+                                 world_size,
+                                 llama_config,
+                                 hf_llama,
+                                 model_name,
+                                 use_plugin,
+                                 batch_size,
+                                 beam_width,
+                                 input_len,
+                                 output_len,
+                                 use_refit,
+                                 fast_building=False,
+                                 context_fmha_flag=ContextFMHAType.disabled,
+                                 enable_remove_input_padding=False):
+
+        builder = Builder()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            builder_config = builder.create_builder_config(
+                name=model_name,
+                precision=dtype,
+                timing_cache='model.cache',
+                tensor_parallel=world_size,  # TP only
+                use_refit=use_refit,
+                strongly_typed=(dtype in ["float16", "bfloat16"]),
+            )
+            network = builder.create_network()
+            network.plugin_config.to_legacy_setting()
+            if use_plugin:
+                network.plugin_config.set_gpt_attention_plugin(dtype)
+            if fast_building:
+                network.plugin_config.set_gemm_plugin(dtype)
+            if enable_remove_input_padding:
+                network.plugin_config.enable_remove_input_padding()
+            network.plugin_config.set_context_fmha(context_fmha_flag)
+
+            self._gen_tensorrt_llm_network(network, hf_llama, llama_config,
+                                           batch_size, beam_width, input_len,
+                                           output_len, dtype, rank, world_size)
+
+            engine_buffer = builder.build_engine(network, builder_config)
+            return engine_buffer
+
+    def _gen_tensorrt_llm_runtime(self,
+                                  log_level,
+                                  dtype,
+                                  world_size,
+                                  rank,
+                                  llama_config,
+                                  hf_llama,
+                                  model_name,
+                                  use_plugin,
+                                  batch_size,
+                                  beam_width,
+                                  input_len,
+                                  output_len,
+                                  use_refit,
+                                  fast_building=False,
+                                  context_fmha_flag=ContextFMHAType.disabled,
+                                  enable_remove_input_padding=False):
+        tensorrt_llm.logger.set_level(log_level)
+        mapping = tensorrt_llm.Mapping(world_size, rank, tp_size=world_size)
+        engine_buffer = self._gen_tensorrt_llm_engine(
+            dtype, rank, world_size, llama_config, hf_llama, model_name,
+            use_plugin, batch_size, beam_width, input_len, output_len,
+            use_refit, fast_building, context_fmha_flag,
+            enable_remove_input_padding)
+        runtime = tensorrt_llm.runtime.generation._Runtime(
+            engine_buffer, mapping)
+        return runtime, engine_buffer
+
+    def load_test_cases():
+        test_cases = []
+        test_cases.append((False, True, ContextFMHAType.disabled, False,
+                           'bfloat16', 56, True))  # arctic MHA
+        return test_cases
+
+    @parameterized.expand(load_test_cases, name_func=unittest_name_func)
+    def test_arctic(self, use_refit, fast_building, context_fmha_flag,
+                    enable_remove_input_padding, dtype, num_kv_heads,
+                    residual_mlp):
+        # Simplified from Mistral test
+        # - Arctic is not officially supported in HuggingFace yet, so skipping results comparison
+        # - Skip model loader tests
+        skip_hf = True
+
+        # Skip tests that are not supported in pre-ampere architecture
+        skip_bf16_pre_ampere(dtype)
+        skip_fp32_accum_pre_ampere(context_fmha_flag)
+
+        PRECHECKED_GOOD_RANDOM_SEEDS = [1, 4, 5, 8]
+        model = 'llama'
+        log_level = 'error'
+        use_plugin = True  # gpt plugin
+        batch_size = 4
+        beam_width = 1
+        input_len = 4
+        output_len = 2
+        max_seq_len = input_len + output_len
+        world_size = 1
+        head_size = 32
+        rank = 0
+        mistral_config = MistralConfig()
+        mistral_config.hidden_act = 'silu'
+        mistral_config.num_hidden_layers = 2
+        mistral_config.max_position_embeddings = 64
+        mistral_config.vocab_size = 128
+        mistral_config.num_attention_heads = num_kv_heads
+        mistral_config.hidden_size = mistral_config.num_attention_heads * head_size
+        mistral_config.intermediate_size = ((
+            (mistral_config.hidden_size * 4 * 2 // 3) + head_size - 1) //
+                                            head_size) * head_size
+        mistral_config.num_key_value_heads = num_kv_heads
+        assert (mistral_config.num_attention_heads %
+                mistral_config.num_key_value_heads) == 0
+        mistral_config.pad_token_id = self.PAD_TOKEN
+        mistral_config.eos_token_id = self.EOS_TOKEN
+        mistral_config.residual_mlp = residual_mlp
+        seed_idx = random.randint(0, len(PRECHECKED_GOOD_RANDOM_SEEDS) - 1)
+        torch.manual_seed(PRECHECKED_GOOD_RANDOM_SEEDS[seed_idx])
+        if not skip_hf:
+            hf_mistral = MistralForCausalLM(mistral_config).cuda()
+        runtime, _ = self._gen_tensorrt_llm_runtime(
+            log_level, dtype, world_size, rank, mistral_config, None, model,
+            use_plugin, batch_size, beam_width, input_len, output_len,
+            use_refit, fast_building, context_fmha_flag,
+            enable_remove_input_padding)
+        key_value_cache_buffers = []
+        head_size = mistral_config.hidden_size // mistral_config.num_attention_heads
+        for i in range(mistral_config.num_hidden_layers):
+            key_value_cache_buffers.append(
+                torch.zeros((
+                    batch_size,
+                    2,
+                    mistral_config.num_key_value_heads,
+                    max_seq_len,
+                    head_size,
+                ),
+                            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype),
+                            device='cuda'))
+
+        # compare context
+        step = 0
+        ctx_ids = torch.randint(100, (batch_size, input_len)).int().cuda()
+        ctx_context_lengths = input_len * torch.ones(
+            (batch_size), dtype=torch.int32, device='cuda')
+        ctx_position_ids = torch.tensor(range(input_len),
+                                        dtype=torch.int32).reshape([
+                                            1, input_len
+                                        ]).expand([batch_size,
+                                                   input_len]).cuda()
+        ctx_last_token_ids = ctx_context_lengths.clone()
+        ctx_host_request_types = torch.tensor([0] * batch_size,
+                                              dtype=torch.int32)
+
+        # We need sequence_lengths start as context_lengths for step 0,
+        # and it will be added one after each step.
+        sequence_length_buffer = ctx_context_lengths.detach().clone()
+
+        if not skip_hf:
+            with torch.no_grad():
+                hf_outputs = hf_mistral.forward(ctx_ids)
+            torch.cuda.synchronize()
+            ref = hf_outputs.logits[:, -1, :]
+
+        if enable_remove_input_padding:
+            ctx_ids = ctx_ids.view([batch_size * input_len])
+            ctx_position_ids = ctx_position_ids.view([batch_size * input_len])
+            ctx_last_token_ids = torch.cumsum(ctx_last_token_ids, dim=0).int()
+
+        cache_indirections = [
+            torch.full((
+                batch_size,
+                beam_width,
+                max_seq_len,
+            ),
+                       0,
+                       dtype=torch.int32,
+                       device='cuda'),
+            torch.full((
+                batch_size,
+                beam_width,
+                max_seq_len,
+            ),
+                       0,
+                       dtype=torch.int32,
+                       device='cuda')
+        ]  # ping-pong buffers
+
+        ctx_buffer = {
+            'input_ids': ctx_ids,
+            'context_lengths': ctx_context_lengths,
+            'position_ids': ctx_position_ids,
+            'last_token_ids': ctx_last_token_ids,
+            'cache_indirection': cache_indirections[0],
+            'host_request_types': ctx_host_request_types,
+        }
+        if enable_remove_input_padding:
+            ctx_buffer['host_context_lengths'] = ctx_context_lengths.cpu()
+
+        ctx_shape = {k: v.shape for k, v in ctx_buffer.items()}
+
+        kv_shape = (batch_size, 2, mistral_config.num_key_value_heads,
+                    max_seq_len, head_size)
+        ctx_buffer[f'host_max_attention_window_sizes'] = torch.tensor(
+            [max_seq_len] * mistral_config.num_hidden_layers, dtype=torch.int32)
+        ctx_shape[f'host_max_attention_window_sizes'] = (
+            mistral_config.num_hidden_layers, )
+        for i in range(mistral_config.num_hidden_layers):
+            ctx_shape[f'past_key_value_{i}'] = kv_shape
+            ctx_buffer[f'past_key_value_{i}'] = key_value_cache_buffers[i]
+            ctx_buffer[f'present_key_value_{i}'] = key_value_cache_buffers[i]
+        ctx_buffer['sequence_length'] = sequence_length_buffer
+        ctx_shape['sequence_length'] = ctx_buffer['sequence_length'].shape
+        ctx_shape['host_past_key_value_lengths'] = (batch_size, )
+        ctx_buffer['host_past_key_value_lengths'] = torch.tensor(
+            [0] * batch_size, dtype=torch.int32)
+        ctx_shape['host_sink_token_length'] = (1, )
+        ctx_buffer['host_sink_token_length'] = torch.tensor([0],
+                                                            dtype=torch.int32)
+
+        context = runtime.ctx_context
+        runtime._set_shape(context, ctx_shape)
+        runtime._set_buffer(context, ctx_buffer)
+        runtime._run(context)
+        torch.cuda.synchronize()
+        res = ctx_buffer['logits']
+
+        if not skip_hf:
+            np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(),
+                                       res.to(torch.float32).cpu().numpy(),
+                                       atol=0.12)
+
+        # compare generation
+        step = 1
+        step1_id = torch.randint(100, (batch_size, 1)).int().cuda()
+        gen_context_lengths = ctx_context_lengths.clone()
+        gen_position_ids = torch.ones_like(step1_id).int().cuda() * input_len
+        gen_last_token_ids = torch.zeros_like(gen_context_lengths).int().cuda()
+        gen_host_request_types = torch.tensor([1] * batch_size,
+                                              dtype=torch.int32)
+
+        if not skip_hf:
+            with torch.no_grad():
+                hf_outputs = hf_mistral.forward(
+                    step1_id,
+                    past_key_values=hf_outputs.past_key_values,
+                    use_cache=True)
+            torch.cuda.synchronize()
+            ref = hf_outputs.logits[:, -1, :]
+
+        if enable_remove_input_padding:
+            step1_id = step1_id.view([batch_size])
+            gen_position_ids = gen_position_ids.view([batch_size])
+            gen_last_token_ids = torch.ones_like(
+                gen_context_lengths).int().cuda()
+            gen_last_token_ids = torch.cumsum(gen_last_token_ids, dim=0).int()
+
+        step1_buffer = {
+            'input_ids': step1_id,
+            'context_lengths': gen_context_lengths,
+            'position_ids': gen_position_ids,
+            'last_token_ids': gen_last_token_ids,
+            'host_request_types': gen_host_request_types,
+            'cache_indirection': cache_indirections[1],
+        }
+        if enable_remove_input_padding:
+            step1_buffer['host_context_lengths'] = gen_context_lengths.cpu()
+
+        step1_shape = {k: v.shape for k, v in step1_buffer.items()}
+
+        step1_shape[f'host_max_attention_window_sizes'] = (
+            mistral_config.num_hidden_layers, )
+        step1_buffer[f'host_max_attention_window_sizes'] = torch.tensor(
+            [max_seq_len] * mistral_config.num_hidden_layers, dtype=torch.int32)
+        for i in range(mistral_config.num_hidden_layers):
+            step1_shape[f'past_key_value_{i}'] = kv_shape
+        step1_shape['sequence_length'] = (batch_size, )
+        step1_shape['host_past_key_value_lengths'] = (batch_size, )
+        step1_shape['host_sink_token_length'] = (1, )
+        for i in range(mistral_config.num_hidden_layers):
+            step1_buffer[f'past_key_value_{i}'] = key_value_cache_buffers[i]
+            step1_buffer[f'present_key_value_{i}'] = key_value_cache_buffers[i]
+        step1_buffer[
+            'host_past_key_value_lengths'] = sequence_length_buffer.cpu()
+        sequence_length_buffer = torch.add(sequence_length_buffer, step)
+        step1_buffer['sequence_length'] = sequence_length_buffer
+        step1_buffer['host_sink_token_length'] = torch.tensor([0],
+                                                              dtype=torch.int32)
+
+        context = runtime.context_1
+        runtime._set_shape(context, step1_shape)
+        runtime._set_buffer(context, step1_buffer)
+        runtime._run(context)
+        torch.cuda.synchronize()
+        res = step1_buffer['logits']
+
+        if not skip_hf:
+            np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(),
+                                       res.to(torch.float32).cpu().numpy(),
+                                       atol=0.12)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/model/test_gpt_e2e.py b/tests/model/test_gpt_e2e.py
index c7a72d194..f375eed9a 100644
--- a/tests/model/test_gpt_e2e.py
+++ b/tests/model/test_gpt_e2e.py
@@ -31,9 +31,9 @@
 
 work_dir = Path(__file__).parent.resolve() / 'check_gpt'
 
-sys.path.append(os.path.join(os.path.dirname(__file__), '../utils'))
-from llm_data import llm_models_root
-from util import getSMVersion
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from utils.llm_data import llm_models_root
+from utils.util import getSMVersion
 
 gpt_example_root = os.path.join(os.path.dirname(__file__), '../../examples/gpt')
 
diff --git a/tests/model/test_phi.py b/tests/model/test_phi.py
index 6791aa047..842eeaf11 100644
--- a/tests/model/test_phi.py
+++ b/tests/model/test_phi.py
@@ -61,8 +61,7 @@ def generate_hf_model(self, dtype: str):
                                                 trust_remote_code=True)
         gpt_config.num_hidden_layers = 2
         model = AutoModelForCausalLM.from_config(
-            gpt_config, code_revision=HF_CODE_REVISION,
-            trust_remote_code=True).cuda().to(
+            gpt_config, trust_remote_code=True).cuda().to(
                 tensorrt_llm._utils.str_dtype_to_torch(dtype)).eval()
         return gpt_config, model
 
diff --git a/tests/model_api/test_model_quantization.py b/tests/model_api/test_model_quantization.py
index c19e54920..4f4399d5f 100644
--- a/tests/model_api/test_model_quantization.py
+++ b/tests/model_api/test_model_quantization.py
@@ -12,13 +12,13 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.llm_data import llm_models_root
-from utils.util import force_ampere, skip_no_ammo, skip_pre_ada
+from utils.util import force_ampere, skip_no_modelopt, skip_pre_ada
 
 tensorrt_llm.logger.set_level('info')
 
 
 @force_ampere
-@skip_no_ammo
+@skip_no_modelopt
 def test_int4_awq_quantization():
     input_text = [
         'Born in north-east France, Soyer trained as a',
@@ -56,7 +56,7 @@ def test_int4_awq_quantization():
 
 
 @skip_pre_ada
-@skip_no_ammo
+@skip_no_modelopt
 def test_fp8_quantization():
     input_text = [
         'Born in north-east France, Soyer trained as a',
diff --git a/tests/test_layer.py b/tests/test_layer.py
index 6e7a99951..31a7dc4b9 100644
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@@ -1667,15 +1667,24 @@ def test_recurrent(self, batch_size, in_seq_len, out_seq_len, width,
             conv_state_trt_llm = conv_state_trt_llm.permute(0, 2, 1)
         conv_state_trt_llm = conv_state_trt_llm.to(torch.float32).cpu().numpy()
 
+        # https://nvbugs/4619116
+        # test_layer.py::TestLayer::test_recurrent_3_16_1_1280_1280_10_generation_float16_False_False
+        # only failed on V100 due the strict default tolerance setting.
+        atol = 1e-2 if getSMVersion() == 70 else dtype_atol[dtype]
+        rtol = 10 if getSMVersion() == 70 else 1e-7
+
         np.testing.assert_allclose(output_torch.numpy(),
                                    output_trt_llm.numpy(),
-                                   atol=dtype_atol[dtype])
+                                   atol=atol,
+                                   rtol=rtol)
         np.testing.assert_allclose(lru_state_torch.numpy(),
                                    lru_state_trt_llm.numpy(),
-                                   atol=dtype_atol[dtype])
+                                   atol=atol,
+                                   rtol=rtol)
         np.testing.assert_allclose(conv_state_ref,
                                    conv_state_trt_llm,
-                                   atol=dtype_atol[dtype])
+                                   atol=atol,
+                                   rtol=rtol)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_llama_conversion.sh b/tests/test_llama_conversion.sh
index 6bffc23fc..8b55cd7a8 100755
--- a/tests/test_llama_conversion.sh
+++ b/tests/test_llama_conversion.sh
@@ -80,7 +80,7 @@ test_gptq() {
     python convert_checkpoint.py --model_dir ${MODEL} \
                                  --output_dir ./tllm_checkpoint/2gpu_gptq \
                                  --dtype float16 \
-                                 --ammo_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
+                                 --modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
                                  --use_weight_only \
                                  --weight_only_precision int4_gptq \
                                  --per_group \
diff --git a/tests/utils/util.py b/tests/utils/util.py
index df34cb299..b840eff10 100644
--- a/tests/utils/util.py
+++ b/tests/utils/util.py
@@ -114,19 +114,19 @@ def skip_bf16_fp32_accum(dtype, context_fmha_type):
         )
 
 
-def ammo_installed():
+def modelopt_installed():
     try:
         # isort: off
-        import ammo.torch.quantization as atq  # NOQA
-        from ammo.torch.export import export_tensorrt_llm_checkpoint  # NOQA
+        import modelopt.torch.quantization as atq  # NOQA
+        from modelopt.torch.export import export_tensorrt_llm_checkpoint  # NOQA
         # isort: on
         return True
     except Exception:
         return False
 
 
-skip_no_ammo = unittest.skipIf(not ammo_installed(),
-                               reason="AMMO is not installed")
+skip_no_modelopt = unittest.skipIf(not modelopt_installed(),
+                                   reason="Modelopt is not installed")
 
 
 # This function names will make all unit tests names to show the values of all parameters in @parameterized.expand
diff --git a/windows/README.md b/windows/README.md
index 06127995b..46748425b 100644
--- a/windows/README.md
+++ b/windows/README.md
@@ -13,7 +13,8 @@ The release wheel for Windows can be installed with `pip`. Alternatively, you ca
 
 To get started with TensorRT-LLM on Windows, visit our documentation:
 
-- [Quick Start Guide](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)
-- [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
 - [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
+- [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
+- [Quick Start Guide](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)
 - [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
+- [Source build on Windows](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-windows.html)
diff --git a/windows/destruct_env.ps1 b/windows/destruct_env.ps1
index b5372dc2d..8f91624a5 100644
--- a/windows/destruct_env.ps1
+++ b/windows/destruct_env.ps1
@@ -14,6 +14,7 @@ foreach($line in Get-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt) {
 #3 = Python
 #4 = MPI Presence
 #5 = CUDNN
+#6 = TRT
 
 if ($defaultEnv[0].Equals("0")) {
     Write-Output "Removing CUDA"
@@ -39,8 +40,12 @@ if ($defaultEnv[4].Equals("0")) {
     [System.Environment]::SetEnvironmentVariable("path", $path,'Machine')
 }
 
-
 if ($defaultEnv[5].Equals("0")) {
     Write-Output "Removing CUDNN"
     [Environment]::SetEnvironmentVariable('CUDNN', '', [EnvironmentVariableTarget]::Machine)
 }
+
+if ($defaultEnv[6].Equals("0")) {
+    Write-Output "Removing TRT"
+    [Environment]::SetEnvironmentVariable('TRT', '', [EnvironmentVariableTarget]::Machine)
+}
diff --git a/windows/docker/Dockerfile b/windows/docker/Dockerfile
index 2a7e22493..479dcd981 100644
--- a/windows/docker/Dockerfile
+++ b/windows/docker/Dockerfile
@@ -1,206 +1,79 @@
 # https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
 
 # Use the Windows Server Core 2019 image.
-FROM mcr.microsoft.com/windows/servercore:ltsc2019
+FROM mcr.microsoft.com/windows/servercore:ltsc2019 AS devel
 
-# Restore the default Windows shell for correct batch processing.
-# (Used for VS Build Tools installation)
-SHELL ["cmd", "/S", "/C"]
+SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"]
 
 # -----------------------------------------------------------------------------
+# Create a working directory
 
-# Install CUDA 12.2
-
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_546.12_windows.exe \
-    -OutFile "cuda_installer.exe"; \
-    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
-    Remove-Item cuda_installer.exe -Force
-
-# -----------------------------------------------------------------------------
-
-# Install Python 3.10.11
-
-# Download and install Python
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe -OutFile python-3.10.11.exe ; \
-    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
-    Remove-Item python-3.10.11.exe -Force
-
-# Add python3 command
-RUN powershell -Command \
-    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+WORKDIR "C:\\\\workspace"
 
 # -----------------------------------------------------------------------------
+# Install runtime dependencies
 
-# Install Microsoft MPI
-
-# The latest version is 10.1.3, but it requires you to get a temporary download
-# link.
-# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
-# We use 10.1.1 which has a release on the GitHub page
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisetup.exe \
-    -OutFile "msmpisetup.exe"; \
-    Start-Process .\msmpisetup.exe -Wait ; \
-    Remove-Item msmpisetup.exe -Force
-
-# Add MPI binaries to Path
-RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
-
-# Download the MSMPI SDK
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisdk.msi \
-    -OutFile "msmpisdk.msi"; \
-    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
-    Remove-Item msmpisdk.msi -Force
+COPY setup_env.ps1 C:\\workspace\\setup_env.ps1
+# TRT is installed along with build-time dependencies
+RUN C:\workspace\setup_env.ps1 -skipTRT
+RUN Remove-Item "C:\workspace\setup_env.ps1" -Force
+# CUDNN paths are populated in the env variable CUDNN, add it to PATH
+RUN [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';' + $Env:CUDNN, [EnvironmentVariableTarget]::Machine)
 
 # -----------------------------------------------------------------------------
+# Install build-time dependencies
 
-# Install CMake
-
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://github.com/Kitware/CMake/releases/download/v3.27.7/cmake-3.27.7-windows-x86_64.msi \
-    -OutFile "cmake.msi"; \
-    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
-    Remove-Item cmake.msi -Force
-
-# Add CMake binaries to Path
-RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+COPY setup_build_env.ps1 C:\\workspace\\setup_build_env.ps1
+# TRT is installed in workspace
+RUN C:\workspace\setup_build_env.ps1 -TRTPath 'C:\\workspace'
+RUN Remove-Item "C:\workspace\setup_build_env.ps1" -Force
 
-# -----------------------------------------------------------------------------
-
-# Install VS Build Tools
-
-RUN \
-    # Download the Build Tools bootstrapper.
-    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
-    \
-    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
-    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
-        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
-        --includeRecommended \
-        --add Microsoft.VisualStudio.Workload.MSBuildTools \
-        --add Microsoft.VisualStudio.Workload.VCTools \
-        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
-        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
-        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
-        --remove Microsoft.VisualStudio.Component.Windows81SDK \
-        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
-    \
-    # Cleanup
-    && del /q vs_buildtools.exe
+# Add binaries to Path
+RUN [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';C:\Program Files\CMake\bin', [EnvironmentVariableTarget]::Machine)
 
 # -----------------------------------------------------------------------------
 
 # Install Vim (can delete this but it's nice to have)
+# and add binaries to Path
 
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://ftp.nluug.nl/pub/vim/pc/gvim90.exe \
+RUN Invoke-WebRequest -Uri https://ftp.nluug.nl/pub/vim/pc/gvim90.exe \
     -OutFile "install_vim.exe"; \
     Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
-    Remove-Item install_vim.exe -Force
-
-# Add Vim binaries to Path
-RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
-
+    Remove-Item install_vim.exe -Force ; \
+    [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';C:\Program Files (x86)\Vim\vim90', [EnvironmentVariableTarget]::Machine)
 # -----------------------------------------------------------------------------
 
 # Install Chocolatey
 # Chocolatey is a package manager for Windows
-# I probably could've used it to install some of the above, but I didn't...
 
 # If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
 # https://stackoverflow.com/a/76470753
 ENV chocolateyVersion=1.4.0
 
-# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
-    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
-    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
-    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-powershell.exe
+RUN Set-ExecutionPolicy Bypass -Scope Process -Force; \
+ [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; \
+ iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
 
 # -----------------------------------------------------------------------------
 
 # Install Git via Chocolatey
-RUN powershell -Command \
-    choco install git -y
+RUN choco install git -y
 
 # -----------------------------------------------------------------------------
-
 # Install CUDA 11.8 NVTX
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+RUN Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
     -OutFile cuda_11.8.0_windows_network.exe; \
     Invoke-WebRequest -Uri https://7-zip.org/a/7zr.exe \
     -OutFile 7zr.exe
 
-RUN \
-    7zr.exe e -i!"nsight_nvtx\nsight_nvtx\NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" cuda_11.8.0_windows_network.exe &&\
-    msiexec.exe /i "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" /norestart /quiet &&\
-    del "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" &&\
-    del 7zr.exe &&\
-    del cuda_11.8.0_windows_network.exe
+RUN .\7zr.exe e -i!'nsight_nvtx\nsight_nvtx\NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi' cuda_11.8.0_windows_network.exe ;
 
-# -----------------------------------------------------------------------------
+RUN cmd.exe /S /C "msiexec.exe /i 'NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi' /norestart /quiet"
 
-# Create a working directory
-WORKDIR "C:\\\\workspace"
-
-# -----------------------------------------------------------------------------
-
-# Download and unzip TensorrRT 9.3.0.1 for TensorRT-LLM
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.win10.cuda-12.2.llm.beta.zip \
-    -OutFile TensorRT-9.3.0.1.zip; \
-    Expand-Archive .\TensorRT-9.3.0.1.zip -DestinationPath .; \
-    Move-Item -Path .\TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta\TensorRT-9.3.0.1 -Destination .; \
-    Remove-Item TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta -Force; \
-    Remove-Item TensorRT-9.3.0.1.zip -Force
-
-# Add TensorRT libs to Path
-RUN setx Path "%Path%;C:\workspace\TensorRT-9.3.0.1\lib"
-
-# Install TensorRT Python wheel
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    pip install TensorRT-9.3.0.1\python\tensorrt-9.3.0.post12.dev1-cp310-none-win_amd64.whl
-
-# -----------------------------------------------------------------------------
-
-# Download and unzip cuDNN 8.9.7.29 for TensorRT-LLM
-# https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip
-RUN powershell -Command \
-    $ErrorActionPreference = 'Stop'; \
-    $ProgressPreference = 'SilentlyContinue'; \
-    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip \
-    -OutFile cuDNN.zip; \
-    Expand-Archive .\cuDNN.zip -DestinationPath .; \
-    New-Item -Path cuDNN -ItemType Directory; \
-    Move-Item -Path .\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\* -Destination .\cuDNN; \
-    Remove-Item cudnn-windows-x86_64-8.9.7.29_cuda12-archive -Force; \
-    Remove-Item cuDNN.zip -Force
-
-# Add cuDNN libs and bin to Path.
-RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+RUN Remove-Item 'NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi' -Force ; \
+    Remove-Item 7zr.exe -Force ; \
+    Remove-Item cuda_11.8.0_windows_network.exe -Force
 
 # -----------------------------------------------------------------------------
 
@@ -208,3 +81,10 @@ RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
 # This entry point launches the 64-bit PowerShell developer shell.
 # We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
 ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+# COPY requirements-windows.txt C:\\workspace\\requirements-windows.txt
+# COPY requirements-dev-windows.txt C:\\workspace\\requirements-dev-windows.txt
+# RUN python3 -m pip --no-cache-dir install -r C:\workspace\requirements-dev-windows.txt
+# RUN Remove-Item "C:\workspace\requirements-windows.txt" -Force
+# RUN Remove-Item "C:\workspace\requirements-dev-windows.txt" -Force
diff --git a/windows/docker/README.md b/windows/docker/README.md
index afb6e0715..48f2a3ff2 100644
--- a/windows/docker/README.md
+++ b/windows/docker/README.md
@@ -2,30 +2,12 @@
 
 These instructions provide details on how to build the TensorRT-LLM Windows Docker image manually from source.
 
-You should already have set up Docker Desktop based on the top-level [Windows README instructions](/windows/README.md#docker-desktop).
+You should already have set up Docker Desktop based on the [Windows source build instructions](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-windows.html#docker-desktop).
 
-## Set up Build Context
+From the `TensorRT-LLM\windows\` folder, run the build command:
 
-cuDNN and NvToolsExt cannot be installed via the command line, so you'll need to manually install them and copy them to the build context in order to build this container.
-
-### cuDNN
-
-If you followed the top-level [Windows README](/windows/README.md), you'll already have a copy of cuDNN. If not, download and unzip [cuDNN](https://developer.nvidia.com/cudnn).
-
-Copy the entire `cuDNN` folder into `TensorRT-LLM/windows/docker`.
-
-### NvToolsExt
-
-TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA12.2 installer. To install these assets, download the [CUDA11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64). During installation, select "Advanced installation." Nsight NVTX is located in the CUDA drop down. Deselect all packages, and then select Nsight NVTX.
-
-You will now have `C:\Program Files\NVIDIA Corporation\NvToolsExt`. Copy the entire `NvToolsExt` folder into `TensorRT-LLM/windows/docker`
-
-### Build
-
-Now that `TensorRT-LLM\windows\docker` contains `cuDNN\` and `NvToolsExt\`, run the build command:
-
-```
-docker build -t tensorrt-llm-windows-build:latest .
+```bash
+docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
 ```
 
-Your image is now ready for use. Return to [Running the Container](/windows/README.md#running-the-container) to proceed with your TensorRT-LLM build using Docker.
+Your image is now ready for use. Return to [Run the Container](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-windows.html#run-the-container) to proceed with your TensorRT-LLM build using Docker.
diff --git a/windows/setup_build_env.ps1 b/windows/setup_build_env.ps1
index ed26ad722..4983ae7e8 100644
--- a/windows/setup_build_env.ps1
+++ b/windows/setup_build_env.ps1
@@ -45,21 +45,21 @@ if (-not $skipVSBuildTools) {
     Write-Output "Skipping Visual Studio Build Tools installation"
 }
 
-# Install TensorRT 9.3.0.1 for TensorRT-LLM
+# Install TensorRT 10.0.1 for TensorRT-LLM
 if (-not $skipTRT) {
     Write-Output "Downloading TensorRT"
-    Invoke-WebRequest -Uri 'https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.x86_64.cuda-12.2.llm.beta.zip' -OutFile 'TensorRT-9.3.0.1.zip'
+    Invoke-WebRequest -Uri 'https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip' -OutFile 'TensorRT-10.0.1.6.zip'
     Write-Output "Extracting TensorRT"
     # Get path
     $absolutePath = Resolve-Path $TRTPath
-    Expand-Archive -Path '.\TensorRT-9.3.0.1.zip' -DestinationPath $absolutePath
+    Expand-Archive -Path '.\TensorRT-10.0.1.6.zip' -DestinationPath $absolutePath
     Write-Output "Removing TensorRT zip"
-    Remove-Item -Path 'TensorRT-9.3.0.1.zip' -Force
+    Remove-Item -Path 'TensorRT-10.0.1.6.zip' -Force
     Write-Output "Adding TensorRT to system Path"
-    [Environment]::SetEnvironmentVariable('Path', "$env:Path;$absolutePath\TensorRT-9.3.0.1\lib", [EnvironmentVariableTarget]::Machine)
+    [Environment]::SetEnvironmentVariable('Path', "$env:Path;$absolutePath\TensorRT-10.0.1.6\lib", [EnvironmentVariableTarget]::Machine)
     Write-Output "Installing TensorRT Python wheel"
-    pip install $absolutePath\TensorRT-9.3.0.1\python\tensorrt-9.3.0.post12.dev1-cp310-none-win_amd64.whl
-    Write-Output "Done TensorRT installation at '$absolutePath\TensorRT-9.3.0.1'"
+    python3 -m pip install $absolutePath\TensorRT-10.0.1.6\python\tensorrt-10.0.1-cp310-none-win_amd64.whl
+    Write-Output "Done TensorRT installation at '$absolutePath\TensorRT-10.0.1.6'"
 } else {
     Write-Output "Skipping TensorRT installation"
 }
diff --git a/windows/setup_env.ps1 b/windows/setup_env.ps1
index 86707b9d3..f763374a0 100644
--- a/windows/setup_env.ps1
+++ b/windows/setup_env.ps1
@@ -4,7 +4,8 @@ param (
     [switch]$skipPython,
     [switch]$skipMPI = $true,
     [switch]$skipCUDNN,
-    [string]$cudaVersion #CUDA version defaults to 12.3, specify otherwise, however only 12.2 and 12.3 have uris
+    [string]$cudaVersion, #CUDA version defaults to 12.4, specify otherwise
+    [switch]$skipTRT = $true
 )
 
 # Set the error action preference to 'Stop' for the entire script.
@@ -19,13 +20,13 @@ $ErrorActionPreference = 'Stop'
 
 New-Item -Path "$($env:LOCALAPPDATA)\trt_env_outlog.txt" -Force
 
-# Install CUDA, default to 12.3
+# Install CUDA, default to 12.4
 if (-not $skipCUDA){
     if($cudaVersion){
         $cudaVer = "NVIDIA CUDA Toolkit " + $cudaVersion
         } else {
-        $cudaVersion = 12.3
-        $cudaVer = "NVIDIA CUDA Toolkit 12.3"
+        $cudaVersion = 12.4
+        $cudaVer = "NVIDIA CUDA Toolkit 12.4"
     }
 
     if (-not (Get-Package -Name $cudaVer -EA Ignore)) {
@@ -35,6 +36,8 @@ if (-not $skipCUDA){
             Invoke-WebRequest -Uri 'https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_537.13_windows.exe' -OutFile 'cuda_installer.exe'
         } elseif ($cudaVersion -eq 12.3){
             Invoke-WebRequest -Uri 'https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_546.12_windows.exe' -OutFile 'cuda_installer.exe'
+        } elseif ($cudaVersion -eq 12.4){
+            Invoke-WebRequest -Uri 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe' -OutFile 'cuda_installer.exe'
         } else {
             $cudaUri = Read-Host "Please go to https://developer.nvidia.com/cuda-downloads and input the url of the CUDA version you wish to use"
             Invoke-WebRequest -Uri $cudaUri -OutFile 'cuda_installer.exe'
@@ -71,7 +74,7 @@ if(-not $skipPython){
         Write-Output "Creating python3 alias executable"
         Copy-Item -Path 'C:\Program Files\Python310\python.exe' -Destination 'C:\Program Files\Python310\python3.exe'
         Write-Output "Done Python installation at 'C:\Program Files\Python310'"
-        [Environment]::SetEnvironmentVariable('Path', "C:\Program Files\Python310;$env:Path", [EnvironmentVariableTarget]::Machine)
+        [Environment]::SetEnvironmentVariable('Path', "C:\Program Files\Python310;C:\Program Files\Python310\Scripts;$env:Path", [EnvironmentVariableTarget]::Machine)
         Add-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt -Value "0"
     } else {
         Write-Output "Python installation already exists"
@@ -87,6 +90,10 @@ if (-not ($skipMPI)) {
     if (-not (Test-Path -Path 'C:\Program Files\Microsoft MPI\Bin')) {
         Write-Output "Downloading Microsoft MPI not detected"
         Add-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt -Value "0"
+        # The latest version is 10.1.3, but it requires you to get a temporary download
+        # link.
+        # https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+        # We use 10.1.1 which has a release on the GitHub page
         Write-Output "Downloading Microsoft MPI installer"
         Invoke-WebRequest -Uri 'https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisetup.exe' -OutFile 'msmpisetup.exe'
         Write-Output "Installing Microsoft MPI"
@@ -162,15 +169,42 @@ if(-not $skipCUDNN){
     Add-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt -Value "1"
 }
 
-Write-Output "Grabbing TensorRT..."
-$ProgressPreference = 'SilentlyContinue'
-New-Item -Path .\TensorRT -ItemType Directory
-Invoke-WebRequest -Uri 'https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.win10.cuda-12.2.llm.beta.zip' -OutFile .\TensorRT\trt.zip
-Expand-Archive -Path .\TensorRT\trt.zip -DestinationPath .\TensorRT\Temp
-Move-Item -Path .\TensorRT\Temp\TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta\TensorRT-9.3.0.1 -Destination .\TensorRT
-Remove-Item -Path .\TensorRT\trt.zip -Force
-Remove-Item .\TensorRT\Temp -Force -Recurse
-Write-Output "TensorRT installed at .\TensorRT\TensorRT-9.3.0.1"
+# Install TensorRT
+if (-not ($skipTRT)) {
+    $TRT_BASE = Join-Path $PWD \TensorRT
+    if (-not (Test-Path -Path $TRT_BASE)) {
+        Write-Output "Grabbing TensorRT..."
+        $ProgressPreference = 'SilentlyContinue'
+        New-Item -Path .\TensorRT -ItemType Directory
+        Invoke-WebRequest -Uri 'https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip' -OutFile .\TensorRT\trt.zip
+        Expand-Archive -Path .\TensorRT\trt.zip -DestinationPath .\TensorRT\
+        Remove-Item -Path .\TensorRT\trt.zip -Force
+        $trtPath = Join-Path $TRT_BASE TensorRT-10.0.1.6
+        Write-Output "TensorRT installed at ${trtPath}"
+
+        $trtSubPaths = @{
+            "bin" = Join-Path $trtPath bin
+            "include" = Join-Path $trtPath include
+            "lib" = Join-Path $trtPath lib
+        }
+
+        foreach ($key in $trtSubPaths.Keys) {
+            $subPath = $trtSubPaths[$key]
+            if (-not (Test-Path -Path $subPath)) {
+              Write-Error "TensorRT ${key} path ${subPath} does not exist!"
+            }
+        }
+        $TRTEnvVar = $trtSubPaths.Values -join ";"
+
+        [Environment]::SetEnvironmentVariable("TRT", "$TRTEnvVar", [EnvironmentVariableTarget]::Machine)
+    } else {
+        Write-Output "TensorRT already present"
+        Add-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt -Value "1"
+    }
+} else {
+    Write-Output "Skipping TRT installation"
+    Add-Content -Path $env:LOCALAPPDATA\trt_env_outlog.txt -Value "1"
+}
 
 
 return $env:Path