Update TensorRT-LLM (#2582)

NVIDIA · Dec 17, 2024 · be17881 · be17881
1 parent b171e87
commit be17881
Show file tree

Hide file tree

Showing 267 changed files with 4,505 additions and 4,898 deletions.
diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v2
 
-      - name: Get Assignee
+      - name: Get assignee
         uses: actions/github-script@v6
         id: get-assignee
         with:

diff --git a/.../workflows/auto_close_inactive_issues.yml → .../workflows/auto-close-inactive-issues.yml b/.../workflows/auto_close_inactive_issues.yml → .../workflows/auto-close-inactive-issues.yml
diff --git a/.github/workflows/module-owners.json b/.github/workflows/module-owners.json
@@ -9,5 +9,6 @@
     "Performance": ["kaiyux", "jiahanc", "hypdeb"],
     "Lora/P-tuning":["byshiue", "Naveassaf"],
     "Disaggregated Serving":["Shixiaowei02", "joyang-nv", "chuangz0", "schetlur-nv"],
-    "Documentation":["nv-guomingz", "mikemckiernan"]
+    "Documentation":["nv-guomingz", "mikemckiernan"],
+    "Windows":["pamelap-nvidia"]
 }
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,8 +7,8 @@ repos:
     rev: v1.1.13
     hooks:
     -   id: remove-crlf
--   repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
+-   repo:  https://github.com/google/yapf
+    rev: v0.43.0
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
 [![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.17.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
@@ -17,12 +17,31 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
-* [2024/11/19] Llama 3.2 Full-Stack Optimizations Unlock High Performance on NVIDIA GPUs
-[➡️ link](https://developer.nvidia.com/blog/llama-3-2-full-stack-optimizations-unlock-high-performance-on-nvidia-gpus/?ncid=so-link-721194)
+* [2024/12/10] ⚡ Llama 3.3 70B from AI at Meta is accelerated by TensorRT-LLM. 🌟 State-of-the-art model on par with Llama 3.1 405B for reasoning, math, instruction following and tool use. Explore the preview
+[➡️ link](https://build.nvidia.com/meta/llama-3_3-70b-instruct)
 <div align="center">
-<img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/11/three-llamas-holding-number-10-signs-1.jpg" width="50%">
+<img src="https://media.licdn.com/dms/image/v2/D4E10AQEMmE0xCJ6QvQ/image-shrink_800/image-shrink_800/0/1733853602424?e=1734912000&v=beta&t=mjW-P7skGnxGSgMx07IWczyYYC_05RYypB2mEmbgqZQ" width="50%">
 <div align="left">
 
+* [2024/12/03] 🌟 Boost your AI hashtag#inference throughput by up to 3.6x.  We now support speculative decoding and tripling token throughput with our NVIDIA TensorRT-LLM. Perfect for your generative AI apps.  ⚡Learn how in this technical deep dive
+[➡️ link](https://nvda.ws/3ZCZTzD)
+
+* [2024/12/02] Working on deploying ONNX models for performance-critical applications? Try our NVIDIA Nsight Deep Learning Designer ⚡ A user-friendly GUI and tight integration with NVIDIA TensorRT that offers:
+✅ Intuitive visualization of ONNX model graphs
+✅ Quick tweaking of model architecture and parameters
+✅ Detailed performance profiling with either ORT or TensorRT
+✅ Easy building of TensorRT engines
+[➡️ link](https://developer.nvidia.com/nsight-dl-designer?ncid=so-link-485689&linkId=100000315016072)
+
+* [2024/11/26] 📣 Introducing TensorRT-LLM for Jetson AGX Orin, making it even easier to deploy on Jetson AGX Orin with initial support in JetPack 6.1 via the v0.12.0-jetson branch of the TensorRT-LLM repo. ✅ Pre-compiled TensorRT-LLM wheels & containers for easy integration ✅ Comprehensive guides & docs to get you started
+[➡️ link](https://forums.developer.nvidia.com/t/tensorrt-llm-for-jetson/313227?linkId=100000312718869)
+
+* [2024/11/21] NVIDIA TensorRT-LLM Multiblock Attention Boosts Throughput by More Than 3x for Long Sequence Lengths on NVIDIA HGX H200
+[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-llm-multiblock-attention-boosts-throughput-by-more-than-3x-for-long-sequence-lengths-on-nvidia-hgx-h200/)
+
+* [2024/11/19] Llama 3.2 Full-Stack Optimizations Unlock High Performance on NVIDIA GPUs
+[➡️ link](https://developer.nvidia.com/blog/llama-3-2-full-stack-optimizations-unlock-high-performance-on-nvidia-gpus/?ncid=so-link-721194)
+
 * [2024/11/09] 🚀🚀🚀 3x Faster AllReduce with NVSwitch and TensorRT-LLM MultiShot
 [➡️ link](https://developer.nvidia.com/blog/3x-faster-allreduce-with-nvswitch-and-tensorrt-llm-multishot/)
 
@@ -46,6 +65,9 @@ TensorRT-LLM
 * [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries
 [➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
 
+<details close>
+<summary>Previous News</summary>
+
 * [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
 [➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
 
@@ -61,8 +83,6 @@ TensorRT-LLM
 * [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
 
-<details close>
-<summary>Previous News</summary>
 
 * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)