update gh-pages (#2530)

NVIDIA · Dec 4, 2024 · b79ef8a · b79ef8a
1 parent f8fa42a
commit b79ef8a
Show file tree

Hide file tree

Showing 205 changed files with 20,373 additions and 14,657 deletions.
diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html
diff --git a/_cpp_gen/runtime.html b/_cpp_gen/runtime.html
diff --git a/_downloads/29c17f8c7171976309d720e2b031e77e/test_debugging_api.py b/_downloads/29c17f8c7171976309d720e2b031e77e/test_debugging_api.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+from polygraphy.backend.trt import EngineFromNetwork, TrtRunner
+from torch import nn
+
+import tensorrt_llm
+from tensorrt_llm import Module, Tensor
+
+
+class TorchMLP(nn.Module):
+
+    def __init__(self, hidden_size, ffn_hidden_size, bias=True):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, ffn_hidden_size, bias=bias)
+        self.proj = nn.Linear(ffn_hidden_size, hidden_size, bias=bias)
+
+    def forward(self, hidden_states):
+        inter = self.fc(hidden_states)
+        inter = nn.functional.relu(inter)
+        output = self.proj(inter)
+        return output, inter
+
+
+class MLP(Module):
+
+    def __init__(self,
+                 hidden_size,
+                 ffn_hidden_size,
+                 bias=True,
+                 tp_group=None,
+                 tp_size=1):
+        super().__init__()
+        self.fc = tensorrt_llm.layers.ColumnLinear(hidden_size,
+                                                   ffn_hidden_size,
+                                                   bias=bias,
+                                                   tp_group=tp_group,
+                                                   tp_size=tp_size,
+                                                   gather_output=False)
+        self.proj = tensorrt_llm.layers.RowLinear(ffn_hidden_size,
+                                                  hidden_size,
+                                                  bias=bias,
+                                                  tp_group=tp_group,
+                                                  tp_size=tp_size)
+
+    def forward(self, hidden_states):
+        inter = self.fc(hidden_states)
+        inter = tensorrt_llm.functional.relu(inter)
+        self.register_network_output('inter', inter)
+        output = self.proj(inter)
+        return output
+
+
+class TestDebuggingAPI(unittest.TestCase):
+
+    def setUp(self):
+        tensorrt_llm.logger.set_level('error')
+
+    def test_debugging_api(self):
+        # test data
+        dtype = 'float32'
+        hidden_size = 768
+        x_data = torch.randn(2, 16, hidden_size)
+
+        tm = TorchMLP(hidden_size=hidden_size,
+                      ffn_hidden_size=hidden_size * 4,
+                      bias=False)
+
+        # construct trt network
+        builder = tensorrt_llm.Builder()
+        net = builder.create_network()
+        with tensorrt_llm.net_guard(net):
+            x = Tensor(name='x',
+                       shape=x_data.shape,
+                       dtype=tensorrt_llm.str_dtype_to_trt(dtype))
+
+            gm = MLP(hidden_size=hidden_size,
+                     ffn_hidden_size=4 * hidden_size,
+                     bias=False)
+            gm.fc.weight.value = tm.fc.weight.detach().cpu().numpy()
+            gm.proj.weight.value = tm.proj.weight.detach().cpu().numpy()
+
+            output = gm.forward(x)
+            net._mark_output(output, 'output',
+                             tensorrt_llm.str_dtype_to_trt(dtype))
+
+            for k, v in gm.named_network_outputs():
+                net._mark_output(v, k, tensorrt_llm.str_dtype_to_trt(dtype))
+
+        # trt run
+        build_engine = EngineFromNetwork((builder.trt_builder, net.trt_network))
+        with TrtRunner(build_engine) as runner:
+            outputs = runner.infer(feed_dict={'x': x_data.numpy()})
+
+        # pytorch run
+        with torch.no_grad():
+            ref1, ref2 = tm(x_data)
+
+        # compare diff
+        np.testing.assert_allclose(ref1.cpu().numpy(),
+                                   outputs['output'],
+                                   atol=1e-5)
+        np.testing.assert_allclose(ref2.cpu().numpy(),
+                                   outputs['inter'],
+                                   atol=1e-5)
diff --git a/_downloads/408e9af6e2b04a79e78215bde246e8bc/model.py b/_downloads/408e9af6e2b04a79e78215bde246e8bc/model.py
@@ -20,7 +20,7 @@
 from ..._common import default_net
 from ..._utils import pad_vocab_size
 from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor,
-                           non_gated_version, recv, send)
+                           allgather, concat, non_gated_version, recv, send)
 from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, GatedMLP, PositionEmbeddingType, RmsNorm)
 from ...lora_manager import LoraConfig, use_lora
@@ -34,22 +34,27 @@
 from .convert import (load_hf_llama, load_weights_from_gptq,
                       load_weights_from_hf_by_shard, load_weights_from_hf_model,
                       load_weights_from_hf_safetensors,
-                      load_weights_from_meta_ckpt)
+                      load_weights_from_lmquant, load_weights_from_meta_ckpt)
 
 
 class LLaMADecoderLayer(Module):
 
     def __init__(self, config: LLaMAConfig, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
+        layer_idx += config.layer_idx_offset
         self.config = config
+        self.mapping = config.mapping
 
-        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
-                                       eps=config.norm_epsilon,
-                                       dtype=config.dtype)
+        if (self.config.use_input_layernorm_in_first_layer
+                and self.layer_idx == 0) or self.layer_idx > 0:
+            self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                           eps=config.norm_epsilon,
+                                           dtype=config.dtype)
 
         layers_range = config.mapping.pp_layers(config.num_hidden_layers)
         self.local_layer_idx = layer_idx - layers_range[0]
+        self.is_last_local_layer = layer_idx == layers_range[-1]
         self.attention = Attention(
             local_layer_idx=self.local_layer_idx,
             hidden_size=config.hidden_size,
@@ -134,7 +139,9 @@ def forward(self,
             hidden_states, residual = hidden_states
         else:
             residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
+            if (self.config.use_input_layernorm_in_first_layer
+                    and self.layer_idx == 0) or self.layer_idx > 0:
+                hidden_states = self.input_layernorm(hidden_states)
 
         attention_output = self.attention(
             hidden_states,
@@ -190,9 +197,18 @@ def forward(self,
                         norm_weight=next_layer_input_layernorm_args[0],
                         eps=next_layer_input_layernorm_args[1]))
             else:
-                hidden_states = self.mlp(hidden_states,
-                                         lora_layer_params=lora_layer_params)
-                hidden_states = residual + hidden_states
+                if default_net(
+                ).plugin_config.pp_reduce_scatter and self.is_last_local_layer and not self.mapping.is_last_pp_rank(
+                ):
+                    hidden_states = self.mlp(
+                        hidden_states,
+                        lora_layer_params=lora_layer_params,
+                        last_local_layer_residual=residual)
+                else:
+                    hidden_states = self.mlp(
+                        hidden_states, lora_layer_params=lora_layer_params)
+                    hidden_states = residual + hidden_states
+
         if use_cache:
             return (hidden_states, presents)
         return hidden_states
@@ -204,17 +220,29 @@ def __init__(self, config: LLaMAConfig) -> None:
         super().__init__()
 
         self.mapping = config.mapping
+        self.hidden_size = config.hidden_size
         if self.mapping.is_first_pp_rank():
             self.vocab_embedding = Embedding(config.vocab_size,
                                              config.hidden_size,
                                              dtype=config.dtype)
 
         self.layers = DecoderLayerList(LLaMADecoderLayer, config)
 
+        if config.fc_after_embed:
+            self.fc = ColumnLinear(2 * config.hidden_size,
+                                   config.hidden_size,
+                                   bias=True,
+                                   dtype=config.dtype,
+                                   tp_group=config.mapping.tp_group,
+                                   tp_size=config.mapping.tp_size,
+                                   gather_output=True)
+
         if self.mapping.is_last_pp_rank():
-            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
-                                eps=config.norm_epsilon,
-                                dtype=config.dtype)
+            self.ln_f = None
+            if config.use_last_layernorm:
+                self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                    eps=config.norm_epsilon,
+                                    dtype=config.dtype)
 
     def forward(self,
                 input_ids,
@@ -225,6 +253,7 @@ def forward(self,
                 kv_cache_params=None,
                 attention_params=None,
                 hidden_states=None,
+                hidden_states_for_embed=None,
                 prompt_embedding_table: Optional[Tensor] = None,
                 prompt_tasks: Optional[Tensor] = None,
                 prompt_vocab_size: Optional[Tensor] = None,
@@ -238,6 +267,18 @@ def forward(self,
             hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
         else:
             hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+            if default_net().plugin_config.pp_reduce_scatter:
+                hidden_states = allgather(hidden_states,
+                                          self.mapping.tp_group,
+                                          gather_dim=0)
+                # reshape to (-1, hidden_size)
+                hidden_states = hidden_states.view(
+                    concat([-1, self.hidden_size]))
+
+        if hidden_states_for_embed is not None:
+            hidden_states = concat([hidden_states, hidden_states_for_embed],
+                                   dim=-1)
+            hidden_states = self.fc(hidden_states)
 
         hidden_states = self.layers.forward(
             hidden_states,
@@ -252,7 +293,8 @@ def forward(self,
             hidden_states, presents = hidden_states
 
         if self.mapping.is_last_pp_rank():
-            hidden_states = self.ln_f(hidden_states)
+            if self.ln_f:
+                hidden_states = self.ln_f(hidden_states)
         else:
             hidden_states = send(hidden_states, self.mapping.next_pp_rank())
 
@@ -303,9 +345,9 @@ def from_hugging_face(
             if "vila" in hf_model_or_dir or "llava" in hf_model_or_dir:
                 hf_model_or_dir = load_hf_llama(hf_model_or_dir,
                                                 load_model_on_cpu)
-            elif not (load_by_shard or
-                      (has_safetensors(hf_model_or_dir)
-                       and not quant_config.quant_mode.has_any_quant())):
+            elif not load_by_shard and not has_safetensors(
+                    hf_model_or_dir
+            ) and not quant_config.quant_mode.has_any_quant():
                 hf_model_or_dir = load_hf_llama(hf_model_or_dir,
                                                 load_model_on_cpu)
 
@@ -351,6 +393,9 @@ def from_hugging_face(
                     "input_layernorm": "ln_1",
                     "post_layernorm": "ln_2",
                 }
+            elif config.tie_word_embeddings:
+                custom_dict = {"lm_head": "model.embed_tokens"}
+
             if quant_ckpt_path is not None:
                 hf_model_dir = quant_ckpt_path
 
@@ -368,7 +413,14 @@ def from_hugging_face(
                     hf_model_dir) and not config.quant_mode.has_any_quant():
                 weights = load_weights_from_hf_safetensors(hf_model_dir, config)
             elif quant_ckpt_path is not None:
-                weights = load_weights_from_gptq(quant_ckpt_path, config)
+                if quant_config.quant_mode.is_int4_weight_only():
+                    weights = load_weights_from_gptq(quant_ckpt_path, config)
+                elif quant_config.quant_mode.is_qserve_w4a8():
+                    weights = load_weights_from_lmquant(quant_ckpt_path, config)
+                else:
+                    raise ValueError(
+                        "quant_ckpt_path should be specified only for GPTQ or QServe"
+                    )
             else:
                 hf_model = load_hf_llama(hf_model_dir, load_model_on_cpu)
                 weights = load_weights_from_hf_model(hf_model, config)