Skip to content

Commit

Permalink
update gh-pages (#2530)
Browse files Browse the repository at this point in the history
  • Loading branch information
Shixiaowei02 authored Dec 4, 2024
1 parent f8fa42a commit b79ef8a
Show file tree
Hide file tree
Showing 205 changed files with 20,373 additions and 14,657 deletions.
7,656 changes: 4,253 additions & 3,403 deletions _cpp_gen/executor.html

Large diffs are not rendered by default.

5,685 changes: 3,091 additions & 2,594 deletions _cpp_gen/runtime.html

Large diffs are not rendered by default.

120 changes: 120 additions & 0 deletions _downloads/29c17f8c7171976309d720e2b031e77e/test_debugging_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

import numpy as np
import torch
from polygraphy.backend.trt import EngineFromNetwork, TrtRunner
from torch import nn

import tensorrt_llm
from tensorrt_llm import Module, Tensor


class TorchMLP(nn.Module):

def __init__(self, hidden_size, ffn_hidden_size, bias=True):
super().__init__()
self.fc = nn.Linear(hidden_size, ffn_hidden_size, bias=bias)
self.proj = nn.Linear(ffn_hidden_size, hidden_size, bias=bias)

def forward(self, hidden_states):
inter = self.fc(hidden_states)
inter = nn.functional.relu(inter)
output = self.proj(inter)
return output, inter


class MLP(Module):

def __init__(self,
hidden_size,
ffn_hidden_size,
bias=True,
tp_group=None,
tp_size=1):
super().__init__()
self.fc = tensorrt_llm.layers.ColumnLinear(hidden_size,
ffn_hidden_size,
bias=bias,
tp_group=tp_group,
tp_size=tp_size,
gather_output=False)
self.proj = tensorrt_llm.layers.RowLinear(ffn_hidden_size,
hidden_size,
bias=bias,
tp_group=tp_group,
tp_size=tp_size)

def forward(self, hidden_states):
inter = self.fc(hidden_states)
inter = tensorrt_llm.functional.relu(inter)
self.register_network_output('inter', inter)
output = self.proj(inter)
return output


class TestDebuggingAPI(unittest.TestCase):

def setUp(self):
tensorrt_llm.logger.set_level('error')

def test_debugging_api(self):
# test data
dtype = 'float32'
hidden_size = 768
x_data = torch.randn(2, 16, hidden_size)

tm = TorchMLP(hidden_size=hidden_size,
ffn_hidden_size=hidden_size * 4,
bias=False)

# construct trt network
builder = tensorrt_llm.Builder()
net = builder.create_network()
with tensorrt_llm.net_guard(net):
x = Tensor(name='x',
shape=x_data.shape,
dtype=tensorrt_llm.str_dtype_to_trt(dtype))

gm = MLP(hidden_size=hidden_size,
ffn_hidden_size=4 * hidden_size,
bias=False)
gm.fc.weight.value = tm.fc.weight.detach().cpu().numpy()
gm.proj.weight.value = tm.proj.weight.detach().cpu().numpy()

output = gm.forward(x)
net._mark_output(output, 'output',
tensorrt_llm.str_dtype_to_trt(dtype))

for k, v in gm.named_network_outputs():
net._mark_output(v, k, tensorrt_llm.str_dtype_to_trt(dtype))

# trt run
build_engine = EngineFromNetwork((builder.trt_builder, net.trt_network))
with TrtRunner(build_engine) as runner:
outputs = runner.infer(feed_dict={'x': x_data.numpy()})

# pytorch run
with torch.no_grad():
ref1, ref2 = tm(x_data)

# compare diff
np.testing.assert_allclose(ref1.cpu().numpy(),
outputs['output'],
atol=1e-5)
np.testing.assert_allclose(ref2.cpu().numpy(),
outputs['inter'],
atol=1e-5)
86 changes: 69 additions & 17 deletions _downloads/408e9af6e2b04a79e78215bde246e8bc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..._common import default_net
from ..._utils import pad_vocab_size
from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor,
non_gated_version, recv, send)
allgather, concat, non_gated_version, recv, send)
from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
Embedding, GatedMLP, PositionEmbeddingType, RmsNorm)
from ...lora_manager import LoraConfig, use_lora
Expand All @@ -34,22 +34,27 @@
from .convert import (load_hf_llama, load_weights_from_gptq,
load_weights_from_hf_by_shard, load_weights_from_hf_model,
load_weights_from_hf_safetensors,
load_weights_from_meta_ckpt)
load_weights_from_lmquant, load_weights_from_meta_ckpt)


class LLaMADecoderLayer(Module):

def __init__(self, config: LLaMAConfig, layer_idx: int):
super().__init__()
self.layer_idx = layer_idx
layer_idx += config.layer_idx_offset
self.config = config
self.mapping = config.mapping

self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
if (self.config.use_input_layernorm_in_first_layer
and self.layer_idx == 0) or self.layer_idx > 0:
self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)

layers_range = config.mapping.pp_layers(config.num_hidden_layers)
self.local_layer_idx = layer_idx - layers_range[0]
self.is_last_local_layer = layer_idx == layers_range[-1]
self.attention = Attention(
local_layer_idx=self.local_layer_idx,
hidden_size=config.hidden_size,
Expand Down Expand Up @@ -134,7 +139,9 @@ def forward(self,
hidden_states, residual = hidden_states
else:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
if (self.config.use_input_layernorm_in_first_layer
and self.layer_idx == 0) or self.layer_idx > 0:
hidden_states = self.input_layernorm(hidden_states)

attention_output = self.attention(
hidden_states,
Expand Down Expand Up @@ -190,9 +197,18 @@ def forward(self,
norm_weight=next_layer_input_layernorm_args[0],
eps=next_layer_input_layernorm_args[1]))
else:
hidden_states = self.mlp(hidden_states,
lora_layer_params=lora_layer_params)
hidden_states = residual + hidden_states
if default_net(
).plugin_config.pp_reduce_scatter and self.is_last_local_layer and not self.mapping.is_last_pp_rank(
):
hidden_states = self.mlp(
hidden_states,
lora_layer_params=lora_layer_params,
last_local_layer_residual=residual)
else:
hidden_states = self.mlp(
hidden_states, lora_layer_params=lora_layer_params)
hidden_states = residual + hidden_states

if use_cache:
return (hidden_states, presents)
return hidden_states
Expand All @@ -204,17 +220,29 @@ def __init__(self, config: LLaMAConfig) -> None:
super().__init__()

self.mapping = config.mapping
self.hidden_size = config.hidden_size
if self.mapping.is_first_pp_rank():
self.vocab_embedding = Embedding(config.vocab_size,
config.hidden_size,
dtype=config.dtype)

self.layers = DecoderLayerList(LLaMADecoderLayer, config)

if config.fc_after_embed:
self.fc = ColumnLinear(2 * config.hidden_size,
config.hidden_size,
bias=True,
dtype=config.dtype,
tp_group=config.mapping.tp_group,
tp_size=config.mapping.tp_size,
gather_output=True)

if self.mapping.is_last_pp_rank():
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)
self.ln_f = None
if config.use_last_layernorm:
self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
eps=config.norm_epsilon,
dtype=config.dtype)

def forward(self,
input_ids,
Expand All @@ -225,6 +253,7 @@ def forward(self,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
hidden_states_for_embed=None,
prompt_embedding_table: Optional[Tensor] = None,
prompt_tasks: Optional[Tensor] = None,
prompt_vocab_size: Optional[Tensor] = None,
Expand All @@ -238,6 +267,18 @@ def forward(self,
hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
else:
hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
if default_net().plugin_config.pp_reduce_scatter:
hidden_states = allgather(hidden_states,
self.mapping.tp_group,
gather_dim=0)
# reshape to (-1, hidden_size)
hidden_states = hidden_states.view(
concat([-1, self.hidden_size]))

if hidden_states_for_embed is not None:
hidden_states = concat([hidden_states, hidden_states_for_embed],
dim=-1)
hidden_states = self.fc(hidden_states)

hidden_states = self.layers.forward(
hidden_states,
Expand All @@ -252,7 +293,8 @@ def forward(self,
hidden_states, presents = hidden_states

if self.mapping.is_last_pp_rank():
hidden_states = self.ln_f(hidden_states)
if self.ln_f:
hidden_states = self.ln_f(hidden_states)
else:
hidden_states = send(hidden_states, self.mapping.next_pp_rank())

Expand Down Expand Up @@ -303,9 +345,9 @@ def from_hugging_face(
if "vila" in hf_model_or_dir or "llava" in hf_model_or_dir:
hf_model_or_dir = load_hf_llama(hf_model_or_dir,
load_model_on_cpu)
elif not (load_by_shard or
(has_safetensors(hf_model_or_dir)
and not quant_config.quant_mode.has_any_quant())):
elif not load_by_shard and not has_safetensors(
hf_model_or_dir
) and not quant_config.quant_mode.has_any_quant():
hf_model_or_dir = load_hf_llama(hf_model_or_dir,
load_model_on_cpu)

Expand Down Expand Up @@ -351,6 +393,9 @@ def from_hugging_face(
"input_layernorm": "ln_1",
"post_layernorm": "ln_2",
}
elif config.tie_word_embeddings:
custom_dict = {"lm_head": "model.embed_tokens"}

if quant_ckpt_path is not None:
hf_model_dir = quant_ckpt_path

Expand All @@ -368,7 +413,14 @@ def from_hugging_face(
hf_model_dir) and not config.quant_mode.has_any_quant():
weights = load_weights_from_hf_safetensors(hf_model_dir, config)
elif quant_ckpt_path is not None:
weights = load_weights_from_gptq(quant_ckpt_path, config)
if quant_config.quant_mode.is_int4_weight_only():
weights = load_weights_from_gptq(quant_ckpt_path, config)
elif quant_config.quant_mode.is_qserve_w4a8():
weights = load_weights_from_lmquant(quant_ckpt_path, config)
else:
raise ValueError(
"quant_ckpt_path should be specified only for GPTQ or QServe"
)
else:
hf_model = load_hf_llama(hf_model_dir, load_model_on_cpu)
weights = load_weights_from_hf_model(hf_model, config)
Expand Down
Loading

0 comments on commit b79ef8a

Please sign in to comment.