From aa9176c004db7da4897c563117bc75a7b7356f89 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 19:19:42 -0700 Subject: [PATCH 01/24] commit --- .../layers/attention_layers/LlamaContextAttentionLayer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 977216650..f5c4fd9d1 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -318,6 +318,7 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten (size_t)layer_id * 2 * local_head_num_ * size_per_head_}; if (padding_offset != nullptr) { + printf("padding_offset is not null\n"); // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous cudaMemsetAsync( q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_); From b3e68ec6ff2846ccf01650f1cee81e6a1c8a7033 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 19:20:28 -0700 Subject: [PATCH 02/24] commit --- .../layers/attention_layers/LlamaContextAttentionLayer.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index f5c4fd9d1..977216650 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -318,7 +318,6 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten (size_t)layer_id * 2 * local_head_num_ * size_per_head_}; if (padding_offset != nullptr) { - printf("padding_offset is not null\n"); // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous cudaMemsetAsync( q_buf_2_, 0, request_batch_size * request_seq_len * 3 * local_hidden_units_ * sizeof(T), stream_); From 7645f040f10c9e1fc277c79cc4c953616bd556b1 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 19:20:59 -0700 Subject: [PATCH 03/24] commit --- .../LlamaContextAttentionLayer.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 977216650..f12ee36c0 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -140,15 +140,15 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten hidden_units_, // k qkv_buf_tmp_, local_qkv_size /* n */); - if (local_kv_head_num_ != local_head_num_) { - invokeRepeatKv(qkv_buf_, - qkv_buf_tmp_, - local_head_num_, - local_kv_head_num_, - size_per_head_, - m, - stream_); - } + // if (local_kv_head_num_ != local_head_num_) { + // invokeRepeatKv(qkv_buf_, + // qkv_buf_tmp_, + // local_head_num_, + // local_kv_head_num_, + // size_per_head_, + // m, + // stream_); + // } // { // const int head_num = 6; From 2bdbed50c90e6fd417e32c310632158f71a2396d Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 19:33:06 -0700 Subject: [PATCH 04/24] commit --- .../layers/attention_layers/LlamaContextAttentionLayer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index f12ee36c0..16cae4462 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -138,7 +138,7 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten local_qkv_size, // n attention_input, hidden_units_, // k - qkv_buf_tmp_, + qkv_buf_, local_qkv_size /* n */); // if (local_kv_head_num_ != local_head_num_) { // invokeRepeatKv(qkv_buf_, From aaec0dedb4da4b9723de24be4ba54c5472952ec7 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 21:43:34 -0700 Subject: [PATCH 05/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 1 + src/fastertransformer/kernels/unfused_attention_kernels.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 89e04ba89..732f8726f 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1512,6 +1512,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int seq_len, const int token_num, const int head_num, + const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, const int neox_rotary_style, diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index baa074839..a3bf1e005 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -138,6 +138,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, seq_len, token_num, head_num, + head_num, size_per_head, 0, false, @@ -177,6 +178,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, seq_len, token_num, head_num, + head_num, size_per_head, rotary_embedding_dim, neox_rotary_style, From bbf87916074603824076b528831002c939d0f4ca Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 21:43:52 -0700 Subject: [PATCH 06/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index a3bf1e005..b67caf140 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -200,6 +200,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int seq_len, const int token_num, const int head_num, + const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, const int neox_rotary_style, From d81a7dfdd2e52aba30cec9209a2cfeffbf69af7a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 21:55:34 -0700 Subject: [PATCH 07/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 732f8726f..0944ffde4 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1572,6 +1572,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, const int seq_len, \ const int token_num, \ const int head_num, \ + const int kv_head_num, \ const int size_per_head, \ const int rotary_embedding_dim, \ const int neox_rotary_style, \ From ed1e2c7d51c94137a93bcbf345dd16aeaab35512 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 21:59:14 -0700 Subject: [PATCH 08/24] commit --- .../layers/attention_layers/LlamaContextAttentionLayer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 16cae4462..27f2ba35d 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -333,6 +333,7 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten request_seq_len, m, local_head_num_, + local_kv_head_num_, size_per_head_, rotary_embedding_dim_, neox_rotary_style_, From 8f0292763a36a4cd3be00773a0da5bf92ed92b41 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 22:04:02 -0700 Subject: [PATCH 09/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 0944ffde4..b3075d0e6 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1334,6 +1334,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int batch_size, const int seq_len, const int head_num, + const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, const bool neox_rotary_style, @@ -1495,6 +1496,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* batch_size, \ seq_len, \ head_num, \ + kv_head_num, \ size_per_head, \ rotary_embedding_dim, \ neox_rotary_style, \ From c4705f675352a565b79b9573af35530891bfdf1a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 22:54:00 -0700 Subject: [PATCH 10/24] commit --- .../kernels/unfused_attention_kernels.cu | 28 +++++++++++-------- .../LlamaContextAttentionLayer.cc | 12 ++------ .../LlamaContextAttentionLayer.h | 1 - 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index b3075d0e6..67f85d98e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1396,7 +1396,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0; const int hidden_idx = head_idx * size_per_head + tidx * vec_size; - const int n = head_num * size_per_head; + const int qkv_size = head_num * size_per_head + 2 * kv_head_num * size_per_head; + const int k_offset = head_num * size_per_head; + const int v_offset = k_offset + kv_head_num * size_per_head; // the [0..seq_len) indices really handle KV [max_pp_len..seq_len+max_pp_len) // and Q [0..seq_len) @@ -1404,21 +1406,23 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int dst_kv_seq_idx = seq_idx + prefix_prompt_length; // NOTE: q has seq len excluding prefix prompt - // src QKV: [batch, time, 3, head, hidden] - const int src_q_idx = token_idx * 3 * n + hidden_idx; - const int src_k_idx = token_idx * 3 * n + hidden_idx + n; - const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; + // src QKV: [batch, time, head+2*kv_head, hidden] + const int src_q_idx = token_idx * qkv_size + hidden_idx; + const int src_k_idx = token_idx * qkv_size + hidden_idx + k_offset; + const int src_v_idx = token_idx * qkv_size + hidden_idx + v_offset; Vec_t q, k, v; Vec_t q_bias, k_bias, v_bias; if (!is_masked) { q = *reinterpret_cast(&QKV[src_q_idx]); - k = *reinterpret_cast(&QKV[src_k_idx]); - v = *reinterpret_cast(&QKV[src_v_idx]); - q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); - k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n]); - v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + 2 * n]); + + if (head_idx < kv_head_num) { + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); + k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n]); + v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + 2 * n]); + } } q = mmha::add(q, q_bias); @@ -1477,7 +1481,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * head_num + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head + tidx * vec_size; - + const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * kv_head_num + + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head + + tidx * vec_size; if (!is_masked) { *reinterpret_cast(&q_buf[dest_q_idx]) = q; *reinterpret_cast(&k_buf[dest_kv_idx]) = k; diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 27f2ba35d..06babcaef 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -730,13 +730,8 @@ void LlamaContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T); // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels. const auto type_size = sizeof(T); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true); - if (local_kv_head_num_ != local_head_num_) { - size_t local_qkv_size = local_hidden_units_ + 2 * local_kv_head_num_ * size_per_head_; - qkv_buf_tmp_ = (T*)allocator_->reMalloc(qkv_buf_tmp_, type_size * batch_size * seq_len * local_qkv_size, true); - } else { - qkv_buf_tmp_ = qkv_buf_; - } + size_t local_qkv_size = local_hidden_units_ + 2 * local_kv_head_num_ * size_per_head_; + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * batch_size * seq_len * local_qkv_size, true); q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true); k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_; v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_; @@ -790,9 +785,6 @@ void LlamaContextAttentionLayer::freeBuffer() if (is_allocate_buffer_) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); allocator_->free((void**)(&qkv_buf_)); - if (local_kv_head_num_ != local_head_num_) { - allocator_->free((void**)(&qkv_buf_tmp_)); - } allocator_->free((void**)(&q_buf_2_)); allocator_->free((void**)(&qk_buf_)); allocator_->free((void**)(&qkv_buf_2_)); diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h index e26aa44f7..d8ab17f86 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h @@ -65,7 +65,6 @@ class LlamaContextAttentionLayer: public BaseAttentionLayer { using BaseAttentionLayer::stream_; using BaseAttentionLayer::sparse_; T* qkv_buf_ = nullptr; - T* qkv_buf_tmp_ = nullptr; T* q_buf_2_ = nullptr; T* k_buf_2_ = nullptr; T* v_buf_2_ = nullptr; From 5c338ffeda1d363db254184a1f706c00839ae970 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 22:59:16 -0700 Subject: [PATCH 11/24] commit --- .../kernels/unfused_attention_kernels.cu | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 67f85d98e..db3dca2c0 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1415,14 +1415,12 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* Vec_t q_bias, k_bias, v_bias; if (!is_masked) { q = *reinterpret_cast(&QKV[src_q_idx]); - q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); - if (head_idx < kv_head_num) { - k = *reinterpret_cast(&QKV[src_k_idx]); - v = *reinterpret_cast(&QKV[src_v_idx]); - k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n]); - v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + 2 * n]); - } + q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); + k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + k_offset]); + v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + v_offset]); } q = mmha::add(q, q_bias); @@ -1481,9 +1479,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * head_num + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head + tidx * vec_size; - const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * kv_head_num - + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head - + tidx * vec_size; + if (!is_masked) { *reinterpret_cast(&q_buf[dest_q_idx]) = q; *reinterpret_cast(&k_buf[dest_kv_idx]) = k; From 21167b24df0d124855ae9918acb46295a985b9b6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:00:55 -0700 Subject: [PATCH 12/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index db3dca2c0..3ef1d1fe3 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1546,6 +1546,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, int8_mode); } else { + printf("head_num: %d kv_head_num: %d\n", head_num, kv_head_num); FT_CHECK_WITH_INFO(int8_mode != 2, "w8a8 not yet implemented with prefix prompt"); // TODO(mseznec) // To implement rotary embeddings, each thread processes two QKV elems: dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); From 6c4524b6963244b576b9c5a7af471643b5d5c92b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:06:23 -0700 Subject: [PATCH 13/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 3ef1d1fe3..bbcf2955e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1396,6 +1396,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0; const int hidden_idx = head_idx * size_per_head + tidx * vec_size; + const int kv_hidden_idx = head_idx / 8 * size_per_head + tidx * vec_size; const int qkv_size = head_num * size_per_head + 2 * kv_head_num * size_per_head; const int k_offset = head_num * size_per_head; const int v_offset = k_offset + kv_head_num * size_per_head; From 335936283efe392993d065f7f3aca2718a92ec0d Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:06:57 -0700 Subject: [PATCH 14/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index bbcf2955e..0b3fa249a 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1409,8 +1409,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* // NOTE: q has seq len excluding prefix prompt // src QKV: [batch, time, head+2*kv_head, hidden] const int src_q_idx = token_idx * qkv_size + hidden_idx; - const int src_k_idx = token_idx * qkv_size + hidden_idx + k_offset; - const int src_v_idx = token_idx * qkv_size + hidden_idx + v_offset; + const int src_k_idx = token_idx * qkv_size + kv_hidden_idx + k_offset; + const int src_v_idx = token_idx * qkv_size + kv_hidden_idx + v_offset; Vec_t q, k, v; Vec_t q_bias, k_bias, v_bias; From 6d14988dcd39bf9f058941cd1c019226fa65165a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:09:05 -0700 Subject: [PATCH 15/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 0b3fa249a..d61b2d31c 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1396,7 +1396,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0; const int hidden_idx = head_idx * size_per_head + tidx * vec_size; - const int kv_hidden_idx = head_idx / 8 * size_per_head + tidx * vec_size; + const int kv_repeat_num = head_num / kv_head_num; + const int kv_hidden_idx = head_idx / kv_repeat_num * size_per_head + tidx * vec_size; const int qkv_size = head_num * size_per_head + 2 * kv_head_num * size_per_head; const int k_offset = head_num * size_per_head; const int v_offset = k_offset + kv_head_num * size_per_head; From 7aa3f45bf7e732f9c4016a8bbed0231cc6ef1430 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:13:03 -0700 Subject: [PATCH 16/24] commit --- examples/cpp/llama/llama_example.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index ce761f75f..db8ccf480 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -463,7 +463,12 @@ void llama_example(const INIReader reader) cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount); cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount); printf("seqlBuf: %d\n", seqlBuf[0]); - +/* +golden request: +1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +golden result: +1 18637 29892 526 366 1136 455 2470 29973 1815 366 5193 304 592 29973 31489 25709 29251 25143 9777 24957 12623 29013 25302 11973 886 29457 6626 13638 10893 26609 25049 15066 29013 1927 27436 28754 1740 698 24551 25482 31552 22617 1140 293 10146 912 +*/ { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; From b3585346c3f6bfde8d80e1be5d929f37fddb13f1 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:14:45 -0700 Subject: [PATCH 17/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index d61b2d31c..1d75e51a1 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1548,7 +1548,6 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, int8_mode); } else { - printf("head_num: %d kv_head_num: %d\n", head_num, kv_head_num); FT_CHECK_WITH_INFO(int8_mode != 2, "w8a8 not yet implemented with prefix prompt"); // TODO(mseznec) // To implement rotary embeddings, each thread processes two QKV elems: dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); From 5523f1e7aef803f6101d2c1072583e03f7251e82 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:16:29 -0700 Subject: [PATCH 18/24] commit --- src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index 34ad480cf..d190d7e71 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -144,7 +144,7 @@ void LlamaDecoderLayerWeight::copyFrom(const LlamaDecoderLayerWeight& other) cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_); cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); - // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的,后面做一下验证,回来再改一下 + // TODO: not sure gate_pro_weight corresponds to up_proj or down_proj cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_); cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_); } From 81a856cdb46c64a962d71a4b4c75a1fa5a76601b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:23:21 -0700 Subject: [PATCH 19/24] commit --- examples/cpp/llama/llama_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index db8ccf480..4075976ee 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -164,7 +164,7 @@ void llama_example(const INIReader reader) // Handle bad_words dictionary std::vector bad_words; - read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", bad_words); + read_word_list("./bad_words.csv", bad_words); int* d_bad_words = nullptr; deviceMalloc(&d_bad_words, bad_words.size(), false); From 8aa0208bf57d8dbce3ace2846539fc950fb30be6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:24:09 -0700 Subject: [PATCH 20/24] commit --- examples/cpp/llama/llama_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 4075976ee..f26e21dc8 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -172,7 +172,7 @@ void llama_example(const INIReader reader) // Handle stop_words dictionary std::vector stop_words; - read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/stop_words.csv", stop_words); + read_word_list("./stop_words.csv", stop_words); const size_t stop_words_len = stop_words.size() / 2; // Tile with same dict for each element @@ -196,7 +196,7 @@ void llama_example(const INIReader reader) max_input_len, end_id, 1, - "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); + "./start_ids.csv"); int* d_input_ids; From c74dd65bbe8f6638d5fbf85213361430d8066ba5 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 30 Sep 2023 23:24:41 -0700 Subject: [PATCH 21/24] commit --- examples/cpp/llama/llama_example.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index f26e21dc8..db8ccf480 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -164,7 +164,7 @@ void llama_example(const INIReader reader) // Handle bad_words dictionary std::vector bad_words; - read_word_list("./bad_words.csv", bad_words); + read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/bad_words.csv", bad_words); int* d_bad_words = nullptr; deviceMalloc(&d_bad_words, bad_words.size(), false); @@ -172,7 +172,7 @@ void llama_example(const INIReader reader) // Handle stop_words dictionary std::vector stop_words; - read_word_list("./stop_words.csv", stop_words); + read_word_list("/notebooks/FasterTransformer/examples/cpp/llama/stop_words.csv", stop_words); const size_t stop_words_len = stop_words.size() / 2; // Tile with same dict for each element @@ -196,7 +196,7 @@ void llama_example(const INIReader reader) max_input_len, end_id, 1, - "./start_ids.csv"); + "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); int* d_input_ids; From 24d66020b511f85cbac082cc677fc98e5e548204 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sun, 1 Oct 2023 17:10:54 -0700 Subject: [PATCH 22/24] commit --- src/fastertransformer/kernels/unfused_attention_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 1d75e51a1..266d4fc93 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1421,8 +1421,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* v = *reinterpret_cast(&QKV[src_v_idx]); q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); - k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + k_offset]); - v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + v_offset]); + k_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + k_offset]); + v_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + v_offset]); } q = mmha::add(q, q_bias); From c05154751eb13c5ce7369c5946550e80a9e7e9d6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sun, 1 Oct 2023 17:18:56 -0700 Subject: [PATCH 23/24] commit --- .../kernels/unfused_attention_kernels.cu | 17 ++++++++++------- .../LlamaContextAttentionLayer.cc | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index 266d4fc93..8d9a65f9e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1420,14 +1420,17 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* k = *reinterpret_cast(&QKV[src_k_idx]); v = *reinterpret_cast(&QKV[src_v_idx]); - q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); - k_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + k_offset]); - v_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + v_offset]); + if (qkv_bias) { + q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); + k_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + k_offset]); + v_bias = *reinterpret_cast(&qkv_bias[kv_hidden_idx + v_offset]); + } + } + if (qkv_bias) { + q = mmha::add(q, q_bias); + k = mmha::add(k, k_bias); + v = mmha::add(v, v_bias); } - - q = mmha::add(q, q_bias); - k = mmha::add(k, k_bias); - v = mmha::add(v, v_bias); if (!neox_rotary_style) { mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rope_theta, dst_kv_seq_idx); diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 06babcaef..2d96d2600 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -327,7 +327,7 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten v_buf_2_, param, // prefix prompt qkv_buf_, - attention_weights->query_weight.bias, + nullptr, padding_offset, request_batch_size, request_seq_len, From 0854463a17b6a5646f138cff62a176bc754e5d2d Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sun, 1 Oct 2023 21:14:25 -0700 Subject: [PATCH 24/24] commit --- .vscode/settings.json | 5 +++-- .../layers/attention_layers/LlamaContextAttentionLayer.cc | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6f535da99..180377d5d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -67,6 +67,7 @@ "unordered_set": "cpp", "future": "cpp", "cfenv": "cpp", - "typeindex": "cpp" + "typeindex": "cpp", + "__config": "cpp" } -} \ No newline at end of file +} diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc index 2d96d2600..0531728d6 100644 --- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc @@ -327,7 +327,7 @@ void LlamaContextAttentionLayer::forward(TensorMap* output_ten v_buf_2_, param, // prefix prompt qkv_buf_, - nullptr, + (T*)(nullptr), padding_offset, request_batch_size, request_seq_len,