diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6f535da99..180377d5d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -67,6 +67,7 @@
         "unordered_set": "cpp",
         "future": "cpp",
         "cfenv": "cpp",
-        "typeindex": "cpp"
+        "typeindex": "cpp",
+        "__config": "cpp"
     }
-}
\ No newline at end of file
+}
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index ce761f75f..db8ccf480 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -463,7 +463,12 @@ void llama_example(const INIReader reader)
             cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount);
             cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount);
             printf("seqlBuf: %d\n", seqlBuf[0]);
-
+/*
+golden request:
+1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+golden result:
+1 18637 29892   526   366  1136   455  2470 29973  1815   366  5193   304   592 29973 31489 25709 29251 25143  9777 24957 12623 29013 25302 11973   886 29457  6626 13638 10893 26609 25049 15066 29013  1927 27436 28754  1740   698 24551 25482 31552 22617  1140   293 10146   912
+*/
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index 89e04ba89..8d9a65f9e 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1334,6 +1334,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
                                                    const int   batch_size,
                                                    const int   seq_len,
                                                    const int   head_num,
+                                                   const int   kv_head_num,
                                                    const int   size_per_head,
                                                    const int   rotary_embedding_dim,
                                                    const bool  neox_rotary_style,
@@ -1395,7 +1396,11 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
 
     const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0;
     const int hidden_idx           = head_idx * size_per_head + tidx * vec_size;
-    const int n                    = head_num * size_per_head;
+    const int kv_repeat_num        = head_num / kv_head_num;
+    const int kv_hidden_idx        = head_idx / kv_repeat_num * size_per_head + tidx * vec_size;
+    const int qkv_size             = head_num * size_per_head + 2 * kv_head_num * size_per_head;
+    const int k_offset             = head_num * size_per_head;
+    const int v_offset             = k_offset + kv_head_num * size_per_head;
 
     // the [0..seq_len) indices really handle KV [max_pp_len..seq_len+max_pp_len)
     // and Q [0..seq_len)
@@ -1403,10 +1408,10 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int dst_kv_seq_idx = seq_idx + prefix_prompt_length;
 
     // NOTE: q has seq len excluding prefix prompt
-    // src QKV: [batch, time, 3, head, hidden]
-    const int src_q_idx = token_idx * 3 * n + hidden_idx;
-    const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
-    const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
+    // src QKV: [batch, time, head+2*kv_head, hidden]
+    const int src_q_idx = token_idx * qkv_size + hidden_idx;
+    const int src_k_idx = token_idx * qkv_size + kv_hidden_idx + k_offset;
+    const int src_v_idx = token_idx * qkv_size + kv_hidden_idx + v_offset;
 
     Vec_t q, k, v;
     Vec_t q_bias, k_bias, v_bias;
@@ -1415,14 +1420,17 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
         k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
         v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
 
-        q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
-        k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + n]);
-        v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + 2 * n]);
+        if (qkv_bias) {
+            q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
+            k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[kv_hidden_idx + k_offset]);
+            v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[kv_hidden_idx + v_offset]);
+        }
+    }
+    if (qkv_bias) {
+        q = mmha::add(q, q_bias);
+        k = mmha::add(k, k_bias);
+        v = mmha::add(v, v_bias);
     }
-
-    q = mmha::add(q, q_bias);
-    k = mmha::add(k, k_bias);
-    v = mmha::add(v, v_bias);
 
     if (!neox_rotary_style) {
         mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rope_theta, dst_kv_seq_idx);
@@ -1495,6 +1503,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
                                                                                              batch_size,               \
                                                                                              seq_len,                  \
                                                                                              head_num,                 \
+                                                                                             kv_head_num,              \
                                                                                              size_per_head,            \
                                                                                              rotary_embedding_dim,     \
                                                                                              neox_rotary_style,        \
@@ -1512,6 +1521,7 @@ void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
                                     const int                        seq_len,
                                     const int                        token_num,
                                     const int                        head_num,
+                                    const int                        kv_head_num,
                                     const int                        size_per_head,
                                     const int                        rotary_embedding_dim,
                                     const int                        neox_rotary_style,
@@ -1571,6 +1581,7 @@ void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
                                                  const int                        seq_len,                             \
                                                  const int                        token_num,                           \
                                                  const int                        head_num,                            \
+                                                 const int                        kv_head_num,                         \
                                                  const int                        size_per_head,                       \
                                                  const int                        rotary_embedding_dim,                \
                                                  const int                        neox_rotary_style,                   \
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index baa074839..b67caf140 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -138,6 +138,7 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                    seq_len,
                                    token_num,
                                    head_num,
+                                   head_num,
                                    size_per_head,
                                    0,
                                    false,
@@ -177,6 +178,7 @@ void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
                                    seq_len,
                                    token_num,
                                    head_num,
+                                   head_num,
                                    size_per_head,
                                    rotary_embedding_dim,
                                    neox_rotary_style,
@@ -198,6 +200,7 @@ void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
                                     const int                        seq_len,
                                     const int                        token_num,
                                     const int                        head_num,
+                                    const int                        kv_head_num,
                                     const int                        size_per_head,
                                     const int                        rotary_embedding_dim,
                                     const int                        neox_rotary_style,
diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc
index 977216650..0531728d6 100644
--- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc
+++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.cc
@@ -138,17 +138,17 @@ void LlamaContextAttentionLayer<T>::forward(TensorMap*                output_ten
                               local_qkv_size,  // n
                               attention_input,
                               hidden_units_,  // k
-                              qkv_buf_tmp_,
+                              qkv_buf_,
                               local_qkv_size /* n */);
-        if (local_kv_head_num_ != local_head_num_) {
-            invokeRepeatKv(qkv_buf_,
-                        qkv_buf_tmp_,
-                        local_head_num_,
-                        local_kv_head_num_,
-                        size_per_head_,
-                        m,
-                        stream_);
-        }
+        // if (local_kv_head_num_ != local_head_num_) {
+        //     invokeRepeatKv(qkv_buf_,
+        //                 qkv_buf_tmp_,
+        //                 local_head_num_,
+        //                 local_kv_head_num_,
+        //                 size_per_head_,
+        //                 m,
+        //                 stream_);
+        // }
 
         // {
         //     const int head_num = 6;
@@ -327,12 +327,13 @@ void LlamaContextAttentionLayer<T>::forward(TensorMap*                output_ten
                                    v_buf_2_,
                                    param,  // prefix prompt
                                    qkv_buf_,
-                                   attention_weights->query_weight.bias,
+                                   (T*)(nullptr),
                                    padding_offset,
                                    request_batch_size,
                                    request_seq_len,
                                    m,
                                    local_head_num_,
+                                   local_kv_head_num_,
                                    size_per_head_,
                                    rotary_embedding_dim_,
                                    neox_rotary_style_,
@@ -729,13 +730,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
     // const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
     // NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels.
     const auto type_size = sizeof(T);
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true);
-    if (local_kv_head_num_ != local_head_num_) {
-        size_t local_qkv_size = local_hidden_units_ + 2 * local_kv_head_num_ * size_per_head_;
-        qkv_buf_tmp_ = (T*)allocator_->reMalloc(qkv_buf_tmp_, type_size * batch_size * seq_len * local_qkv_size, true);
-    } else {
-        qkv_buf_tmp_ = qkv_buf_;
-    }
+    size_t local_qkv_size = local_hidden_units_ + 2 * local_kv_head_num_ * size_per_head_;
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * batch_size * seq_len * local_qkv_size, true);
     q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true);
     k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_;
     v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_;
@@ -789,9 +785,6 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
     if (is_allocate_buffer_) {
         FT_LOG_DEBUG(__PRETTY_FUNCTION__);
         allocator_->free((void**)(&qkv_buf_));
-        if (local_kv_head_num_ != local_head_num_) {
-            allocator_->free((void**)(&qkv_buf_tmp_));
-        }
         allocator_->free((void**)(&q_buf_2_));
         allocator_->free((void**)(&qk_buf_));
         allocator_->free((void**)(&qkv_buf_2_));
diff --git a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h
index e26aa44f7..d8ab17f86 100644
--- a/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/LlamaContextAttentionLayer.h
@@ -65,7 +65,6 @@ class LlamaContextAttentionLayer: public BaseAttentionLayer<T> {
     using BaseAttentionLayer<T>::stream_;
     using BaseAttentionLayer<T>::sparse_;
     T*     qkv_buf_              = nullptr;
-    T*     qkv_buf_tmp_          = nullptr;
     T*     q_buf_2_              = nullptr;
     T*     k_buf_2_              = nullptr;
     T*     v_buf_2_              = nullptr;
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 34ad480cf..d190d7e71 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -144,7 +144,7 @@ void LlamaDecoderLayerWeight<T>::copyFrom(const LlamaDecoderLayerWeight& other)
             cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
             cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
 
-            // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的，后面做一下验证，回来再改一下
+            // TODO: not sure gate_pro_weight corresponds to up_proj or down_proj
             cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
             cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_);
         }