From dadb1044438cc919e07572809ac5a710653462a8 Mon Sep 17 00:00:00 2001 From: skirodev Date: Sat, 15 Jul 2023 18:34:38 +0800 Subject: [PATCH 01/10] add Falcon 40B model support --- crates/ggml/src/context.rs | 6 +++ crates/llm/Cargo.toml | 2 +- crates/models/falcon/src/lib.rs | 91 +++++++++++++++++++++------------ 3 files changed, 66 insertions(+), 33 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index e3d24017..763f707a 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -473,6 +473,12 @@ impl Context { let tensor = unsafe { sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } + + /// flash attention. + pub fn op_flash_attn(&self, q: &Tensor, k: &Tensor, v: &Tensor, masked: bool) -> Tensor { + let tensor = unsafe { sys::ggml_flash_attn(self.ptr.as_ptr(), q.ptr.as_ptr(), k.ptr.as_ptr(), v.ptr.as_ptr(), masked) }; + self.new_tensor_raw(tensor) + } } impl Drop for Context { diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 1d7f688f..83960b06 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -33,7 +33,7 @@ default = ["models", "tokenizers-remote"] tokenizers-remote = ["llm-base/tokenizers-remote"] -models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] +models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 2ac269ef..eae7250b 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -64,10 +64,30 @@ impl KnownModel for Falcon { let lm_head = tl.load("lm_head.weight")?; let mut layers = Vec::new(); + // utilizing n_head_kv to determine the model version (parameters) + let Hyperparameters { + n_head_kv, + .. + } = hyperparameters; for i in 0..hyperparameters.n_layer { + let (input_layernorm_name, attention_norm_name) = if n_head_kv == 1 { + // falcon 7b + ( + format!("transformer.h.{i}.input_layernorm"), + None, + ) + } else { + // falcon 40b + ( + format!("transformer.h.{i}.ln_mlp"), + Some(format!("transformer.h.{i}.ln_attn")), + ) + }; let layer = Layer { - attention_norm: tl.load(&format!("transformer.h.{i}.input_layernorm.weight"))?, - attention_norm_b: tl.load(&format!("transformer.h.{i}.input_layernorm.bias"))?, + input_layernorm: tl.load(&format!("{}.weight", input_layernorm_name))?, + input_layernorm_b: tl.load(&format!("{}.bias", input_layernorm_name))?, + attention_norm: attention_norm_name.as_ref().map(|path| tl.load(&format!("{}.bias", path))).transpose()?, + attention_norm_b: attention_norm_name.map(|path| tl.load(&format!("{}.bias", path))).transpose()?, query_key_value: tl.load(&format!( "transformer.h.{i}.self_attention.query_key_value.weight" @@ -123,6 +143,7 @@ impl KnownModel for Falcon { let Hyperparameters { n_embd, n_head, + n_head_kv, n_vocab, n_layer, .. @@ -163,18 +184,29 @@ impl KnownModel for Falcon { current = ctx0.op_norm(&input_layer); current = ctx0.op_add( &ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].attention_norm, ¤t), + &ctx0.op_repeat(&self.layers[il].input_layernorm, ¤t), ¤t, ), - &ctx0.op_repeat(&self.layers[il].attention_norm_b, ¤t), + &ctx0.op_repeat(&self.layers[il].input_layernorm_b, ¤t), ); layernorm_output = current.share(); + // Falcon-40B only + if n_head_kv != 1 { + current = ctx0.op_add( + &ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].attention_norm.as_ref().unwrap(), ¤t), + ¤t, + ), + &ctx0.op_repeat(&self.layers[il].attention_norm_b.as_ref().unwrap(), ¤t), + ); + } + // compute QKV current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); - let fused_qkv_row_nb = (n_embd + 2 * (n_embd / n_head)) * f32_size; + let fused_qkv_row_nb = head_dim * (n_head + 2 * n_head_kv) * f32_size; let mut qcur = ctx0.op_view_3d( ¤t, @@ -185,16 +217,16 @@ impl KnownModel for Falcon { let mut kcur = ctx0.op_view_3d( ¤t, - (head_dim, 1, n), + (head_dim, n_head_kv, n), (head_dim * f32_size, fused_qkv_row_nb), - n_embd * f32_size, + head_dim * n_head * f32_size, ); let vcur = ctx0.op_view_3d( ¤t, - (head_dim, 1, n), + (head_dim, n_head_kv, n), (head_dim * f32_size, fused_qkv_row_nb), - (n_embd + head_dim) * f32_size, + head_dim * (n_head + n_head_kv) * f32_size, ); // using mode = 2 for neox mode @@ -205,13 +237,13 @@ impl KnownModel for Falcon { let k = ctx0.op_view_1d( memory_k, - n * head_dim, - (memory_k_size * head_dim) * (il * ctx_size + session_len), + n * n_head_kv * head_dim, + (memory_k_size * n_head_kv * head_dim) * (il * ctx_size + session_len), ); let v = ctx0.op_view_1d( memory_v, - n * head_dim, - (memory_v_size * head_dim) * (il * ctx_size + session_len), + n * n_head_kv * head_dim, + (memory_v_size * n_head_kv * head_dim) * (il * ctx_size + session_len), ); gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); @@ -224,28 +256,16 @@ impl KnownModel for Falcon { &ctx0.op_reshape_3d( &ctx0.op_view_1d( memory_k, - (session_len + n) * head_dim, - il * ctx_size * memory_k_size * head_dim, + (session_len + n) * n_head_kv * head_dim, + il * ctx_size * memory_k_size * n_head_kv * head_dim, ), head_dim, - 1, + n_head_kv, session_len + n, ), (0, 2, 1, 3), ); - // K * Q bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy)); - let big_kq = ctx0.op_mul_mat(&bigk, &bigq); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let big_kq_scaled = ctx0.op_scale_inplace( - &big_kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len); - - let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked); let mut bigv = ctx0.op_permute( &ctx0.op_reshape_3d( @@ -262,8 +282,7 @@ impl KnownModel for Falcon { ); bigv = ctx0.op_cont(&ctx0.op_transpose(&ctx0.op_repeat(&bigv, &repeat_dummy))); - // KQV = transpose(V) * KQ_soft_max - let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax); + let big_kqv = ctx0.op_flash_attn(&bigq, &bigk, &bigv, true); // KQV_merged = KQV.permute(0, 2, 1, 3) let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3)); @@ -366,6 +385,8 @@ pub struct Hyperparameters { n_embd: usize, /// n_heads n_head: usize, + // Number of heads for key-value pairs + n_head_kv: usize, /// Number of layers in the model n_layer: usize, /// file_type @@ -378,6 +399,7 @@ impl llm_base::Hyperparameters for Hyperparameters { n_vocab: util::read_i32(reader)?.try_into()?, n_embd: util::read_i32(reader)?.try_into()?, n_head: util::read_i32(reader)?.try_into()?, + n_head_kv: util::read_i32(reader)?.try_into()?, n_layer: util::read_i32(reader)?.try_into()?, file_type: util::read_filetype(reader)?, }; @@ -389,6 +411,7 @@ impl llm_base::Hyperparameters for Hyperparameters { util::write_i32(writer, self.n_vocab.try_into()?)?; util::write_i32(writer, self.n_embd.try_into()?)?; util::write_i32(writer, self.n_head.try_into()?)?; + util::write_i32(writer, self.n_head_kv.try_into()?)?; util::write_i32(writer, self.n_layer.try_into()?)?; util::write_i32(writer, self.file_type.into())?; Ok(()) @@ -409,8 +432,12 @@ impl llm_base::Hyperparameters for Hyperparameters { struct Layer { // normalization - attention_norm: Tensor, - attention_norm_b: Tensor, + input_layernorm: Tensor, + input_layernorm_b: Tensor, + + // Falcon-40B only + attention_norm: Option, + attention_norm_b: Option, // attention query_key_value: Tensor, From e28e0ef9c4aab6aa74fa411bf83c6fcf2d8aee7c Mon Sep 17 00:00:00 2001 From: skirodev Date: Sat, 15 Jul 2023 18:53:16 +0800 Subject: [PATCH 02/10] disable falcon by default --- crates/llm/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 83960b06..1d7f688f 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -33,7 +33,7 @@ default = ["models", "tokenizers-remote"] tokenizers-remote = ["llm-base/tokenizers-remote"] -models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] +models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] From 5b757aaedff7d79cbec2e75ba55c0b469301d7c9 Mon Sep 17 00:00:00 2001 From: skirodev Date: Sat, 15 Jul 2023 19:29:17 +0800 Subject: [PATCH 03/10] fix formatting --- crates/ggml/src/context.rs | 10 +++++++++- crates/models/falcon/src/lib.rs | 29 +++++++++++++++++------------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 763f707a..b6d81e1d 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -476,7 +476,15 @@ impl Context { /// flash attention. pub fn op_flash_attn(&self, q: &Tensor, k: &Tensor, v: &Tensor, masked: bool) -> Tensor { - let tensor = unsafe { sys::ggml_flash_attn(self.ptr.as_ptr(), q.ptr.as_ptr(), k.ptr.as_ptr(), v.ptr.as_ptr(), masked) }; + let tensor = unsafe { + sys::ggml_flash_attn( + self.ptr.as_ptr(), + q.ptr.as_ptr(), + k.ptr.as_ptr(), + v.ptr.as_ptr(), + masked, + ) + }; self.new_tensor_raw(tensor) } } diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index eae7250b..c7950bc9 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -65,17 +65,11 @@ impl KnownModel for Falcon { let mut layers = Vec::new(); // utilizing n_head_kv to determine the model version (parameters) - let Hyperparameters { - n_head_kv, - .. - } = hyperparameters; + let Hyperparameters { n_head_kv, .. } = hyperparameters; for i in 0..hyperparameters.n_layer { let (input_layernorm_name, attention_norm_name) = if n_head_kv == 1 { // falcon 7b - ( - format!("transformer.h.{i}.input_layernorm"), - None, - ) + (format!("transformer.h.{i}.input_layernorm"), None) } else { // falcon 40b ( @@ -86,8 +80,13 @@ impl KnownModel for Falcon { let layer = Layer { input_layernorm: tl.load(&format!("{}.weight", input_layernorm_name))?, input_layernorm_b: tl.load(&format!("{}.bias", input_layernorm_name))?, - attention_norm: attention_norm_name.as_ref().map(|path| tl.load(&format!("{}.bias", path))).transpose()?, - attention_norm_b: attention_norm_name.map(|path| tl.load(&format!("{}.bias", path))).transpose()?, + attention_norm: attention_norm_name + .as_ref() + .map(|path| tl.load(&format!("{}.bias", path))) + .transpose()?, + attention_norm_b: attention_norm_name + .map(|path| tl.load(&format!("{}.bias", path))) + .transpose()?, query_key_value: tl.load(&format!( "transformer.h.{i}.self_attention.query_key_value.weight" @@ -196,10 +195,16 @@ impl KnownModel for Falcon { if n_head_kv != 1 { current = ctx0.op_add( &ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].attention_norm.as_ref().unwrap(), ¤t), + &ctx0.op_repeat( + &self.layers[il].attention_norm.as_ref().unwrap(), + ¤t, + ), + ¤t, + ), + &ctx0.op_repeat( + &self.layers[il].attention_norm_b.as_ref().unwrap(), ¤t, ), - &ctx0.op_repeat(&self.layers[il].attention_norm_b.as_ref().unwrap(), ¤t), ); } From 334e8beb4c7e5b627d67a37c46d1a2c84f0a4db5 Mon Sep 17 00:00:00 2001 From: skirodev Date: Sat, 15 Jul 2023 19:40:15 +0800 Subject: [PATCH 04/10] remove needless borrow --- crates/models/falcon/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index c7950bc9..a6d43137 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -196,13 +196,13 @@ impl KnownModel for Falcon { current = ctx0.op_add( &ctx0.op_mul( &ctx0.op_repeat( - &self.layers[il].attention_norm.as_ref().unwrap(), + self.layers[il].attention_norm.as_ref().unwrap(), ¤t, ), ¤t, ), &ctx0.op_repeat( - &self.layers[il].attention_norm_b.as_ref().unwrap(), + self.layers[il].attention_norm_b.as_ref().unwrap(), ¤t, ), ); From 5c40a195d6c3b1f769de0dbb0a2686083b01e195 Mon Sep 17 00:00:00 2001 From: skirodev Date: Sun, 16 Jul 2023 13:22:17 +0800 Subject: [PATCH 05/10] fix bot token id --- crates/llm/Cargo.toml | 2 +- crates/models/falcon/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 1d7f688f..83960b06 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -33,7 +33,7 @@ default = ["models", "tokenizers-remote"] tokenizers-remote = ["llm-base/tokenizers-remote"] -models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] +models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index a6d43137..110f1e37 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -365,7 +365,7 @@ impl KnownModel for Falcon { } fn bot_token_id(&self) -> Option { - None + self.tokenizer.id(">>ABSTRACT<<".as_bytes()) } fn eot_token_id(&self) -> TokenId { From 71c3273f61aa09c1d714250aa2a02fe8ff2d62e9 Mon Sep 17 00:00:00 2001 From: skirodev Date: Sun, 16 Jul 2023 17:17:41 +0800 Subject: [PATCH 06/10] disable falcon by default --- crates/llm/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 83960b06..1d7f688f 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -33,7 +33,7 @@ default = ["models", "tokenizers-remote"] tokenizers-remote = ["llm-base/tokenizers-remote"] -models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] +models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] From 565ca6db4f381b3d6eed063799efc34a762c714c Mon Sep 17 00:00:00 2001 From: skirodev Date: Sun, 16 Jul 2023 17:31:00 +0800 Subject: [PATCH 07/10] fix attention_norm weight error --- crates/models/falcon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 110f1e37..103ede50 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -82,7 +82,7 @@ impl KnownModel for Falcon { input_layernorm_b: tl.load(&format!("{}.bias", input_layernorm_name))?, attention_norm: attention_norm_name .as_ref() - .map(|path| tl.load(&format!("{}.bias", path))) + .map(|path| tl.load(&format!("{}.weight", path))) .transpose()?, attention_norm_b: attention_norm_name .map(|path| tl.load(&format!("{}.bias", path))) From a804a5f26edb424922657c434c5713a735aca62e Mon Sep 17 00:00:00 2001 From: skirodev Date: Mon, 17 Jul 2023 11:49:13 +0800 Subject: [PATCH 08/10] fix bigv --- crates/ggml/src/context.rs | 2 +- crates/models/falcon/src/lib.rs | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 4f488ca2..33dbb00e 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -566,7 +566,7 @@ impl Context { pub fn op_flash_attn(&self, q: &Tensor, k: &Tensor, v: &Tensor, masked: bool) -> Tensor { let tensor = unsafe { sys::ggml_flash_attn( - self.ptr.as_ptr(), + self.as_ptr(), q.ptr.as_ptr(), k.ptr.as_ptr(), v.ptr.as_ptr(), diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 8e116938..01c99e92 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -175,19 +175,21 @@ impl KnownModel for Falcon { ctx0.use_scratch(builder.get_scratch(0)); // self-attention - current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( + layernorm_output = ctx0.op_norm(&input_layer); + layernorm_output = ctx0.op_add( &ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].input_layernorm, ¤t), - ¤t, + &ctx0.op_repeat(&self.layers[il].input_layernorm, &layernorm_output), + &layernorm_output, ), - &ctx0.op_repeat(&self.layers[il].input_layernorm_b, ¤t), + &ctx0.op_repeat(&self.layers[il].input_layernorm_b, &layernorm_output), ); - layernorm_output = current.share(); - - // Falcon-40B only - if n_head_kv != 1 { + if n_head_kv == 1 { + // Falcon-7B only + current = layernorm_output.share(); + }else{ + // Falcon-40B only + current = ctx0.op_norm(&input_layer); current = ctx0.op_add( &ctx0.op_mul( &ctx0.op_repeat( @@ -271,11 +273,11 @@ impl KnownModel for Falcon { &ctx0.op_reshape_3d( &ctx0.op_view_1d( memory_v, - (session_len + n) * head_dim, - il * ctx_size * memory_v_size * head_dim, + (session_len + n) * n_head_kv * head_dim, + il * ctx_size * memory_v_size * n_head_kv * head_dim, ), head_dim, - 1, + n_head_kv, session_len + n, ), (0, 2, 1, 3), From 191ec023b5e78f48fec85bd4fd8875e1f17cb21f Mon Sep 17 00:00:00 2001 From: skirodev Date: Mon, 17 Jul 2023 12:10:21 +0800 Subject: [PATCH 09/10] fix formatting --- crates/models/falcon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 01c99e92..362d3891 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -187,7 +187,7 @@ impl KnownModel for Falcon { if n_head_kv == 1 { // Falcon-7B only current = layernorm_output.share(); - }else{ + } else { // Falcon-40B only current = ctx0.op_norm(&input_layer); current = ctx0.op_add( From 0afd18ece35dec938ea4f306852d2105430e8609 Mon Sep 17 00:00:00 2001 From: skirodev Date: Wed, 26 Jul 2023 23:22:14 +0800 Subject: [PATCH 10/10] remove bos token id and use float16 kv memory type --- crates/models/falcon/src/lib.rs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 362d3891..b647b361 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -150,12 +150,6 @@ impl KnownModel for Falcon { let ctx0 = builder.ctx0.borrow(); let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd); - let repeat_dummy = ctx0.new_tensor_3d( - input_layer.get_type(), - head_dim, - input_len + session_len, - n_head, - ); let f32_size = std::mem::size_of::(); @@ -254,7 +248,7 @@ impl KnownModel for Falcon { // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) let bigq = ctx0.op_permute(&qcur, (0, 2, 1, 3)); - let mut bigk = ctx0.op_permute( + let bigk = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( memory_k, @@ -267,7 +261,19 @@ impl KnownModel for Falcon { ), (0, 2, 1, 3), ); - bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy)); + + // K * Q + let big_kq = ctx0.op_mul_mat(&bigk, &bigq); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + let big_kq_scaled = ctx0.op_scale_inplace( + &big_kq, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); + + let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len); + + let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked); let mut bigv = ctx0.op_permute( &ctx0.op_reshape_3d( @@ -282,9 +288,9 @@ impl KnownModel for Falcon { ), (0, 2, 1, 3), ); - bigv = ctx0.op_cont(&ctx0.op_transpose(&ctx0.op_repeat(&bigv, &repeat_dummy))); + bigv = ctx0.op_cont(&ctx0.op_transpose(&bigv)); - let big_kqv = ctx0.op_flash_attn(&bigq, &bigk, &bigv, true); + let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax); // KQV_merged = KQV.permute(0, 2, 1, 3) let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3)); @@ -362,7 +368,7 @@ impl KnownModel for Falcon { } fn bot_token_id(&self) -> Option { - self.tokenizer.id(">>ABSTRACT<<".as_bytes()) + None } fn eot_token_id(&self) -> TokenId {