EricLBuehler · joshpopelka20 · Aug 12, 2024 · Aug 12, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/mistralrs-core/src/models/llama.rs b/mistralrs-core/src/models/llama.rs
@@ -1,6 +1,6 @@
 #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
 
-use candle_core::{quantized::QMatMul, DType, Device, Result, Tensor};
+use candle_core::{quantized::QMatMul, DType, Device, Result, Tensor, IndexOp};
 use candle_nn::{embedding, Embedding, Module, VarBuilder};
 use mistralrs_quant::{QuantMethod, QuantizedConfig};
 use serde::Deserialize;
@@ -361,10 +361,15 @@ impl Block {
             metadata,
         )? + residual)?;
         let residual = &x;
-        let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
+        // let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
+        let x = self.rms_2.forward(&x)?;
         Ok(x)
     }
 
+    fn get_device(&self) -> Device {
+        self.mlp.dtype_device().1
+    }
+
     fn load(
         vb: VarBuilder,
         cfg: &Config,
@@ -405,7 +410,9 @@ pub struct Llama {
     blocks: Vec<Block>,
     ln_f: RmsNorm,
     lm_head: QMatMul,
-    pub kv_cache: crate::pipeline::Cache,
+    // pub kv_cache: crate::pipeline::Cache,
+    pub kv_caches: Vec<crate::pipeline::Cache>,
+    cuda_devices: Vec<candle_core::Device>,
     pub device: Device,
     mapper: Box<dyn DeviceMapper + Send + Sync>,
     cfg: ModelConfigMetadata,
@@ -421,30 +428,160 @@ impl Llama {
         mut metadata: Option<(Vec<(Tensor, Tensor)>, &mut PagedAttentionInputMetadata)>,
     ) -> Result<Tensor> {
         let mut x = self.wte.forward(input_ids)?;
-        let mut cache = self.kv_cache.lock();
-        let mask = CausalMasker.make_causal_mask_as_attn_bias(
-            input_ids,
-            metadata
-                .as_ref()
-                .map(|(_, _)| &seqlen_offsets as &dyn PastKvLenCache)
-                .unwrap_or(&*cache as &dyn PastKvLenCache),
-            x.dtype(),
-            self.blocks[0].attn.num_attention_heads,
-        )?;
+        let (batch_size, seq_len, hidden_size) = x.dims3()?;
+
+        let num_devices = 4;
+        let chunk_size = seq_len / num_devices;
+
+        let mut chunks: Vec<Tensor> = Vec::with_capacity(num_devices);
+        // chunks.push(x.copy().unwrap());
+
+        // Handle the case where sequence length is less than number of devices
+        if seq_len <= num_devices {
+            for j in 0..seq_len {
+                // let chunk = x.i((.., j..j+1, ..))?;
+                let chunk = x.clone();
+                chunks.push(chunk.to_device(&self.cuda_devices[j])?);
+            }
+        } else {
+            for j in 0..num_devices {
+                let start = j * chunk_size;
+                let end = if j == num_devices - 1 {
+                    seq_len
+                } else {
+                    (j+ 1) * chunk_size
+                };
+
+                let chunk = x.i((.., start..end,..))?;
+                let device = &self.cuda_devices[j];
+                chunks.push(chunk.to_device(&device)?);
+            }
+        }
+
+        // let mut cache = self.kv_caches[0].lock();
+        let mut processed_chunks = Vec::new();
+        let mut target_device = &self.cuda_devices[0];
+
+        let mut block_chunks: Vec<Tensor> = Vec::new();
+
         for (block_idx, block) in self.blocks.iter().enumerate() {
-            x = self.mapper.map(x, block_idx)?;
-            x = block.forward(
-                &x,
-                &mask.clone().map(|m| m.to_device(x.device()).unwrap()),
-                seqlen_offsets,
-                start_offsets_kernel.clone(),
-                block_idx,
-                &mut cache,
-                metadata
-                    .as_mut()
-                    .map(|(kv_cache, metadata)| (kv_cache[block_idx].clone(), &mut **metadata)),
-            )?;
+
+            let device_chunk = block.get_device();
+            // x = self.mapper.map(x, block_idx)?;
+            // x = self.mapper.map(&chunks[0], block_idx)?;
+            // println!("block_idx {:?}", block_idx);
+            // println!("chunk device {:?}", chunks[0].device());
+            for (chunk_idx, chunk) in chunks.iter().enumerate() {
+                // println!("chunk_idx {:?}", chunk_idx);
+                let mut x = if block_idx == 0 {
+                    let tensor = chunk.clone();
+                    self.mapper.map(tensor.clone(), block_idx)?;
+                    tensor.to_device(&device_chunk)?
+                } else {
+                    let tensor = block_chunks[chunk_idx].clone();
+                    self.mapper.map(tensor.clone(), block_idx)?;
+                    tensor.to_device(&device_chunk)?
+                };
+
+                let num_caches = self.kv_caches.len();
+
+                for cache_rotation in 0..num_caches {
+                    let cache_idx = (chunk_idx + cache_rotation) % num_caches;
+                    let kv_cache = &self.kv_caches[cache_idx];
+                    // println!("cache_idx {:?}", cache_idx);
+                    let mut cache = kv_cache.lock();
+
+
+                    // Determine the original device of the cache
+                    let original_cache_device = cache.iter().find_map(|opt| {
+                        opt.as_ref().map(|(k, _)| k.device().clone())
+                    }).unwrap_or_else(|| device_chunk.clone());
+
+                    // Move cache to chunk device
+                    let mut cache_on_chunk_device: Vec<_> = cache.iter().map(|opt| {
+                        opt.as_ref().map(|(k, v)| {
+                            (k.to_device(&device_chunk).unwrap(), v.to_device(&device_chunk).unwrap())
+                        })
+                    }).collect();
+
+                    let mask = CausalMasker.make_causal_mask_as_attn_bias(
+                        input_ids,
+                        metadata
+                            .as_ref()
+                            .map(|(_, _)| &seqlen_offsets as &dyn PastKvLenCache)
+                            .unwrap_or(&*cache as &dyn PastKvLenCache),
+                        // x.dtype(),
+                        chunks[0].dtype(),
+                        self.blocks[0].attn.num_attention_heads,
+                    )?;
+
+
+
+
+                    // x = block.forward(
+                    //     &x,
+                    //     &mask.clone().map(|m| m.to_device(x.device()).unwrap()),
+                    //     seqlen_offsets,
+                    //     start_offsets_kernel.clone(),
+                    //     block_idx,
+                    //     &mut cache,
+                    //     metadata
+                    //         .as_mut()
+                    //         .map(|(kv_cache, metadata)| (kv_cache[block_idx].clone(), &mut **metadata)),
+                    // )?;
+
+                    // println!("before block forward");
+                    x = block.forward(
+                        &x,
+                        &mask.clone().map(|m| m.to_device(&device_chunk).unwrap()),
+                        seqlen_offsets,
+                        start_offsets_kernel.clone().to_device(&device_chunk)?,
+                        block_idx,
+                        // &mut cache_on_chunk_device,
+                        &mut cache_on_chunk_device,
+                        metadata
+                            .as_mut()
+                            .map(|(kv_cache, metadata)| {
+                                let (tensor1, tensor2) = kv_cache[block_idx].clone();
+                                (
+                                    (tensor1.to_device(&device_chunk).unwrap(), tensor2.to_device(&device_chunk).unwrap()),
+                                    &mut **metadata
+                                )
+                            }),
+                    )?;
+
+                    // println!("after block forward");
+
+                    // Accumulate attention results
+                    if block_chunks.len() <= chunk_idx {
+                        block_chunks.push(x.clone());
+                    } else {
+                        block_chunks[chunk_idx] = x.clone();
+                    }
+                }
+            }   
+
+            // Concatenate chunks for this block
+            // let block_chunks: Result<Vec<Tensor>> = block_chunks
+            //     .clone()
+            //     .into_iter()
+            //     .map(|chunk| chunk.to_device(&device_chunk))
+            //     .collect();
+
+            // let block_chunks = block_chunks?; // Propagate any errors
+
+            // println!("concat block chunks");
+            let mut x = candle_core::Tensor::cat(&block_chunks, 1)?;
+
+            // do feedforward after attention has been run for each chunk
+            let residual = x.clone();
+            let mut x = block.mlp.forward(&x)?;
+            x = (x + &residual)?;
+            x = x.to_device(&target_device)?;
+            processed_chunks.push(x.clone()); 
         }
+        // println!("concat processed chunks");
+        x = candle_core::Tensor::cat(&processed_chunks, 1)?;
         let x = x.to_device(&self.device)?;
         let mut x = self.ln_f.forward(&x)?;
         if matches!(self.lm_head, QMatMul::QTensor(_)) {
@@ -468,6 +605,9 @@ impl Llama {
                 quant_cfg.bits
             );
         }
+
+        let num_devices = 4;
+        let mut cuda_devices = Vec::with_capacity(num_devices);
         let mapper = normal_loading_metadata
             .mapper
             .into_mapper(cfg.num_hidden_layers, &normal_loading_metadata.real_device)?;
@@ -514,6 +654,9 @@ impl Llama {
                             .expect("Failed to create PagedAttention"),
                         ),
                     };
+                    if !cuda_devices.iter().any(|d| format!("{:?}", d) == format!("{:?}", device)) {
+                        cuda_devices.push(device.clone());
+                    }
                     Block::load(
                         vb.pp(&format!("model.layers.{i}")),
                         cfg,
@@ -527,12 +670,21 @@ impl Llama {
                 })
                 .collect();
 
+                let mut kv_caches: Vec<crate::pipeline::Cache> = Vec::with_capacity(num_devices);
+
+                for device_id in 0..num_devices {
+                    let cache = crate::pipeline::Cache::new(cfg.num_hidden_layers , false);
+                    kv_caches.push(cache);
+                };
+
         Ok(Self {
             wte,
             blocks,
             ln_f,
             lm_head: QMatMul::Tensor(lm_head.weight().clone()),
-            kv_cache: crate::pipeline::Cache::new(cfg.num_hidden_layers, false),
+            // kv_cache: crate::pipeline::Cache::new(cfg.num_hidden_layers, false),
+            kv_caches,
+            cuda_devices,
             device: normal_loading_metadata.real_device,
             mapper,
             cfg: ModelConfigMetadata {
@@ -623,7 +775,8 @@ impl NormalModel for Llama {
         unimplemented!()
     }
     fn cache(&self) -> &crate::pipeline::Cache {
-        &self.kv_cache
+        &self.kv_caches[0]
+        // &self.kv_cache
     }
     fn device(&self) -> &Device {
         &self.device