From ce7ffd14a76c263ee39f89ccefc7e82c4f6faf54 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Thu, 22 Jun 2023 10:19:03 +0200
Subject: [PATCH 01/28] Update LLama.cpp

---
 crates/ggml/src/lib.rs     |   2 +-
 crates/ggml/sys/llama-cpp  |   2 +-
 crates/ggml/sys/src/lib.rs | 169 ++++++++++++++++++++++++++++---------
 3 files changed, 132 insertions(+), 41 deletions(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 79623a69..54bb3992 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -399,5 +399,5 @@ pub fn cpu_has_gpublas() -> bool {
 /// Sets the name of a tensor.
 pub fn set_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
-    unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) }
+    unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()); }
 }
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 8ab8ba62..bbca06e2 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 8ab8ba62eb27cc340be2edf3418e051b1d967416
+Subproject commit bbca06e26949686d61a5126332680ba3cccf235c
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index a9a64bfd..c90b9874 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -100,42 +100,46 @@ pub const ggml_op_GGML_OP_NEG: ggml_op = 18;
 pub const ggml_op_GGML_OP_STEP: ggml_op = 19;
 pub const ggml_op_GGML_OP_RELU: ggml_op = 20;
 pub const ggml_op_GGML_OP_GELU: ggml_op = 21;
-pub const ggml_op_GGML_OP_SILU: ggml_op = 22;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 23;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 24;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 25;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 26;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 27;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 28;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 29;
-pub const ggml_op_GGML_OP_SET: ggml_op = 30;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 31;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 32;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 33;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 34;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 35;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 36;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 37;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 38;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 39;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 40;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 41;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 42;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 43;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 44;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 45;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 46;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 47;
-pub const ggml_op_GGML_OP_CONV_1D_1S: ggml_op = 48;
-pub const ggml_op_GGML_OP_CONV_1D_2S: ggml_op = 49;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 53;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 54;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 55;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 56;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 57;
+pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 22;
+pub const ggml_op_GGML_OP_SILU: ggml_op = 23;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 24;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 25;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 26;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 27;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 28;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 29;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 30;
+pub const ggml_op_GGML_OP_SET: ggml_op = 31;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 32;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 33;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 34;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 35;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 36;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 37;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 38;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 40;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 41;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 42;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 43;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 44;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 45;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 46;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 47;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 48;
+pub const ggml_op_GGML_OP_CONV_1D_S1_PH: ggml_op = 49;
+pub const ggml_op_GGML_OP_CONV_1D_S2_PH: ggml_op = 50;
+pub const ggml_op_GGML_OP_CONV_2D_SK_P0: ggml_op = 51;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 52;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 53;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 54;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 59;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 60;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 61;
 pub type ggml_op = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -939,7 +943,10 @@ extern "C" {
     pub fn ggml_get_name(tensor: *const ggml_tensor) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn ggml_set_name(tensor: *mut ggml_tensor, name: *const ::std::os::raw::c_char);
+    pub fn ggml_set_name(
+        tensor: *mut ggml_tensor,
+        name: *const ::std::os::raw::c_char,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
@@ -1001,6 +1008,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_sub_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_mul(
         ctx: *mut ggml_context,
@@ -1008,6 +1022,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_mul_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_div(
         ctx: *mut ggml_context,
@@ -1015,12 +1036,25 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_div_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_sqr(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_sqr_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_sqrt(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_sqrt_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_log(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1053,24 +1087,52 @@ extern "C" {
 extern "C" {
     pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_abs_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_sgn(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_sgn_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_neg(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_neg_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_step(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_step_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_relu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_gelu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_gelu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_gelu_quick(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_gelu_quick_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor)
+        -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_silu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_silu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_silu_back(
         ctx: *mut ggml_context,
@@ -1081,9 +1143,15 @@ extern "C" {
 extern "C" {
     pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_rms_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm_back(
         ctx: *mut ggml_context,
@@ -1394,14 +1462,21 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_1s(
+    pub fn ggml_conv_1d_s1_ph(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_2s(
+    pub fn ggml_conv_1d_s2_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_sk_p0(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
@@ -1436,6 +1511,22 @@ extern "C" {
         c1: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_win_part(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        w: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_win_unpart(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        w0: ::std::os::raw::c_int,
+        h0: ::std::os::raw::c_int,
+        w: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 pub type ggml_unary_op_f32_t = ::std::option::Option<
     unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
 >;

From 46fb5e34f8305178020aad19ab08a3ddf51b2d3c Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Thu, 22 Jun 2023 13:55:40 +0200
Subject: [PATCH 02/28] Made LLama work with CUDA

---
 binaries/llm-cli/src/cli_args.rs         |   7 +-
 crates/ggml/src/context.rs               |  25 +++-
 crates/ggml/src/lib.rs                   | 141 ++++++++++++++++++++++-
 crates/ggml/src/tensor.rs                |  30 +++++
 crates/llm-base/src/inference_session.rs |  72 +++++++++---
 crates/llm-base/src/lib.rs               |  10 --
 crates/llm-base/src/loader.rs            |  20 ++++
 crates/llm-base/src/model/mod.rs         |  16 +++
 crates/models/bloom/src/lib.rs           |   2 +-
 crates/models/gpt2/src/lib.rs            |  12 +-
 crates/models/gptj/src/lib.rs            |   2 +-
 crates/models/gptneox/src/lib.rs         |  14 +--
 crates/models/llama/src/lib.rs           |  56 +++++----
 crates/models/mpt/src/lib.rs             |  10 +-
 14 files changed, 347 insertions(+), 70 deletions(-)

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 39a14fb4..cf6cc6ab 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -306,6 +306,7 @@ impl Generate {
             memory_k_type: mem_typ,
             memory_v_type: mem_typ,
             use_gpu: self.use_gpu,
+            n_batch: self.batch_size,
         }
     }
 
@@ -320,7 +321,6 @@ impl Generate {
     pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters {
         InferenceParameters {
             n_threads: self.num_threads(),
-            n_batch: self.batch_size,
             sampler: Arc::new(llm::samplers::TopPTopK {
                 top_k: self.top_k,
                 top_p: self.top_p,
@@ -406,6 +406,10 @@ pub struct ModelLoad {
     /// LoRA adapter to use for the model
     #[arg(long, num_args(0..))]
     pub lora_paths: Option<Vec<PathBuf>>,
+
+    /// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU.
+    #[arg(long)]
+    pub gpu_layers: Option<usize>,
 }
 impl ModelLoad {
     pub fn load<M: llm::KnownModel + 'static>(&self, use_gpu: bool) -> Result<Box<dyn Model>> {
@@ -414,6 +418,7 @@ impl ModelLoad {
             context_size: self.num_ctx_tokens,
             lora_adapters: self.lora_paths.clone(),
             use_gpu,
+            gpu_layers: self.gpu_layers,
         };
 
         let mut sp = Some(spinoff::Spinner::new(
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 9dd7d108..819a0ef8 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -18,6 +18,9 @@ pub struct Context {
 
     /// Backing buffer (in case we own it)
     pub buffer: Option<Buffer>,
+
+    /// Whether the context can offload tensors to the GPU
+    pub can_offload: bool,
 }
 
 impl Context {
@@ -35,6 +38,7 @@ impl Context {
             ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
             mmap: None,
             buffer: Some(buffer),
+            can_offload: false,
         }
     }
 
@@ -52,6 +56,7 @@ impl Context {
             ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
             mmap: Some(mmap),
             buffer: None,
+            can_offload: false,
         }
     }
 
@@ -70,15 +75,31 @@ impl Context {
             ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
             mmap: None,
             buffer: None,
+            can_offload: false,
         }
     }
 
+    /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
+    pub fn enable_offloading(&mut self) {
+        self.can_offload = true;
+    }
+
+    /// Disables the offloading of tensors to the GPU
+    pub fn disable_offloading(&mut self) {
+        self.can_offload = false;
+    }
+
     /// Wraps a raw tensor with a weak pointer to the context.
     fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
-        Tensor {
+        let tensor = Tensor {
             ptr: NonNull::new(raw).expect("Should not be null"),
             ctx: Arc::downgrade(&self.ptr),
+        };
+
+        if self.can_offload {
+            crate::accelerator_offload_tensor(&tensor);
         }
+        tensor
     }
 
     /// Creates a new 1D tensor.
@@ -429,7 +450,7 @@ impl Context {
     /// Sets the scratch buffer to be used by this [Context].
     ///
     /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
-    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) {
+    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) {
         let (size, data) = if let Some(buffer) = scratch_buffer {
             (buffer.size(), buffer.data)
         } else {
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 54bb3992..941e3e4e 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -29,6 +29,65 @@ mod tests;
 #[cfg(feature = "metal")]
 pub mod metal;
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+///Accelerators supported by `ggml`.
+pub enum Accelerator {
+    ///CuBLAS accelerated
+    CuBLAS,
+    ///CLBlast accelerated
+    CLBlast,
+    ///Metal accelerated
+    Metal,
+    ///Cpu accelerated
+    None,
+}
+
+///Returns the accelerator `ggml` was compiled with.
+pub fn get_accelerator() -> Accelerator {
+    #[cfg(feature = "cublas")]
+    return Accelerator::CLBlast;
+    #[cfg(feature = "clblast")]
+    return Accelerator::CuBLAS;
+    #[cfg(feature = "metal")]
+    return Accelerator::Metal;
+    #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))]
+    return Accelerator::None;
+}
+
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+///Backend to use for a tensor.
+pub enum Backend {
+    /// CPU backend
+    #[default]
+    Cpu,
+    /// GPU backend
+    Gpu,
+    ///Multi-GPU backend
+    GpuSplit,
+}
+
+impl From<Backend> for sys::ggml_backend {
+    fn from(b: Backend) -> Self {
+        match b {
+            Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU,
+            Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU,
+            Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT,
+        }
+    }
+}
+
+impl TryFrom<sys::ggml_backend> for Backend {
+    type Error = ();
+    fn try_from(b: sys::ggml_backend) -> Result<Self, Self::Error> {
+        match b {
+            sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu),
+            sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu),
+            sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit),
+            _ => Err(()),
+        }
+    }
+}
+
 /// The type of a tensor element.
 pub type ElementType = Type;
 
@@ -399,5 +458,85 @@ pub fn cpu_has_gpublas() -> bool {
 /// Sets the name of a tensor.
 pub fn set_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
-    unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()); }
+    unsafe {
+        sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr());
+    }
+}
+
+/// Gets the acceleration backend of a tensor.
+pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend {
+    (tensor.backend as sys::ggml_backend).try_into().unwrap()
+}
+
+/// Sets the acceleration backend of a tensor.
+/// # Safety
+/// This function assumes that the tensor is valid.
+pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) {
+    unsafe {
+        (*tensor).backend = backend.try_into().unwrap();
+    }
+}
+
+/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_transform_tensor(tensor: &mut Tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
+    }
+    #[cfg(feature = "clblast")]
+    unsafe {
+        sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
+    }
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+pub fn accelerator_offload_tensor(tensor: &Tensor) {
+    accelerator_offload_raw_tensor(tensor.ptr.as_ptr());
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_assign_buffers(tensor);
+    }
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr());
+    }
+}
+
+///  Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_set_scratch_size(size: usize) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_set_scratch_size(size);
+    }
+}
+
+///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_initialize(device: i32) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        //TODO: Make this configurable
+        sys::cuda::ggml_init_cublas();
+        sys::cuda::ggml_cuda_set_main_device(device);
+        let split = 1.0f32;
+        sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
+    }
+
+    #[cfg(feature = "clblast")]
+    unsafe {
+        sys::opencl::ggml_cl_init();
+    }
+    
 }
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 39f67d67..947c1e71 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -15,6 +15,36 @@ impl Tensor {
     /// Exposed for purposes of determining context size.
     pub const C_TYPE_SIZE: usize = std::mem::size_of::<sys::ggml_tensor>();
 
+    ///Sets the name of the tensor
+    pub fn set_name(&mut self, name: &str) -> &Tensor {
+        assert!(name.len() <= 32, "Name is too long!");
+
+        let bytes = name.as_bytes();
+        let mut array = [0i8; 32];
+        array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::<Vec<_>>());
+
+        unsafe { self.ptr.as_mut().name = array }
+        self
+    }
+
+    ///Gets the name of the tensor
+    pub fn get_name(&self) -> String {
+        let name = unsafe { self.ptr.as_ref().name };
+        let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
+        name.retain(|&x| x != 0);
+        String::from_utf8(name).unwrap()
+    }
+
+    ///Sets the acceleration backend of the tensor
+    pub fn set_backend(&mut self, backend: crate::Backend) {
+        unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
+    }
+
+    ///Gets the acceleration backend of the tensor
+    pub fn get_backend(&self) -> crate::Backend {
+        unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 57ea0908..d6ef7eaa 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -1,5 +1,5 @@
 use ggml::{Buffer, ComputationGraph, Context, Tensor};
-use std::{fmt::Display, sync::Arc};
+use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
 
 #[cfg(feature = "metal")]
@@ -25,6 +25,24 @@ fn scratch_buffers() -> ScratchBuffers {
     ]
 }
 
+fn kv_memory(
+    context: &Context,
+    config: &InferenceSessionConfig,
+    n_elements: usize,
+) -> (Tensor, Tensor) {
+    let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements);
+    let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements);
+    ggml::set_name(&memory_k, "memory_k");
+    ggml::set_name(&memory_v, "memory_v");
+
+    if config.use_gpu {
+        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
+        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
+    }
+
+    (memory_k, memory_v)
+}
+
 /// Result of graph building
 pub struct GraphOutputs {
     /// The output containing the model's result
@@ -92,19 +110,27 @@ pub struct InferenceSession {
 }
 
 pub struct BuildContext<'session> {
-    pub ctx0: &'session Context,
+    //FIXME: Borrowing issue, dont know how to fix it
+    pub ctx0: RefCell<&'session mut Context>,
     pub embd: &'session Tensor,
     pub memory_k: &'session Tensor,
     pub memory_v: &'session Tensor,
-    pub scratch: &'session mut ScratchBuffers,
+    pub scratch: &'session ScratchBuffers,
 }
 
 impl<'session> BuildContext<'session> {
-    pub fn use_scratch(&mut self, idx: Option<usize>) {
-        self.ctx0.use_scratch(match idx {
-            None => None,
-            Some(idx) => Some(&mut self.scratch[idx]),
-        })
+    pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> {
+        Some(&self.scratch[idx])
+    }
+
+    pub fn enable_offloading(&self) {
+        let mut ctx0 = self.ctx0.borrow_mut();
+        ctx0.enable_offloading();
+    }
+
+    pub fn disable_offloading(&self) {
+        let mut ctx0 = self.ctx0.borrow_mut();
+        ctx0.disable_offloading();
     }
 }
 
@@ -137,15 +163,18 @@ impl InferenceSession {
             ctx_size
         };
 
+        //TODO: check if this is needed and the right place to put it
+        if config.use_gpu {
+            ggml::accelerator_initialize(0);
+            ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024);
+        }
+
         let session_ctx = Arc::new(ggml::Context::init(ctx_size, true));
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * n_ctx;
         let n_elements = n_embd * n_mem;
-        let memory_k = session_ctx.new_tensor_1d(config.memory_k_type.into(), n_elements);
-        let memory_v = session_ctx.new_tensor_1d(config.memory_v_type.into(), n_elements);
-        ggml::set_name(&memory_k, "memory_k");
-        ggml::set_name(&memory_v, "memory_v");
+        let (memory_k, memory_v) = kv_memory(&session_ctx, &config, n_elements);
 
         let scratch = scratch_buffers();
 
@@ -216,12 +245,12 @@ impl InferenceSession {
     {
         // Build a graph
         self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap());
-        let ctx0 = &self.ctx0;
+        let ctx0 = &mut self.ctx0;
         let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
         ggml::set_name(&embd, "embd");
 
         let bc = BuildContext {
-            ctx0,
+            ctx0: RefCell::new(ctx0),
             embd: &embd,
             memory_k: &self.memory_k,
             memory_v: &self.memory_v,
@@ -296,7 +325,7 @@ impl InferenceSession {
             return Err(InferenceError::ContextFull);
         }
 
-        for batch in prompt_tokens.chunks(params.n_batch) {
+        for batch in prompt_tokens.chunks(self.config.n_batch) {
             model.evaluate(self, params, batch, output_request);
             for &tk in batch {
                 let should_call_callback = Some(tk) != model.bot_token_id();
@@ -480,7 +509,7 @@ impl InferenceSession {
         let n_ctx = model.context_size();
         let n_chunk = tokens.len() / n_ctx;
         let n_vocab = model.vocabulary().len();
-        let n_batch = parameters.n_batch;
+        let n_batch = self.config.n_batch;
 
         let mut nll = 0.0;
 
@@ -705,13 +734,24 @@ pub struct InferenceSessionConfig {
 
     /// Whether to use GPU acceleration
     pub use_gpu: bool,
+    /// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt].
+    ///
+    /// This is the number of tokens that will be ingested at once. This is useful for
+    /// trying to speed up the ingestion of prompts, as it allows for parallelization.
+    /// However, you will be fundamentally limited by your machine's ability to evaluate
+    /// the transformer model, so increasing the batch size will not always help.
+    ///
+    /// A reasonable default value is 8.
+    pub n_batch: usize,
 }
+
 impl Default for InferenceSessionConfig {
     fn default() -> Self {
         Self {
             memory_k_type: ModelKVMemoryType::Float16,
             memory_v_type: ModelKVMemoryType::Float16,
             use_gpu: false,
+            n_batch: 8,
         }
     }
 }
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index 127fedf6..caa1fa55 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -65,15 +65,6 @@ pub struct InferenceParameters {
     /// A reasonable default value is 8, as most modern high-performance computers have
     /// 8 physical cores. Adjust to your needs.
     pub n_threads: usize,
-    /// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt].
-    ///
-    /// This is the number of tokens that will be ingested at once. This is useful for
-    /// trying to speed up the ingestion of prompts, as it allows for parallelization.
-    /// However, you will be fundamentally limited by your machine's ability to evaluate
-    /// the transformer model, so increasing the batch size will not always help.
-    ///
-    /// A reasonable default value is 8.
-    pub n_batch: usize,
     /// The sampler to use for sampling tokens from the model's probabilities.
     ///
     /// Each time the model runs, it generates a distribution of probabilities; each token
@@ -94,7 +85,6 @@ impl Default for InferenceParameters {
     fn default() -> Self {
         Self {
             n_threads: 8,
-            n_batch: 8,
             sampler: Arc::new(samplers::TopPTopK::default()),
         }
     }
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 6524a198..a1254114 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -341,6 +341,8 @@ impl LoadError {
 pub trait TensorLoader<E: std::error::Error> {
     /// Gets a tensor from the loader.
     fn load(&mut self, name: &str) -> Result<ggml::Tensor, E>;
+    /// Gets a tensor from the loader and tries to offload it to the specified backend.
+    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, E>;
     /// Finish loading the model, and extract all of the state from the loader.
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>);
 }
@@ -615,6 +617,15 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
         Ok(tensor)
     }
 
+    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, LoadError> {
+        let mut tensor = self.load(name)?;
+        if backend != ggml::Backend::Cpu {
+            tensor.set_backend(backend);
+            crate::ggml::accelerator_transform_tensor(&mut tensor);
+        }
+        Ok(tensor)
+    }
+
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>) {
         (self.context, self.loaded_tensors)
     }
@@ -686,6 +697,15 @@ impl<'a> FileContext<'a> {
             }
         }
 
+        // The tensor name is truncated to 32 characters.
+
+        let tensor_name = if name.len() > 32 {
+            &name[name.len() - 32..]
+        } else {
+            name
+        };
+        tensor.set_name(tensor_name);
+
         Ok(tensor)
     }
 }
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 15bd8ee5..95b1c790 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -190,6 +190,8 @@ pub struct ModelParameters {
     pub lora_adapters: Option<Vec<PathBuf>>,
     /// Whether to use GPU acceleration when available
     pub use_gpu: bool,
+    /// The number of layers to offload to the gpu. If `None`, all layers will be offloaded.
+    pub gpu_layers: Option<usize>,
 }
 
 impl Default for ModelParameters {
@@ -199,6 +201,20 @@ impl Default for ModelParameters {
             context_size: 2048,
             lora_adapters: None,
             use_gpu: false,
+            gpu_layers: None,
+        }
+    }
+}
+
+impl ModelParameters {
+    /// Returns true if the model should offload the given layer to the accelerator.
+    pub fn should_offload(&self, layer: usize) -> bool {
+        if !self.use_gpu {
+            false
+        } else if let Some(offloadable_layers) = self.gpu_layers {
+            layer < offloadable_layers
+        } else {
+            true
         }
     }
 }
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index d44f143e..4f256409 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -140,7 +140,7 @@ impl KnownModel for Bloom {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
-            let ctx0 = builder.ctx0;
+            let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
                 builder.memory_v.element_size(),
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index abc0726d..28911510 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -129,8 +129,8 @@ impl KnownModel for Gpt2 {
             ..
         } = self.hyperparameters;
 
-        let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| {
-            let ctx0 = builder.ctx0;
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
                 builder.memory_v.element_size(),
@@ -149,7 +149,7 @@ impl KnownModel for Gpt2 {
 
             let mut gf = ggml::ComputationGraph::new(num_threads);
             for il in 0..n_layer {
-                builder.use_scratch(Some(0));
+                ctx0.use_scratch(builder.get_scratch(0));
 
                 // norm
                 let mut current = ctx0.op_norm(&input_layer);
@@ -263,7 +263,7 @@ impl KnownModel for Gpt2 {
                 // feed-forward
                 let ff_in = current.share();
 
-                builder.use_scratch(Some(1));
+                ctx0.use_scratch(builder.get_scratch(1));
 
                 // feed-forward normalization
                 current = ctx0.op_norm(&ff_in);
@@ -293,7 +293,7 @@ impl KnownModel for Gpt2 {
                 input_layer = ctx0.op_add(&current, &ff_in);
             }
 
-            builder.use_scratch(Some(0));
+            ctx0.use_scratch(builder.get_scratch(0));
 
             // normalization
             input_layer = ctx0.op_norm(&input_layer);
@@ -302,7 +302,7 @@ impl KnownModel for Gpt2 {
                 &ctx0.op_repeat(&self.ln_f_b, &input_layer),
             );
 
-            builder.use_scratch(None);
+            ctx0.use_scratch(None);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 5ec7d5bc..9ae601b0 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -132,7 +132,7 @@ impl KnownModel for GptJ {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
-            let ctx0 = builder.ctx0;
+            let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
                 builder.memory_v.element_size(),
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 5b4ea0c0..5fd0f059 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -147,8 +147,8 @@ impl KnownModel for GptNeoX {
             ..
         } = self.hyperparameters;
 
-        let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| {
-            let ctx0 = builder.ctx0;
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let ctx0 = builder.ctx0.borrow();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
             let (memory_k_size, memory_v_size) = (
@@ -160,7 +160,7 @@ impl KnownModel for GptNeoX {
 
             for il in 0..n_layer {
                 // attention uses first scratch buffer
-                builder.use_scratch(Some(0));
+                ctx0.use_scratch(builder.get_scratch(0));
 
                 // self-attention
                 let mut current = ctx0.op_norm(&input_layer);
@@ -280,12 +280,12 @@ impl KnownModel for GptNeoX {
                 );
 
                 // use the second scratch for the feed forward
-                builder.use_scratch(Some(1));
+                ctx0.use_scratch(builder.get_scratch(1));
 
                 let feedforward_input: Tensor;
                 if !use_parallel_residual {
                     feedforward_input = ctx0.op_add(&current, &input_layer);
-                    current = feed_forward_network(ctx0, &self.layers[il], &feedforward_input);
+                    current = feed_forward_network(&ctx0, &self.layers[il], &feedforward_input);
                     // input for next layer
                     input_layer = ctx0.op_add(&current, &feedforward_input);
                 } else {
@@ -294,7 +294,7 @@ impl KnownModel for GptNeoX {
 
                     // this is independent of the self-attention result, so it could be done in parallel to the self-attention
                     // note here we pass inpL instead of cur
-                    current = feed_forward_network(ctx0, &self.layers[il], &input_layer);
+                    current = feed_forward_network(&ctx0, &self.layers[il], &input_layer);
 
                     // layer input + FF
                     current = ctx0.op_add(&current, &feedforward_input);
@@ -305,7 +305,7 @@ impl KnownModel for GptNeoX {
             }
 
             // use the first scratch for the norm
-            builder.use_scratch(Some(1));
+            ctx0.use_scratch(builder.get_scratch(0));
 
             // normalize the output
             input_layer = ctx0.op_norm(&input_layer);
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index d4abb2e1..734fdf76 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -4,7 +4,7 @@
 use std::{error::Error, sync::Arc};
 
 use llm_base::{
-    ggml,
+    ggml::{self, Backend},
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
     KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId,
@@ -18,7 +18,7 @@ use llm_base::{
 pub struct Llama {
     // the context size ("memory") the model should use when evaluating a prompt
     context_size: usize,
-
+    model_params: ModelParameters,
     hyperparameters: Hyperparameters,
     vocabulary: Vocabulary,
 
@@ -53,32 +53,39 @@ impl KnownModel for Llama {
 
         // model-global weights
         let wte = tl.load("tok_embeddings.weight")?;
-        let norm = tl.load("norm.weight")?;
-        let output = tl.load("output.weight")?;
+        let norm = tl.offload("norm.weight", Backend::Gpu)?;
+
+        let output = tl.offload("output.weight", Backend::Gpu)?;
 
         let mut layers = Vec::new();
+
         for i in 0..hyperparameters.n_layer {
+            let backend = if params.should_offload(i) {
+                Backend::Gpu
+            } else {
+                Backend::Cpu
+            };
             let layer = Layer {
-                attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"))?,
-                wq: tl.load(&format!("layers.{i}.attention.wq.weight"))?,
-                wk: tl.load(&format!("layers.{i}.attention.wk.weight"))?,
-                wv: tl.load(&format!("layers.{i}.attention.wv.weight"))?,
-                wo: tl.load(&format!("layers.{i}.attention.wo.weight"))?,
-                ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"))?,
-                w1: tl.load(&format!("layers.{i}.feed_forward.w1.weight"))?,
-                w2: tl.load(&format!("layers.{i}.feed_forward.w2.weight"))?,
-                w3: tl.load(&format!("layers.{i}.feed_forward.w3.weight"))?,
+                attention_norm: tl
+                    .offload(&format!("layers.{i}.attention_norm.weight"), backend)?,
+                wq: tl.offload(&format!("layers.{i}.attention.wq.weight"), backend)?,
+                wk: tl.offload(&format!("layers.{i}.attention.wk.weight"), backend)?,
+                wv: tl.offload(&format!("layers.{i}.attention.wv.weight"), backend)?,
+                wo: tl.offload(&format!("layers.{i}.attention.wo.weight"), backend)?,
+                ffn_norm: tl.offload(&format!("layers.{i}.ffn_norm.weight"), backend)?,
+                w1: tl.offload(&format!("layers.{i}.feed_forward.w1.weight"), backend)?,
+                w2: tl.offload(&format!("layers.{i}.feed_forward.w2.weight"), backend)?,
+                w3: tl.offload(&format!("layers.{i}.feed_forward.w3.weight"), backend)?,
             };
-
             layers.push(layer);
         }
-
         let (context, _tensors) = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
         Ok(Self {
             hyperparameters,
+            model_params: params,
             context_size,
             vocabulary,
             wte,
@@ -122,8 +129,8 @@ impl KnownModel for Llama {
             file_type: _,
         } = self.hyperparameters;
 
-        let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| {
-            let ctx0 = builder.ctx0;
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
@@ -137,10 +144,17 @@ impl KnownModel for Llama {
                 },
             );
             for il in 0..n_layer {
+                //TODO: find a better way to do this
+                if self.model_params.should_offload(il) {
+                    ctx0.enable_offloading();
+                } else {
+                    ctx0.disable_offloading();
+                }
+
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
 
-                builder.use_scratch(Some(0));
+                ctx0.use_scratch(builder.get_scratch(0));
 
                 // norm
                 current = ctx0.op_rms_norm(&input_layer);
@@ -266,7 +280,7 @@ impl KnownModel for Llama {
                 // projection (no bias)
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
-                builder.use_scratch(Some(1));
+                ctx0.use_scratch(builder.get_scratch(1));
 
                 let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
 
@@ -293,7 +307,8 @@ impl KnownModel for Llama {
                 // input for next layer
                 input_layer = current;
             }
-            builder.use_scratch(Some(0));
+
+            ctx0.use_scratch(builder.get_scratch(0));
 
             // norm
             input_layer = ctx0.op_rms_norm(&input_layer);
@@ -303,6 +318,7 @@ impl KnownModel for Llama {
 
             let embedding_result: ggml::Tensor = input_layer.share();
 
+            ctx0.disable_offloading();
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
 
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 10ce78e9..e32b545e 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -116,8 +116,8 @@ impl KnownModel for Mpt {
             ..
         } = self.hyperparameters;
 
-        let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| {
-            let ctx0 = builder.ctx0;
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
                 builder.memory_v.element_size(),
@@ -131,7 +131,7 @@ impl KnownModel for Mpt {
             let mut gf = ggml::ComputationGraph::new(num_threads);
             for il in 0..n_layer {
                 // attention uses first scratch buffer
-                builder.use_scratch(Some(0));
+                ctx0.use_scratch(builder.get_scratch(0));
 
                 let mut current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(
@@ -224,7 +224,7 @@ impl KnownModel for Mpt {
                 input_layer = ctx0.op_add(&input_layer, &current);
 
                 // feed forward uses second scratch buffer
-                builder.use_scratch(Some(1));
+                ctx0.use_scratch(builder.get_scratch(1));
 
                 current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(
@@ -243,7 +243,7 @@ impl KnownModel for Mpt {
             }
 
             //use scratch buffer 0 for the rest
-            builder.use_scratch(Some(0));
+            ctx0.use_scratch(builder.get_scratch(0));
 
             // norm
             input_layer = ctx0.op_norm(&input_layer);

From 2c90981d7d288bff6d4d0bd4cbaf681da394ec33 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Thu, 22 Jun 2023 14:17:30 +0200
Subject: [PATCH 03/28] formatting

---
 crates/ggml/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 941e3e4e..fe6f5364 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -538,5 +538,4 @@ pub fn accelerator_initialize(device: i32) {
     unsafe {
         sys::opencl::ggml_cl_init();
     }
-    
 }

From 93dac8f79ec9c31b18df17d0a0e22944fcddb269 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:36:36 +0200
Subject: [PATCH 04/28] Merge: Accelerator Functions

---
 crates/ggml/src/lib.rs | 78 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index b2155c8f..4527a985 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -484,3 +484,81 @@ pub fn set_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
     unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
 }
+
+/// Gets the acceleration backend of a tensor.
+pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend {
+    (tensor.backend as sys::ggml_backend).try_into().unwrap()
+}
+
+/// Sets the acceleration backend of a tensor.
+/// # Safety
+/// This function assumes that the tensor is valid.
+pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) {
+    unsafe {
+        (*tensor).backend = backend.try_into().unwrap();
+    }
+}
+
+/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_transform_tensor(tensor: &mut Tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
+    }
+    #[cfg(feature = "clblast")]
+    unsafe {
+        sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
+    }
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+pub fn accelerator_offload_tensor(tensor: &Tensor) {
+    accelerator_offload_raw_tensor(tensor.ptr.as_ptr());
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_assign_buffers(tensor);
+    }
+}
+
+/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr());
+    }
+}
+
+///  Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_set_scratch_size(size: usize) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_set_scratch_size(size);
+    }
+}
+
+///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_initialize(device: i32) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        //TODO: Make this configurable
+        sys::cuda::ggml_init_cublas();
+        sys::cuda::ggml_cuda_set_main_device(device);
+        let split = 1.0f32;
+        sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
+    }
+
+    #[cfg(feature = "clblast")]
+    unsafe {
+        sys::opencl::ggml_cl_init();
+    }
+    
+}
\ No newline at end of file

From 8499798a634b89bc39fe1341e516de1d48d705f8 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:38:09 +0200
Subject: [PATCH 05/28] We need pre-commit hooks

---
 crates/ggml/src/lib.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 4527a985..06ceb9cb 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -560,5 +560,4 @@ pub fn accelerator_initialize(device: i32) {
     unsafe {
         sys::opencl::ggml_cl_init();
     }
-    
-}
\ No newline at end of file
+}

From a1f61b46a4e6f5c465928a0e21e0fff0b5eb9fc6 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:45:50 +0200
Subject: [PATCH 06/28] update llama.cpp

---
 binaries/generate-ggml-bindings/src/main.rs |  2 +
 crates/ggml/sys/llama-cpp                   |  2 +-
 crates/ggml/sys/src/lib.rs                  | 96 ++++++++++++++++++++-
 crates/ggml/sys/src/llama.rs                |  2 +-
 4 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
index 5bc5d8ca..432d7ffc 100644
--- a/binaries/generate-ggml-bindings/src/main.rs
+++ b/binaries/generate-ggml-bindings/src/main.rs
@@ -23,6 +23,8 @@ fn main() {
 
 fn generate_main(ggml_path: &Path, src_path: &Path) {
     let bindings = bindgen::Builder::default()
+        .header(ggml_path.join("ggml.h").to_str().unwrap().to_string())
+        .allowlist_file(r".*ggml.h")
         .header(ggml_path.join("k_quants.h").to_string_lossy())
         .allowlist_file(r".*k_quants.h")
         // Suppress some warnings
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index bbca06e2..447ccbe8 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit bbca06e26949686d61a5126332680ba3cccf235c
+Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 01f1b52f..d5caebe9 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -140,9 +140,12 @@ pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55;
 pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56;
 pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57;
 pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 59;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 60;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 61;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 62;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 63;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 64;
 pub type ggml_op = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -951,6 +954,13 @@ extern "C" {
         name: *const ::std::os::raw::c_char,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_format_name(
+        tensor: *mut ggml_tensor,
+        fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1541,6 +1551,23 @@ pub type ggml_binary_op_f32_t = ::std::option::Option<
         arg4: *const f32,
     ),
 >;
+pub type ggml_custom1_op_f32_t =
+    ::std::option::Option<unsafe extern "C" fn(arg1: *mut ggml_tensor, arg2: *const ggml_tensor)>;
+pub type ggml_custom2_op_f32_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        arg1: *mut ggml_tensor,
+        arg2: *const ggml_tensor,
+        arg3: *const ggml_tensor,
+    ),
+>;
+pub type ggml_custom3_op_f32_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        arg1: *mut ggml_tensor,
+        arg2: *const ggml_tensor,
+        arg3: *const ggml_tensor,
+        arg4: *const ggml_tensor,
+    ),
+>;
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -1548,6 +1575,13 @@ extern "C" {
         fun: ggml_unary_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_map_unary_inplace_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_unary_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_map_binary_f32(
         ctx: *mut ggml_context,
@@ -1556,6 +1590,62 @@ extern "C" {
         fun: ggml_binary_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_map_binary_inplace_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_binary_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_inplace_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_inplace_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_inplace_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cross_entropy_loss(
         ctx: *mut ggml_context,
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index 4157472a..33e9e541 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -28,4 +28,4 @@ pub const LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
 pub const LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
-pub type llama_ftype = ::std::os::raw::c_uint;
+pub type llama_ftype = ::std::os::raw::c_int;

From e1476a429145e4be6f76332cd686f876fe01a02c Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 30 Jun 2023 16:42:47 +0200
Subject: [PATCH 07/28] Update falcon

---
 crates/llm-base/src/loader.rs   |  2 +-
 crates/models/falcon/src/lib.rs | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 0c53e855..a5ac1c2a 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -30,7 +30,7 @@ pub struct FileType {
 impl From<FileType> for i32 {
     fn from(value: FileType) -> Self {
         (value.quantization_version * ggml::QNT_VERSION_FACTOR) as i32
-            + ggml::sys::llama::llama_ftype::from(value.format) as i32
+            + ggml::sys::llama::llama_ftype::from(value.format)
     }
 }
 impl TryFrom<i32> for FileType {
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 8ee37453..b5d3ce4e 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -131,8 +131,8 @@ impl KnownModel for Falcon {
         let head_dim = n_embd / n_head;
         let n = input_len;
 
-        let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| {
-            let ctx0 = builder.ctx0;
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let ctx0 = builder.ctx0.borrow();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd);
             let repeat_dummy = ctx0.new_tensor_3d(
@@ -157,7 +157,7 @@ impl KnownModel for Falcon {
 
             for il in 0..n_layer {
                 // attention uses first scratch buffer
-                builder.use_scratch(Some(0));
+                ctx0.use_scratch(builder.get_scratch(0));
 
                 // self-attention
                 current = ctx0.op_norm(&input_layer);
@@ -277,7 +277,7 @@ impl KnownModel for Falcon {
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
                 // feed forward uses second scratch buffer
-                builder.use_scratch(Some(1));
+                ctx0.use_scratch(builder.get_scratch(1));
 
                 let inp_ff = layernorm_output.share();
                 let attn_out =
@@ -293,7 +293,7 @@ impl KnownModel for Falcon {
                 input_layer = current.share();
             }
 
-            builder.use_scratch(Some(0));
+            ctx0.use_scratch(builder.get_scratch(0));
 
             // norm
             input_layer = ctx0.op_norm(&input_layer);
@@ -308,7 +308,7 @@ impl KnownModel for Falcon {
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
-            builder.use_scratch(None);
+            ctx0.use_scratch(None);
 
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer);

From 4a18bff6db799641178ee78453184b691c83d5ea Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 30 Jun 2023 17:09:03 +0200
Subject: [PATCH 08/28] Sync latest llama.cpp

---
 crates/ggml/src/context.rs   |  2 ++
 crates/ggml/src/tensor.rs    |  4 ++--
 crates/ggml/sys/llama-cpp    |  2 +-
 crates/ggml/sys/src/cuda.rs  |  3 +++
 crates/ggml/sys/src/lib.rs   | 21 +++++++++++++++------
 crates/ggml/sys/src/llama.rs |  1 +
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 819a0ef8..f05e918f 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -416,6 +416,7 @@ impl Context {
                 usize_to_i32(npast),
                 usize_to_i32(ndims),
                 mode,
+                0,
             )
         };
         self.new_tensor_raw(tensor)
@@ -430,6 +431,7 @@ impl Context {
                 usize_to_i32(npast),
                 usize_to_i32(ndims),
                 mode,
+                0,
             )
         };
         self.new_tensor_raw(tensor)
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 947c1e71..c313eaf7 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -17,10 +17,10 @@ impl Tensor {
 
     ///Sets the name of the tensor
     pub fn set_name(&mut self, name: &str) -> &Tensor {
-        assert!(name.len() <= 32, "Name is too long!");
+        assert!(name.len() <= 48, "Name is too long!");
 
         let bytes = name.as_bytes();
-        let mut array = [0i8; 32];
+        let mut array = [0i8; 48];
         array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::<Vec<_>>());
 
         unsafe { self.ptr.as_mut().name = array }
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 447ccbe8..b8c8dda7 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d
+Subproject commit b8c8dda75fdf5fdea49c80af36818e7c30fe0ddf
diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs
index 7eff110b..b6a82745 100644
--- a/crates/ggml/sys/src/cuda.rs
+++ b/crates/ggml/sys/src/cuda.rs
@@ -85,6 +85,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cuda_assign_buffers_no_scratch(tensor: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor);
+}
 extern "C" {
     pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
 }
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index d5caebe9..dd30f4de 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -22,9 +22,10 @@ pub const GGML_MAX_NODES: u32 = 4096;
 pub const GGML_MAX_PARAMS: u32 = 256;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_OPT: u32 = 4;
-pub const GGML_MAX_NAME: u32 = 32;
+pub const GGML_MAX_NAME: u32 = 48;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
 pub const QK_K: u32 = 256;
+pub const K_SCALE_SIZE: u32 = 12;
 pub type ggml_fp16_t = u16;
 extern "C" {
     pub fn ggml_fp16_to_fp32(x: ggml_fp16_t) -> f32;
@@ -230,7 +231,7 @@ pub struct ggml_tensor {
     pub perf_cycles: i64,
     pub perf_time_us: i64,
     pub data: *mut ::std::os::raw::c_void,
-    pub name: [::std::os::raw::c_char; 32usize],
+    pub name: [::std::os::raw::c_char; 48usize],
     pub extra: *mut ::std::os::raw::c_void,
     pub padding: [::std::os::raw::c_char; 4usize],
 }
@@ -240,7 +241,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        224usize,
+        240usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -420,7 +421,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        208usize,
+        224usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -430,7 +431,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        216usize,
+        232usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -439,7 +440,7 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 224;
+pub const GGML_TENSOR_SIZE: usize = 240;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cgraph {
@@ -777,6 +778,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_cycles_per_ms() -> i64;
 }
+extern "C" {
+    pub fn ggml_numa_init();
+}
+extern "C" {
+    pub fn ggml_is_numa() -> bool;
+}
 extern "C" {
     pub fn ggml_print_object(obj: *const ggml_object);
 }
@@ -1437,6 +1444,7 @@ extern "C" {
         n_past: ::std::os::raw::c_int,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
+        n_ctx: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1446,6 +1454,7 @@ extern "C" {
         n_past: ::std::os::raw::c_int,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
+        n_ctx: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index 33e9e541..2d5a9a6f 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -11,6 +11,7 @@ pub const LLAMA_FILE_MAGIC: u32 = 1734830708;
 pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
 pub const LLAMA_SESSION_VERSION: u32 = 1;
+pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
 pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
 pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
 pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;

From 4ff7f2139e1927225b037eb8059c4353a9f2ae8b Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <lukaskreussel@gmail.com>
Date: Fri, 7 Jul 2023 09:44:54 +0200
Subject: [PATCH 09/28] Adjust to latest changes from main

---
 binaries/llm-test/src/main.rs            | 1 -
 crates/llm-base/src/inference_session.rs | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs
index aa259779..4c59394c 100644
--- a/binaries/llm-test/src/main.rs
+++ b/binaries/llm-test/src/main.rs
@@ -418,7 +418,6 @@ fn run_inference(
             prompt: input.into(),
             parameters: &llm::InferenceParameters {
                 n_threads: model_config.threads,
-                n_batch: 1,
                 sampler: Arc::new(DeterministicSampler),
             },
             play_back_previous_tokens: false,
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 8b1010f9..c082b4b4 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -1,6 +1,6 @@
 use ggml::{Buffer, ComputationGraph, Context, Tensor};
 use serde::Serialize;
-use std::{fmt::Display, sync::Arc};
+use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
 
 #[cfg(feature = "metal")]

From d933f1e8c219e7c654ca9db9960a5f0e1e25ef19 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:02:36 +0200
Subject: [PATCH 10/28] Make clblast buildable on windows

---
 crates/ggml/sys/build.rs    |   7 +-
 crates/ggml/sys/llama-cpp   |   2 +-
 crates/ggml/sys/src/cuda.rs |  31 ------
 crates/ggml/sys/src/lib.rs  | 211 +++++++++++++++++++-----------------
 4 files changed, 120 insertions(+), 131 deletions(-)

diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index 084f09de..38960b21 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -155,8 +155,11 @@ fn lib_path(prefix: &str) -> String {
 fn enable_clblast(build: &mut cc::Build) {
     println!("cargo:rustc-link-lib=clblast");
     println!("cargo:rustc-link-lib=OpenCL");
-    //enable dynamic linking against stdc++
-    println!(r"cargo:rustc-link-lib=dylib=stdc++");
+
+    if cfg!(linux) {
+        //enable dynamic linking against stdc++
+        println!(r"cargo:rustc-link-lib=dylib=stdc++");
+    }
 
     build.file("llama-cpp/ggml-opencl.cpp");
     build.flag("-DGGML_USE_CLBLAST");
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index b8c8dda7..481f793a 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit b8c8dda75fdf5fdea49c80af36818e7c30fe0ddf
+Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc
diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs
index b6a82745..aeeff19a 100644
--- a/crates/ggml/sys/src/cuda.rs
+++ b/crates/ggml/sys/src/cuda.rs
@@ -4,37 +4,6 @@ use super::ggml_compute_params;
 use super::ggml_tensor;
 
 pub const GGML_CUDA_MAX_DEVICES: u32 = 16;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_tensor_extra_gpu {
-    pub data_device: [*mut ::std::os::raw::c_void; 16usize],
-}
-#[test]
-fn bindgen_test_layout_ggml_tensor_extra_gpu() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_tensor_extra_gpu> =
-        ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_tensor_extra_gpu>(),
-        128usize,
-        concat!("Size of: ", stringify!(ggml_tensor_extra_gpu))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_tensor_extra_gpu>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_tensor_extra_gpu))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).data_device) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor_extra_gpu),
-            "::",
-            stringify!(data_device)
-        )
-    );
-}
 extern "C" {
     pub fn ggml_init_cublas();
 }
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index dd30f4de..ce79bcc3 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -34,10 +34,10 @@ extern "C" {
     pub fn ggml_fp32_to_fp16(x: f32) -> ggml_fp16_t;
 }
 extern "C" {
-    pub fn ggml_fp16_to_fp32_row(x: *const ggml_fp16_t, y: *mut f32, n: usize);
+    pub fn ggml_fp16_to_fp32_row(x: *const ggml_fp16_t, y: *mut f32, n: ::std::os::raw::c_int);
 }
 extern "C" {
-    pub fn ggml_fp32_to_fp16_row(x: *const f32, y: *mut ggml_fp16_t, n: usize);
+    pub fn ggml_fp32_to_fp16_row(x: *const f32, y: *mut ggml_fp16_t, n: ::std::os::raw::c_int);
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -96,57 +96,59 @@ pub const ggml_op_GGML_OP_LOG: ggml_op = 10;
 pub const ggml_op_GGML_OP_SUM: ggml_op = 11;
 pub const ggml_op_GGML_OP_SUM_ROWS: ggml_op = 12;
 pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
-pub const ggml_op_GGML_OP_REPEAT: ggml_op = 14;
-pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 15;
-pub const ggml_op_GGML_OP_ABS: ggml_op = 16;
-pub const ggml_op_GGML_OP_SGN: ggml_op = 17;
-pub const ggml_op_GGML_OP_NEG: ggml_op = 18;
-pub const ggml_op_GGML_OP_STEP: ggml_op = 19;
-pub const ggml_op_GGML_OP_RELU: ggml_op = 20;
-pub const ggml_op_GGML_OP_GELU: ggml_op = 21;
-pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 22;
-pub const ggml_op_GGML_OP_SILU: ggml_op = 23;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 24;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 25;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 26;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 27;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 28;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 29;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 30;
-pub const ggml_op_GGML_OP_SET: ggml_op = 31;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 32;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 33;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 34;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 35;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 36;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 37;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 38;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 39;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 40;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 41;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 42;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 43;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 44;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 45;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 46;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 47;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 48;
-pub const ggml_op_GGML_OP_CONV_1D_S1_PH: ggml_op = 49;
-pub const ggml_op_GGML_OP_CONV_1D_S2_PH: ggml_op = 50;
-pub const ggml_op_GGML_OP_CONV_2D_SK_P0: ggml_op = 51;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 52;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 53;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 54;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 59;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 60;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 61;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 62;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 63;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 64;
+pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
+pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
+pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
+pub const ggml_op_GGML_OP_ABS: ggml_op = 17;
+pub const ggml_op_GGML_OP_SGN: ggml_op = 18;
+pub const ggml_op_GGML_OP_NEG: ggml_op = 19;
+pub const ggml_op_GGML_OP_STEP: ggml_op = 20;
+pub const ggml_op_GGML_OP_TANH: ggml_op = 21;
+pub const ggml_op_GGML_OP_ELU: ggml_op = 22;
+pub const ggml_op_GGML_OP_RELU: ggml_op = 23;
+pub const ggml_op_GGML_OP_GELU: ggml_op = 24;
+pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 25;
+pub const ggml_op_GGML_OP_SILU: ggml_op = 26;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 27;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 28;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 29;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 30;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 31;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 32;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 33;
+pub const ggml_op_GGML_OP_SET: ggml_op = 34;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 35;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 36;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 37;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 38;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 39;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 40;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 41;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 42;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 43;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 44;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 45;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 46;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 47;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 48;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 49;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51;
+pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52;
+pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 54;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 55;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 56;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 57;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 58;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 63;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 64;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 65;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 66;
 pub type ggml_op = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -1090,6 +1092,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_mean(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_argmax(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_repeat(
         ctx: *mut ggml_context,
@@ -1128,6 +1133,18 @@ extern "C" {
 extern "C" {
     pub fn ggml_step_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_tanh(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_tanh_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_elu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_elu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1484,24 +1501,35 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_s1_ph(
+    pub fn ggml_conv_1d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        s0: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_s2_ph(
+    pub fn ggml_conv_2d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+        p1: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
+        d1: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_2d_sk_p0(
+    pub fn ggml_conv_1d_ph(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        s: ::std::os::raw::c_int,
+        d: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -2666,13 +2694,13 @@ extern "C" {
 extern "C" {
     pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
 }
-pub type dequantize_row_q_t = ::std::option::Option<
+pub type ggml_to_float_t = ::std::option::Option<
     unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
 >;
-pub type quantize_row_q_t = ::std::option::Option<
+pub type ggml_from_float_t = ::std::option::Option<
     unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
 >;
-pub type vec_dot_q_t = ::std::option::Option<
+pub type ggml_vec_dot_t = ::std::option::Option<
     unsafe extern "C" fn(
         n: ::std::os::raw::c_int,
         s: *mut f32,
@@ -2682,91 +2710,80 @@ pub type vec_dot_q_t = ::std::option::Option<
 >;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct quantize_fns_t {
-    pub dequantize_row_q: dequantize_row_q_t,
-    pub quantize_row_q: quantize_row_q_t,
-    pub quantize_row_q_reference: quantize_row_q_t,
-    pub quantize_row_q_dot: quantize_row_q_t,
-    pub vec_dot_q: vec_dot_q_t,
+pub struct ggml_type_traits_t {
+    pub to_float: ggml_to_float_t,
+    pub from_float: ggml_from_float_t,
+    pub from_float_reference: ggml_from_float_t,
+    pub vec_dot: ggml_vec_dot_t,
     pub vec_dot_type: ggml_type,
 }
 #[test]
-fn bindgen_test_layout_quantize_fns_t() {
-    const UNINIT: ::std::mem::MaybeUninit<quantize_fns_t> = ::std::mem::MaybeUninit::uninit();
+fn bindgen_test_layout_ggml_type_traits_t() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_type_traits_t> = ::std::mem::MaybeUninit::uninit();
     let ptr = UNINIT.as_ptr();
     assert_eq!(
-        ::std::mem::size_of::<quantize_fns_t>(),
-        48usize,
-        concat!("Size of: ", stringify!(quantize_fns_t))
+        ::std::mem::size_of::<ggml_type_traits_t>(),
+        40usize,
+        concat!("Size of: ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
-        ::std::mem::align_of::<quantize_fns_t>(),
+        ::std::mem::align_of::<ggml_type_traits_t>(),
         8usize,
-        concat!("Alignment of ", stringify!(quantize_fns_t))
+        concat!("Alignment of ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).dequantize_row_q) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
-            stringify!(quantize_fns_t),
+            stringify!(ggml_type_traits_t),
             "::",
-            stringify!(dequantize_row_q)
+            stringify!(to_float)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
-            stringify!(quantize_fns_t),
+            stringify!(ggml_type_traits_t),
             "::",
-            stringify!(quantize_row_q)
+            stringify!(from_float)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_reference) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
-            stringify!(quantize_fns_t),
+            stringify!(ggml_type_traits_t),
             "::",
-            stringify!(quantize_row_q_reference)
+            stringify!(from_float_reference)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_dot) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
         24usize,
         concat!(
             "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(quantize_row_q_dot)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_q) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
+            stringify!(ggml_type_traits_t),
             "::",
-            stringify!(vec_dot_q)
+            stringify!(vec_dot)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
-        40usize,
+        32usize,
         concat!(
             "Offset of field: ",
-            stringify!(quantize_fns_t),
+            stringify!(ggml_type_traits_t),
             "::",
             stringify!(vec_dot_type)
         )
     );
 }
 extern "C" {
-    pub fn ggml_internal_get_quantize_fn(i: usize) -> quantize_fns_t;
+    pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t;
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]

From 7ec3683d7b64b2507e1a301e2bdf1b48782a619e Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:25:07 +0200
Subject: [PATCH 11/28] Fix opencl inference

---
 crates/ggml/src/lib.rs                   |  5 -----
 crates/llm-base/src/inference_session.rs | 10 ----------
 crates/models/llama/src/lib.rs           | 12 ++++++++++--
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index f6ab7f06..0cb6aa3d 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -559,9 +559,4 @@ pub fn accelerator_initialize(device: i32) {
         let split = 1.0f32;
         sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
     }
-
-    #[cfg(feature = "clblast")]
-    unsafe {
-        sys::opencl::ggml_cl_init();
-    }
 }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index c082b4b4..860795eb 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -123,16 +123,6 @@ impl<'session> BuildContext<'session> {
     pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> {
         Some(&self.scratch[idx])
     }
-
-    pub fn enable_offloading(&self) {
-        let mut ctx0 = self.ctx0.borrow_mut();
-        ctx0.enable_offloading();
-    }
-
-    pub fn disable_offloading(&self) {
-        let mut ctx0 = self.ctx0.borrow_mut();
-        ctx0.disable_offloading();
-    }
 }
 
 unsafe impl Send for InferenceSession {}
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 214ce96d..654f490b 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -52,9 +52,16 @@ impl KnownModel for Llama {
 
         // model-global weights
         let wte = tl.load("tok_embeddings.weight")?;
-        let norm = tl.offload("norm.weight", Backend::Gpu)?;
 
-        let output = tl.offload("output.weight", Backend::Gpu)?;
+        let backend = if params.should_offload(0) {
+            Backend::Gpu
+        } else {
+            Backend::Cpu
+        };
+
+        let norm = tl.offload("norm.weight", backend)?;
+
+        let output = tl.offload("output.weight", backend)?;
 
         let mut layers = Vec::new();
 
@@ -131,6 +138,7 @@ impl KnownModel for Llama {
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
             let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
+
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
             // for big prompts, if BLAS is enabled, it is better to use only one thread

From 37d6e4fee4be8a33362183e0a37c929d2ada1d4d Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:40:12 +0200
Subject: [PATCH 12/28] Only link clblast dynamically on linux

---
 crates/ggml/sys/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index 38960b21..73fe6f20 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -156,7 +156,7 @@ fn enable_clblast(build: &mut cc::Build) {
     println!("cargo:rustc-link-lib=clblast");
     println!("cargo:rustc-link-lib=OpenCL");
 
-    if cfg!(linux) {
+    if cfg!(target_os = "linux") {
         //enable dynamic linking against stdc++
         println!(r"cargo:rustc-link-lib=dylib=stdc++");
     }

From 1cf7e5c3f912a73e592fe331a6bc8c1c83454ed8 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <lukaskreussel@gmail.com>
Date: Mon, 10 Jul 2023 14:37:23 +0200
Subject: [PATCH 13/28] Update inference.rs

---
 binaries/llm-test/src/inference.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
index ec803cdd..95ce02e7 100644
--- a/binaries/llm-test/src/inference.rs
+++ b/binaries/llm-test/src/inference.rs
@@ -71,7 +71,6 @@ fn run_inference(
             prompt: input.into(),
             parameters: &llm::InferenceParameters {
                 n_threads: model_config.threads,
-                n_batch: 1,
                 sampler: Arc::new(DeterministicSampler),
             },
             play_back_previous_tokens: false,

From 7f767b807fccd978bd785f449f66e48f1dc9275e Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <lukaskreussel@gmail.com>
Date: Mon, 10 Jul 2023 14:57:53 +0200
Subject: [PATCH 14/28] Free tensor and scratch memory if they are dropped

---
 crates/ggml/src/lib.rs                   | 21 +++++++++++++++++++++
 crates/ggml/src/tensor.rs                |  9 +++++++++
 crates/llm-base/src/inference_session.rs |  7 +++++++
 3 files changed, 37 insertions(+)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 0cb6aa3d..540209b9 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -560,3 +560,24 @@ pub fn accelerator_initialize(device: i32) {
         sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
     }
 }
+
+/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op.
+pub fn accelerator_free_scratch() {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_free_scratch();
+    }
+}
+
+/// Frees the memory of a tensor. If ggml-sys is compiled with CUDA or ClBlast support, this function will free the memory of a tensor. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn accelerator_free_tensor(tensor: &Tensor) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_free_data(tensor.ptr.as_ptr());
+    }
+    #[cfg(feature = "clblast")]
+    unsafe {
+        sys::cuda::ggml_cl_free_data(tensor.ptr.as_ptr());
+    }
+}
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index c313eaf7..40bafe6e 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -155,3 +155,12 @@ impl Tensor {
         std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
     }
 }
+
+impl Drop for Tensor {
+    fn drop(&mut self) {
+        if self.get_backend() != crate::Backend::Cpu {
+            // if the tensor is not on the cpu, free it from the accelerator
+            crate::accelerator_free_tensor(self)
+        }
+    }
+}
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 20625283..84922ab5 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -648,6 +648,13 @@ impl InferenceSession {
     }
 }
 
+impl Drop for InferenceSession {
+    fn drop(&mut self) {
+        //if we are using an accelerator, we need to free the scratch memory
+        ggml::accelerator_free_scratch();
+    }
+}
+
 fn get_newly_decoded_portion_huggingface(
     model: &dyn Model,
     tokens: Vec<u32>,

From 83faca77d2b085daf7730b35282913dae56d54c6 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Mon, 10 Jul 2023 15:32:17 +0200
Subject: [PATCH 15/28] Explicitly free k/v memory

---
 crates/ggml/src/lib.rs                   | 2 +-
 crates/ggml/src/tensor.rs                | 9 ---------
 crates/llm-base/src/inference_session.rs | 4 +++-
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 540209b9..33f0cc03 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -578,6 +578,6 @@ pub fn accelerator_free_tensor(tensor: &Tensor) {
     }
     #[cfg(feature = "clblast")]
     unsafe {
-        sys::cuda::ggml_cl_free_data(tensor.ptr.as_ptr());
+        sys::opencl::ggml_cl_free_data(tensor.ptr.as_ptr());
     }
 }
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 40bafe6e..c313eaf7 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -155,12 +155,3 @@ impl Tensor {
         std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
     }
 }
-
-impl Drop for Tensor {
-    fn drop(&mut self) {
-        if self.get_backend() != crate::Backend::Cpu {
-            // if the tensor is not on the cpu, free it from the accelerator
-            crate::accelerator_free_tensor(self)
-        }
-    }
-}
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 84922ab5..ce9551f2 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -650,8 +650,10 @@ impl InferenceSession {
 
 impl Drop for InferenceSession {
     fn drop(&mut self) {
-        //if we are using an accelerator, we need to free the scratch memory
+        //if we are using an accelerator, we need to free the scratch memory and the k/v memory
         ggml::accelerator_free_scratch();
+        ggml::accelerator_free_tensor(&self.memory_k);
+        ggml::accelerator_free_tensor(&self.memory_v);
     }
 }
 

From dfaf7ae453cb858d05f6ca7cce82b25fed0212f8 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Mon, 10 Jul 2023 16:34:06 +0200
Subject: [PATCH 16/28] Explicitly drop model weights

---
 crates/llm/examples/drop_multiple_sessions.rs | 61 +++++++++++++++++++
 crates/models/llama/src/lib.rs                | 16 ++++-
 2 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 crates/llm/examples/drop_multiple_sessions.rs

diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs
new file mode 100644
index 00000000..44e18676
--- /dev/null
+++ b/crates/llm/examples/drop_multiple_sessions.rs
@@ -0,0 +1,61 @@
+use llm::ModelArchitecture;
+use llm_base::{InferenceFeedback, InferenceParameters, ModelParameters};
+use std::{convert::Infallible, path::PathBuf};
+
+fn main() {
+    let prompt = "What is the meaning of life?";
+    let model_path = PathBuf::from(r"C:\Users\lkreu\Downloads\orca-mini-v2_7b.ggmlv3.q5_K_M.bin");
+    let now = std::time::Instant::now();
+
+    let model = llm::load_dynamic(
+        Some(ModelArchitecture::Llama),
+        &model_path,
+        llm_base::TokenizerSource::Embedded,
+        ModelParameters {
+            use_gpu: true,
+            ..Default::default()
+        },
+        llm::load_progress_callback_stdout,
+    )
+    .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}"));
+
+    println!(
+        "Model fully loaded! Elapsed: {}ms",
+        now.elapsed().as_millis()
+    );
+
+    for i in 0..10 {
+        println!("Starting session {i}");
+        let mut session = model.start_session(Default::default());
+        session
+            .feed_prompt(
+                model.as_ref(),
+                &InferenceParameters::default(),
+                prompt,
+                &mut Default::default(),
+                |_| Ok::<InferenceFeedback, Infallible>(llm::InferenceFeedback::Continue),
+            )
+            .unwrap();
+        drop(session);
+        println!("Dropped session {i}");
+    }
+
+    drop(model);
+
+    println!("Model dropped! Elapsed: {}ms", now.elapsed().as_millis());
+
+    for _ in 0..5 {
+        let model = llm::load_dynamic(
+            Some(ModelArchitecture::Llama),
+            &model_path,
+            llm_base::TokenizerSource::Embedded,
+            ModelParameters {
+                use_gpu: true,
+                ..Default::default()
+            },
+            llm::load_progress_callback_stdout,
+        )
+        .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}"));
+        drop(model);
+    }
+}
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 8e5052bd..91822ec3 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -1,7 +1,7 @@
 //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem.
 #![deny(missing_docs)]
 
-use std::{error::Error, sync::Arc};
+use std::{collections::HashMap, error::Error, sync::Arc};
 
 use llm_base::{
     ggml::{self, Backend},
@@ -34,11 +34,22 @@ pub struct Llama {
 
     // must be kept alive for the model
     context: Arc<ggml::Context>,
+    loaded_tensors: HashMap<String, ggml::Tensor>,
 }
 
 unsafe impl Send for Llama {}
 unsafe impl Sync for Llama {}
 
+impl Drop for Llama {
+    fn drop(&mut self) {
+        for (_, tensor) in self.loaded_tensors.drain() {
+            if tensor.get_backend() != Backend::Cpu {
+                ggml::accelerator_free_tensor(&tensor);
+            }
+        }
+    }
+}
+
 impl KnownModel for Llama {
     type Hyperparameters = Hyperparameters;
 
@@ -85,7 +96,7 @@ impl KnownModel for Llama {
             };
             layers.push(layer);
         }
-        let (context, _tensors) = tl.finish();
+        let (context, loaded_tensors) = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
@@ -99,6 +110,7 @@ impl KnownModel for Llama {
             output,
             layers,
             context: Arc::new(context),
+            loaded_tensors,
         })
     }
 

From 53095b1c2b7068708c6852e44e5d42c9f05b6554 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Wed, 12 Jul 2023 12:25:52 +0200
Subject: [PATCH 17/28] Update `llama.cpp` and integrate graph "planning"

---
 binaries/llm-cli/src/cli_args.rs              |   2 +-
 binaries/llm-cli/src/main.rs                  |  13 +-
 binaries/llm-test/src/delete.rs               |   2 +-
 binaries/llm-test/src/inference.rs            |  17 +-
 binaries/llm-test/src/tokens.rs               |   4 +-
 crates/ggml/src/context.rs                    |   9 +-
 crates/ggml/src/lib.rs                        |  59 +++++-
 crates/ggml/sys/llama-cpp                     |   2 +-
 crates/ggml/sys/src/lib.rs                    | 196 +++++++++++-------
 crates/ggml/sys/src/metal.rs                  |   5 +-
 crates/llm-base/src/inference_session.rs      |  29 ++-
 crates/llm-base/src/lib.rs                    |  16 --
 crates/llm-base/src/lora.rs                   |  13 +-
 crates/llm-base/src/model/mod.rs              |  13 +-
 crates/llm/examples/drop_multiple_sessions.rs |  10 +-
 crates/llm/examples/embeddings.rs             |   7 +-
 crates/llm/examples/vicuna-chat.rs            |   1 -
 crates/models/bloom/src/lib.rs                |   8 +-
 crates/models/falcon/src/lib.rs               |   8 +-
 crates/models/gpt2/src/lib.rs                 |   8 +-
 crates/models/gptj/src/lib.rs                 |   8 +-
 crates/models/gptneox/src/lib.rs              |   8 +-
 crates/models/llama/src/lib.rs                |  17 +-
 crates/models/mpt/src/lib.rs                  |   8 +-
 24 files changed, 251 insertions(+), 212 deletions(-)

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index bf8af97b..2c5ba985 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -336,6 +336,7 @@ impl Generate {
             memory_v_type: mem_typ,
             use_gpu: self.use_gpu,
             n_batch: self.batch_size,
+            n_threads: self.num_threads(),
         }
     }
 
@@ -349,7 +350,6 @@ impl Generate {
 
     pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters {
         InferenceParameters {
-            n_threads: self.num_threads(),
             sampler: Arc::new(llm::samplers::TopPTopK {
                 top_k: self.top_k,
                 top_p: self.top_p,
diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
index 9cd8cb84..9815225a 100644
--- a/binaries/llm-cli/src/main.rs
+++ b/binaries/llm-cli/src/main.rs
@@ -111,16 +111,10 @@ fn perplexity(args: &cli_args::Perplexity) -> Result<()> {
     let model = args.model_load.load(args.generate.use_gpu)?;
     let (mut session, _) =
         snapshot::read_or_create_session(model.as_ref(), None, None, inference_session_config);
-    let parameters = args.generate.inference_parameters(model.eot_token_id());
 
-    session.perplexity(
-        model.as_ref(),
-        &parameters,
-        prompt.as_str(),
-        |chunk, perplexity| {
-            println!("Perplexity[{chunk}]: {perplexity}");
-        },
-    )?;
+    session.perplexity(model.as_ref(), prompt.as_str(), |chunk, perplexity| {
+        println!("Perplexity[{chunk}]: {perplexity}");
+    })?;
 
     Ok(())
 }
@@ -273,7 +267,6 @@ fn interactive(
         let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
         if let Err(InferenceError::ContextFull) = session.feed_prompt(
             model.as_ref(),
-            &parameters,
             &prompt,
             // OutputRequest
             &mut Default::default(),
diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs
index 9ddbe7a8..7bcf81df 100644
--- a/binaries/llm-test/src/delete.rs
+++ b/binaries/llm-test/src/delete.rs
@@ -64,7 +64,7 @@ fn feed_prompt(
     model: &impl Model,
     output: &mut OutputRequest,
 ) -> Result<(), llm::InferenceError> {
-    session.feed_prompt(model, &Default::default(), prompt, output, always_continue)
+    session.feed_prompt(model, prompt, output, always_continue)
 }
 
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
index 95ce02e7..5190bb9e 100644
--- a/binaries/llm-test/src/inference.rs
+++ b/binaries/llm-test/src/inference.rs
@@ -4,7 +4,7 @@
 
 use std::{convert::Infallible, sync::Arc};
 
-use llm::InferenceStats;
+use llm::{InferenceSessionConfig, InferenceStats};
 
 use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta};
 
@@ -15,14 +15,11 @@ pub(crate) fn can_infer(
     expected_output: Option<&str>,
     maximum_token_count: usize,
 ) -> anyhow::Result<TestCaseReport> {
-    let mut session = model.start_session(Default::default());
-    let (actual_output, res) = run_inference(
-        model,
-        model_config,
-        &mut session,
-        input,
-        maximum_token_count,
-    );
+    let mut session = model.start_session(InferenceSessionConfig {
+        n_threads: model_config.threads,
+        ..Default::default()
+    });
+    let (actual_output, res) = run_inference(model, &mut session, input, maximum_token_count);
 
     // Process the results
     Ok(TestCaseReport {
@@ -58,7 +55,6 @@ pub(crate) fn can_infer(
 
 fn run_inference(
     model: &dyn llm::Model,
-    model_config: &ModelConfig,
     session: &mut llm::InferenceSession,
     input: &str,
     maximum_token_count: usize,
@@ -70,7 +66,6 @@ fn run_inference(
         &llm::InferenceRequest {
             prompt: input.into(),
             parameters: &llm::InferenceParameters {
-                n_threads: model_config.threads,
                 sampler: Arc::new(DeterministicSampler),
             },
             play_back_previous_tokens: false,
diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs
index 260546b8..adddd678 100644
--- a/binaries/llm-test/src/tokens.rs
+++ b/binaries/llm-test/src/tokens.rs
@@ -65,9 +65,7 @@ fn feed_prompt(
     model: &impl Model,
     output: &mut OutputRequest,
 ) -> Result<(), llm::InferenceError> {
-    session.feed_prompt(model, &Default::default(), prompt, output, |x| {
-        always_continue(x)
-    })
+    session.feed_prompt(model, prompt, output, always_continue)
 }
 
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index d4e7c66c..cb7aa21b 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -2,7 +2,7 @@ use std::{os::raw::c_int, ptr::NonNull, sync::Arc};
 
 use memmap2::Mmap;
 
-use crate::{sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type};
+use crate::{sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type};
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
 /// `ggml_init` and dropping via `ggml_free`.
@@ -442,13 +442,6 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
-    /// Computes the specified graph. Must be run in order to evaluate the graph.
-    pub fn graph_compute(&self, graph: &mut ComputationGraph) {
-        unsafe {
-            sys::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner);
-        }
-    }
-
     /// Retrieves the memory used by this [Context].
     pub fn used_mem(&self) -> usize {
         unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) }
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 33f0cc03..3013254f 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -3,7 +3,7 @@
 //! It exposes a subset of operations (currently used to implement the [llm](https://crates.io/crates/llm) library).
 //! Note that it does not expose a fully-idiomatic safe Rust interface; operations that could be potentially unsafe are marked as such.
 //!
-//! `ggml` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
+//! `ggml` operates on a computational graph; no values will be computed until the [Context] is executed via an [GraphExecutionPlan].
 //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
 #![deny(missing_docs)]
 
@@ -221,6 +221,8 @@ pub enum Type {
     F16,
     /// Float 32-bit.
     F32,
+    /// Integer 8-bit.
+    I8,
 }
 impl From<Type> for sys::ggml_type {
     fn from(t: Type) -> Self {
@@ -239,6 +241,7 @@ impl From<Type> for sys::ggml_type {
             Type::I32 => sys::ggml_type_GGML_TYPE_I32,
             Type::F16 => sys::ggml_type_GGML_TYPE_F16,
             Type::F32 => sys::ggml_type_GGML_TYPE_F32,
+            Type::I8 => sys::ggml_type_GGML_TYPE_I8,
         }
     }
 }
@@ -260,6 +263,7 @@ impl TryFrom<sys::ggml_type> for Type {
             sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
             sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
             sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
+            sys::ggml_type_GGML_TYPE_I8 => Ok(Type::I8),
 
             _ => Err(()),
         }
@@ -282,6 +286,7 @@ impl std::fmt::Display for Type {
             Type::I32 => write!(f, "i32"),
             Type::F16 => write!(f, "f16"),
             Type::F32 => write!(f, "f32"),
+            Type::I8 => write!(f, "i8"),
         }
     }
 }
@@ -303,6 +308,7 @@ impl Type {
             Type::I32 => false,
             Type::F16 => false,
             Type::F32 => false,
+            Type::I8 => false,
         }
     }
 }
@@ -351,10 +357,9 @@ pub struct ComputationGraph {
 
 impl ComputationGraph {
     /// Create a new [ComputationGraph] with the specified `n_threads`.
-    pub fn new(n_threads: usize) -> Self {
+    pub fn new() -> Self {
         Self {
             inner: sys::ggml_cgraph {
-                n_threads: usize_to_i32(n_threads),
                 // SAFETY: This should be safe to zero. The original C++ impl
                 // just leaves it uninitialized
                 ..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
@@ -368,6 +373,54 @@ impl ComputationGraph {
     }
 }
 
+impl Default for ComputationGraph {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A `ggml` execution plan. Contains the information needed to execute a computation graph.
+pub struct GraphExecutionPlan {
+    inner: sys::ggml_cplan,
+    inner_graph: sys::ggml_cgraph,
+}
+
+impl GraphExecutionPlan {
+    /// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use.
+    pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self {
+        Self {
+            inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) },
+            inner_graph: graph.inner,
+        }
+    }
+
+    /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context].
+    fn create_work_buffer(&mut self, context: &Context) -> Tensor {
+        context.new_tensor_1d(Type::I8, self.inner.work_size)
+    }
+
+    /// Assign a work buffer to this [GraphExecutionPlan].
+    fn assign_work_buffer(&mut self, buffer: &mut Tensor) {
+        assert!(
+            buffer.get_type() == Type::I8,
+            "Work buffer must be of type i8"
+        );
+        unsafe {
+            self.inner.work_data = buffer.data().cast();
+        }
+    }
+
+    /// Execute this [GraphExecutionPlan] in the given [Context].
+    pub fn execute(&mut self, context: &Context) {
+        let mut work_buffer = self.create_work_buffer(context);
+        self.assign_work_buffer(&mut work_buffer);
+
+        unsafe {
+            sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner);
+        }
+    }
+}
+
 /// The size of `t` as bytes.
 pub fn type_size(t: Type) -> usize {
     unsafe { sys::ggml_type_size(t.into()) }
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 481f793a..2b5eb72e 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc
+Subproject commit 2b5eb72e109577ed84e44bb8fa47e4956f337300
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index ce79bcc3..a9779fd1 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -21,9 +21,11 @@ pub const GGML_MAX_DIMS: u32 = 4;
 pub const GGML_MAX_NODES: u32 = 4096;
 pub const GGML_MAX_PARAMS: u32 = 256;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
-pub const GGML_MAX_OPT: u32 = 4;
+pub const GGML_MAX_SRC: u32 = 6;
 pub const GGML_MAX_NAME: u32 = 48;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub const GGML_EXIT_SUCCESS: u32 = 0;
+pub const GGML_EXIT_ABORTED: u32 = 1;
 pub const QK_K: u32 = 256;
 pub const K_SCALE_SIZE: u32 = 12;
 pub type ggml_fp16_t = u16;
@@ -225,17 +227,14 @@ pub struct ggml_tensor {
     pub op: ggml_op,
     pub is_param: bool,
     pub grad: *mut ggml_tensor,
-    pub src0: *mut ggml_tensor,
-    pub src1: *mut ggml_tensor,
-    pub opt: [*mut ggml_tensor; 4usize],
-    pub n_tasks: ::std::os::raw::c_int,
+    pub src: [*mut ggml_tensor; 6usize],
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
     pub data: *mut ::std::os::raw::c_void,
     pub name: [::std::os::raw::c_char; 48usize],
     pub extra: *mut ::std::os::raw::c_void,
-    pub padding: [::std::os::raw::c_char; 4usize],
+    pub padding: [::std::os::raw::c_char; 8usize],
 }
 #[test]
 fn bindgen_test_layout_ggml_tensor() {
@@ -332,48 +331,18 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).src0) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize },
         96usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
             "::",
-            stringify!(src0)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).src1) as usize - ptr as usize },
-        104usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(src1)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).opt) as usize - ptr as usize },
-        112usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(opt)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
-        144usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(n_tasks)
+            stringify!(src)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        148usize,
+        144usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -445,86 +414,139 @@ fn bindgen_test_layout_ggml_tensor() {
 pub const GGML_TENSOR_SIZE: usize = 240;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct ggml_cgraph {
-    pub n_nodes: ::std::os::raw::c_int,
-    pub n_leafs: ::std::os::raw::c_int,
-    pub n_threads: ::std::os::raw::c_int,
+pub struct ggml_cplan {
     pub work_size: usize,
-    pub work: *mut ggml_tensor,
-    pub nodes: [*mut ggml_tensor; 4096usize],
-    pub grads: [*mut ggml_tensor; 4096usize],
-    pub leafs: [*mut ggml_tensor; 4096usize],
-    pub perf_runs: ::std::os::raw::c_int,
-    pub perf_cycles: i64,
-    pub perf_time_us: i64,
+    pub work_data: *mut u8,
+    pub n_threads: ::std::os::raw::c_int,
+    pub n_tasks: [::std::os::raw::c_int; 4096usize],
+    pub abort_callback:
+        ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void) -> bool>,
+    pub abort_callback_data: *mut ::std::os::raw::c_void,
 }
 #[test]
-fn bindgen_test_layout_ggml_cgraph() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_cgraph> = ::std::mem::MaybeUninit::uninit();
+fn bindgen_test_layout_ggml_cplan() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_cplan> = ::std::mem::MaybeUninit::uninit();
     let ptr = UNINIT.as_ptr();
     assert_eq!(
-        ::std::mem::size_of::<ggml_cgraph>(),
-        98360usize,
-        concat!("Size of: ", stringify!(ggml_cgraph))
+        ::std::mem::size_of::<ggml_cplan>(),
+        16424usize,
+        concat!("Size of: ", stringify!(ggml_cplan))
     );
     assert_eq!(
-        ::std::mem::align_of::<ggml_cgraph>(),
+        ::std::mem::align_of::<ggml_cplan>(),
         8usize,
-        concat!("Alignment of ", stringify!(ggml_cgraph))
+        concat!("Alignment of ", stringify!(ggml_cplan))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_cgraph),
+            stringify!(ggml_cplan),
             "::",
-            stringify!(n_nodes)
+            stringify!(work_size)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
-        4usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).work_data) as usize - ptr as usize },
+        8usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_cgraph),
+            stringify!(ggml_cplan),
             "::",
-            stringify!(n_leafs)
+            stringify!(work_data)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
-        8usize,
+        16usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_cgraph),
+            stringify!(ggml_cplan),
             "::",
             stringify!(n_threads)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize },
-        16usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cplan),
+            "::",
+            stringify!(n_tasks)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize },
+        16408usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cplan),
+            "::",
+            stringify!(abort_callback)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize },
+        16416usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cplan),
+            "::",
+            stringify!(abort_callback_data)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_cgraph {
+    pub n_nodes: ::std::os::raw::c_int,
+    pub n_leafs: ::std::os::raw::c_int,
+    pub nodes: [*mut ggml_tensor; 4096usize],
+    pub grads: [*mut ggml_tensor; 4096usize],
+    pub leafs: [*mut ggml_tensor; 4096usize],
+    pub perf_runs: ::std::os::raw::c_int,
+    pub perf_cycles: i64,
+    pub perf_time_us: i64,
+}
+#[test]
+fn bindgen_test_layout_ggml_cgraph() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_cgraph> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_cgraph>(),
+        98336usize,
+        concat!("Size of: ", stringify!(ggml_cgraph))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_cgraph>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_cgraph))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        0usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
             "::",
-            stringify!(work_size)
+            stringify!(n_nodes)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).work) as usize - ptr as usize },
-        24usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
+        4usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
             "::",
-            stringify!(work)
+            stringify!(n_leafs)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize },
-        32usize,
+        8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -534,7 +556,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize },
-        32800usize,
+        32776usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -544,7 +566,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize },
-        65568usize,
+        65544usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -554,7 +576,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        98336usize,
+        98312usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -564,7 +586,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        98344usize,
+        98320usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -574,7 +596,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        98352usize,
+        98328usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -1715,11 +1737,27 @@ extern "C" {
     ) -> ggml_cgraph;
 }
 extern "C" {
-    pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph);
+    pub fn ggml_graph_plan(
+        cgraph: *mut ggml_cgraph,
+        n_threads: ::std::os::raw::c_int,
+    ) -> ggml_cplan;
+}
+extern "C" {
+    pub fn ggml_graph_compute(
+        cgraph: *mut ggml_cgraph,
+        cplan: *mut ggml_cplan,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
 }
+extern "C" {
+    pub fn ggml_graph_compute_with_ctx(
+        ctx: *mut ggml_context,
+        cgraph: *mut ggml_cgraph,
+        n_threads: ::std::os::raw::c_int,
+    );
+}
 extern "C" {
     pub fn ggml_graph_get_tensor(
         cgraph: *mut ggml_cgraph,
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index 95b427c0..37d97e68 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -17,11 +17,14 @@ pub struct ggml_metal_context {
     _unused: [u8; 0],
 }
 extern "C" {
-    pub fn ggml_metal_init() -> *mut ggml_metal_context;
+    pub fn ggml_metal_init(n_cb: ::std::os::raw::c_int) -> *mut ggml_metal_context;
 }
 extern "C" {
     pub fn ggml_metal_free(ctx: *mut ggml_metal_context);
 }
+extern "C" {
+    pub fn ggml_metal_set_n_cb(ctx: *mut ggml_metal_context, n_cb: ::std::os::raw::c_int);
+}
 extern "C" {
     pub fn ggml_metal_add_buffer(
         ctx: *mut ggml_metal_context,
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index ce9551f2..6e558d06 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -1,4 +1,4 @@
-use ggml::{Buffer, ComputationGraph, Context, Tensor};
+use ggml::{Buffer, ComputationGraph, Context, GraphExecutionPlan, Tensor};
 use serde::Serialize;
 use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
@@ -280,7 +280,8 @@ impl InferenceSession {
         }
         #[cfg(not(feature = "metal"))]
         {
-            ctx0.graph_compute(&mut built_gf);
+            let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
+            plan.execute(ctx0);
         }
 
         // Adjust the required memory per token if we didn't know that already
@@ -302,7 +303,6 @@ impl InferenceSession {
     pub fn feed_prompt<'a, E: std::error::Error + 'static, P: Into<Prompt<'a>>>(
         &mut self,
         model: &dyn Model,
-        params: &InferenceParameters,
         prompt: P,
         output_request: &mut OutputRequest,
         mut callback: impl FnMut(&[u8]) -> Result<InferenceFeedback, E>,
@@ -317,7 +317,7 @@ impl InferenceSession {
         }
 
         for batch in prompt_tokens.chunks(self.config.n_batch) {
-            model.evaluate(self, params, batch, output_request);
+            model.evaluate(self, batch, output_request);
             for &tk in batch {
                 let should_call_callback = Some(tk) != model.bot_token_id();
 
@@ -397,7 +397,7 @@ impl InferenceSession {
         self.tokens.push(next_token);
 
         // Then, evaluate the network again to compute the new last_logits
-        model.evaluate(self, params, &[next_token], output_request);
+        model.evaluate(self, &[next_token], output_request);
 
         // Return the next token
         if next_token as TokenId == model.eot_token_id() {
@@ -460,7 +460,6 @@ impl InferenceSession {
         // context window with new data.
         self.feed_prompt(
             model,
-            parameters,
             request.prompt,
             output_request,
             feed_prompt_callback(&mut callback),
@@ -509,7 +508,6 @@ impl InferenceSession {
     pub fn perplexity<'a, P: Into<Prompt<'a>>>(
         &mut self,
         model: &dyn Model,
-        parameters: &InferenceParameters,
         prompt: P,
         mut perplexity_callback: impl FnMut(usize, f32),
     ) -> Result<(), TokenizationError> {
@@ -554,7 +552,6 @@ impl InferenceSession {
 
                 model.evaluate(
                     self,
-                    parameters,
                     &tokens[batch_start..batch_start + batch_size],
                     &mut output_request,
                 );
@@ -804,6 +801,21 @@ pub struct InferenceSessionConfig {
     ///
     /// A reasonable default value is 8.
     pub n_batch: usize,
+    /// The number of threads to use. This is dependent on your user's system,
+    /// and should be selected accordingly.
+    ///
+    /// Note that you should aim for a value close to the number of physical cores
+    /// on the system, as this will give the best performance. This means that, for
+    /// example, on a 16-core system with hyperthreading, you should set this to 16.
+    ///
+    /// Also note that not all cores on a system are equal, and that you may need to
+    /// experiment with this value to find the optimal value for your use case. For example,
+    /// Apple Silicon and modern Intel processors have "performance" and "efficiency" cores,
+    /// and you may want to only use the performance cores.
+    ///
+    /// A reasonable default value is 8, as most modern high-performance computers have
+    /// 8 physical cores. Adjust to your needs.
+    pub n_threads: usize,
 }
 
 impl Default for InferenceSessionConfig {
@@ -813,6 +825,7 @@ impl Default for InferenceSessionConfig {
             memory_v_type: ModelKVMemoryType::Float16,
             use_gpu: false,
             n_batch: 8,
+            n_threads: 8,
         }
     }
 }
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index b08e62cd..e7607888 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -49,21 +49,6 @@ pub use util::TokenUtf8Buffer;
 /// This needs to be provided during all inference calls,
 /// but can be changed between calls.
 pub struct InferenceParameters {
-    /// The number of threads to use. This is dependent on your user's system,
-    /// and should be selected accordingly.
-    ///
-    /// Note that you should aim for a value close to the number of physical cores
-    /// on the system, as this will give the best performance. This means that, for
-    /// example, on a 16-core system with hyperthreading, you should set this to 16.
-    ///
-    /// Also note that not all cores on a system are equal, and that you may need to
-    /// experiment with this value to find the optimal value for your use case. For example,
-    /// Apple Silicon and modern Intel processors have "performance" and "efficiency" cores,
-    /// and you may want to only use the performance cores.
-    ///
-    /// A reasonable default value is 8, as most modern high-performance computers have
-    /// 8 physical cores. Adjust to your needs.
-    pub n_threads: usize,
     /// The sampler to use for sampling tokens from the model's probabilities.
     ///
     /// Each time the model runs, it generates a distribution of probabilities; each token
@@ -83,7 +68,6 @@ unsafe impl Sync for InferenceParameters {}
 impl Default for InferenceParameters {
     fn default() -> Self {
         Self {
-            n_threads: 8,
             sampler: Arc::new(samplers::TopPTopK::default()),
         }
     }
diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
index 6c378b25..8cdc2c88 100644
--- a/crates/llm-base/src/lora.rs
+++ b/crates/llm-base/src/lora.rs
@@ -3,7 +3,7 @@ use crate::{
     LoadError,
 };
 
-use ggml::format::TensorLoadInfo;
+use ggml::{format::TensorLoadInfo, GraphExecutionPlan};
 use std::{
     collections::{HashMap, HashSet},
     fs::File,
@@ -112,9 +112,9 @@ impl LoraAdapter {
         let a = patch_file.get_tensor(&a_info)?;
         let b = patch_file.get_tensor(&b_info)?;
 
-        // Build a ggml context and apply the patch
-        // TODO: maybe pass the model's thread count to this context
-        let mut gf = ggml::ComputationGraph::new(8);
+        //Build a ggml context and apply the patch
+
+        let mut gf = ggml::ComputationGraph::new();
 
         // LoRA formula: w = w + ba*s
         let mut ba = patch_context.op_mul_mat(&a, &b);
@@ -126,7 +126,10 @@ impl LoraAdapter {
 
         // Compute the graph
         gf.build_forward_expand(&output);
-        patch_context.graph_compute(&mut gf);
+
+        //TODO: maybe pass the model's thread count to this context
+        let mut plan = GraphExecutionPlan::new(&mut gf, 8);
+        plan.execute(&patch_context);
 
         // Overwrite the original tensor.
         // The `output` and the `target_tensor` are not from the same context,
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index fd5d01bc..c8799cdc 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -11,8 +11,8 @@ use regex::Regex;
 use thiserror::Error;
 
 use crate::{
-    loader::TensorLoader, tokenizer::TokenId, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionConfig, LoadError, LoadProgress, Tokenizer, TokenizerSource,
+    loader::TensorLoader, tokenizer::TokenId, FileType, InferenceSession, InferenceSessionConfig,
+    LoadError, LoadProgress, Tokenizer, TokenizerSource,
 };
 
 /// Common functions for model evaluation
@@ -54,13 +54,12 @@ pub trait KnownModel: Send + Sync {
     fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession;
 
     /// This function is called by the provided [InferenceSession]; it will use this model
-    /// and the [InferenceParameters] to generate output by evaluating the `input_tokens`.
+    /// to generate output by evaluating the `input_tokens`.
     /// The [OutputRequest] is used to specify additional data to fetch from the
     /// model.
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     );
@@ -101,13 +100,12 @@ pub trait Model: Send + Sync {
     fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession;
 
     /// This function is called by the provided [InferenceSession]; it will use this model
-    /// and the [InferenceParameters] to generate output by evaluating the `input_tokens`.
+    /// to generate output by evaluating the `input_tokens`.
     /// The [OutputRequest] is used to specify additional data to fetch from the
     /// model.
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     );
@@ -136,11 +134,10 @@ impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        KnownModel::evaluate(self, session, params, input_tokens, output_request)
+        KnownModel::evaluate(self, session, input_tokens, output_request)
     }
 
     fn tokenizer(&self) -> &Tokenizer {
diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs
index 44e18676..219e2a6d 100644
--- a/crates/llm/examples/drop_multiple_sessions.rs
+++ b/crates/llm/examples/drop_multiple_sessions.rs
@@ -28,13 +28,9 @@ fn main() {
         println!("Starting session {i}");
         let mut session = model.start_session(Default::default());
         session
-            .feed_prompt(
-                model.as_ref(),
-                &InferenceParameters::default(),
-                prompt,
-                &mut Default::default(),
-                |_| Ok::<InferenceFeedback, Infallible>(llm::InferenceFeedback::Continue),
-            )
+            .feed_prompt(model.as_ref(), prompt, &mut Default::default(), |_| {
+                Ok::<InferenceFeedback, Infallible>(llm::InferenceFeedback::Continue)
+            })
             .unwrap();
         drop(session);
         println!("Dropped session {i}");
diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs
index 0a6a999a..427c9e48 100644
--- a/crates/llm/examples/embeddings.rs
+++ b/crates/llm/examples/embeddings.rs
@@ -125,12 +125,7 @@ fn get_embeddings(
         .iter()
         .map(|(_, tok)| *tok)
         .collect::<Vec<_>>();
-    model.evaluate(
-        &mut session,
-        inference_parameters,
-        &query_token_ids,
-        &mut output_request,
-    );
+    model.evaluate(&mut session, &query_token_ids, &mut output_request);
     output_request.embeddings.unwrap()
 }
 
diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs
index 7cdeb1d1..6b96d756 100644
--- a/crates/llm/examples/vicuna-chat.rs
+++ b/crates/llm/examples/vicuna-chat.rs
@@ -57,7 +57,6 @@ fn main() {
     session
         .feed_prompt(
             model.as_ref(),
-            &inference_parameters,
             format!("{persona}\n{history}").as_str(),
             &mut Default::default(),
             llm::feed_prompt_callback(|resp| match resp {
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index ce781fc8..3f29c1ed 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -7,8 +7,8 @@ use std::sync::Arc;
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel,
+    ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom)
@@ -121,13 +121,11 @@ impl KnownModel for Bloom {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -153,7 +151,7 @@ impl KnownModel for Bloom {
             input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
             input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer);
 
-            let mut gf = ggml::ComputationGraph::new(num_threads);
+            let mut gf = ggml::ComputationGraph::new();
             for il in 0..n_layer {
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 9f44b055..15a77527 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -13,8 +13,8 @@ use ggml::Tensor;
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae)
@@ -111,13 +111,11 @@ impl KnownModel for Falcon {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -150,7 +148,7 @@ impl KnownModel for Falcon {
             let memory_v = builder.memory_v;
             let memory_v_size = memory_v.element_size();
 
-            let mut gf = ggml::ComputationGraph::new(num_threads);
+            let mut gf = ggml::ComputationGraph::new();
 
             let mut current: Tensor;
             let mut layernorm_output: Tensor;
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index 0387df1a..e19264b1 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -7,8 +7,8 @@ use ggml::Tensor;
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/)
@@ -117,13 +117,11 @@ impl KnownModel for Gpt2 {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -152,7 +150,7 @@ impl KnownModel for Gpt2 {
                 &ctx0.op_get_rows(&self.wpe, &position),
             );
 
-            let mut gf = ggml::ComputationGraph::new(num_threads);
+            let mut gf = ggml::ComputationGraph::new();
             for il in 0..n_layer {
                 ctx0.use_scratch(builder.get_scratch(0));
 
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 5c7a0897..85864fce 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -7,8 +7,8 @@ use ggml::Tensor;
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b)
@@ -112,13 +112,11 @@ impl KnownModel for GptJ {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -140,7 +138,7 @@ impl KnownModel for GptJ {
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
-            let mut gf = ggml::ComputationGraph::new(num_threads);
+            let mut gf = ggml::ComputationGraph::new();
             for il in 0..n_layer {
                 // norm
                 let mut current = ctx0.op_norm(&input_layer);
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 80d0b7b8..12fb42a0 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -8,8 +8,8 @@ use ggml::Tensor;
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox)
@@ -127,13 +127,11 @@ impl KnownModel for GptNeoX {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let n = input_tokens.len();
         let n_past = session.n_past;
-        let n_threads = params.n_threads;
         let n_ctx = self.context_size;
 
         let Hyperparameters {
@@ -155,7 +153,7 @@ impl KnownModel for GptNeoX {
                 builder.memory_v.element_size(),
             );
 
-            let mut gf = ggml::ComputationGraph::new(n_threads);
+            let mut gf = ggml::ComputationGraph::new();
 
             for il in 0..n_layer {
                 // attention uses first scratch buffer
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 91822ec3..f612d84f 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -6,8 +6,8 @@ use std::{collections::HashMap, error::Error, sync::Arc};
 use llm_base::{
     ggml::{self, Backend},
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
@@ -128,13 +128,11 @@ impl KnownModel for Llama {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -153,15 +151,8 @@ impl KnownModel for Llama {
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
-            // for big prompts, if BLAS is enabled, it is better to use only one thread
-            // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-            let mut gf = ggml::ComputationGraph::new(
-                if input_len >= 32 && ggml::cpu_has_blas() && !ggml::cpu_has_gpublas() {
-                    1
-                } else {
-                    num_threads
-                },
-            );
+            let mut gf = ggml::ComputationGraph::new();
+
             for il in 0..n_layer {
                 //TODO: find a better way to do this
                 if self.model_params.should_offload(il) {
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 69484bcd..fa66eac2 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -7,8 +7,8 @@ use ggml::Tensor;
 use llm_base::{
     ggml::{self},
     model::{common, HyperparametersWriteError},
-    util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b)
@@ -98,13 +98,11 @@ impl KnownModel for Mpt {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
         let n = input_tokens.len();
         let session_len = session.n_past;
-        let num_threads = params.n_threads;
         let ctx_size = self.context_size;
 
         let Hyperparameters {
@@ -128,7 +126,7 @@ impl KnownModel for Mpt {
 
             let f32_size = std::mem::size_of::<f32>();
 
-            let mut gf = ggml::ComputationGraph::new(num_threads);
+            let mut gf = ggml::ComputationGraph::new();
             for il in 0..n_layer {
                 // attention uses first scratch buffer
                 ctx0.use_scratch(builder.get_scratch(0));

From 351a0f53f085653225c4c8721f431aec4fcbcc08 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Wed, 12 Jul 2023 14:17:31 +0200
Subject: [PATCH 18/28] potentially fixed metal

---
 crates/ggml/src/metal.rs                 | 10 ++--------
 crates/llm-base/src/inference_session.rs |  8 +++++---
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/ggml/src/metal.rs b/crates/ggml/src/metal.rs
index a86e9273..cbfdbcc6 100644
--- a/crates/ggml/src/metal.rs
+++ b/crates/ggml/src/metal.rs
@@ -14,8 +14,8 @@ pub struct MetalContext {
 
 impl MetalContext {
     /// Create a new Metal context
-    pub fn new() -> Self {
-        let raw = unsafe { metal::ggml_metal_init() };
+    pub fn new(n_threads: usize) -> Self {
+        let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) };
 
         MetalContext {
             contexts: vec![],
@@ -76,12 +76,6 @@ impl MetalContext {
     }
 }
 
-impl Default for MetalContext {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl MetalContext {
     /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before.
     fn ref_context(&mut self, context: Arc<Context>) -> bool {
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 6e558d06..2998f0e3 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -192,7 +192,7 @@ impl InferenceSession {
         #[cfg(feature = "metal")]
         let metal_context = {
             if config.use_gpu {
-                let mut metal_context = MetalContext::new();
+                let mut metal_context = MetalContext::new(config.n_threads);
                 metal_context.add_scratch_buffer(ctx0.buffer.as_ref().unwrap());
 
                 for buf in scratch.iter() {
@@ -272,10 +272,12 @@ impl InferenceSession {
                     metal_context.graph_compute(&mut built_gf);
                     metal_context.get_tensor(&built_result.result);
                 } else {
-                    ctx0.graph_compute(&mut built_gf);
+                    let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
+                    plan.execute(ctx0);
                 }
             } else {
-                ctx0.graph_compute(&mut built_gf);
+                let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
+                plan.execute(ctx0);
             }
         }
         #[cfg(not(feature = "metal"))]

From a8986b89b7a34126d2d4201a9f1931b09c8314dc Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Thu, 13 Jul 2023 18:06:03 +0200
Subject: [PATCH 19/28] Sync latest `llama.cpp`

---
 crates/ggml/sys/llama-cpp  |  2 +-
 crates/ggml/sys/src/lib.rs | 55 +++++++++++++++++++++++++++++---------
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 2b5eb72e..32c54116 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 2b5eb72e109577ed84e44bb8fa47e4956f337300
+Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index a9779fd1..f3d9192d 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -138,19 +138,21 @@ pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50;
 pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51;
 pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52;
 pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 54;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 55;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 56;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 57;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 58;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 59;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 60;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 61;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 62;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 63;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 64;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 65;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 66;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 54;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 55;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 56;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 57;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 58;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 59;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
 pub type ggml_op = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -1554,6 +1556,33 @@ extern "C" {
         d: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
+pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
+pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
+pub type ggml_op_pool = ::std::os::raw::c_int;
+extern "C" {
+    pub fn ggml_pool_1d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_pool_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        k1: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+        p1: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_flash_attn(
         ctx: *mut ggml_context,

From cea02f2a3feb84887003cdedd50f558d843ea041 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:10:02 +0200
Subject: [PATCH 20/28] Review fixes

---
 crates/ggml/src/context.rs                    |  29 +++--
 crates/ggml/src/lib.rs                        |  21 ++--
 crates/ggml/src/tensor.rs                     |  64 +++++++++--
 crates/llm-base/src/inference_session.rs      |  41 +++----
 crates/llm-base/src/loader.rs                 |  22 +---
 crates/llm-base/src/model/mod.rs              |  11 +-
 crates/llm/examples/drop_multiple_sessions.rs |  57 ----------
 crates/models/llama/src/lib.rs                | 105 ++++++++----------
 8 files changed, 174 insertions(+), 176 deletions(-)
 delete mode 100644 crates/llm/examples/drop_multiple_sessions.rs

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index cb7aa21b..4a37f5d6 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -1,4 +1,9 @@
-use std::{os::raw::c_int, ptr::NonNull, sync::Arc};
+use std::{
+    collections::HashMap,
+    os::raw::c_int,
+    ptr::NonNull,
+    sync::{Arc, Mutex},
+};
 
 use memmap2::Mmap;
 
@@ -21,6 +26,9 @@ pub struct Context {
 
     /// Whether the context can offload tensors to the GPU
     pub can_offload: bool,
+
+    /// Offloaded tensors
+    offloaded_tensors: Arc<Mutex<HashMap<String, Tensor>>>,
 }
 
 impl Context {
@@ -39,6 +47,7 @@ impl Context {
             mmap: None,
             buffer: Some(buffer),
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -57,6 +66,7 @@ impl Context {
             mmap: Some(mmap),
             buffer: None,
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -76,17 +86,13 @@ impl Context {
             mmap: None,
             buffer: None,
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
     /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
-    pub fn enable_offloading(&mut self) {
-        self.can_offload = true;
-    }
-
-    /// Disables the offloading of tensors to the GPU
-    pub fn disable_offloading(&mut self) {
-        self.can_offload = false;
+    pub fn set_offloading(&mut self, can_offload: bool) {
+        self.can_offload = can_offload;
     }
 
     /// Wraps a raw tensor with a weak pointer to the context.
@@ -94,6 +100,7 @@ impl Context {
         let tensor = Tensor {
             ptr: NonNull::new(raw).expect("Should not be null"),
             ctx: Arc::downgrade(&self.ptr),
+            offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
         };
 
         if self.can_offload {
@@ -495,6 +502,12 @@ impl Drop for Context {
     fn drop(&mut self) {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
+            // if we moved tensors to an acceleratoor we need to free them
+            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+                if tensor.backend() != crate::Backend::Cpu {
+                    crate::accelerator_free_tensor(&tensor);
+                }
+            }
             sys::ggml_free(self.ptr.as_ptr());
         }
     }
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 3013254f..77cccb06 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -30,19 +30,19 @@ mod tests;
 pub mod metal;
 
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
-///Accelerators supported by `ggml`.
+/// Accelerators supported by `ggml`.
 pub enum Accelerator {
-    ///CuBLAS accelerated
+    /// CuBLAS accelerated
     CuBLAS,
-    ///CLBlast accelerated
+    /// CLBlast accelerated
     CLBlast,
-    ///Metal accelerated
+    /// Metal accelerated
     Metal,
-    ///Cpu accelerated
+    /// Cpu accelerated
     None,
 }
 
-///Returns the accelerator `ggml` was compiled with.
+/// Returns the accelerator `ggml` was compiled with.
 pub fn get_accelerator() -> Accelerator {
     #[cfg(feature = "cublas")]
     return Accelerator::CLBlast;
@@ -55,14 +55,14 @@ pub fn get_accelerator() -> Accelerator {
 }
 
 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
-///Backend to use for a tensor.
+/// Backend to use for a tensor.
 pub enum Backend {
     /// CPU backend
     #[default]
     Cpu,
     /// GPU backend
     Gpu,
-    ///Multi-GPU backend
+    /// Multi-GPU backend
     GpuSplit,
 }
 
@@ -184,6 +184,9 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR;
 /// The size of a `ggml` object.
 pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 
+/// The maximum length of a `ggml` tensor-name.
+pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME;
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
 /// The type of a value in `ggml`.
 pub enum Type {
@@ -537,7 +540,7 @@ pub fn cpu_has_gpublas() -> bool {
 }
 
 /// Sets the name of a tensor.
-pub fn set_name(tensor: &Tensor, name: &str) {
+pub fn set_tensor_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
     unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
 }
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index c313eaf7..3ade3c91 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -1,4 +1,9 @@
-use std::{os::raw::c_void, ptr::NonNull, sync::Weak};
+use std::{
+    collections::HashMap,
+    os::raw::c_void,
+    ptr::NonNull,
+    sync::{Mutex, Weak},
+};
 
 use crate::{i64_to_usize, sys, Type};
 
@@ -7,6 +12,7 @@ use crate::{i64_to_usize, sys, Type};
 pub struct Tensor {
     pub(crate) ptr: NonNull<sys::ggml_tensor>,
     pub(crate) ctx: Weak<NonNull<sys::ggml_context>>,
+    pub(crate) offloaded_tensors: Weak<Mutex<HashMap<String, Tensor>>>,
 }
 
 impl Tensor {
@@ -15,9 +21,18 @@ impl Tensor {
     /// Exposed for purposes of determining context size.
     pub const C_TYPE_SIZE: usize = std::mem::size_of::<sys::ggml_tensor>();
 
-    ///Sets the name of the tensor
-    pub fn set_name(&mut self, name: &str) -> &Tensor {
-        assert!(name.len() <= 48, "Name is too long!");
+    /// Sets the name of the tensor.
+    ///
+    /// # Safety
+    ///
+    /// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters.
+    pub fn set_name(mut self, name: &str) -> Tensor {
+        assert!(
+            name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(),
+            "Name '{}' is too long, max length is {} characters",
+            name,
+            crate::MAX_NAME_LENGTH
+        );
 
         let bytes = name.as_bytes();
         let mut array = [0i8; 48];
@@ -27,29 +42,60 @@ impl Tensor {
         self
     }
 
-    ///Gets the name of the tensor
-    pub fn get_name(&self) -> String {
+    /// Gets the name of the tensor
+    pub fn name(&self) -> String {
         let name = unsafe { self.ptr.as_ref().name };
         let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
         name.retain(|&x| x != 0);
         String::from_utf8(name).unwrap()
     }
 
-    ///Sets the acceleration backend of the tensor
+    /// Sets the acceleration backend of the tensor.
+    ///
+    /// # Caution
+    ///
+    /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
     pub fn set_backend(&mut self, backend: crate::Backend) {
         unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
     }
 
-    ///Gets the acceleration backend of the tensor
-    pub fn get_backend(&self) -> crate::Backend {
+    /// Gets the acceleration backend of the tensor
+    pub fn backend(&self) -> crate::Backend {
         unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
     }
 
+    /// Sets the tensors acceleration backend and moves the tensors data to the new backend.
+    pub fn transfer_to<E>(mut self, backend: crate::Backend) -> Result<Tensor, E> {
+        let current_backend = self.backend();
+        self.set_backend(backend);
+
+        if backend != crate::Backend::Cpu {
+            crate::accelerator_transform_tensor(&mut self);
+            if current_backend == crate::Backend::Cpu {
+                // tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator
+                self.with_alive_ctx_mut(|| {
+                    if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() {
+                        //TODO: Do we need to check if the tensor is already in the map?
+                        offloaded_tensors
+                            .lock()
+                            .unwrap()
+                            .insert(self.name(), self.share());
+                    } else {
+                        panic!("Using a context after it was dropped!")
+                    }
+                })
+            }
+        }
+
+        Ok(self)
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
             ptr: self.ptr,
             ctx: Weak::clone(&self.ctx),
+            offloaded_tensors: Weak::clone(&self.offloaded_tensors),
         }
     }
 
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 2998f0e3..7a76a9aa 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -26,24 +26,6 @@ fn scratch_buffers() -> ScratchBuffers {
     ]
 }
 
-fn kv_memory(
-    context: &Context,
-    config: &InferenceSessionConfig,
-    n_elements: usize,
-) -> (Tensor, Tensor) {
-    let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements);
-    let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements);
-    ggml::set_name(&memory_k, "memory_k");
-    ggml::set_name(&memory_v, "memory_v");
-
-    if config.use_gpu {
-        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
-        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
-    }
-
-    (memory_k, memory_v)
-}
-
 /// Result of graph building
 pub struct GraphOutputs {
     /// The output containing the model's result
@@ -238,7 +220,7 @@ impl InferenceSession {
         self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap());
         let ctx0 = &mut self.ctx0;
         let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
-        ggml::set_name(&embd, "embd");
+        ggml::set_tensor_name(&embd, "embd");
 
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
@@ -946,3 +928,24 @@ pub fn feed_prompt_callback<'a, E: std::error::Error + 'static>(
         None => Ok(InferenceFeedback::Continue),
     }
 }
+
+/// Create the memory K/V tensors for the inference-session.
+fn kv_memory(
+    context: &Context,
+    config: &InferenceSessionConfig,
+    n_elements: usize,
+) -> (Tensor, Tensor) {
+    let memory_k = context
+        .new_tensor_1d(config.memory_k_type.into(), n_elements)
+        .set_name("memory_k");
+    let memory_v = context
+        .new_tensor_1d(config.memory_v_type.into(), n_elements)
+        .set_name("memory_v");
+
+    if config.use_gpu {
+        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
+        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
+    }
+
+    (memory_k, memory_v)
+}
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 16d16e12..bfd5b26c 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -396,8 +396,6 @@ impl LoadError {
 pub trait TensorLoader<E: std::error::Error> {
     /// Gets a tensor from the loader.
     fn load(&mut self, name: &str) -> Result<ggml::Tensor, E>;
-    /// Gets a tensor from the loader and tries to offload it to the specified backend.
-    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, E>;
     /// Finish loading the model, and extract all of the state from the loader.
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>);
 }
@@ -672,15 +670,6 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
         Ok(tensor)
     }
 
-    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, LoadError> {
-        let mut tensor = self.load(name)?;
-        if backend != ggml::Backend::Cpu {
-            tensor.set_backend(backend);
-            crate::ggml::accelerator_transform_tensor(&mut tensor);
-        }
-        Ok(tensor)
-    }
-
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>) {
         (self.context, self.loaded_tensors)
     }
@@ -752,16 +741,15 @@ impl<'a> FileContext<'a> {
             }
         }
 
-        // The tensor name is truncated to 32 characters.
-
-        let tensor_name = if name.len() > 32 {
-            &name[name.len() - 32..]
+        // The tensor name is truncated to it's maximum length.
+        let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap();
+        let tensor_name = if name.len() >= max_name_length {
+            &name[name.len() - max_name_length..]
         } else {
             name
         };
-        tensor.set_name(tensor_name);
 
-        Ok(tensor)
+        Ok(tensor.set_name(tensor_name))
     }
 }
 
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index c8799cdc..75a070f1 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -204,7 +204,7 @@ pub struct ModelParameters {
     pub lora_adapters: Option<Vec<PathBuf>>,
     /// Whether to use GPU acceleration when available
     pub use_gpu: bool,
-    /// The number of layers to offload to the gpu. If `None`, all layers will be offloaded.
+    /// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded.
     pub gpu_layers: Option<usize>,
 }
 
@@ -231,6 +231,15 @@ impl ModelParameters {
             true
         }
     }
+
+    /// Returns the backend to use for the given layer.
+    pub fn backend(&self, layer: usize) -> ggml::Backend {
+        if self.should_offload(layer) {
+            ggml::Backend::Gpu
+        } else {
+            ggml::Backend::Cpu
+        }
+    }
 }
 
 /// Used in a call to [Model::evaluate] or [InferenceSession::infer] to request
diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs
deleted file mode 100644
index 219e2a6d..00000000
--- a/crates/llm/examples/drop_multiple_sessions.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-use llm::ModelArchitecture;
-use llm_base::{InferenceFeedback, InferenceParameters, ModelParameters};
-use std::{convert::Infallible, path::PathBuf};
-
-fn main() {
-    let prompt = "What is the meaning of life?";
-    let model_path = PathBuf::from(r"C:\Users\lkreu\Downloads\orca-mini-v2_7b.ggmlv3.q5_K_M.bin");
-    let now = std::time::Instant::now();
-
-    let model = llm::load_dynamic(
-        Some(ModelArchitecture::Llama),
-        &model_path,
-        llm_base::TokenizerSource::Embedded,
-        ModelParameters {
-            use_gpu: true,
-            ..Default::default()
-        },
-        llm::load_progress_callback_stdout,
-    )
-    .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}"));
-
-    println!(
-        "Model fully loaded! Elapsed: {}ms",
-        now.elapsed().as_millis()
-    );
-
-    for i in 0..10 {
-        println!("Starting session {i}");
-        let mut session = model.start_session(Default::default());
-        session
-            .feed_prompt(model.as_ref(), prompt, &mut Default::default(), |_| {
-                Ok::<InferenceFeedback, Infallible>(llm::InferenceFeedback::Continue)
-            })
-            .unwrap();
-        drop(session);
-        println!("Dropped session {i}");
-    }
-
-    drop(model);
-
-    println!("Model dropped! Elapsed: {}ms", now.elapsed().as_millis());
-
-    for _ in 0..5 {
-        let model = llm::load_dynamic(
-            Some(ModelArchitecture::Llama),
-            &model_path,
-            llm_base::TokenizerSource::Embedded,
-            ModelParameters {
-                use_gpu: true,
-                ..Default::default()
-            },
-            llm::load_progress_callback_stdout,
-        )
-        .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}"));
-        drop(model);
-    }
-}
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index f612d84f..4011bea3 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -1,10 +1,10 @@
 //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem.
 #![deny(missing_docs)]
 
-use std::{collections::HashMap, error::Error, sync::Arc};
+use std::{error::Error, sync::Arc};
 
 use llm_base::{
-    ggml::{self, Backend},
+    ggml::{self},
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
     ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
@@ -34,22 +34,11 @@ pub struct Llama {
 
     // must be kept alive for the model
     context: Arc<ggml::Context>,
-    loaded_tensors: HashMap<String, ggml::Tensor>,
 }
 
 unsafe impl Send for Llama {}
 unsafe impl Sync for Llama {}
 
-impl Drop for Llama {
-    fn drop(&mut self) {
-        for (_, tensor) in self.loaded_tensors.drain() {
-            if tensor.get_backend() != Backend::Cpu {
-                ggml::accelerator_free_tensor(&tensor);
-            }
-        }
-    }
-}
-
 impl KnownModel for Llama {
     type Hyperparameters = Hyperparameters;
 
@@ -64,39 +53,49 @@ impl KnownModel for Llama {
         // model-global weights
         let wte = tl.load("tok_embeddings.weight")?;
 
-        let backend = if params.should_offload(0) {
-            Backend::Gpu
-        } else {
-            Backend::Cpu
-        };
+        let backend = params.backend(0);
 
-        let norm = tl.offload("norm.weight", backend)?;
+        let norm = tl.load("norm.weight")?.transfer_to(backend)?;
 
-        let output = tl.offload("output.weight", backend)?;
+        let output = tl.load("output.weight")?.transfer_to(backend)?;
 
         let mut layers = Vec::new();
 
         for i in 0..hyperparameters.n_layer {
-            let backend = if params.should_offload(i) {
-                Backend::Gpu
-            } else {
-                Backend::Cpu
-            };
+            let backend = params.backend(i);
+
             let layer = Layer {
                 attention_norm: tl
-                    .offload(&format!("layers.{i}.attention_norm.weight"), backend)?,
-                wq: tl.offload(&format!("layers.{i}.attention.wq.weight"), backend)?,
-                wk: tl.offload(&format!("layers.{i}.attention.wk.weight"), backend)?,
-                wv: tl.offload(&format!("layers.{i}.attention.wv.weight"), backend)?,
-                wo: tl.offload(&format!("layers.{i}.attention.wo.weight"), backend)?,
-                ffn_norm: tl.offload(&format!("layers.{i}.ffn_norm.weight"), backend)?,
-                w1: tl.offload(&format!("layers.{i}.feed_forward.w1.weight"), backend)?,
-                w2: tl.offload(&format!("layers.{i}.feed_forward.w2.weight"), backend)?,
-                w3: tl.offload(&format!("layers.{i}.feed_forward.w3.weight"), backend)?,
+                    .load(&format!("layers.{i}.attention_norm.weight"))?
+                    .transfer_to(backend)?,
+                wq: tl
+                    .load(&format!("layers.{i}.attention.wq.weight"))?
+                    .transfer_to(backend)?,
+                wk: tl
+                    .load(&format!("layers.{i}.attention.wk.weight"))?
+                    .transfer_to(backend)?,
+                wv: tl
+                    .load(&format!("layers.{i}.attention.wv.weight"))?
+                    .transfer_to(backend)?,
+                wo: tl
+                    .load(&format!("layers.{i}.attention.wo.weight"))?
+                    .transfer_to(backend)?,
+                ffn_norm: tl
+                    .load(&format!("layers.{i}.ffn_norm.weight"))?
+                    .transfer_to(backend)?,
+                w1: tl
+                    .load(&format!("layers.{i}.feed_forward.w1.weight"))?
+                    .transfer_to(backend)?,
+                w2: tl
+                    .load(&format!("layers.{i}.feed_forward.w2.weight"))?
+                    .transfer_to(backend)?,
+                w3: tl
+                    .load(&format!("layers.{i}.feed_forward.w3.weight"))?
+                    .transfer_to(backend)?,
             };
             layers.push(layer);
         }
-        let (context, loaded_tensors) = tl.finish();
+        let (context, _) = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
@@ -110,7 +109,6 @@ impl KnownModel for Llama {
             output,
             layers,
             context: Arc::new(context),
-            loaded_tensors,
         })
     }
 
@@ -154,12 +152,7 @@ impl KnownModel for Llama {
             let mut gf = ggml::ComputationGraph::new();
 
             for il in 0..n_layer {
-                //TODO: find a better way to do this
-                if self.model_params.should_offload(il) {
-                    ctx0.enable_offloading();
-                } else {
-                    ctx0.disable_offloading();
-                }
+                ctx0.set_offloading(self.model_params.should_offload(il));
 
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
@@ -185,7 +178,7 @@ impl KnownModel for Llama {
                     n_rot,
                     0,
                 );
-                ggml::set_name(&q_current, "Qcur");
+                ggml::set_tensor_name(&q_current, "Qcur");
                 let k_current = ctx0.op_rope_inplace(
                     &ctx0.op_reshape_3d(
                         &ctx0.op_mul_mat(&self.layers[il].wk, &current),
@@ -197,7 +190,7 @@ impl KnownModel for Llama {
                     n_rot,
                     0,
                 );
-                ggml::set_name(&k_current, "Kcur");
+                ggml::set_tensor_name(&k_current, "Kcur");
 
                 // store key and value to memory
                 // compute the transposed [N, n_embd] V matrix
@@ -226,7 +219,7 @@ impl KnownModel for Llama {
                 gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
 
                 let q = ctx0.op_permute(&q_current, (0, 2, 1, 3));
-                ggml::set_name(&q, "Q");
+                ggml::set_tensor_name(&q, "Q");
 
                 let k = ctx0.op_permute(
                     &ctx0.op_reshape_3d(
@@ -241,25 +234,25 @@ impl KnownModel for Llama {
                     ),
                     (0, 2, 1, 3),
                 );
-                ggml::set_name(&k, "K");
+                ggml::set_tensor_name(&k, "K");
 
                 // K * Q
                 let k_q = ctx0.op_mul_mat(&k, &q);
-                ggml::set_name(&k_q, "KQ");
+                ggml::set_tensor_name(&k_q, "KQ");
 
                 // KQ_scaled = KQ / sqrt(n_embd/n_head)
                 let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt()));
-                ggml::set_name(&kq_scale, "1/sqrt(n_embd/n_head)");
+                ggml::set_tensor_name(&kq_scale, "1/sqrt(n_embd/n_head)");
                 let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale);
-                ggml::set_name(&k_q_scaled, "KQ_scaled");
+                ggml::set_tensor_name(&k_q_scaled, "KQ_scaled");
 
                 // KQ_masked = mask_past(KQ_scaled)
                 let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len);
-                ggml::set_name(&k_q_masked, "KQ_masked");
+                ggml::set_tensor_name(&k_q_masked, "KQ_masked");
 
                 // KQ = soft_max(KQ_masked)
                 let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked);
-                ggml::set_name(&k_q_soft_max, "KQ_soft_max");
+                ggml::set_tensor_name(&k_q_soft_max, "KQ_soft_max");
 
                 // split cached V into n_head heads
                 let v = ctx0.op_view_3d(
@@ -271,21 +264,21 @@ impl KnownModel for Llama {
                     ),
                     il * ctx_size * builder.memory_v.element_size() * n_embd,
                 );
-                ggml::set_name(&v, "V");
+                ggml::set_tensor_name(&v, "V");
 
                 let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
-                ggml::set_name(&k_q_v, "KQV");
+                ggml::set_tensor_name(&k_q_v, "KQV");
 
                 // KQV_merged = KQV.permute(0, 2, 1, 3)
                 let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3));
-                ggml::set_name(&k_q_v_merged, "KQV_merged");
+                ggml::set_tensor_name(&k_q_v_merged, "KQV_merged");
 
                 // cur = KQV_merged.contiguous().view(n_embd, N)
                 current = ctx0.op_cpy(
                     &k_q_v_merged,
                     &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
                 );
-                ggml::set_name(&current, "KQV_merged_contiguous");
+                ggml::set_tensor_name(&current, "KQV_merged_contiguous");
 
                 // projection (no bias)
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
@@ -328,7 +321,7 @@ impl KnownModel for Llama {
 
             let embedding_result: ggml::Tensor = input_layer.share();
 
-            ctx0.disable_offloading();
+            ctx0.set_offloading(false);
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
 

From e09b937bccfeaf77986469ec06334f8242346e01 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 15 Jul 2023 20:36:28 +0200
Subject: [PATCH 21/28] Typos and small review changes

---
 crates/ggml/src/context.rs               |  2 +-
 crates/ggml/src/tensor.rs                | 18 ++++++++++++------
 crates/llm-base/src/inference_session.rs |  3 ++-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 4a37f5d6..5bbea9a1 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -502,7 +502,7 @@ impl Drop for Context {
     fn drop(&mut self) {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
-            // if we moved tensors to an acceleratoor we need to free them
+            // if we moved tensors to an accelerator we need to free them
             for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
                 if tensor.backend() != crate::Backend::Cpu {
                     crate::accelerator_free_tensor(&tensor);
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 3ade3c91..cb8bb982 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -7,6 +7,8 @@ use std::{
 
 use crate::{i64_to_usize, sys, Type};
 
+const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize;
+
 /// Tensors are owned by the context. A tensor is alive as long as the
 /// underlying context it was created with is alive.
 pub struct Tensor {
@@ -25,17 +27,17 @@ impl Tensor {
     ///
     /// # Safety
     ///
-    /// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters.
+    /// The name must be a valid UTF-8 string and must not be longer than `MAX_NAME_LENGTH` characters.
     pub fn set_name(mut self, name: &str) -> Tensor {
         assert!(
-            name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(),
+            name.len() <= MAX_NAME_LENGTH,
             "Name '{}' is too long, max length is {} characters",
             name,
-            crate::MAX_NAME_LENGTH
+            MAX_NAME_LENGTH
         );
 
         let bytes = name.as_bytes();
-        let mut array = [0i8; 48];
+        let mut array = [0i8; MAX_NAME_LENGTH];
         array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::<Vec<_>>());
 
         unsafe { self.ptr.as_mut().name = array }
@@ -55,7 +57,7 @@ impl Tensor {
     /// # Caution
     ///
     /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
-    pub fn set_backend(&mut self, backend: crate::Backend) {
+    pub(crate) fn set_backend(&mut self, backend: crate::Backend) {
         unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
     }
 
@@ -64,9 +66,13 @@ impl Tensor {
         unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
     }
 
-    /// Sets the tensors acceleration backend and moves the tensors data to the new backend.
+    /// Sets the tensor's acceleration backend and moves the tensors data to the new backend.
     pub fn transfer_to<E>(mut self, backend: crate::Backend) -> Result<Tensor, E> {
         let current_backend = self.backend();
+
+        if current_backend != crate::Backend::Cpu && backend == crate::Backend::Cpu {
+            panic!("Currently there is no way to move data from an accelerator to the cpu")
+        }
         self.set_backend(backend);
 
         if backend != crate::Backend::Cpu {
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 7a76a9aa..c8bb2ade 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -136,7 +136,6 @@ impl InferenceSession {
             ctx_size
         };
 
-        //TODO: check if this is needed and the right place to put it
         if config.use_gpu {
             ggml::accelerator_initialize(0);
             ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024);
@@ -943,6 +942,8 @@ fn kv_memory(
         .set_name("memory_v");
 
     if config.use_gpu {
+        // CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer.
+        // For OpenCL this is a no-op.
         ggml::accelerator_offload_tensor_no_scratch(&memory_k);
         ggml::accelerator_offload_tensor_no_scratch(&memory_v);
     }

From 67bbaf9d578da2994b83704bf2e1d75719574eb9 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sat, 15 Jul 2023 22:13:39 +0200
Subject: [PATCH 22/28] refactor: remove unused tensorloader tensors

---
 crates/llm-base/src/loader.rs    | 8 ++++----
 crates/models/bloom/src/lib.rs   | 2 +-
 crates/models/falcon/src/lib.rs  | 2 +-
 crates/models/gpt2/src/lib.rs    | 2 +-
 crates/models/gptj/src/lib.rs    | 2 +-
 crates/models/gptneox/src/lib.rs | 2 +-
 crates/models/llama/src/lib.rs   | 2 +-
 crates/models/mpt/src/lib.rs     | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index bfd5b26c..8c5d2653 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -396,8 +396,8 @@ impl LoadError {
 pub trait TensorLoader<E: std::error::Error> {
     /// Gets a tensor from the loader.
     fn load(&mut self, name: &str) -> Result<ggml::Tensor, E>;
-    /// Finish loading the model, and extract all of the state from the loader.
-    fn finish(self) -> (Context, HashMap<String, ggml::Tensor>);
+    /// Finish loading the model, returning the context.
+    fn finish(self) -> Context;
 }
 
 /// Load a GGML model from the `path` and configure it per the `params`. The status
@@ -670,8 +670,8 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
         Ok(tensor)
     }
 
-    fn finish(self) -> (Context, HashMap<String, ggml::Tensor>) {
-        (self.context, self.loaded_tensors)
+    fn finish(self) -> Context {
+        self.context
     }
 }
 
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index 3f29c1ed..78c9cec6 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -89,7 +89,7 @@ impl KnownModel for Bloom {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 15a77527..02fcd933 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -81,7 +81,7 @@ impl KnownModel for Falcon {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index e19264b1..534f46e7 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -86,7 +86,7 @@ impl KnownModel for Gpt2 {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 85864fce..487123e0 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -81,7 +81,7 @@ impl KnownModel for GptJ {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 12fb42a0..83187273 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -95,7 +95,7 @@ impl KnownModel for GptNeoX {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 4011bea3..b99de13f 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -95,7 +95,7 @@ impl KnownModel for Llama {
             };
             layers.push(layer);
         }
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index fa66eac2..107c47aa 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -70,7 +70,7 @@ impl KnownModel for Mpt {
             layers.push(layer);
         }
 
-        let (context, _) = tl.finish();
+        let context = tl.finish();
 
         let ModelParameters { context_size, .. } = params;
 

From e7ac55b09a32272f37b42f1aaa0b43ff341d99aa Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 01:25:49 +0200
Subject: [PATCH 23/28] refactor(ggml): accelerator/tensor/context

---
 crates/ggml/src/{ => accelerator}/metal.rs |   0
 crates/ggml/src/accelerator/mod.rs         |  94 +++++++++++
 crates/ggml/src/context.rs                 | 101 +++++++-----
 crates/ggml/src/lib.rs                     | 157 +-----------------
 crates/ggml/src/tensor.rs                  | 180 ++++++++++++++-------
 crates/llm-base/src/inference_session.rs   |  22 ++-
 crates/llm-base/src/model/mod.rs           |  17 +-
 crates/models/llama/src/lib.rs             |  23 ++-
 8 files changed, 308 insertions(+), 286 deletions(-)
 rename crates/ggml/src/{ => accelerator}/metal.rs (100%)
 create mode 100644 crates/ggml/src/accelerator/mod.rs

diff --git a/crates/ggml/src/metal.rs b/crates/ggml/src/accelerator/metal.rs
similarity index 100%
rename from crates/ggml/src/metal.rs
rename to crates/ggml/src/accelerator/metal.rs
diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs
new file mode 100644
index 00000000..a4132583
--- /dev/null
+++ b/crates/ggml/src/accelerator/mod.rs
@@ -0,0 +1,94 @@
+//! Functionality related to hardware acceleration of GGML (GPU, etc.)
+use crate::sys;
+
+#[cfg(feature = "metal")]
+pub mod metal;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+/// Accelerators supported by `ggml`.
+pub enum Accelerator {
+    /// CuBLAS accelerated
+    CuBLAS,
+    /// CLBlast accelerated
+    CLBlast,
+    /// Metal accelerated
+    Metal,
+    /// Cpu accelerated
+    None,
+}
+
+/// Returns the accelerator `ggml` was compiled with.
+pub fn get_accelerator() -> Accelerator {
+    #[cfg(feature = "cublas")]
+    return Accelerator::CLBlast;
+    #[cfg(feature = "clblast")]
+    return Accelerator::CuBLAS;
+    #[cfg(feature = "metal")]
+    return Accelerator::Metal;
+    #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))]
+    return Accelerator::None;
+}
+
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+/// Backend to use for a tensor.
+pub enum Backend {
+    /// CPU backend
+    #[default]
+    Cpu,
+    /// GPU backend
+    Gpu,
+    /// Multi-GPU backend
+    GpuSplit,
+}
+
+impl From<Backend> for sys::ggml_backend {
+    fn from(b: Backend) -> Self {
+        match b {
+            Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU,
+            Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU,
+            Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT,
+        }
+    }
+}
+
+impl TryFrom<sys::ggml_backend> for Backend {
+    type Error = ();
+    fn try_from(b: sys::ggml_backend) -> Result<Self, Self::Error> {
+        match b {
+            sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu),
+            sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu),
+            sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit),
+            _ => Err(()),
+        }
+    }
+}
+
+/// Initialize the accelerator. If ggml-sys is compiled with CUDA or CLBlast support, this function will initialize the accelerator. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn initialize(device: i32) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        //TODO: Make this configurable
+        sys::cuda::ggml_init_cublas();
+        sys::cuda::ggml_cuda_set_main_device(device);
+        let split = 1.0f32;
+        sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
+    }
+}
+
+///  Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn set_scratch_size(size: usize) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_set_scratch_size(size);
+    }
+}
+
+/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op.
+pub fn free_scratch() {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_free_scratch();
+    }
+}
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 5bbea9a1..616edb32 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -7,7 +7,7 @@ use std::{
 
 use memmap2::Mmap;
 
-use crate::{sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type};
+use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type};
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
 /// `ggml_init` and dropping via `ggml_free`.
@@ -27,7 +27,18 @@ pub struct Context {
     /// Whether the context can offload tensors to the GPU
     pub can_offload: bool,
 
-    /// Offloaded tensors
+    /// Offloaded tensors. Used to free them when the context is dropped.
+    // TODO: revisit this. What it means for a tensor to be "offloaded",
+    // "transferred", etc. is not clear. This map is necessary because
+    // there is no obvious heuristic for whether a given `Tensor`
+    // should have the accelerator free method called on it.
+    //
+    // This is because tensors can be present on the accelerator without
+    // having data (i.e. compute nodes), or they can refer to the scratch buffers.
+    // Freeing these offloaded-but-not-allocated tensors will lead to crashes.
+    //
+    // Hopefully, this is resolved by GGML redesigning both its accelerator
+    // interface and its scratch buffer solution.
     offloaded_tensors: Arc<Mutex<HashMap<String, Tensor>>>,
 }
 
@@ -95,18 +106,31 @@ impl Context {
         self.can_offload = can_offload;
     }
 
-    /// Wraps a raw tensor with a weak pointer to the context.
-    fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
-        let tensor = Tensor {
-            ptr: NonNull::new(raw).expect("Should not be null"),
-            ctx: Arc::downgrade(&self.ptr),
-            offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
-        };
+    /// Retrieves the memory used by this [Context].
+    pub fn used_mem(&self) -> usize {
+        unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) }
+    }
 
-        if self.can_offload {
-            crate::accelerator_offload_tensor(&tensor);
+    /// Sets the scratch buffer to be used by this [Context].
+    ///
+    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
+    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) {
+        let (size, data) = if let Some(buffer) = scratch_buffer {
+            (buffer.size(), buffer.data)
+        } else {
+            (0, std::ptr::null_mut())
+        };
+        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
+        unsafe {
+            sys::ggml_set_scratch(
+                self.ptr.as_ptr(),
+                sys::ggml_scratch {
+                    offs: 0,
+                    size,
+                    data,
+                },
+            );
         }
-        tensor
     }
 
     /// Creates a new 1D tensor.
@@ -148,7 +172,9 @@ impl Context {
         let raw = unsafe { sys::ggml_new_f32(self.ptr.as_ptr(), x) };
         self.new_tensor_raw(raw)
     }
-
+}
+// Operations
+impl Context {
     /// Unknown, aside from the obvious. It's transposing something!
     pub fn op_transpose(&self, a: &Tensor) -> Tensor {
         let tensor = unsafe { sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
@@ -449,33 +475,6 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
-    /// Retrieves the memory used by this [Context].
-    pub fn used_mem(&self) -> usize {
-        unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) }
-    }
-
-    /// Sets the scratch buffer to be used by this [Context].
-    ///
-    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
-    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) {
-        let (size, data) = if let Some(buffer) = scratch_buffer {
-            (buffer.size(), buffer.data)
-        } else {
-            (0, std::ptr::null_mut())
-        };
-        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
-        unsafe {
-            sys::ggml_set_scratch(
-                self.ptr.as_ptr(),
-                sys::ggml_scratch {
-                    offs: 0,
-                    size,
-                    data,
-                },
-            );
-        }
-    }
-
     /// Attention with LInear BIases (Ref: <https://arxiv.org/pdf/2108.12409.pdf>)
     pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize, bias_max: f32) -> Tensor {
         let tensor = unsafe {
@@ -497,15 +496,31 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 }
+// Private methods
+impl Context {
+    /// Wraps a raw tensor with a weak pointer to the context.
+    fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
+        let tensor = Tensor {
+            ptr: NonNull::new(raw).expect("Should not be null"),
+            ctx: Arc::downgrade(&self.ptr),
+            offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
+        };
+
+        if self.can_offload {
+            tensor.offload();
+        }
+        tensor
+    }
+}
 
 impl Drop for Context {
     fn drop(&mut self) {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
             // if we moved tensors to an accelerator we need to free them
-            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
-                if tensor.backend() != crate::Backend::Cpu {
-                    crate::accelerator_free_tensor(&tensor);
+            for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+                if tensor.backend() != Backend::Cpu {
+                    tensor.free_accelerator();
                 }
             }
             sys::ggml_free(self.ptr.as_ptr());
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 77cccb06..168a292a 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -18,6 +18,8 @@ mod tensor;
 pub mod format;
 pub mod util;
 
+pub mod accelerator;
+
 pub use context::Context;
 pub use tensor::Tensor;
 
@@ -26,68 +28,6 @@ pub use ggml_sys as sys;
 #[cfg(test)]
 mod tests;
 
-#[cfg(feature = "metal")]
-pub mod metal;
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-/// Accelerators supported by `ggml`.
-pub enum Accelerator {
-    /// CuBLAS accelerated
-    CuBLAS,
-    /// CLBlast accelerated
-    CLBlast,
-    /// Metal accelerated
-    Metal,
-    /// Cpu accelerated
-    None,
-}
-
-/// Returns the accelerator `ggml` was compiled with.
-pub fn get_accelerator() -> Accelerator {
-    #[cfg(feature = "cublas")]
-    return Accelerator::CLBlast;
-    #[cfg(feature = "clblast")]
-    return Accelerator::CuBLAS;
-    #[cfg(feature = "metal")]
-    return Accelerator::Metal;
-    #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))]
-    return Accelerator::None;
-}
-
-#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
-/// Backend to use for a tensor.
-pub enum Backend {
-    /// CPU backend
-    #[default]
-    Cpu,
-    /// GPU backend
-    Gpu,
-    /// Multi-GPU backend
-    GpuSplit,
-}
-
-impl From<Backend> for sys::ggml_backend {
-    fn from(b: Backend) -> Self {
-        match b {
-            Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU,
-            Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU,
-            Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT,
-        }
-    }
-}
-
-impl TryFrom<sys::ggml_backend> for Backend {
-    type Error = ();
-    fn try_from(b: sys::ggml_backend) -> Result<Self, Self::Error> {
-        match b {
-            sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu),
-            sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu),
-            sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit),
-            _ => Err(()),
-        }
-    }
-}
-
 /// The type of a tensor element.
 pub type ElementType = Type;
 
@@ -544,96 +484,3 @@ pub fn set_tensor_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
     unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
 }
-
-/// Gets the acceleration backend of a tensor.
-pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend {
-    (tensor.backend as sys::ggml_backend).try_into().unwrap()
-}
-
-/// Sets the acceleration backend of a tensor.
-/// # Safety
-/// This function assumes that the tensor is valid.
-pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) {
-    unsafe {
-        (*tensor).backend = backend.try_into().unwrap();
-    }
-}
-
-/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_transform_tensor(tensor: &mut Tensor) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
-    }
-    #[cfg(feature = "clblast")]
-    unsafe {
-        sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr());
-    }
-}
-
-/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
-pub fn accelerator_offload_tensor(tensor: &Tensor) {
-    accelerator_offload_raw_tensor(tensor.ptr.as_ptr());
-}
-
-/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_assign_buffers(tensor);
-    }
-}
-
-/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr());
-    }
-}
-
-///  Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_set_scratch_size(size: usize) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_set_scratch_size(size);
-    }
-}
-
-///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_initialize(device: i32) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        //TODO: Make this configurable
-        sys::cuda::ggml_init_cublas();
-        sys::cuda::ggml_cuda_set_main_device(device);
-        let split = 1.0f32;
-        sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
-    }
-}
-
-/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op.
-pub fn accelerator_free_scratch() {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_free_scratch();
-    }
-}
-
-/// Frees the memory of a tensor. If ggml-sys is compiled with CUDA or ClBlast support, this function will free the memory of a tensor. If not this is a no-op.
-#[allow(unused_variables)]
-pub fn accelerator_free_tensor(tensor: &Tensor) {
-    #[cfg(feature = "cublas")]
-    unsafe {
-        sys::cuda::ggml_cuda_free_data(tensor.ptr.as_ptr());
-    }
-    #[cfg(feature = "clblast")]
-    unsafe {
-        sys::opencl::ggml_cl_free_data(tensor.ptr.as_ptr());
-    }
-}
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index cb8bb982..ee5370b7 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -5,7 +5,7 @@ use std::{
     sync::{Mutex, Weak},
 };
 
-use crate::{i64_to_usize, sys, Type};
+use crate::{accelerator::Backend, i64_to_usize, sys, Type};
 
 const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize;
 
@@ -40,60 +40,87 @@ impl Tensor {
         let mut array = [0i8; MAX_NAME_LENGTH];
         array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::<Vec<_>>());
 
-        unsafe { self.ptr.as_mut().name = array }
+        self.with_alive_ctx_mut(|t| unsafe { t.ptr.as_mut().name = array });
         self
     }
 
     /// Gets the name of the tensor
     pub fn name(&self) -> String {
-        let name = unsafe { self.ptr.as_ref().name };
-        let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
-        name.retain(|&x| x != 0);
-        String::from_utf8(name).unwrap()
-    }
-
-    /// Sets the acceleration backend of the tensor.
-    ///
-    /// # Caution
-    ///
-    /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
-    pub(crate) fn set_backend(&mut self, backend: crate::Backend) {
-        unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
+        self.with_alive_ctx(|| {
+            let name = unsafe { self.ptr.as_ref().name };
+            let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
+            name.retain(|&x| x != 0);
+            String::from_utf8(name).unwrap()
+        })
     }
 
     /// Gets the acceleration backend of the tensor
-    pub fn backend(&self) -> crate::Backend {
-        unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
+    pub fn backend(&self) -> Backend {
+        self.with_alive_ctx(|| unsafe {
+            (self.ptr.as_ref().backend as sys::ggml_backend)
+                .try_into()
+                .unwrap()
+        })
     }
 
-    /// Sets the tensor's acceleration backend and moves the tensors data to the new backend.
-    pub fn transfer_to<E>(mut self, backend: crate::Backend) -> Result<Tensor, E> {
-        let current_backend = self.backend();
+    /// Sets the tensor's acceleration backend and moves the tensor's data to the new backend.
+    pub fn transfer_to(mut self, backend: Backend) -> Tensor {
+        self.with_alive_ctx_mut(|t| {
+            let current_backend = t.backend();
 
-        if current_backend != crate::Backend::Cpu && backend == crate::Backend::Cpu {
-            panic!("Currently there is no way to move data from an accelerator to the cpu")
-        }
-        self.set_backend(backend);
-
-        if backend != crate::Backend::Cpu {
-            crate::accelerator_transform_tensor(&mut self);
-            if current_backend == crate::Backend::Cpu {
-                // tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator
-                self.with_alive_ctx_mut(|| {
-                    if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() {
-                        //TODO: Do we need to check if the tensor is already in the map?
-                        offloaded_tensors
-                            .lock()
-                            .unwrap()
-                            .insert(self.name(), self.share());
-                    } else {
-                        panic!("Using a context after it was dropped!")
-                    }
-                })
+            if current_backend != Backend::Cpu && backend == Backend::Cpu {
+                unimplemented!("Tensors cannot be moved from an accelerator to the CPU at present");
             }
-        }
+            t.set_backend(backend);
+            if backend == Backend::Cpu {
+                return;
+            }
+
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_transform_tensor(t.data(), t.ptr.as_ptr());
+            }
+            #[cfg(feature = "clblast")]
+            unsafe {
+                sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr());
+            }
+
+            t.offloaded_tensors
+                .upgrade()
+                .expect("Attempted to update a dropped context's offloaded tensors")
+                .lock()
+                .unwrap()
+                .insert(t.name(), t.share());
+        });
+        self
+    }
 
-        Ok(self)
+    /// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU.
+    /// If not, this is a no-op.
+    ///
+    /// It will not transfer the data. Use `transfer_to` for that.
+    #[allow(unused_variables)]
+    pub fn offload(&self) {
+        self.with_alive_ctx(|| {
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr());
+            }
+        })
+    }
+
+    /// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU without using the scratch buffer.
+    /// If not, this is a no-op.
+    ///
+    /// It will not transfer the data. Use `transfer_to` for that.
+    #[allow(unused_variables)]
+    pub fn offload_no_scratch(&self) {
+        self.with_alive_ctx(|| {
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr());
+            }
+        })
     }
 
     /// Creates a shared copy of this tensor pointer.
@@ -105,22 +132,6 @@ impl Tensor {
         }
     }
 
-    fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f()
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
-    }
-
-    fn with_alive_ctx_mut<U>(&self, mut f: impl FnMut() -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f()
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
-    }
-
     /// Number of bytes used by this tensor.
     pub fn nbytes(&self) -> usize {
         self.with_alive_ctx(|| {
@@ -147,8 +158,8 @@ impl Tensor {
     ///
     /// The memory region from `data_ptr` to `data_ptr.offset(tensor.nbytes())` will be read from.
     pub unsafe fn set_data(&mut self, data_ptr: *mut c_void) {
-        let tensor = self.ptr.as_mut();
-        self.with_alive_ctx_mut(|| {
+        self.with_alive_ctx_mut(|t| {
+            let tensor = t.ptr.as_mut();
             // SAFETY: The with_alive_call guarantees the context is alive
             tensor.data = data_ptr;
         })
@@ -206,4 +217,53 @@ impl Tensor {
         let data = unsafe { sys::ggml_get_data(self.ptr.as_ptr()).add(offset) };
         std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
     }
+
+    /// Frees the memory of a tensor on an accelerator if ggml-sys is compiled with CUDA or CLBlast support.
+    /// If not, this is a no-op.
+    ///
+    /// This is temporary while GGML improves their context memory management. This should only be called by
+    /// `Context` when it is dropped, as well as `llm`'s `InferenceSession`.
+    ///
+    /// # Safety
+    ///
+    /// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop`
+    /// isn't `self`.
+    pub unsafe fn free_accelerator(&mut self) {
+        #[cfg(feature = "cublas")]
+        unsafe {
+            sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr());
+        }
+        #[cfg(feature = "clblast")]
+        unsafe {
+            sys::opencl::ggml_cl_free_data(self.ptr.as_ptr());
+        }
+    }
+}
+impl Tensor {
+    fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
+        if let Some(_ctx) = self.ctx.upgrade() {
+            f()
+        } else {
+            panic!("Using a tensor after the context was dropped")
+        }
+    }
+
+    fn with_alive_ctx_mut<U>(&mut self, mut f: impl FnMut(&mut Tensor) -> U) -> U {
+        if let Some(_ctx) = self.ctx.upgrade() {
+            f(self)
+        } else {
+            panic!("Using a tensor after the context was dropped")
+        }
+    }
+
+    /// Sets the acceleration backend of the tensor.
+    ///
+    /// # Caution
+    ///
+    /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
+    fn set_backend(&mut self, backend: Backend) {
+        unsafe {
+            self.ptr.as_mut().backend = backend.try_into().unwrap();
+        }
+    }
 }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index d1bae54f..d1c03ba2 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -4,7 +4,7 @@ use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
 
 #[cfg(feature = "metal")]
-use ggml::metal::MetalContext;
+use ggml::accelerator::metal::MetalContext;
 
 use crate::{
     mulf, util, InferenceParameters, Model, OutputRequest, Prompt, TokenId, TokenUtf8Buffer,
@@ -137,8 +137,8 @@ impl InferenceSession {
         };
 
         if config.use_gpu {
-            ggml::accelerator_initialize(0);
-            ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024);
+            ggml::accelerator::initialize(0);
+            ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
         }
 
         let session_ctx = Arc::new(ggml::Context::init(ctx_size, true));
@@ -633,9 +633,11 @@ impl InferenceSession {
 impl Drop for InferenceSession {
     fn drop(&mut self) {
         //if we are using an accelerator, we need to free the scratch memory and the k/v memory
-        ggml::accelerator_free_scratch();
-        ggml::accelerator_free_tensor(&self.memory_k);
-        ggml::accelerator_free_tensor(&self.memory_v);
+        ggml::accelerator::free_scratch();
+        unsafe {
+            self.memory_k.free_accelerator();
+            self.memory_v.free_accelerator();
+        }
     }
 }
 
@@ -983,8 +985,12 @@ fn kv_memory(
     if config.use_gpu {
         // CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer.
         // For OpenCL this is a no-op.
-        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
-        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
+        //
+        // Note that these must be manually freed from the accelerator in the `InferenceSession`
+        // destructor. This is because `offload_no_scratch` does not update the `offloaded_tensors`
+        // map, because reasons.
+        memory_k.offload_no_scratch();
+        memory_v.offload_no_scratch();
     }
 
     (memory_k, memory_v)
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 75a070f1..2fffbace 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -7,6 +7,7 @@ use std::{
     path::{Path, PathBuf},
 };
 
+use ggml::accelerator::Backend;
 use regex::Regex;
 use thiserror::Error;
 
@@ -224,20 +225,20 @@ impl ModelParameters {
     /// Returns true if the model should offload the given layer to the accelerator.
     pub fn should_offload(&self, layer: usize) -> bool {
         if !self.use_gpu {
-            false
-        } else if let Some(offloadable_layers) = self.gpu_layers {
-            layer < offloadable_layers
-        } else {
-            true
+            return false;
         }
+
+        self.gpu_layers
+            .map(|gpu_layers| layer < gpu_layers)
+            .unwrap_or(true)
     }
 
     /// Returns the backend to use for the given layer.
-    pub fn backend(&self, layer: usize) -> ggml::Backend {
+    pub fn backend(&self, layer: usize) -> Backend {
         if self.should_offload(layer) {
-            ggml::Backend::Gpu
+            Backend::Gpu
         } else {
-            ggml::Backend::Cpu
+            Backend::Cpu
         }
     }
 }
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index b99de13f..0c3623c8 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -55,9 +55,8 @@ impl KnownModel for Llama {
 
         let backend = params.backend(0);
 
-        let norm = tl.load("norm.weight")?.transfer_to(backend)?;
-
-        let output = tl.load("output.weight")?.transfer_to(backend)?;
+        let norm = tl.load("norm.weight")?.transfer_to(backend);
+        let output = tl.load("output.weight")?.transfer_to(backend);
 
         let mut layers = Vec::new();
 
@@ -67,31 +66,31 @@ impl KnownModel for Llama {
             let layer = Layer {
                 attention_norm: tl
                     .load(&format!("layers.{i}.attention_norm.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 wq: tl
                     .load(&format!("layers.{i}.attention.wq.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 wk: tl
                     .load(&format!("layers.{i}.attention.wk.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 wv: tl
                     .load(&format!("layers.{i}.attention.wv.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 wo: tl
                     .load(&format!("layers.{i}.attention.wo.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 ffn_norm: tl
                     .load(&format!("layers.{i}.ffn_norm.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 w1: tl
                     .load(&format!("layers.{i}.feed_forward.w1.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 w2: tl
                     .load(&format!("layers.{i}.feed_forward.w2.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
                 w3: tl
                     .load(&format!("layers.{i}.feed_forward.w3.weight"))?
-                    .transfer_to(backend)?,
+                    .transfer_to(backend),
             };
             layers.push(layer);
         }

From c74e159316e226f79454f7cf2e694765cf953008 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 01:33:49 +0200
Subject: [PATCH 24/28] refactor(ggml): offload_no_scratch auto-free

---
 crates/ggml/src/context.rs               |  2 +-
 crates/ggml/src/tensor.rs                | 31 ++++++++++++++----------
 crates/llm-base/src/inference_session.rs |  7 ++----
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 616edb32..04d80a4b 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -518,7 +518,7 @@ impl Drop for Context {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
             // if we moved tensors to an accelerator we need to free them
-            for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
                 if tensor.backend() != Backend::Cpu {
                     tensor.free_accelerator();
                 }
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index ee5370b7..49ec741b 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -85,12 +85,7 @@ impl Tensor {
                 sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr());
             }
 
-            t.offloaded_tensors
-                .upgrade()
-                .expect("Attempted to update a dropped context's offloaded tensors")
-                .lock()
-                .unwrap()
-                .insert(t.name(), t.share());
+            t.mark_as_offloaded();
         });
         self
     }
@@ -113,6 +108,9 @@ impl Tensor {
     /// If not, this is a no-op.
     ///
     /// It will not transfer the data. Use `transfer_to` for that.
+    ///
+    /// Unlike `offload`, this function will add the tensor to the offloaded tensors map. This is because the non-use of a scratch buffer
+    /// allows us to safely assume that this tensor will actually point to data.
     #[allow(unused_variables)]
     pub fn offload_no_scratch(&self) {
         self.with_alive_ctx(|| {
@@ -120,6 +118,7 @@ impl Tensor {
             unsafe {
                 sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr());
             }
+            self.mark_as_offloaded();
         })
     }
 
@@ -222,13 +221,9 @@ impl Tensor {
     /// If not, this is a no-op.
     ///
     /// This is temporary while GGML improves their context memory management. This should only be called by
-    /// `Context` when it is dropped, as well as `llm`'s `InferenceSession`.
-    ///
-    /// # Safety
-    ///
-    /// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop`
-    /// isn't `self`.
-    pub unsafe fn free_accelerator(&mut self) {
+    /// `Context` when it is dropped.
+    pub(crate) fn free_accelerator(self) {
+        println!("Freeing tensor {}", self.name());
         #[cfg(feature = "cublas")]
         unsafe {
             sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr());
@@ -266,4 +261,14 @@ impl Tensor {
             self.ptr.as_mut().backend = backend.try_into().unwrap();
         }
     }
+
+    /// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed.
+    fn mark_as_offloaded(&self) {
+        self.offloaded_tensors
+            .upgrade()
+            .expect("Attempted to update a dropped context's offloaded tensors")
+            .lock()
+            .unwrap()
+            .insert(self.name(), self.share());
+    }
 }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index d1c03ba2..9b45687c 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -632,12 +632,9 @@ impl InferenceSession {
 
 impl Drop for InferenceSession {
     fn drop(&mut self) {
-        //if we are using an accelerator, we need to free the scratch memory and the k/v memory
+        // If we are using an accelerator, we need to free the scratch memory.
+        // The k/v memory is freed by the ctx0 destructor.
         ggml::accelerator::free_scratch();
-        unsafe {
-            self.memory_k.free_accelerator();
-            self.memory_v.free_accelerator();
-        }
     }
 }
 

From 55b2dc34276733a994bf495f507d5d5e54bfbf96 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 01:48:26 +0200
Subject: [PATCH 25/28] refactor(ggml): use ContextInner for shared state

---
 crates/ggml/src/context.rs | 121 ++++++++++++++++++++-----------------
 crates/ggml/src/tensor.rs  |  39 +++++-------
 2 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 04d80a4b..c367c83c 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -16,7 +16,7 @@ pub struct Context {
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
     /// contains a `Weak` reference underneath and doesn't let you do anything
     /// with it if the underlying context has been deallocated.
-    pub ptr: Arc<NonNull<sys::ggml_context>>,
+    inner: Arc<ContextInner>,
 
     /// Memory mapping information
     pub mmap: Option<Mmap>,
@@ -26,6 +26,11 @@ pub struct Context {
 
     /// Whether the context can offload tensors to the GPU
     pub can_offload: bool,
+}
+
+/// Contains state shared between a context and its tensors
+pub(crate) struct ContextInner {
+    pub ptr: NonNull<sys::ggml_context>,
 
     /// Offloaded tensors. Used to free them when the context is dropped.
     // TODO: revisit this. What it means for a tensor to be "offloaded",
@@ -39,7 +44,16 @@ pub struct Context {
     //
     // Hopefully, this is resolved by GGML redesigning both its accelerator
     // interface and its scratch buffer solution.
-    offloaded_tensors: Arc<Mutex<HashMap<String, Tensor>>>,
+    pub offloaded_tensors: Mutex<HashMap<String, Tensor>>,
+}
+
+impl ContextInner {
+    pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc<Self> {
+        Arc::new(Self {
+            ptr: NonNull::new(ptr).expect("Should not be null"),
+            offloaded_tensors: Default::default(),
+        })
+    }
 }
 
 impl Context {
@@ -54,11 +68,10 @@ impl Context {
         };
 
         Self {
-            ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
+            inner: ContextInner::new(raw),
             mmap: None,
             buffer: Some(buffer),
             can_offload: false,
-            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -73,11 +86,10 @@ impl Context {
         };
 
         Self {
-            ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
+            inner: ContextInner::new(raw),
             mmap: Some(mmap),
             buffer: None,
             can_offload: false,
-            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -93,11 +105,10 @@ impl Context {
         };
 
         Self {
-            ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
+            inner: ContextInner::new(raw),
             mmap: None,
             buffer: None,
             can_offload: false,
-            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -108,7 +119,7 @@ impl Context {
 
     /// Retrieves the memory used by this [Context].
     pub fn used_mem(&self) -> usize {
-        unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) }
+        unsafe { sys::ggml_used_mem(self.as_ptr()) }
     }
 
     /// Sets the scratch buffer to be used by this [Context].
@@ -123,7 +134,7 @@ impl Context {
         // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
         unsafe {
             sys::ggml_set_scratch(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 sys::ggml_scratch {
                     offs: 0,
                     size,
@@ -135,8 +146,7 @@ impl Context {
 
     /// Creates a new 1D tensor.
     pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
-        let raw =
-            unsafe { sys::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0)) };
+        let raw = unsafe { sys::ggml_new_tensor_1d(self.as_ptr(), typ.into(), usize_to_i64(ne0)) };
         self.new_tensor_raw(raw)
     }
 
@@ -144,7 +154,7 @@ impl Context {
     pub fn new_tensor_2d(&self, typ: Type, ne0: usize, ne1: usize) -> Tensor {
         let raw = unsafe {
             sys::ggml_new_tensor_2d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 typ.into(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -157,7 +167,7 @@ impl Context {
     pub fn new_tensor_3d(&self, typ: Type, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
         let raw = unsafe {
             sys::ggml_new_tensor_3d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 typ.into(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -169,7 +179,7 @@ impl Context {
 
     /// Creates a new 1D tensor with the specified value.
     pub fn new_f32(&self, x: f32) -> Tensor {
-        let raw = unsafe { sys::ggml_new_f32(self.ptr.as_ptr(), x) };
+        let raw = unsafe { sys::ggml_new_f32(self.as_ptr(), x) };
         self.new_tensor_raw(raw)
     }
 }
@@ -177,38 +187,37 @@ impl Context {
 impl Context {
     /// Unknown, aside from the obvious. It's transposing something!
     pub fn op_transpose(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_transpose(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Unknown.
     pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { sys::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_get_rows(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the values of `a`, but normalized.
     pub fn op_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the values of `a`, but normalized using RMSNorm.
     pub fn op_rms_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the multiplication of `a` and `b`.
     pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_mul(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Unknown.
     pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_repeat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -220,61 +229,59 @@ impl Context {
     ///
     /// Result is m columns, p rows
     pub fn op_mul_mat(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { sys::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_mul_mat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the addition of `a` and `b`.
     pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_add(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the [SiLU](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html) activation function applied to `a`.
     pub fn op_silu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_silu(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Scales `a` by the 1D tensor `b`.
     pub fn op_scale(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_scale(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, scales `a` by the 1D tensor `b`.
     pub fn op_scale_inplace(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
-            unsafe { sys::ggml_scale_inplace(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+            unsafe { sys::ggml_scale_inplace(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Sets the elements above the diagonal to -INF.
     pub fn op_diag_mask_inf(&self, a: &Tensor, n_past: usize) -> Tensor {
-        let tensor = unsafe {
-            sys::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
-        };
+        let tensor =
+            unsafe { sys::ggml_diag_mask_inf(self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past)) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, sets the elements above the diagonal to -INF.
     pub fn op_diag_mask_inf_inplace(&self, a: &Tensor, n_past: usize) -> Tensor {
         let tensor = unsafe {
-            sys::ggml_diag_mask_inf_inplace(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
+            sys::ggml_diag_mask_inf_inplace(self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
         };
         self.new_tensor_raw(tensor)
     }
 
     /// Applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`.
     pub fn op_soft_max(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_soft_max(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`.
     pub fn op_soft_max_inplace(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_soft_max_inplace(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_soft_max_inplace(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -298,8 +305,7 @@ impl Context {
         a: &Tensor,
         fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32),
     ) -> Tensor {
-        let tensor =
-            unsafe { sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
+        let tensor = unsafe { sys::ggml_map_unary_f32(self.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
         self.new_tensor_raw(tensor)
     }
 
@@ -325,7 +331,7 @@ impl Context {
         fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32),
     ) -> Tensor {
         let tensor = unsafe {
-            sys::ggml_map_binary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun))
+            sys::ggml_map_binary_f32(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun))
         };
         self.new_tensor_raw(tensor)
     }
@@ -337,9 +343,8 @@ impl Context {
             offset < a.nbytes(),
             "Cannot create tensor view with offset larger than tensor"
         );
-        let tensor = unsafe {
-            sys::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset)
-        };
+        let tensor =
+            unsafe { sys::ggml_view_1d(self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset) };
         self.new_tensor_raw(tensor)
     }
 
@@ -348,7 +353,7 @@ impl Context {
         let (ne0, ne1) = ne;
         let tensor = unsafe {
             sys::ggml_view_2d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -371,7 +376,7 @@ impl Context {
         let (nb1, nb2) = nb;
         let tensor = unsafe {
             sys::ggml_view_3d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -386,7 +391,7 @@ impl Context {
 
     /// Copies `a` to `b` and returns `b`.
     pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_cpy(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -394,7 +399,7 @@ impl Context {
     pub fn op_permute(&self, a: &Tensor, axes: (usize, usize, usize, usize)) -> Tensor {
         let tensor = unsafe {
             sys::ggml_permute(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(axes.0),
                 usize_to_i32(axes.1),
@@ -407,8 +412,7 @@ impl Context {
 
     /// In-place; reshapes `a` in accordance with the dimensions of `b`
     pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_reshape(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -416,7 +420,7 @@ impl Context {
     pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor {
         let tensor = unsafe {
             sys::ggml_reshape_2d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -429,7 +433,7 @@ impl Context {
     pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
         let tensor = unsafe {
             sys::ggml_reshape_3d(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
                 usize_to_i64(ne1),
@@ -441,7 +445,7 @@ impl Context {
 
     /// ggml_cont
     pub fn op_cont(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_cont(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_cont(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -449,7 +453,7 @@ impl Context {
     pub fn op_rope(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
         let tensor = unsafe {
             sys::ggml_rope(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(npast),
                 usize_to_i32(ndims),
@@ -464,7 +468,7 @@ impl Context {
     pub fn op_rope_inplace(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
         let tensor = unsafe {
             sys::ggml_rope_inplace(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(npast),
                 usize_to_i32(ndims),
@@ -479,7 +483,7 @@ impl Context {
     pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize, bias_max: f32) -> Tensor {
         let tensor = unsafe {
             sys::ggml_alibi(
-                self.ptr.as_ptr(),
+                self.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(n_past),
                 usize_to_i32(n_head),
@@ -492,7 +496,7 @@ impl Context {
 
     /// Gaussian Error Linear Units
     pub fn op_gelu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_gelu(self.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 }
@@ -502,8 +506,7 @@ impl Context {
     fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
         let tensor = Tensor {
             ptr: NonNull::new(raw).expect("Should not be null"),
-            ctx: Arc::downgrade(&self.ptr),
-            offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
+            inner: Arc::downgrade(&self.inner),
         };
 
         if self.can_offload {
@@ -511,6 +514,10 @@ impl Context {
         }
         tensor
     }
+
+    fn as_ptr(&self) -> *mut sys::ggml_context {
+        self.inner.ptr.as_ptr()
+    }
 }
 
 impl Drop for Context {
@@ -518,12 +525,12 @@ impl Drop for Context {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
             // if we moved tensors to an accelerator we need to free them
-            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+            for (_, tensor) in self.inner.offloaded_tensors.lock().unwrap().drain() {
                 if tensor.backend() != Backend::Cpu {
                     tensor.free_accelerator();
                 }
             }
-            sys::ggml_free(self.ptr.as_ptr());
+            sys::ggml_free(self.as_ptr());
         }
     }
 }
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 49ec741b..25b8bf44 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -1,11 +1,6 @@
-use std::{
-    collections::HashMap,
-    os::raw::c_void,
-    ptr::NonNull,
-    sync::{Mutex, Weak},
-};
+use std::{os::raw::c_void, ptr::NonNull, sync::Weak};
 
-use crate::{accelerator::Backend, i64_to_usize, sys, Type};
+use crate::{accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type};
 
 const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize;
 
@@ -13,8 +8,7 @@ const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize;
 /// underlying context it was created with is alive.
 pub struct Tensor {
     pub(crate) ptr: NonNull<sys::ggml_tensor>,
-    pub(crate) ctx: Weak<NonNull<sys::ggml_context>>,
-    pub(crate) offloaded_tensors: Weak<Mutex<HashMap<String, Tensor>>>,
+    pub(crate) inner: Weak<ContextInner>,
 }
 
 impl Tensor {
@@ -126,8 +120,7 @@ impl Tensor {
     pub fn share(&self) -> Self {
         Tensor {
             ptr: self.ptr,
-            ctx: Weak::clone(&self.ctx),
-            offloaded_tensors: Weak::clone(&self.offloaded_tensors),
+            inner: Weak::clone(&self.inner),
         }
     }
 
@@ -223,7 +216,6 @@ impl Tensor {
     /// This is temporary while GGML improves their context memory management. This should only be called by
     /// `Context` when it is dropped.
     pub(crate) fn free_accelerator(self) {
-        println!("Freeing tensor {}", self.name());
         #[cfg(feature = "cublas")]
         unsafe {
             sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr());
@@ -236,19 +228,19 @@ impl Tensor {
 }
 impl Tensor {
     fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f()
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
+        let _ctx = self
+            .inner
+            .upgrade()
+            .expect("Using a tensor after the context was dropped");
+        f()
     }
 
     fn with_alive_ctx_mut<U>(&mut self, mut f: impl FnMut(&mut Tensor) -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f(self)
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
+        let _ctx = self
+            .inner
+            .upgrade()
+            .expect("Using a tensor after the context was dropped");
+        f(self)
     }
 
     /// Sets the acceleration backend of the tensor.
@@ -264,9 +256,10 @@ impl Tensor {
 
     /// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed.
     fn mark_as_offloaded(&self) {
-        self.offloaded_tensors
+        self.inner
             .upgrade()
             .expect("Attempted to update a dropped context's offloaded tensors")
+            .offloaded_tensors
             .lock()
             .unwrap()
             .insert(self.name(), self.share());

From 70d57dc17ff5e98a00f8dc7ec3631b77763401f4 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 02:23:49 +0200
Subject: [PATCH 26/28] refactor(ggml): unify context creation

---
 crates/ggml/src/context.rs               | 104 +++++++++++++----------
 crates/ggml/src/lib.rs                   |   2 +-
 crates/llm-base/src/inference_session.rs |   6 +-
 crates/llm-base/src/loader.rs            |   6 +-
 crates/llm-base/src/lora.rs              |   2 +-
 5 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index c367c83c..dea550cd 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -18,11 +18,8 @@ pub struct Context {
     /// with it if the underlying context has been deallocated.
     inner: Arc<ContextInner>,
 
-    /// Memory mapping information
-    pub mmap: Option<Mmap>,
-
-    /// Backing buffer (in case we own it)
-    pub buffer: Option<Buffer>,
+    /// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped.
+    storage: Option<ContextStorage>,
 
     /// Whether the context can offload tensors to the GPU
     pub can_offload: bool,
@@ -46,7 +43,6 @@ pub(crate) struct ContextInner {
     // interface and its scratch buffer solution.
     pub offloaded_tensors: Mutex<HashMap<String, Tensor>>,
 }
-
 impl ContextInner {
     pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc<Self> {
         Arc::new(Self {
@@ -56,60 +52,72 @@ impl ContextInner {
     }
 }
 
+/// Controls how the context uses memory.
+pub enum ContextStorage {
+    /// Use the provided buffer as memory.
+    Buffer(Buffer),
+    /// Use the provided memory mapped file as memory.
+    Mmap(Mmap),
+    /// Allocate `mem_size` bytes of memory.
+    Allocate {
+        /// The size, in bytes, of the memory in to allocate.
+        mem_size: usize,
+    },
+}
+
 impl Context {
-    /// Creates a new [Context] using the buffer provided as memory
-    pub fn init_buffer(buffer: Buffer) -> Self {
-        let raw = unsafe {
-            sys::ggml_init(sys::ggml_init_params {
+    /// Creates a new [Context] with the given storage..
+    pub fn new(storage: ContextStorage) -> Self {
+        let init_params = match &storage {
+            ContextStorage::Buffer(buffer) => sys::ggml_init_params {
                 mem_size: buffer.size(),
                 mem_buffer: buffer.data,
                 no_alloc: false,
-            })
+            },
+            ContextStorage::Mmap(mmap) => sys::ggml_init_params {
+                mem_size: mmap.len(),
+                mem_buffer: std::ptr::null_mut(),
+                // We are mmapping so ggml does not need to allocate any memory for us
+                no_alloc: true,
+            },
+            ContextStorage::Allocate { mem_size } => sys::ggml_init_params {
+                mem_size: *mem_size,
+                // Null here means we want ggml to own this memory.
+                mem_buffer: std::ptr::null_mut(),
+                // It doesn't make sense to `no_alloc` when passing in a `mem_size` in this mode.
+                no_alloc: false,
+            },
         };
 
+        let raw = unsafe { sys::ggml_init(init_params) };
         Self {
             inner: ContextInner::new(raw),
-            mmap: None,
-            buffer: Some(buffer),
+            storage: Some(storage),
             can_offload: false,
         }
     }
 
-    /// Creates a new [Context] with the memory mapped file provided
-    pub fn init_mmap(mmap: Mmap) -> Self {
-        let raw = unsafe {
-            sys::ggml_init(sys::ggml_init_params {
-                mem_size: mmap.len(),
-                mem_buffer: std::ptr::null_mut(),
-                no_alloc: true, // We are mmapping so ggml does not need to allocate any memory for us
-            })
-        };
+    /// Creates a new [Context] with the specified buffer.
+    /// The buffer will be used by GGML.
+    pub fn new_with_buffer(buffer: Buffer) -> Self {
+        Self::new(ContextStorage::Buffer(buffer))
+    }
 
-        Self {
-            inner: ContextInner::new(raw),
-            mmap: Some(mmap),
-            buffer: None,
-            can_offload: false,
-        }
+    /// Creates a new [Context] with the specified memory mapped file.
+    pub fn new_with_mmap(mmap: Mmap) -> Self {
+        Self::new(ContextStorage::Mmap(mmap))
     }
 
-    /// Creates a new [Context] with the specified `mem_size` as a working area.
-    pub fn init(mem_size: usize, alloc: bool) -> Self {
-        let raw = unsafe {
-            sys::ggml_init(sys::ggml_init_params {
-                mem_size,
-                // Null here means we want ggml to own this memory.
-                mem_buffer: std::ptr::null_mut(),
-                no_alloc: !alloc,
-            })
-        };
+    /// Creates a new [Context] with the specified memory size.
+    /// The memory will be allocated by GGML.
+    pub fn new_with_allocate(mem_size: usize) -> Self {
+        Self::new(ContextStorage::Allocate { mem_size })
+    }
 
-        Self {
-            inner: ContextInner::new(raw),
-            mmap: None,
-            buffer: None,
-            can_offload: false,
-        }
+    /// Recreates this context using the same storage.
+    pub fn recreate(&mut self) {
+        // This is the only operation that can consume the `self.storage`, so we can unwrap here.
+        *self = Self::new(self.storage.take().unwrap());
     }
 
     /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
@@ -182,6 +190,14 @@ impl Context {
         let raw = unsafe { sys::ggml_new_f32(self.as_ptr(), x) };
         self.new_tensor_raw(raw)
     }
+
+    /// Returns the mmap used by this [Context], if any.
+    pub fn mmap(&self) -> Option<&Mmap> {
+        match &self.storage {
+            Some(ContextStorage::Mmap(mmap)) => Some(mmap),
+            _ => None,
+        }
+    }
 }
 // Operations
 impl Context {
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 168a292a..af833b00 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -20,7 +20,7 @@ pub mod util;
 
 pub mod accelerator;
 
-pub use context::Context;
+pub use context::{Context, ContextStorage};
 pub use tensor::Tensor;
 
 pub use ggml_sys as sys;
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 9b45687c..ee988eae 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -141,7 +141,7 @@ impl InferenceSession {
             ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
         }
 
-        let session_ctx = Arc::new(ggml::Context::init(ctx_size, true));
+        let session_ctx = Arc::new(ggml::Context::new_with_allocate(ctx_size));
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * n_ctx;
@@ -167,7 +167,7 @@ impl InferenceSession {
         };
 
         let eval = Buffer::new(buf_size);
-        let ctx0 = ggml::Context::init_buffer(eval);
+        let ctx0 = ggml::Context::new_with_buffer(eval);
 
         // Set up Metal support
         #[cfg(feature = "metal")]
@@ -216,7 +216,7 @@ impl InferenceSession {
         F: FnOnce(BuildContext) -> (ComputationGraph, GraphOutputs),
     {
         // Build a graph
-        self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap());
+        self.ctx0.recreate();
         let ctx0 = &mut self.ctx0;
         let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
         ggml::set_tensor_name(&embd, "embd");
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 8c5d2653..c3ff904b 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -527,10 +527,10 @@ pub fn load<M: KnownModel>(
         unsafe {
             let mmap = Mmap::map(&file)?;
             let file_size = mmap.len() as u64;
-            (Context::init_mmap(mmap), file_size)
+            (Context::new_with_mmap(mmap), file_size)
         }
     } else {
-        (Context::init(ctx_size, true), file.metadata()?.len())
+        (Context::new_with_allocate(ctx_size), file.metadata()?.len())
     };
 
     let tensors_len = tensors.len();
@@ -646,7 +646,7 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
             &self.context,
             &mut self.file,
             &self.path,
-            self.context.mmap.as_ref(),
+            self.context.mmap(),
         );
 
         let mut tensor = main_context.get_tensor(info)?;
diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
index 8cdc2c88..9dcba74a 100644
--- a/crates/llm-base/src/lora.rs
+++ b/crates/llm-base/src/lora.rs
@@ -105,7 +105,7 @@ impl LoraAdapter {
 
         // Create a temporary context for the patching operations
         // TODO: test if GPU can be enabled (make it configurable)
-        let patch_context = ggml::Context::init(patch_context_size, true);
+        let patch_context = ggml::Context::new_with_allocate(patch_context_size);
         let mut patch_file = FileContext::new(&patch_context, &mut self.file, &self.path, None);
 
         // Load the A and B tensors

From 090735a7f42c628cf682a93c7c0c49b32b6870ce Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 02:45:57 +0200
Subject: [PATCH 27/28] fix(ggml): make metal work again

---
 crates/ggml/src/accelerator/metal.rs     | 47 +++++++---------
 crates/ggml/src/context.rs               | 68 ++++++++++++++++++++----
 crates/ggml/src/lib.rs                   |  1 +
 crates/llm-base/src/inference_session.rs |  2 +-
 crates/llm-base/src/loader.rs            |  2 +-
 5 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/accelerator/metal.rs
index cbfdbcc6..3a471480 100644
--- a/crates/ggml/src/accelerator/metal.rs
+++ b/crates/ggml/src/accelerator/metal.rs
@@ -1,6 +1,6 @@
 //! Metal support.
 use crate::{sys::metal, Buffer, ComputationGraph, Context, Tensor};
-use std::{ffi::c_void, ptr::NonNull, sync::Arc};
+use std::{ptr::NonNull, sync::Arc};
 
 /// Acts as a RAII-guard over a `sys::metal::ggml_metal_context`, allocating via
 /// `ggml_metal_init` and dropping via `ggml_metal_free`.
@@ -45,33 +45,24 @@ impl MetalContext {
 
     /// Add a context's memory as buffer to this Metal context
     pub fn add_context(&mut self, from_context: Arc<Context>) {
-        if self.ref_context(from_context.clone()) {
-            unsafe {
-                let raw_context = from_context.ptr.as_ptr();
-
-                let (data_ptr, data_size): (*mut c_void, usize) =
-                    if let Some(ref mmap) = from_context.mmap {
-                        // This is a bit naughty...
-                        (mmap.as_ptr().cast_mut().cast(), mmap.len())
-                    } else {
-                        (
-                            ggml_sys::ggml_get_mem_buffer(raw_context),
-                            ggml_sys::ggml_get_mem_size(raw_context),
-                        )
-                    };
+        if !self.ref_context(from_context.clone()) {
+            return;
+        }
 
-                let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
-                assert!(
-                    metal::ggml_metal_add_buffer(
-                        self.ptr.as_ptr(),
-                        "wt\0".as_ptr().cast(), // FIXME provide an actual name
-                        data_ptr,
-                        data_size,
-                        max_size
-                    ),
-                    "Could not add weight buffer to metal context"
-                );
-            }
+        unsafe {
+            let raw_context = from_context.as_ptr();
+            let (data_ptr, data_size) = from_context.storage().as_ptr_and_size(&from_context);
+            let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
+            assert!(
+                metal::ggml_metal_add_buffer(
+                    self.ptr.as_ptr(),
+                    "wt\0".as_ptr().cast(), // FIXME provide an actual name
+                    data_ptr,
+                    data_size,
+                    max_size
+                ),
+                "Could not add weight buffer to metal context"
+            );
         }
     }
 }
@@ -79,7 +70,7 @@ impl MetalContext {
 impl MetalContext {
     /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before.
     fn ref_context(&mut self, context: Arc<Context>) -> bool {
-        if self.contexts.iter().any(|c| c.ptr == context.ptr) {
+        if self.contexts.iter().any(|c| *c == context) {
             false
         } else {
             self.contexts.push(context);
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index dea550cd..472b58c1 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -1,5 +1,6 @@
 use std::{
     collections::HashMap,
+    ffi::c_void,
     os::raw::c_int,
     ptr::NonNull,
     sync::{Arc, Mutex},
@@ -11,6 +12,7 @@ use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tenso
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
 /// `ggml_init` and dropping via `ggml_free`.
+#[derive(PartialEq, Eq)]
 pub struct Context {
     /// An `Arc` is used to model the relation between the context and the
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
@@ -43,6 +45,12 @@ pub(crate) struct ContextInner {
     // interface and its scratch buffer solution.
     pub offloaded_tensors: Mutex<HashMap<String, Tensor>>,
 }
+impl PartialEq for ContextInner {
+    fn eq(&self, other: &Self) -> bool {
+        self.ptr == other.ptr
+    }
+}
+impl Eq for ContextInner {}
 impl ContextInner {
     pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc<Self> {
         Arc::new(Self {
@@ -64,6 +72,47 @@ pub enum ContextStorage {
         mem_size: usize,
     },
 }
+impl ContextStorage {
+    /// Returns the `Mmap` if this is a `Mmap` variant.
+    pub fn as_mmap(&self) -> Option<&Mmap> {
+        match self {
+            Self::Mmap(v) => Some(v),
+            _ => None,
+        }
+    }
+
+    /// Returns the `Buffer` if this is a `Buffer` variant.
+    pub fn as_buffer(&self) -> Option<&Buffer> {
+        match self {
+            Self::Buffer(v) => Some(v),
+            _ => None,
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) unsafe fn as_ptr_and_size(&self, ctx: &Context) -> (*mut c_void, usize) {
+        match self {
+            // This is a bit naughty...
+            Self::Mmap(mmap) => (mmap.as_ptr().cast_mut() as *mut c_void, mmap.len()),
+            _ => (
+                ggml_sys::ggml_get_mem_buffer(ctx.as_ptr()),
+                ggml_sys::ggml_get_mem_size(ctx.as_ptr()),
+            ),
+        }
+    }
+}
+impl PartialEq for ContextStorage {
+    fn eq(&self, other: &Self) -> bool {
+        use ContextStorage::*;
+        match (self, other) {
+            (Buffer(l0), Buffer(r0)) => l0 == r0,
+            (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(),
+            (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r,
+            _ => false,
+        }
+    }
+}
+impl Eq for ContextStorage {}
 
 impl Context {
     /// Creates a new [Context] with the given storage..
@@ -191,12 +240,9 @@ impl Context {
         self.new_tensor_raw(raw)
     }
 
-    /// Returns the mmap used by this [Context], if any.
-    pub fn mmap(&self) -> Option<&Mmap> {
-        match &self.storage {
-            Some(ContextStorage::Mmap(mmap)) => Some(mmap),
-            _ => None,
-        }
+    /// Returns a reference to the [ContextStorage] used by this [Context].
+    pub fn storage(&self) -> &ContextStorage {
+        self.storage.as_ref().unwrap()
     }
 }
 // Operations
@@ -516,6 +562,12 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 }
+// Public to this crate methods
+impl Context {
+    pub(crate) fn as_ptr(&self) -> *mut sys::ggml_context {
+        self.inner.ptr.as_ptr()
+    }
+}
 // Private methods
 impl Context {
     /// Wraps a raw tensor with a weak pointer to the context.
@@ -530,10 +582,6 @@ impl Context {
         }
         tensor
     }
-
-    fn as_ptr(&self) -> *mut sys::ggml_context {
-        self.inner.ptr.as_ptr()
-    }
 }
 
 impl Drop for Context {
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index af833b00..277c02c0 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -259,6 +259,7 @@ impl Type {
 /// A buffer of memory that can be used as a scratch buffer for a [Context].
 ///
 /// See [Context::use_scratch].
+#[derive(PartialEq, Eq)]
 pub struct Buffer {
     data: *mut c_void,
     layout: Layout,
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index ee988eae..33f80f02 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -174,7 +174,7 @@ impl InferenceSession {
         let metal_context = {
             if config.use_gpu {
                 let mut metal_context = MetalContext::new(config.n_threads);
-                metal_context.add_scratch_buffer(ctx0.buffer.as_ref().unwrap());
+                metal_context.add_scratch_buffer(ctx0.storage().as_buffer().unwrap());
 
                 for buf in scratch.iter() {
                     metal_context.add_scratch_buffer(buf);
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index c3ff904b..74544309 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -646,7 +646,7 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
             &self.context,
             &mut self.file,
             &self.path,
-            self.context.mmap(),
+            self.context.storage().as_mmap(),
         );
 
         let mut tensor = main_context.get_tensor(info)?;

From d815857412581b1bd41cc78a2e8303f47dfa56b8 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 16 Jul 2023 12:41:52 +0200
Subject: [PATCH 28/28] fix(ggml): only set backend after if guards

---
 crates/ggml/src/tensor.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 25b8bf44..34c2ab2d 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -65,10 +65,10 @@ impl Tensor {
             if current_backend != Backend::Cpu && backend == Backend::Cpu {
                 unimplemented!("Tensors cannot be moved from an accelerator to the CPU at present");
             }
-            t.set_backend(backend);
             if backend == Backend::Cpu {
                 return;
             }
+            t.set_backend(backend);
 
             #[cfg(feature = "cublas")]
             unsafe {