From ce7ffd14a76c263ee39f89ccefc7e82c4f6faf54 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Thu, 22 Jun 2023 10:19:03 +0200 Subject: [PATCH 01/28] Update LLama.cpp --- crates/ggml/src/lib.rs | 2 +- crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/lib.rs | 169 ++++++++++++++++++++++++++++--------- 3 files changed, 132 insertions(+), 41 deletions(-) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 79623a69..54bb3992 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -399,5 +399,5 @@ pub fn cpu_has_gpublas() -> bool { /// Sets the name of a tensor. pub fn set_name(tensor: &Tensor, name: &str) { let c_name = std::ffi::CString::new(name).unwrap(); - unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) } + unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()); } } diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index 8ab8ba62..bbca06e2 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit 8ab8ba62eb27cc340be2edf3418e051b1d967416 +Subproject commit bbca06e26949686d61a5126332680ba3cccf235c diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index a9a64bfd..c90b9874 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -100,42 +100,46 @@ pub const ggml_op_GGML_OP_NEG: ggml_op = 18; pub const ggml_op_GGML_OP_STEP: ggml_op = 19; pub const ggml_op_GGML_OP_RELU: ggml_op = 20; pub const ggml_op_GGML_OP_GELU: ggml_op = 21; -pub const ggml_op_GGML_OP_SILU: ggml_op = 22; -pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 23; -pub const ggml_op_GGML_OP_NORM: ggml_op = 24; -pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 25; -pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 26; -pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 27; -pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 28; -pub const ggml_op_GGML_OP_SCALE: ggml_op = 29; -pub const ggml_op_GGML_OP_SET: ggml_op = 30; -pub const ggml_op_GGML_OP_CPY: ggml_op = 31; -pub const ggml_op_GGML_OP_CONT: ggml_op = 32; -pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 33; -pub const ggml_op_GGML_OP_VIEW: ggml_op = 34; -pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 35; -pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 36; -pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 37; -pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 38; -pub const ggml_op_GGML_OP_DIAG: ggml_op = 39; -pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 40; -pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 41; -pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 42; -pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 43; -pub const ggml_op_GGML_OP_ROPE: ggml_op = 44; -pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 45; -pub const ggml_op_GGML_OP_ALIBI: ggml_op = 46; -pub const ggml_op_GGML_OP_CLAMP: ggml_op = 47; -pub const ggml_op_GGML_OP_CONV_1D_1S: ggml_op = 48; -pub const ggml_op_GGML_OP_CONV_1D_2S: ggml_op = 49; -pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50; -pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51; -pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52; -pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 53; -pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 54; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 55; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 56; -pub const ggml_op_GGML_OP_COUNT: ggml_op = 57; +pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 22; +pub const ggml_op_GGML_OP_SILU: ggml_op = 23; +pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 24; +pub const ggml_op_GGML_OP_NORM: ggml_op = 25; +pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 26; +pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 27; +pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 28; +pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 29; +pub const ggml_op_GGML_OP_SCALE: ggml_op = 30; +pub const ggml_op_GGML_OP_SET: ggml_op = 31; +pub const ggml_op_GGML_OP_CPY: ggml_op = 32; +pub const ggml_op_GGML_OP_CONT: ggml_op = 33; +pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 34; +pub const ggml_op_GGML_OP_VIEW: ggml_op = 35; +pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 36; +pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 37; +pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 38; +pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 39; +pub const ggml_op_GGML_OP_DIAG: ggml_op = 40; +pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 41; +pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 42; +pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 43; +pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 44; +pub const ggml_op_GGML_OP_ROPE: ggml_op = 45; +pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 46; +pub const ggml_op_GGML_OP_ALIBI: ggml_op = 47; +pub const ggml_op_GGML_OP_CLAMP: ggml_op = 48; +pub const ggml_op_GGML_OP_CONV_1D_S1_PH: ggml_op = 49; +pub const ggml_op_GGML_OP_CONV_1D_S2_PH: ggml_op = 50; +pub const ggml_op_GGML_OP_CONV_2D_SK_P0: ggml_op = 51; +pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 52; +pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 53; +pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 54; +pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55; +pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56; +pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57; +pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 59; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 60; +pub const ggml_op_GGML_OP_COUNT: ggml_op = 61; pub type ggml_op = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -939,7 +943,10 @@ extern "C" { pub fn ggml_get_name(tensor: *const ggml_tensor) -> *const ::std::os::raw::c_char; } extern "C" { - pub fn ggml_set_name(tensor: *mut ggml_tensor, name: *const ::std::os::raw::c_char); + pub fn ggml_set_name( + tensor: *mut ggml_tensor, + name: *const ::std::os::raw::c_char, + ) -> *mut ggml_tensor; } extern "C" { pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; @@ -1001,6 +1008,13 @@ extern "C" { b: *mut ggml_tensor, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_sub_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_mul( ctx: *mut ggml_context, @@ -1008,6 +1022,13 @@ extern "C" { b: *mut ggml_tensor, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_mul_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_div( ctx: *mut ggml_context, @@ -1015,12 +1036,25 @@ extern "C" { b: *mut ggml_tensor, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_div_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_sqr(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_sqr_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_sqrt(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_sqrt_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_log(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } @@ -1053,24 +1087,52 @@ extern "C" { extern "C" { pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_abs_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_sgn(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_sgn_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_neg(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_neg_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_step(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_step_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_relu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_gelu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_gelu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_gelu_quick(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_gelu_quick_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) + -> *mut ggml_tensor; +} extern "C" { pub fn ggml_silu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_silu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_silu_back( ctx: *mut ggml_context, @@ -1081,9 +1143,15 @@ extern "C" { extern "C" { pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_rms_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_rms_norm_back( ctx: *mut ggml_context, @@ -1394,14 +1462,21 @@ extern "C" { ) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_conv_1d_1s( + pub fn ggml_conv_1d_s1_ph( ctx: *mut ggml_context, a: *mut ggml_tensor, b: *mut ggml_tensor, ) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_conv_1d_2s( + pub fn ggml_conv_1d_s2_ph( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_conv_2d_sk_p0( ctx: *mut ggml_context, a: *mut ggml_tensor, b: *mut ggml_tensor, @@ -1436,6 +1511,22 @@ extern "C" { c1: *mut ggml_tensor, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_win_part( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + w: ::std::os::raw::c_int, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_win_unpart( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + w0: ::std::os::raw::c_int, + h0: ::std::os::raw::c_int, + w: ::std::os::raw::c_int, + ) -> *mut ggml_tensor; +} pub type ggml_unary_op_f32_t = ::std::option::Option< unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32), >; From 46fb5e34f8305178020aad19ab08a3ddf51b2d3c Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Thu, 22 Jun 2023 13:55:40 +0200 Subject: [PATCH 02/28] Made LLama work with CUDA --- binaries/llm-cli/src/cli_args.rs | 7 +- crates/ggml/src/context.rs | 25 +++- crates/ggml/src/lib.rs | 141 ++++++++++++++++++++++- crates/ggml/src/tensor.rs | 30 +++++ crates/llm-base/src/inference_session.rs | 72 +++++++++--- crates/llm-base/src/lib.rs | 10 -- crates/llm-base/src/loader.rs | 20 ++++ crates/llm-base/src/model/mod.rs | 16 +++ crates/models/bloom/src/lib.rs | 2 +- crates/models/gpt2/src/lib.rs | 12 +- crates/models/gptj/src/lib.rs | 2 +- crates/models/gptneox/src/lib.rs | 14 +-- crates/models/llama/src/lib.rs | 56 +++++---- crates/models/mpt/src/lib.rs | 10 +- 14 files changed, 347 insertions(+), 70 deletions(-) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 39a14fb4..cf6cc6ab 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -306,6 +306,7 @@ impl Generate { memory_k_type: mem_typ, memory_v_type: mem_typ, use_gpu: self.use_gpu, + n_batch: self.batch_size, } } @@ -320,7 +321,6 @@ impl Generate { pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters { InferenceParameters { n_threads: self.num_threads(), - n_batch: self.batch_size, sampler: Arc::new(llm::samplers::TopPTopK { top_k: self.top_k, top_p: self.top_p, @@ -406,6 +406,10 @@ pub struct ModelLoad { /// LoRA adapter to use for the model #[arg(long, num_args(0..))] pub lora_paths: Option>, + + /// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU. + #[arg(long)] + pub gpu_layers: Option, } impl ModelLoad { pub fn load(&self, use_gpu: bool) -> Result> { @@ -414,6 +418,7 @@ impl ModelLoad { context_size: self.num_ctx_tokens, lora_adapters: self.lora_paths.clone(), use_gpu, + gpu_layers: self.gpu_layers, }; let mut sp = Some(spinoff::Spinner::new( diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 9dd7d108..819a0ef8 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -18,6 +18,9 @@ pub struct Context { /// Backing buffer (in case we own it) pub buffer: Option, + + /// Whether the context can offload tensors to the GPU + pub can_offload: bool, } impl Context { @@ -35,6 +38,7 @@ impl Context { ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), mmap: None, buffer: Some(buffer), + can_offload: false, } } @@ -52,6 +56,7 @@ impl Context { ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), mmap: Some(mmap), buffer: None, + can_offload: false, } } @@ -70,15 +75,31 @@ impl Context { ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), mmap: None, buffer: None, + can_offload: false, } } + /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU + pub fn enable_offloading(&mut self) { + self.can_offload = true; + } + + /// Disables the offloading of tensors to the GPU + pub fn disable_offloading(&mut self) { + self.can_offload = false; + } + /// Wraps a raw tensor with a weak pointer to the context. fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor { - Tensor { + let tensor = Tensor { ptr: NonNull::new(raw).expect("Should not be null"), ctx: Arc::downgrade(&self.ptr), + }; + + if self.can_offload { + crate::accelerator_offload_tensor(&tensor); } + tensor } /// Creates a new 1D tensor. @@ -429,7 +450,7 @@ impl Context { /// Sets the scratch buffer to be used by this [Context]. /// /// If `scratch_buffer` is `None`, the scratch buffer will be disabled. - pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) { + pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) { let (size, data) = if let Some(buffer) = scratch_buffer { (buffer.size(), buffer.data) } else { diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 54bb3992..941e3e4e 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -29,6 +29,65 @@ mod tests; #[cfg(feature = "metal")] pub mod metal; +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +///Accelerators supported by `ggml`. +pub enum Accelerator { + ///CuBLAS accelerated + CuBLAS, + ///CLBlast accelerated + CLBlast, + ///Metal accelerated + Metal, + ///Cpu accelerated + None, +} + +///Returns the accelerator `ggml` was compiled with. +pub fn get_accelerator() -> Accelerator { + #[cfg(feature = "cublas")] + return Accelerator::CLBlast; + #[cfg(feature = "clblast")] + return Accelerator::CuBLAS; + #[cfg(feature = "metal")] + return Accelerator::Metal; + #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))] + return Accelerator::None; +} + +#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] +///Backend to use for a tensor. +pub enum Backend { + /// CPU backend + #[default] + Cpu, + /// GPU backend + Gpu, + ///Multi-GPU backend + GpuSplit, +} + +impl From for sys::ggml_backend { + fn from(b: Backend) -> Self { + match b { + Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU, + Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU, + Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT, + } + } +} + +impl TryFrom for Backend { + type Error = (); + fn try_from(b: sys::ggml_backend) -> Result { + match b { + sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu), + sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu), + sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit), + _ => Err(()), + } + } +} + /// The type of a tensor element. pub type ElementType = Type; @@ -399,5 +458,85 @@ pub fn cpu_has_gpublas() -> bool { /// Sets the name of a tensor. pub fn set_name(tensor: &Tensor, name: &str) { let c_name = std::ffi::CString::new(name).unwrap(); - unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()); } + unsafe { + sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()); + } +} + +/// Gets the acceleration backend of a tensor. +pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend { + (tensor.backend as sys::ggml_backend).try_into().unwrap() +} + +/// Sets the acceleration backend of a tensor. +/// # Safety +/// This function assumes that the tensor is valid. +pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) { + unsafe { + (*tensor).backend = backend.try_into().unwrap(); + } +} + +/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_transform_tensor(tensor: &mut Tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); + } + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); + } +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +pub fn accelerator_offload_tensor(tensor: &Tensor) { + accelerator_offload_raw_tensor(tensor.ptr.as_ptr()); +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers(tensor); + } +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr()); + } +} + +/// Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_set_scratch_size(size: usize) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_set_scratch_size(size); + } +} + +///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_initialize(device: i32) { + #[cfg(feature = "cublas")] + unsafe { + //TODO: Make this configurable + sys::cuda::ggml_init_cublas(); + sys::cuda::ggml_cuda_set_main_device(device); + let split = 1.0f32; + sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); + } + + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_init(); + } + } diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 39f67d67..947c1e71 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -15,6 +15,36 @@ impl Tensor { /// Exposed for purposes of determining context size. pub const C_TYPE_SIZE: usize = std::mem::size_of::(); + ///Sets the name of the tensor + pub fn set_name(&mut self, name: &str) -> &Tensor { + assert!(name.len() <= 32, "Name is too long!"); + + let bytes = name.as_bytes(); + let mut array = [0i8; 32]; + array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::>()); + + unsafe { self.ptr.as_mut().name = array } + self + } + + ///Gets the name of the tensor + pub fn get_name(&self) -> String { + let name = unsafe { self.ptr.as_ref().name }; + let mut name = name.iter().map(|&x| x as u8).collect::>(); + name.retain(|&x| x != 0); + String::from_utf8(name).unwrap() + } + + ///Sets the acceleration backend of the tensor + pub fn set_backend(&mut self, backend: crate::Backend) { + unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) } + } + + ///Gets the acceleration backend of the tensor + pub fn get_backend(&self) -> crate::Backend { + unsafe { crate::get_tensor_backend(self.ptr.as_ref()) } + } + /// Creates a shared copy of this tensor pointer. pub fn share(&self) -> Self { Tensor { diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 57ea0908..d6ef7eaa 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -1,5 +1,5 @@ use ggml::{Buffer, ComputationGraph, Context, Tensor}; -use std::{fmt::Display, sync::Arc}; +use std::{cell::RefCell, fmt::Display, sync::Arc}; use thiserror::Error; #[cfg(feature = "metal")] @@ -25,6 +25,24 @@ fn scratch_buffers() -> ScratchBuffers { ] } +fn kv_memory( + context: &Context, + config: &InferenceSessionConfig, + n_elements: usize, +) -> (Tensor, Tensor) { + let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements); + let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements); + ggml::set_name(&memory_k, "memory_k"); + ggml::set_name(&memory_v, "memory_v"); + + if config.use_gpu { + ggml::accelerator_offload_tensor_no_scratch(&memory_k); + ggml::accelerator_offload_tensor_no_scratch(&memory_v); + } + + (memory_k, memory_v) +} + /// Result of graph building pub struct GraphOutputs { /// The output containing the model's result @@ -92,19 +110,27 @@ pub struct InferenceSession { } pub struct BuildContext<'session> { - pub ctx0: &'session Context, + //FIXME: Borrowing issue, dont know how to fix it + pub ctx0: RefCell<&'session mut Context>, pub embd: &'session Tensor, pub memory_k: &'session Tensor, pub memory_v: &'session Tensor, - pub scratch: &'session mut ScratchBuffers, + pub scratch: &'session ScratchBuffers, } impl<'session> BuildContext<'session> { - pub fn use_scratch(&mut self, idx: Option) { - self.ctx0.use_scratch(match idx { - None => None, - Some(idx) => Some(&mut self.scratch[idx]), - }) + pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> { + Some(&self.scratch[idx]) + } + + pub fn enable_offloading(&self) { + let mut ctx0 = self.ctx0.borrow_mut(); + ctx0.enable_offloading(); + } + + pub fn disable_offloading(&self) { + let mut ctx0 = self.ctx0.borrow_mut(); + ctx0.disable_offloading(); } } @@ -137,15 +163,18 @@ impl InferenceSession { ctx_size }; + //TODO: check if this is needed and the right place to put it + if config.use_gpu { + ggml::accelerator_initialize(0); + ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024); + } + let session_ctx = Arc::new(ggml::Context::init(ctx_size, true)); // Initialize key + value memory tensors let n_mem = n_layer * n_ctx; let n_elements = n_embd * n_mem; - let memory_k = session_ctx.new_tensor_1d(config.memory_k_type.into(), n_elements); - let memory_v = session_ctx.new_tensor_1d(config.memory_v_type.into(), n_elements); - ggml::set_name(&memory_k, "memory_k"); - ggml::set_name(&memory_v, "memory_v"); + let (memory_k, memory_v) = kv_memory(&session_ctx, &config, n_elements); let scratch = scratch_buffers(); @@ -216,12 +245,12 @@ impl InferenceSession { { // Build a graph self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap()); - let ctx0 = &self.ctx0; + let ctx0 = &mut self.ctx0; let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len()); ggml::set_name(&embd, "embd"); let bc = BuildContext { - ctx0, + ctx0: RefCell::new(ctx0), embd: &embd, memory_k: &self.memory_k, memory_v: &self.memory_v, @@ -296,7 +325,7 @@ impl InferenceSession { return Err(InferenceError::ContextFull); } - for batch in prompt_tokens.chunks(params.n_batch) { + for batch in prompt_tokens.chunks(self.config.n_batch) { model.evaluate(self, params, batch, output_request); for &tk in batch { let should_call_callback = Some(tk) != model.bot_token_id(); @@ -480,7 +509,7 @@ impl InferenceSession { let n_ctx = model.context_size(); let n_chunk = tokens.len() / n_ctx; let n_vocab = model.vocabulary().len(); - let n_batch = parameters.n_batch; + let n_batch = self.config.n_batch; let mut nll = 0.0; @@ -705,13 +734,24 @@ pub struct InferenceSessionConfig { /// Whether to use GPU acceleration pub use_gpu: bool, + /// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt]. + /// + /// This is the number of tokens that will be ingested at once. This is useful for + /// trying to speed up the ingestion of prompts, as it allows for parallelization. + /// However, you will be fundamentally limited by your machine's ability to evaluate + /// the transformer model, so increasing the batch size will not always help. + /// + /// A reasonable default value is 8. + pub n_batch: usize, } + impl Default for InferenceSessionConfig { fn default() -> Self { Self { memory_k_type: ModelKVMemoryType::Float16, memory_v_type: ModelKVMemoryType::Float16, use_gpu: false, + n_batch: 8, } } } diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index 127fedf6..caa1fa55 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -65,15 +65,6 @@ pub struct InferenceParameters { /// A reasonable default value is 8, as most modern high-performance computers have /// 8 physical cores. Adjust to your needs. pub n_threads: usize, - /// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt]. - /// - /// This is the number of tokens that will be ingested at once. This is useful for - /// trying to speed up the ingestion of prompts, as it allows for parallelization. - /// However, you will be fundamentally limited by your machine's ability to evaluate - /// the transformer model, so increasing the batch size will not always help. - /// - /// A reasonable default value is 8. - pub n_batch: usize, /// The sampler to use for sampling tokens from the model's probabilities. /// /// Each time the model runs, it generates a distribution of probabilities; each token @@ -94,7 +85,6 @@ impl Default for InferenceParameters { fn default() -> Self { Self { n_threads: 8, - n_batch: 8, sampler: Arc::new(samplers::TopPTopK::default()), } } diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 6524a198..a1254114 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -341,6 +341,8 @@ impl LoadError { pub trait TensorLoader { /// Gets a tensor from the loader. fn load(&mut self, name: &str) -> Result; + /// Gets a tensor from the loader and tries to offload it to the specified backend. + fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result; /// Finish loading the model, and extract all of the state from the loader. fn finish(self) -> (Context, HashMap); } @@ -615,6 +617,15 @@ impl TensorLoader for MmapCompatibleLoader<'_> { Ok(tensor) } + fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result { + let mut tensor = self.load(name)?; + if backend != ggml::Backend::Cpu { + tensor.set_backend(backend); + crate::ggml::accelerator_transform_tensor(&mut tensor); + } + Ok(tensor) + } + fn finish(self) -> (Context, HashMap) { (self.context, self.loaded_tensors) } @@ -686,6 +697,15 @@ impl<'a> FileContext<'a> { } } + // The tensor name is truncated to 32 characters. + + let tensor_name = if name.len() > 32 { + &name[name.len() - 32..] + } else { + name + }; + tensor.set_name(tensor_name); + Ok(tensor) } } diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index 15bd8ee5..95b1c790 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -190,6 +190,8 @@ pub struct ModelParameters { pub lora_adapters: Option>, /// Whether to use GPU acceleration when available pub use_gpu: bool, + /// The number of layers to offload to the gpu. If `None`, all layers will be offloaded. + pub gpu_layers: Option, } impl Default for ModelParameters { @@ -199,6 +201,20 @@ impl Default for ModelParameters { context_size: 2048, lora_adapters: None, use_gpu: false, + gpu_layers: None, + } + } +} + +impl ModelParameters { + /// Returns true if the model should offload the given layer to the accelerator. + pub fn should_offload(&self, layer: usize) -> bool { + if !self.use_gpu { + false + } else if let Some(offloadable_layers) = self.gpu_layers { + layer < offloadable_layers + } else { + true } } } diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index d44f143e..4f256409 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -140,7 +140,7 @@ impl KnownModel for Bloom { } = self.hyperparameters; let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let ctx0 = builder.ctx0; + let ctx0 = builder.ctx0.borrow(); let (memory_k_size, memory_v_size) = ( builder.memory_k.element_size(), builder.memory_v.element_size(), diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index abc0726d..28911510 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -129,8 +129,8 @@ impl KnownModel for Gpt2 { .. } = self.hyperparameters; - let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { - let ctx0 = builder.ctx0; + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0.borrow(); let (memory_k_size, memory_v_size) = ( builder.memory_k.element_size(), builder.memory_v.element_size(), @@ -149,7 +149,7 @@ impl KnownModel for Gpt2 { let mut gf = ggml::ComputationGraph::new(num_threads); for il in 0..n_layer { - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // norm let mut current = ctx0.op_norm(&input_layer); @@ -263,7 +263,7 @@ impl KnownModel for Gpt2 { // feed-forward let ff_in = current.share(); - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(1)); // feed-forward normalization current = ctx0.op_norm(&ff_in); @@ -293,7 +293,7 @@ impl KnownModel for Gpt2 { input_layer = ctx0.op_add(¤t, &ff_in); } - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // normalization input_layer = ctx0.op_norm(&input_layer); @@ -302,7 +302,7 @@ impl KnownModel for Gpt2 { &ctx0.op_repeat(&self.ln_f_b, &input_layer), ); - builder.use_scratch(None); + ctx0.use_scratch(None); let embeddings_tensor: ggml::Tensor = input_layer.share(); diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 5ec7d5bc..9ae601b0 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -132,7 +132,7 @@ impl KnownModel for GptJ { } = self.hyperparameters; let outputs = session.compute(self.context.clone(), input_tokens, |builder| { - let ctx0 = builder.ctx0; + let ctx0 = builder.ctx0.borrow(); let (memory_k_size, memory_v_size) = ( builder.memory_k.element_size(), builder.memory_v.element_size(), diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 5b4ea0c0..5fd0f059 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -147,8 +147,8 @@ impl KnownModel for GptNeoX { .. } = self.hyperparameters; - let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { - let ctx0 = builder.ctx0; + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0.borrow(); let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.wte, embd); let (memory_k_size, memory_v_size) = ( @@ -160,7 +160,7 @@ impl KnownModel for GptNeoX { for il in 0..n_layer { // attention uses first scratch buffer - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // self-attention let mut current = ctx0.op_norm(&input_layer); @@ -280,12 +280,12 @@ impl KnownModel for GptNeoX { ); // use the second scratch for the feed forward - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(1)); let feedforward_input: Tensor; if !use_parallel_residual { feedforward_input = ctx0.op_add(¤t, &input_layer); - current = feed_forward_network(ctx0, &self.layers[il], &feedforward_input); + current = feed_forward_network(&ctx0, &self.layers[il], &feedforward_input); // input for next layer input_layer = ctx0.op_add(¤t, &feedforward_input); } else { @@ -294,7 +294,7 @@ impl KnownModel for GptNeoX { // this is independent of the self-attention result, so it could be done in parallel to the self-attention // note here we pass inpL instead of cur - current = feed_forward_network(ctx0, &self.layers[il], &input_layer); + current = feed_forward_network(&ctx0, &self.layers[il], &input_layer); // layer input + FF current = ctx0.op_add(¤t, &feedforward_input); @@ -305,7 +305,7 @@ impl KnownModel for GptNeoX { } // use the first scratch for the norm - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(0)); // normalize the output input_layer = ctx0.op_norm(&input_layer); diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index d4abb2e1..734fdf76 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -4,7 +4,7 @@ use std::{error::Error, sync::Arc}; use llm_base::{ - ggml, + ggml::{self, Backend}, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, @@ -18,7 +18,7 @@ use llm_base::{ pub struct Llama { // the context size ("memory") the model should use when evaluating a prompt context_size: usize, - + model_params: ModelParameters, hyperparameters: Hyperparameters, vocabulary: Vocabulary, @@ -53,32 +53,39 @@ impl KnownModel for Llama { // model-global weights let wte = tl.load("tok_embeddings.weight")?; - let norm = tl.load("norm.weight")?; - let output = tl.load("output.weight")?; + let norm = tl.offload("norm.weight", Backend::Gpu)?; + + let output = tl.offload("output.weight", Backend::Gpu)?; let mut layers = Vec::new(); + for i in 0..hyperparameters.n_layer { + let backend = if params.should_offload(i) { + Backend::Gpu + } else { + Backend::Cpu + }; let layer = Layer { - attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"))?, - wq: tl.load(&format!("layers.{i}.attention.wq.weight"))?, - wk: tl.load(&format!("layers.{i}.attention.wk.weight"))?, - wv: tl.load(&format!("layers.{i}.attention.wv.weight"))?, - wo: tl.load(&format!("layers.{i}.attention.wo.weight"))?, - ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"))?, - w1: tl.load(&format!("layers.{i}.feed_forward.w1.weight"))?, - w2: tl.load(&format!("layers.{i}.feed_forward.w2.weight"))?, - w3: tl.load(&format!("layers.{i}.feed_forward.w3.weight"))?, + attention_norm: tl + .offload(&format!("layers.{i}.attention_norm.weight"), backend)?, + wq: tl.offload(&format!("layers.{i}.attention.wq.weight"), backend)?, + wk: tl.offload(&format!("layers.{i}.attention.wk.weight"), backend)?, + wv: tl.offload(&format!("layers.{i}.attention.wv.weight"), backend)?, + wo: tl.offload(&format!("layers.{i}.attention.wo.weight"), backend)?, + ffn_norm: tl.offload(&format!("layers.{i}.ffn_norm.weight"), backend)?, + w1: tl.offload(&format!("layers.{i}.feed_forward.w1.weight"), backend)?, + w2: tl.offload(&format!("layers.{i}.feed_forward.w2.weight"), backend)?, + w3: tl.offload(&format!("layers.{i}.feed_forward.w3.weight"), backend)?, }; - layers.push(layer); } - let (context, _tensors) = tl.finish(); let ModelParameters { context_size, .. } = params; Ok(Self { hyperparameters, + model_params: params, context_size, vocabulary, wte, @@ -122,8 +129,8 @@ impl KnownModel for Llama { file_type: _, } = self.hyperparameters; - let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { - let ctx0 = builder.ctx0; + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let mut ctx0 = builder.ctx0.borrow_mut(); let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.wte, embd); @@ -137,10 +144,17 @@ impl KnownModel for Llama { }, ); for il in 0..n_layer { + //TODO: find a better way to do this + if self.model_params.should_offload(il) { + ctx0.enable_offloading(); + } else { + ctx0.disable_offloading(); + } + let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // norm current = ctx0.op_rms_norm(&input_layer); @@ -266,7 +280,7 @@ impl KnownModel for Llama { // projection (no bias) current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(1)); let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); @@ -293,7 +307,8 @@ impl KnownModel for Llama { // input for next layer input_layer = current; } - builder.use_scratch(Some(0)); + + ctx0.use_scratch(builder.get_scratch(0)); // norm input_layer = ctx0.op_rms_norm(&input_layer); @@ -303,6 +318,7 @@ impl KnownModel for Llama { let embedding_result: ggml::Tensor = input_layer.share(); + ctx0.disable_offloading(); // lm_head input_layer = ctx0.op_mul_mat(&self.output, &input_layer); diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 10ce78e9..e32b545e 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -116,8 +116,8 @@ impl KnownModel for Mpt { .. } = self.hyperparameters; - let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { - let ctx0 = builder.ctx0; + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0.borrow(); let (memory_k_size, memory_v_size) = ( builder.memory_k.element_size(), builder.memory_v.element_size(), @@ -131,7 +131,7 @@ impl KnownModel for Mpt { let mut gf = ggml::ComputationGraph::new(num_threads); for il in 0..n_layer { // attention uses first scratch buffer - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); let mut current = ctx0.op_norm(&input_layer); current = ctx0.op_mul( @@ -224,7 +224,7 @@ impl KnownModel for Mpt { input_layer = ctx0.op_add(&input_layer, ¤t); // feed forward uses second scratch buffer - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(1)); current = ctx0.op_norm(&input_layer); current = ctx0.op_mul( @@ -243,7 +243,7 @@ impl KnownModel for Mpt { } //use scratch buffer 0 for the rest - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // norm input_layer = ctx0.op_norm(&input_layer); From 2c90981d7d288bff6d4d0bd4cbaf681da394ec33 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:17:30 +0200 Subject: [PATCH 03/28] formatting --- crates/ggml/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 941e3e4e..fe6f5364 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -538,5 +538,4 @@ pub fn accelerator_initialize(device: i32) { unsafe { sys::opencl::ggml_cl_init(); } - } From 93dac8f79ec9c31b18df17d0a0e22944fcddb269 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:36:36 +0200 Subject: [PATCH 04/28] Merge: Accelerator Functions --- crates/ggml/src/lib.rs | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index b2155c8f..4527a985 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -484,3 +484,81 @@ pub fn set_name(tensor: &Tensor, name: &str) { let c_name = std::ffi::CString::new(name).unwrap(); unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) }; } + +/// Gets the acceleration backend of a tensor. +pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend { + (tensor.backend as sys::ggml_backend).try_into().unwrap() +} + +/// Sets the acceleration backend of a tensor. +/// # Safety +/// This function assumes that the tensor is valid. +pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) { + unsafe { + (*tensor).backend = backend.try_into().unwrap(); + } +} + +/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_transform_tensor(tensor: &mut Tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); + } + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); + } +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +pub fn accelerator_offload_tensor(tensor: &Tensor) { + accelerator_offload_raw_tensor(tensor.ptr.as_ptr()); +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers(tensor); + } +} + +/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr()); + } +} + +/// Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_set_scratch_size(size: usize) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_set_scratch_size(size); + } +} + +///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_initialize(device: i32) { + #[cfg(feature = "cublas")] + unsafe { + //TODO: Make this configurable + sys::cuda::ggml_init_cublas(); + sys::cuda::ggml_cuda_set_main_device(device); + let split = 1.0f32; + sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); + } + + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_init(); + } + +} \ No newline at end of file From 8499798a634b89bc39fe1341e516de1d48d705f8 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:38:09 +0200 Subject: [PATCH 05/28] We need pre-commit hooks --- crates/ggml/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 4527a985..06ceb9cb 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -560,5 +560,4 @@ pub fn accelerator_initialize(device: i32) { unsafe { sys::opencl::ggml_cl_init(); } - -} \ No newline at end of file +} From a1f61b46a4e6f5c465928a0e21e0fff0b5eb9fc6 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:45:50 +0200 Subject: [PATCH 06/28] update llama.cpp --- binaries/generate-ggml-bindings/src/main.rs | 2 + crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/lib.rs | 96 ++++++++++++++++++++- crates/ggml/sys/src/llama.rs | 2 +- 4 files changed, 97 insertions(+), 5 deletions(-) diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs index 5bc5d8ca..432d7ffc 100644 --- a/binaries/generate-ggml-bindings/src/main.rs +++ b/binaries/generate-ggml-bindings/src/main.rs @@ -23,6 +23,8 @@ fn main() { fn generate_main(ggml_path: &Path, src_path: &Path) { let bindings = bindgen::Builder::default() + .header(ggml_path.join("ggml.h").to_str().unwrap().to_string()) + .allowlist_file(r".*ggml.h") .header(ggml_path.join("k_quants.h").to_string_lossy()) .allowlist_file(r".*k_quants.h") // Suppress some warnings diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index bbca06e2..447ccbe8 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit bbca06e26949686d61a5126332680ba3cccf235c +Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index 01f1b52f..d5caebe9 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -140,9 +140,12 @@ pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55; pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56; pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57; pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 59; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 60; -pub const ggml_op_GGML_OP_COUNT: ggml_op = 61; +pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 59; +pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 60; +pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 61; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 62; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 63; +pub const ggml_op_GGML_OP_COUNT: ggml_op = 64; pub type ggml_op = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -951,6 +954,13 @@ extern "C" { name: *const ::std::os::raw::c_char, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_format_name( + tensor: *mut ggml_tensor, + fmt: *const ::std::os::raw::c_char, + ... + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } @@ -1541,6 +1551,23 @@ pub type ggml_binary_op_f32_t = ::std::option::Option< arg4: *const f32, ), >; +pub type ggml_custom1_op_f32_t = + ::std::option::Option; +pub type ggml_custom2_op_f32_t = ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut ggml_tensor, + arg2: *const ggml_tensor, + arg3: *const ggml_tensor, + ), +>; +pub type ggml_custom3_op_f32_t = ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut ggml_tensor, + arg2: *const ggml_tensor, + arg3: *const ggml_tensor, + arg4: *const ggml_tensor, + ), +>; extern "C" { pub fn ggml_map_unary_f32( ctx: *mut ggml_context, @@ -1548,6 +1575,13 @@ extern "C" { fun: ggml_unary_op_f32_t, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_map_unary_inplace_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + fun: ggml_unary_op_f32_t, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_map_binary_f32( ctx: *mut ggml_context, @@ -1556,6 +1590,62 @@ extern "C" { fun: ggml_binary_op_f32_t, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_map_binary_inplace_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + fun: ggml_binary_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom1_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + fun: ggml_custom1_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom1_inplace_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + fun: ggml_custom1_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom2_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + fun: ggml_custom2_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom2_inplace_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + fun: ggml_custom2_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom3_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + c: *mut ggml_tensor, + fun: ggml_custom3_op_f32_t, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_map_custom3_inplace_f32( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + c: *mut ggml_tensor, + fun: ggml_custom3_op_f32_t, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_cross_entropy_loss( ctx: *mut ggml_context, diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs index 4157472a..33e9e541 100644 --- a/crates/ggml/sys/src/llama.rs +++ b/crates/ggml/sys/src/llama.rs @@ -28,4 +28,4 @@ pub const LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15; pub const LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16; pub const LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17; pub const LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18; -pub type llama_ftype = ::std::os::raw::c_uint; +pub type llama_ftype = ::std::os::raw::c_int; From e1476a429145e4be6f76332cd686f876fe01a02c Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:42:47 +0200 Subject: [PATCH 07/28] Update falcon --- crates/llm-base/src/loader.rs | 2 +- crates/models/falcon/src/lib.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 0c53e855..a5ac1c2a 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -30,7 +30,7 @@ pub struct FileType { impl From for i32 { fn from(value: FileType) -> Self { (value.quantization_version * ggml::QNT_VERSION_FACTOR) as i32 - + ggml::sys::llama::llama_ftype::from(value.format) as i32 + + ggml::sys::llama::llama_ftype::from(value.format) } } impl TryFrom for FileType { diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 8ee37453..b5d3ce4e 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -131,8 +131,8 @@ impl KnownModel for Falcon { let head_dim = n_embd / n_head; let n = input_len; - let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { - let ctx0 = builder.ctx0; + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0.borrow(); let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd); let repeat_dummy = ctx0.new_tensor_3d( @@ -157,7 +157,7 @@ impl KnownModel for Falcon { for il in 0..n_layer { // attention uses first scratch buffer - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // self-attention current = ctx0.op_norm(&input_layer); @@ -277,7 +277,7 @@ impl KnownModel for Falcon { current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); // feed forward uses second scratch buffer - builder.use_scratch(Some(1)); + ctx0.use_scratch(builder.get_scratch(1)); let inp_ff = layernorm_output.share(); let attn_out = @@ -293,7 +293,7 @@ impl KnownModel for Falcon { input_layer = current.share(); } - builder.use_scratch(Some(0)); + ctx0.use_scratch(builder.get_scratch(0)); // norm input_layer = ctx0.op_norm(&input_layer); @@ -308,7 +308,7 @@ impl KnownModel for Falcon { let embeddings_tensor: ggml::Tensor = input_layer.share(); - builder.use_scratch(None); + ctx0.use_scratch(None); // lm_head input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); From 4a18bff6db799641178ee78453184b691c83d5ea Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Fri, 30 Jun 2023 17:09:03 +0200 Subject: [PATCH 08/28] Sync latest llama.cpp --- crates/ggml/src/context.rs | 2 ++ crates/ggml/src/tensor.rs | 4 ++-- crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/cuda.rs | 3 +++ crates/ggml/sys/src/lib.rs | 21 +++++++++++++++------ crates/ggml/sys/src/llama.rs | 1 + 6 files changed, 24 insertions(+), 9 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 819a0ef8..f05e918f 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -416,6 +416,7 @@ impl Context { usize_to_i32(npast), usize_to_i32(ndims), mode, + 0, ) }; self.new_tensor_raw(tensor) @@ -430,6 +431,7 @@ impl Context { usize_to_i32(npast), usize_to_i32(ndims), mode, + 0, ) }; self.new_tensor_raw(tensor) diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 947c1e71..c313eaf7 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -17,10 +17,10 @@ impl Tensor { ///Sets the name of the tensor pub fn set_name(&mut self, name: &str) -> &Tensor { - assert!(name.len() <= 32, "Name is too long!"); + assert!(name.len() <= 48, "Name is too long!"); let bytes = name.as_bytes(); - let mut array = [0i8; 32]; + let mut array = [0i8; 48]; array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::>()); unsafe { self.ptr.as_mut().name = array } diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index 447ccbe8..b8c8dda7 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d +Subproject commit b8c8dda75fdf5fdea49c80af36818e7c30fe0ddf diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs index 7eff110b..b6a82745 100644 --- a/crates/ggml/sys/src/cuda.rs +++ b/crates/ggml/sys/src/cuda.rs @@ -85,6 +85,9 @@ extern "C" { extern "C" { pub fn ggml_cuda_assign_buffers_no_scratch(tensor: *mut ggml_tensor); } +extern "C" { + pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor); +} extern "C" { pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int); } diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index d5caebe9..dd30f4de 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -22,9 +22,10 @@ pub const GGML_MAX_NODES: u32 = 4096; pub const GGML_MAX_PARAMS: u32 = 256; pub const GGML_MAX_CONTEXTS: u32 = 64; pub const GGML_MAX_OPT: u32 = 4; -pub const GGML_MAX_NAME: u32 = 32; +pub const GGML_MAX_NAME: u32 = 48; pub const GGML_DEFAULT_N_THREADS: u32 = 4; pub const QK_K: u32 = 256; +pub const K_SCALE_SIZE: u32 = 12; pub type ggml_fp16_t = u16; extern "C" { pub fn ggml_fp16_to_fp32(x: ggml_fp16_t) -> f32; @@ -230,7 +231,7 @@ pub struct ggml_tensor { pub perf_cycles: i64, pub perf_time_us: i64, pub data: *mut ::std::os::raw::c_void, - pub name: [::std::os::raw::c_char; 32usize], + pub name: [::std::os::raw::c_char; 48usize], pub extra: *mut ::std::os::raw::c_void, pub padding: [::std::os::raw::c_char; 4usize], } @@ -240,7 +241,7 @@ fn bindgen_test_layout_ggml_tensor() { let ptr = UNINIT.as_ptr(); assert_eq!( ::std::mem::size_of::(), - 224usize, + 240usize, concat!("Size of: ", stringify!(ggml_tensor)) ); assert_eq!( @@ -420,7 +421,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize }, - 208usize, + 224usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -430,7 +431,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize }, - 216usize, + 232usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -439,7 +440,7 @@ fn bindgen_test_layout_ggml_tensor() { ) ); } -pub const GGML_TENSOR_SIZE: usize = 224; +pub const GGML_TENSOR_SIZE: usize = 240; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct ggml_cgraph { @@ -777,6 +778,12 @@ extern "C" { extern "C" { pub fn ggml_cycles_per_ms() -> i64; } +extern "C" { + pub fn ggml_numa_init(); +} +extern "C" { + pub fn ggml_is_numa() -> bool; +} extern "C" { pub fn ggml_print_object(obj: *const ggml_object); } @@ -1437,6 +1444,7 @@ extern "C" { n_past: ::std::os::raw::c_int, n_dims: ::std::os::raw::c_int, mode: ::std::os::raw::c_int, + n_ctx: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { @@ -1446,6 +1454,7 @@ extern "C" { n_past: ::std::os::raw::c_int, n_dims: ::std::os::raw::c_int, mode: ::std::os::raw::c_int, + n_ctx: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs index 33e9e541..2d5a9a6f 100644 --- a/crates/ggml/sys/src/llama.rs +++ b/crates/ggml/sys/src/llama.rs @@ -11,6 +11,7 @@ pub const LLAMA_FILE_MAGIC: u32 = 1734830708; pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468; pub const LLAMA_SESSION_MAGIC: u32 = 1734833006; pub const LLAMA_SESSION_VERSION: u32 = 1; +pub const LLAMA_DEFAULT_SEED: u32 = 4294967295; pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0; pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1; pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2; From 4ff7f2139e1927225b037eb8059c4353a9f2ae8b Mon Sep 17 00:00:00 2001 From: Lukas Kreussel Date: Fri, 7 Jul 2023 09:44:54 +0200 Subject: [PATCH 09/28] Adjust to latest changes from main --- binaries/llm-test/src/main.rs | 1 - crates/llm-base/src/inference_session.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs index aa259779..4c59394c 100644 --- a/binaries/llm-test/src/main.rs +++ b/binaries/llm-test/src/main.rs @@ -418,7 +418,6 @@ fn run_inference( prompt: input.into(), parameters: &llm::InferenceParameters { n_threads: model_config.threads, - n_batch: 1, sampler: Arc::new(DeterministicSampler), }, play_back_previous_tokens: false, diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 8b1010f9..c082b4b4 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -1,6 +1,6 @@ use ggml::{Buffer, ComputationGraph, Context, Tensor}; use serde::Serialize; -use std::{fmt::Display, sync::Arc}; +use std::{cell::RefCell, fmt::Display, sync::Arc}; use thiserror::Error; #[cfg(feature = "metal")] From d933f1e8c219e7c654ca9db9960a5f0e1e25ef19 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:02:36 +0200 Subject: [PATCH 10/28] Make clblast buildable on windows --- crates/ggml/sys/build.rs | 7 +- crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/cuda.rs | 31 ------ crates/ggml/sys/src/lib.rs | 211 +++++++++++++++++++----------------- 4 files changed, 120 insertions(+), 131 deletions(-) diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs index 084f09de..38960b21 100644 --- a/crates/ggml/sys/build.rs +++ b/crates/ggml/sys/build.rs @@ -155,8 +155,11 @@ fn lib_path(prefix: &str) -> String { fn enable_clblast(build: &mut cc::Build) { println!("cargo:rustc-link-lib=clblast"); println!("cargo:rustc-link-lib=OpenCL"); - //enable dynamic linking against stdc++ - println!(r"cargo:rustc-link-lib=dylib=stdc++"); + + if cfg!(linux) { + //enable dynamic linking against stdc++ + println!(r"cargo:rustc-link-lib=dylib=stdc++"); + } build.file("llama-cpp/ggml-opencl.cpp"); build.flag("-DGGML_USE_CLBLAST"); diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index b8c8dda7..481f793a 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit b8c8dda75fdf5fdea49c80af36818e7c30fe0ddf +Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs index b6a82745..aeeff19a 100644 --- a/crates/ggml/sys/src/cuda.rs +++ b/crates/ggml/sys/src/cuda.rs @@ -4,37 +4,6 @@ use super::ggml_compute_params; use super::ggml_tensor; pub const GGML_CUDA_MAX_DEVICES: u32 = 16; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ggml_tensor_extra_gpu { - pub data_device: [*mut ::std::os::raw::c_void; 16usize], -} -#[test] -fn bindgen_test_layout_ggml_tensor_extra_gpu() { - const UNINIT: ::std::mem::MaybeUninit = - ::std::mem::MaybeUninit::uninit(); - let ptr = UNINIT.as_ptr(); - assert_eq!( - ::std::mem::size_of::(), - 128usize, - concat!("Size of: ", stringify!(ggml_tensor_extra_gpu)) - ); - assert_eq!( - ::std::mem::align_of::(), - 8usize, - concat!("Alignment of ", stringify!(ggml_tensor_extra_gpu)) - ); - assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).data_device) as usize - ptr as usize }, - 0usize, - concat!( - "Offset of field: ", - stringify!(ggml_tensor_extra_gpu), - "::", - stringify!(data_device) - ) - ); -} extern "C" { pub fn ggml_init_cublas(); } diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index dd30f4de..ce79bcc3 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -34,10 +34,10 @@ extern "C" { pub fn ggml_fp32_to_fp16(x: f32) -> ggml_fp16_t; } extern "C" { - pub fn ggml_fp16_to_fp32_row(x: *const ggml_fp16_t, y: *mut f32, n: usize); + pub fn ggml_fp16_to_fp32_row(x: *const ggml_fp16_t, y: *mut f32, n: ::std::os::raw::c_int); } extern "C" { - pub fn ggml_fp32_to_fp16_row(x: *const f32, y: *mut ggml_fp16_t, n: usize); + pub fn ggml_fp32_to_fp16_row(x: *const f32, y: *mut ggml_fp16_t, n: ::std::os::raw::c_int); } #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -96,57 +96,59 @@ pub const ggml_op_GGML_OP_LOG: ggml_op = 10; pub const ggml_op_GGML_OP_SUM: ggml_op = 11; pub const ggml_op_GGML_OP_SUM_ROWS: ggml_op = 12; pub const ggml_op_GGML_OP_MEAN: ggml_op = 13; -pub const ggml_op_GGML_OP_REPEAT: ggml_op = 14; -pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 15; -pub const ggml_op_GGML_OP_ABS: ggml_op = 16; -pub const ggml_op_GGML_OP_SGN: ggml_op = 17; -pub const ggml_op_GGML_OP_NEG: ggml_op = 18; -pub const ggml_op_GGML_OP_STEP: ggml_op = 19; -pub const ggml_op_GGML_OP_RELU: ggml_op = 20; -pub const ggml_op_GGML_OP_GELU: ggml_op = 21; -pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 22; -pub const ggml_op_GGML_OP_SILU: ggml_op = 23; -pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 24; -pub const ggml_op_GGML_OP_NORM: ggml_op = 25; -pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 26; -pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 27; -pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 28; -pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 29; -pub const ggml_op_GGML_OP_SCALE: ggml_op = 30; -pub const ggml_op_GGML_OP_SET: ggml_op = 31; -pub const ggml_op_GGML_OP_CPY: ggml_op = 32; -pub const ggml_op_GGML_OP_CONT: ggml_op = 33; -pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 34; -pub const ggml_op_GGML_OP_VIEW: ggml_op = 35; -pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 36; -pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 37; -pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 38; -pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 39; -pub const ggml_op_GGML_OP_DIAG: ggml_op = 40; -pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 41; -pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 42; -pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 43; -pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 44; -pub const ggml_op_GGML_OP_ROPE: ggml_op = 45; -pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 46; -pub const ggml_op_GGML_OP_ALIBI: ggml_op = 47; -pub const ggml_op_GGML_OP_CLAMP: ggml_op = 48; -pub const ggml_op_GGML_OP_CONV_1D_S1_PH: ggml_op = 49; -pub const ggml_op_GGML_OP_CONV_1D_S2_PH: ggml_op = 50; -pub const ggml_op_GGML_OP_CONV_2D_SK_P0: ggml_op = 51; -pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 52; -pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 53; -pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 54; -pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 55; -pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 56; -pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 57; -pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 58; -pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 59; -pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 60; -pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 61; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 62; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 63; -pub const ggml_op_GGML_OP_COUNT: ggml_op = 64; +pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14; +pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15; +pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16; +pub const ggml_op_GGML_OP_ABS: ggml_op = 17; +pub const ggml_op_GGML_OP_SGN: ggml_op = 18; +pub const ggml_op_GGML_OP_NEG: ggml_op = 19; +pub const ggml_op_GGML_OP_STEP: ggml_op = 20; +pub const ggml_op_GGML_OP_TANH: ggml_op = 21; +pub const ggml_op_GGML_OP_ELU: ggml_op = 22; +pub const ggml_op_GGML_OP_RELU: ggml_op = 23; +pub const ggml_op_GGML_OP_GELU: ggml_op = 24; +pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 25; +pub const ggml_op_GGML_OP_SILU: ggml_op = 26; +pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 27; +pub const ggml_op_GGML_OP_NORM: ggml_op = 28; +pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 29; +pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 30; +pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 31; +pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 32; +pub const ggml_op_GGML_OP_SCALE: ggml_op = 33; +pub const ggml_op_GGML_OP_SET: ggml_op = 34; +pub const ggml_op_GGML_OP_CPY: ggml_op = 35; +pub const ggml_op_GGML_OP_CONT: ggml_op = 36; +pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 37; +pub const ggml_op_GGML_OP_VIEW: ggml_op = 38; +pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 39; +pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 40; +pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 41; +pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 42; +pub const ggml_op_GGML_OP_DIAG: ggml_op = 43; +pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 44; +pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 45; +pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 46; +pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 47; +pub const ggml_op_GGML_OP_ROPE: ggml_op = 48; +pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 49; +pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50; +pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51; +pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52; +pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53; +pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 54; +pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 55; +pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 56; +pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 57; +pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 58; +pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 59; +pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 60; +pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 61; +pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 62; +pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 63; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 64; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 65; +pub const ggml_op_GGML_OP_COUNT: ggml_op = 66; pub type ggml_op = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -1090,6 +1092,9 @@ extern "C" { extern "C" { pub fn ggml_mean(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_argmax(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_repeat( ctx: *mut ggml_context, @@ -1128,6 +1133,18 @@ extern "C" { extern "C" { pub fn ggml_step_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_tanh(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_tanh_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_elu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_elu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } @@ -1484,24 +1501,35 @@ extern "C" { ) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_conv_1d_s1_ph( + pub fn ggml_conv_1d( ctx: *mut ggml_context, a: *mut ggml_tensor, b: *mut ggml_tensor, + s0: ::std::os::raw::c_int, + p0: ::std::os::raw::c_int, + d0: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_conv_1d_s2_ph( + pub fn ggml_conv_2d( ctx: *mut ggml_context, a: *mut ggml_tensor, b: *mut ggml_tensor, + s0: ::std::os::raw::c_int, + s1: ::std::os::raw::c_int, + p0: ::std::os::raw::c_int, + p1: ::std::os::raw::c_int, + d0: ::std::os::raw::c_int, + d1: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_conv_2d_sk_p0( + pub fn ggml_conv_1d_ph( ctx: *mut ggml_context, a: *mut ggml_tensor, b: *mut ggml_tensor, + s: ::std::os::raw::c_int, + d: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { @@ -2666,13 +2694,13 @@ extern "C" { extern "C" { pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int; } -pub type dequantize_row_q_t = ::std::option::Option< +pub type ggml_to_float_t = ::std::option::Option< unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int), >; -pub type quantize_row_q_t = ::std::option::Option< +pub type ggml_from_float_t = ::std::option::Option< unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int), >; -pub type vec_dot_q_t = ::std::option::Option< +pub type ggml_vec_dot_t = ::std::option::Option< unsafe extern "C" fn( n: ::std::os::raw::c_int, s: *mut f32, @@ -2682,91 +2710,80 @@ pub type vec_dot_q_t = ::std::option::Option< >; #[repr(C)] #[derive(Debug, Copy, Clone)] -pub struct quantize_fns_t { - pub dequantize_row_q: dequantize_row_q_t, - pub quantize_row_q: quantize_row_q_t, - pub quantize_row_q_reference: quantize_row_q_t, - pub quantize_row_q_dot: quantize_row_q_t, - pub vec_dot_q: vec_dot_q_t, +pub struct ggml_type_traits_t { + pub to_float: ggml_to_float_t, + pub from_float: ggml_from_float_t, + pub from_float_reference: ggml_from_float_t, + pub vec_dot: ggml_vec_dot_t, pub vec_dot_type: ggml_type, } #[test] -fn bindgen_test_layout_quantize_fns_t() { - const UNINIT: ::std::mem::MaybeUninit = ::std::mem::MaybeUninit::uninit(); +fn bindgen_test_layout_ggml_type_traits_t() { + const UNINIT: ::std::mem::MaybeUninit = ::std::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( - ::std::mem::size_of::(), - 48usize, - concat!("Size of: ", stringify!(quantize_fns_t)) + ::std::mem::size_of::(), + 40usize, + concat!("Size of: ", stringify!(ggml_type_traits_t)) ); assert_eq!( - ::std::mem::align_of::(), + ::std::mem::align_of::(), 8usize, - concat!("Alignment of ", stringify!(quantize_fns_t)) + concat!("Alignment of ", stringify!(ggml_type_traits_t)) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).dequantize_row_q) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", - stringify!(quantize_fns_t), + stringify!(ggml_type_traits_t), "::", - stringify!(dequantize_row_q) + stringify!(to_float) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize }, 8usize, concat!( "Offset of field: ", - stringify!(quantize_fns_t), + stringify!(ggml_type_traits_t), "::", - stringify!(quantize_row_q) + stringify!(from_float) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_reference) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize }, 16usize, concat!( "Offset of field: ", - stringify!(quantize_fns_t), + stringify!(ggml_type_traits_t), "::", - stringify!(quantize_row_q_reference) + stringify!(from_float_reference) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_dot) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize }, 24usize, concat!( "Offset of field: ", - stringify!(quantize_fns_t), - "::", - stringify!(quantize_row_q_dot) - ) - ); - assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_q) as usize - ptr as usize }, - 32usize, - concat!( - "Offset of field: ", - stringify!(quantize_fns_t), + stringify!(ggml_type_traits_t), "::", - stringify!(vec_dot_q) + stringify!(vec_dot) ) ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize }, - 40usize, + 32usize, concat!( "Offset of field: ", - stringify!(quantize_fns_t), + stringify!(ggml_type_traits_t), "::", stringify!(vec_dot_type) ) ); } extern "C" { - pub fn ggml_internal_get_quantize_fn(i: usize) -> quantize_fns_t; + pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t; } #[repr(C)] #[derive(Debug, Copy, Clone)] From 7ec3683d7b64b2507e1a301e2bdf1b48782a619e Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:25:07 +0200 Subject: [PATCH 11/28] Fix opencl inference --- crates/ggml/src/lib.rs | 5 ----- crates/llm-base/src/inference_session.rs | 10 ---------- crates/models/llama/src/lib.rs | 12 ++++++++++-- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index f6ab7f06..0cb6aa3d 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -559,9 +559,4 @@ pub fn accelerator_initialize(device: i32) { let split = 1.0f32; sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); } - - #[cfg(feature = "clblast")] - unsafe { - sys::opencl::ggml_cl_init(); - } } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index c082b4b4..860795eb 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -123,16 +123,6 @@ impl<'session> BuildContext<'session> { pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> { Some(&self.scratch[idx]) } - - pub fn enable_offloading(&self) { - let mut ctx0 = self.ctx0.borrow_mut(); - ctx0.enable_offloading(); - } - - pub fn disable_offloading(&self) { - let mut ctx0 = self.ctx0.borrow_mut(); - ctx0.disable_offloading(); - } } unsafe impl Send for InferenceSession {} diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 214ce96d..654f490b 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -52,9 +52,16 @@ impl KnownModel for Llama { // model-global weights let wte = tl.load("tok_embeddings.weight")?; - let norm = tl.offload("norm.weight", Backend::Gpu)?; - let output = tl.offload("output.weight", Backend::Gpu)?; + let backend = if params.should_offload(0) { + Backend::Gpu + } else { + Backend::Cpu + }; + + let norm = tl.offload("norm.weight", backend)?; + + let output = tl.offload("output.weight", backend)?; let mut layers = Vec::new(); @@ -131,6 +138,7 @@ impl KnownModel for Llama { let outputs = session.compute(self.context.clone(), input_tokens, |builder| { let mut ctx0 = builder.ctx0.borrow_mut(); let embd = builder.embd; + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); // for big prompts, if BLAS is enabled, it is better to use only one thread From 37d6e4fee4be8a33362183e0a37c929d2ada1d4d Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:40:12 +0200 Subject: [PATCH 12/28] Only link clblast dynamically on linux --- crates/ggml/sys/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs index 38960b21..73fe6f20 100644 --- a/crates/ggml/sys/build.rs +++ b/crates/ggml/sys/build.rs @@ -156,7 +156,7 @@ fn enable_clblast(build: &mut cc::Build) { println!("cargo:rustc-link-lib=clblast"); println!("cargo:rustc-link-lib=OpenCL"); - if cfg!(linux) { + if cfg!(target_os = "linux") { //enable dynamic linking against stdc++ println!(r"cargo:rustc-link-lib=dylib=stdc++"); } From 1cf7e5c3f912a73e592fe331a6bc8c1c83454ed8 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel Date: Mon, 10 Jul 2023 14:37:23 +0200 Subject: [PATCH 13/28] Update inference.rs --- binaries/llm-test/src/inference.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs index ec803cdd..95ce02e7 100644 --- a/binaries/llm-test/src/inference.rs +++ b/binaries/llm-test/src/inference.rs @@ -71,7 +71,6 @@ fn run_inference( prompt: input.into(), parameters: &llm::InferenceParameters { n_threads: model_config.threads, - n_batch: 1, sampler: Arc::new(DeterministicSampler), }, play_back_previous_tokens: false, From 7f767b807fccd978bd785f449f66e48f1dc9275e Mon Sep 17 00:00:00 2001 From: Lukas Kreussel Date: Mon, 10 Jul 2023 14:57:53 +0200 Subject: [PATCH 14/28] Free tensor and scratch memory if they are dropped --- crates/ggml/src/lib.rs | 21 +++++++++++++++++++++ crates/ggml/src/tensor.rs | 9 +++++++++ crates/llm-base/src/inference_session.rs | 7 +++++++ 3 files changed, 37 insertions(+) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 0cb6aa3d..540209b9 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -560,3 +560,24 @@ pub fn accelerator_initialize(device: i32) { sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); } } + +/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op. +pub fn accelerator_free_scratch() { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_free_scratch(); + } +} + +/// Frees the memory of a tensor. If ggml-sys is compiled with CUDA or ClBlast support, this function will free the memory of a tensor. If not this is a no-op. +#[allow(unused_variables)] +pub fn accelerator_free_tensor(tensor: &Tensor) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_free_data(tensor.ptr.as_ptr()); + } + #[cfg(feature = "clblast")] + unsafe { + sys::cuda::ggml_cl_free_data(tensor.ptr.as_ptr()); + } +} diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index c313eaf7..40bafe6e 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -155,3 +155,12 @@ impl Tensor { std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len()) } } + +impl Drop for Tensor { + fn drop(&mut self) { + if self.get_backend() != crate::Backend::Cpu { + // if the tensor is not on the cpu, free it from the accelerator + crate::accelerator_free_tensor(self) + } + } +} diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 20625283..84922ab5 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -648,6 +648,13 @@ impl InferenceSession { } } +impl Drop for InferenceSession { + fn drop(&mut self) { + //if we are using an accelerator, we need to free the scratch memory + ggml::accelerator_free_scratch(); + } +} + fn get_newly_decoded_portion_huggingface( model: &dyn Model, tokens: Vec, From 83faca77d2b085daf7730b35282913dae56d54c6 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Mon, 10 Jul 2023 15:32:17 +0200 Subject: [PATCH 15/28] Explicitly free k/v memory --- crates/ggml/src/lib.rs | 2 +- crates/ggml/src/tensor.rs | 9 --------- crates/llm-base/src/inference_session.rs | 4 +++- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 540209b9..33f0cc03 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -578,6 +578,6 @@ pub fn accelerator_free_tensor(tensor: &Tensor) { } #[cfg(feature = "clblast")] unsafe { - sys::cuda::ggml_cl_free_data(tensor.ptr.as_ptr()); + sys::opencl::ggml_cl_free_data(tensor.ptr.as_ptr()); } } diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 40bafe6e..c313eaf7 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -155,12 +155,3 @@ impl Tensor { std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len()) } } - -impl Drop for Tensor { - fn drop(&mut self) { - if self.get_backend() != crate::Backend::Cpu { - // if the tensor is not on the cpu, free it from the accelerator - crate::accelerator_free_tensor(self) - } - } -} diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 84922ab5..ce9551f2 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -650,8 +650,10 @@ impl InferenceSession { impl Drop for InferenceSession { fn drop(&mut self) { - //if we are using an accelerator, we need to free the scratch memory + //if we are using an accelerator, we need to free the scratch memory and the k/v memory ggml::accelerator_free_scratch(); + ggml::accelerator_free_tensor(&self.memory_k); + ggml::accelerator_free_tensor(&self.memory_v); } } From dfaf7ae453cb858d05f6ca7cce82b25fed0212f8 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:34:06 +0200 Subject: [PATCH 16/28] Explicitly drop model weights --- crates/llm/examples/drop_multiple_sessions.rs | 61 +++++++++++++++++++ crates/models/llama/src/lib.rs | 16 ++++- 2 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 crates/llm/examples/drop_multiple_sessions.rs diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs new file mode 100644 index 00000000..44e18676 --- /dev/null +++ b/crates/llm/examples/drop_multiple_sessions.rs @@ -0,0 +1,61 @@ +use llm::ModelArchitecture; +use llm_base::{InferenceFeedback, InferenceParameters, ModelParameters}; +use std::{convert::Infallible, path::PathBuf}; + +fn main() { + let prompt = "What is the meaning of life?"; + let model_path = PathBuf::from(r"C:\Users\lkreu\Downloads\orca-mini-v2_7b.ggmlv3.q5_K_M.bin"); + let now = std::time::Instant::now(); + + let model = llm::load_dynamic( + Some(ModelArchitecture::Llama), + &model_path, + llm_base::TokenizerSource::Embedded, + ModelParameters { + use_gpu: true, + ..Default::default() + }, + llm::load_progress_callback_stdout, + ) + .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}")); + + println!( + "Model fully loaded! Elapsed: {}ms", + now.elapsed().as_millis() + ); + + for i in 0..10 { + println!("Starting session {i}"); + let mut session = model.start_session(Default::default()); + session + .feed_prompt( + model.as_ref(), + &InferenceParameters::default(), + prompt, + &mut Default::default(), + |_| Ok::(llm::InferenceFeedback::Continue), + ) + .unwrap(); + drop(session); + println!("Dropped session {i}"); + } + + drop(model); + + println!("Model dropped! Elapsed: {}ms", now.elapsed().as_millis()); + + for _ in 0..5 { + let model = llm::load_dynamic( + Some(ModelArchitecture::Llama), + &model_path, + llm_base::TokenizerSource::Embedded, + ModelParameters { + use_gpu: true, + ..Default::default() + }, + llm::load_progress_callback_stdout, + ) + .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}")); + drop(model); + } +} diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 8e5052bd..91822ec3 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -1,7 +1,7 @@ //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem. #![deny(missing_docs)] -use std::{error::Error, sync::Arc}; +use std::{collections::HashMap, error::Error, sync::Arc}; use llm_base::{ ggml::{self, Backend}, @@ -34,11 +34,22 @@ pub struct Llama { // must be kept alive for the model context: Arc, + loaded_tensors: HashMap, } unsafe impl Send for Llama {} unsafe impl Sync for Llama {} +impl Drop for Llama { + fn drop(&mut self) { + for (_, tensor) in self.loaded_tensors.drain() { + if tensor.get_backend() != Backend::Cpu { + ggml::accelerator_free_tensor(&tensor); + } + } + } +} + impl KnownModel for Llama { type Hyperparameters = Hyperparameters; @@ -85,7 +96,7 @@ impl KnownModel for Llama { }; layers.push(layer); } - let (context, _tensors) = tl.finish(); + let (context, loaded_tensors) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -99,6 +110,7 @@ impl KnownModel for Llama { output, layers, context: Arc::new(context), + loaded_tensors, }) } From 53095b1c2b7068708c6852e44e5d42c9f05b6554 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Wed, 12 Jul 2023 12:25:52 +0200 Subject: [PATCH 17/28] Update `llama.cpp` and integrate graph "planning" --- binaries/llm-cli/src/cli_args.rs | 2 +- binaries/llm-cli/src/main.rs | 13 +- binaries/llm-test/src/delete.rs | 2 +- binaries/llm-test/src/inference.rs | 17 +- binaries/llm-test/src/tokens.rs | 4 +- crates/ggml/src/context.rs | 9 +- crates/ggml/src/lib.rs | 59 +++++- crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/lib.rs | 196 +++++++++++------- crates/ggml/sys/src/metal.rs | 5 +- crates/llm-base/src/inference_session.rs | 29 ++- crates/llm-base/src/lib.rs | 16 -- crates/llm-base/src/lora.rs | 13 +- crates/llm-base/src/model/mod.rs | 13 +- crates/llm/examples/drop_multiple_sessions.rs | 10 +- crates/llm/examples/embeddings.rs | 7 +- crates/llm/examples/vicuna-chat.rs | 1 - crates/models/bloom/src/lib.rs | 8 +- crates/models/falcon/src/lib.rs | 8 +- crates/models/gpt2/src/lib.rs | 8 +- crates/models/gptj/src/lib.rs | 8 +- crates/models/gptneox/src/lib.rs | 8 +- crates/models/llama/src/lib.rs | 17 +- crates/models/mpt/src/lib.rs | 8 +- 24 files changed, 251 insertions(+), 212 deletions(-) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index bf8af97b..2c5ba985 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -336,6 +336,7 @@ impl Generate { memory_v_type: mem_typ, use_gpu: self.use_gpu, n_batch: self.batch_size, + n_threads: self.num_threads(), } } @@ -349,7 +350,6 @@ impl Generate { pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters { InferenceParameters { - n_threads: self.num_threads(), sampler: Arc::new(llm::samplers::TopPTopK { top_k: self.top_k, top_p: self.top_p, diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 9cd8cb84..9815225a 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -111,16 +111,10 @@ fn perplexity(args: &cli_args::Perplexity) -> Result<()> { let model = args.model_load.load(args.generate.use_gpu)?; let (mut session, _) = snapshot::read_or_create_session(model.as_ref(), None, None, inference_session_config); - let parameters = args.generate.inference_parameters(model.eot_token_id()); - session.perplexity( - model.as_ref(), - ¶meters, - prompt.as_str(), - |chunk, perplexity| { - println!("Perplexity[{chunk}]: {perplexity}"); - }, - )?; + session.perplexity(model.as_ref(), prompt.as_str(), |chunk, perplexity| { + println!("Perplexity[{chunk}]: {perplexity}"); + })?; Ok(()) } @@ -273,7 +267,6 @@ fn interactive( let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None); if let Err(InferenceError::ContextFull) = session.feed_prompt( model.as_ref(), - ¶meters, &prompt, // OutputRequest &mut Default::default(), diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs index 9ddbe7a8..7bcf81df 100644 --- a/binaries/llm-test/src/delete.rs +++ b/binaries/llm-test/src/delete.rs @@ -64,7 +64,7 @@ fn feed_prompt( model: &impl Model, output: &mut OutputRequest, ) -> Result<(), llm::InferenceError> { - session.feed_prompt(model, &Default::default(), prompt, output, always_continue) + session.feed_prompt(model, prompt, output, always_continue) } fn always_continue(_: &[u8]) -> Result { diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs index 95ce02e7..5190bb9e 100644 --- a/binaries/llm-test/src/inference.rs +++ b/binaries/llm-test/src/inference.rs @@ -4,7 +4,7 @@ use std::{convert::Infallible, sync::Arc}; -use llm::InferenceStats; +use llm::{InferenceSessionConfig, InferenceStats}; use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta}; @@ -15,14 +15,11 @@ pub(crate) fn can_infer( expected_output: Option<&str>, maximum_token_count: usize, ) -> anyhow::Result { - let mut session = model.start_session(Default::default()); - let (actual_output, res) = run_inference( - model, - model_config, - &mut session, - input, - maximum_token_count, - ); + let mut session = model.start_session(InferenceSessionConfig { + n_threads: model_config.threads, + ..Default::default() + }); + let (actual_output, res) = run_inference(model, &mut session, input, maximum_token_count); // Process the results Ok(TestCaseReport { @@ -58,7 +55,6 @@ pub(crate) fn can_infer( fn run_inference( model: &dyn llm::Model, - model_config: &ModelConfig, session: &mut llm::InferenceSession, input: &str, maximum_token_count: usize, @@ -70,7 +66,6 @@ fn run_inference( &llm::InferenceRequest { prompt: input.into(), parameters: &llm::InferenceParameters { - n_threads: model_config.threads, sampler: Arc::new(DeterministicSampler), }, play_back_previous_tokens: false, diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs index 260546b8..adddd678 100644 --- a/binaries/llm-test/src/tokens.rs +++ b/binaries/llm-test/src/tokens.rs @@ -65,9 +65,7 @@ fn feed_prompt( model: &impl Model, output: &mut OutputRequest, ) -> Result<(), llm::InferenceError> { - session.feed_prompt(model, &Default::default(), prompt, output, |x| { - always_continue(x) - }) + session.feed_prompt(model, prompt, output, always_continue) } fn always_continue(_: &[u8]) -> Result { diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index d4e7c66c..cb7aa21b 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -2,7 +2,7 @@ use std::{os::raw::c_int, ptr::NonNull, sync::Arc}; use memmap2::Mmap; -use crate::{sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type}; +use crate::{sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type}; /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via /// `ggml_init` and dropping via `ggml_free`. @@ -442,13 +442,6 @@ impl Context { self.new_tensor_raw(tensor) } - /// Computes the specified graph. Must be run in order to evaluate the graph. - pub fn graph_compute(&self, graph: &mut ComputationGraph) { - unsafe { - sys::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner); - } - } - /// Retrieves the memory used by this [Context]. pub fn used_mem(&self) -> usize { unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) } diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 33f0cc03..3013254f 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -3,7 +3,7 @@ //! It exposes a subset of operations (currently used to implement the [llm](https://crates.io/crates/llm) library). //! Note that it does not expose a fully-idiomatic safe Rust interface; operations that could be potentially unsafe are marked as such. //! -//! `ggml` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed. +//! `ggml` operates on a computational graph; no values will be computed until the [Context] is executed via an [GraphExecutionPlan]. //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed. #![deny(missing_docs)] @@ -221,6 +221,8 @@ pub enum Type { F16, /// Float 32-bit. F32, + /// Integer 8-bit. + I8, } impl From for sys::ggml_type { fn from(t: Type) -> Self { @@ -239,6 +241,7 @@ impl From for sys::ggml_type { Type::I32 => sys::ggml_type_GGML_TYPE_I32, Type::F16 => sys::ggml_type_GGML_TYPE_F16, Type::F32 => sys::ggml_type_GGML_TYPE_F32, + Type::I8 => sys::ggml_type_GGML_TYPE_I8, } } } @@ -260,6 +263,7 @@ impl TryFrom for Type { sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32), sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16), sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32), + sys::ggml_type_GGML_TYPE_I8 => Ok(Type::I8), _ => Err(()), } @@ -282,6 +286,7 @@ impl std::fmt::Display for Type { Type::I32 => write!(f, "i32"), Type::F16 => write!(f, "f16"), Type::F32 => write!(f, "f32"), + Type::I8 => write!(f, "i8"), } } } @@ -303,6 +308,7 @@ impl Type { Type::I32 => false, Type::F16 => false, Type::F32 => false, + Type::I8 => false, } } } @@ -351,10 +357,9 @@ pub struct ComputationGraph { impl ComputationGraph { /// Create a new [ComputationGraph] with the specified `n_threads`. - pub fn new(n_threads: usize) -> Self { + pub fn new() -> Self { Self { inner: sys::ggml_cgraph { - n_threads: usize_to_i32(n_threads), // SAFETY: This should be safe to zero. The original C++ impl // just leaves it uninitialized ..unsafe { std::mem::zeroed::() } @@ -368,6 +373,54 @@ impl ComputationGraph { } } +impl Default for ComputationGraph { + fn default() -> Self { + Self::new() + } +} + +/// A `ggml` execution plan. Contains the information needed to execute a computation graph. +pub struct GraphExecutionPlan { + inner: sys::ggml_cplan, + inner_graph: sys::ggml_cgraph, +} + +impl GraphExecutionPlan { + /// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use. + pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self { + Self { + inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) }, + inner_graph: graph.inner, + } + } + + /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context]. + fn create_work_buffer(&mut self, context: &Context) -> Tensor { + context.new_tensor_1d(Type::I8, self.inner.work_size) + } + + /// Assign a work buffer to this [GraphExecutionPlan]. + fn assign_work_buffer(&mut self, buffer: &mut Tensor) { + assert!( + buffer.get_type() == Type::I8, + "Work buffer must be of type i8" + ); + unsafe { + self.inner.work_data = buffer.data().cast(); + } + } + + /// Execute this [GraphExecutionPlan] in the given [Context]. + pub fn execute(&mut self, context: &Context) { + let mut work_buffer = self.create_work_buffer(context); + self.assign_work_buffer(&mut work_buffer); + + unsafe { + sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner); + } + } +} + /// The size of `t` as bytes. pub fn type_size(t: Type) -> usize { unsafe { sys::ggml_type_size(t.into()) } diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index 481f793a..2b5eb72e 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc +Subproject commit 2b5eb72e109577ed84e44bb8fa47e4956f337300 diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index ce79bcc3..a9779fd1 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -21,9 +21,11 @@ pub const GGML_MAX_DIMS: u32 = 4; pub const GGML_MAX_NODES: u32 = 4096; pub const GGML_MAX_PARAMS: u32 = 256; pub const GGML_MAX_CONTEXTS: u32 = 64; -pub const GGML_MAX_OPT: u32 = 4; +pub const GGML_MAX_SRC: u32 = 6; pub const GGML_MAX_NAME: u32 = 48; pub const GGML_DEFAULT_N_THREADS: u32 = 4; +pub const GGML_EXIT_SUCCESS: u32 = 0; +pub const GGML_EXIT_ABORTED: u32 = 1; pub const QK_K: u32 = 256; pub const K_SCALE_SIZE: u32 = 12; pub type ggml_fp16_t = u16; @@ -225,17 +227,14 @@ pub struct ggml_tensor { pub op: ggml_op, pub is_param: bool, pub grad: *mut ggml_tensor, - pub src0: *mut ggml_tensor, - pub src1: *mut ggml_tensor, - pub opt: [*mut ggml_tensor; 4usize], - pub n_tasks: ::std::os::raw::c_int, + pub src: [*mut ggml_tensor; 6usize], pub perf_runs: ::std::os::raw::c_int, pub perf_cycles: i64, pub perf_time_us: i64, pub data: *mut ::std::os::raw::c_void, pub name: [::std::os::raw::c_char; 48usize], pub extra: *mut ::std::os::raw::c_void, - pub padding: [::std::os::raw::c_char; 4usize], + pub padding: [::std::os::raw::c_char; 8usize], } #[test] fn bindgen_test_layout_ggml_tensor() { @@ -332,48 +331,18 @@ fn bindgen_test_layout_ggml_tensor() { ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).src0) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize }, 96usize, concat!( "Offset of field: ", stringify!(ggml_tensor), "::", - stringify!(src0) - ) - ); - assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).src1) as usize - ptr as usize }, - 104usize, - concat!( - "Offset of field: ", - stringify!(ggml_tensor), - "::", - stringify!(src1) - ) - ); - assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).opt) as usize - ptr as usize }, - 112usize, - concat!( - "Offset of field: ", - stringify!(ggml_tensor), - "::", - stringify!(opt) - ) - ); - assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize }, - 144usize, - concat!( - "Offset of field: ", - stringify!(ggml_tensor), - "::", - stringify!(n_tasks) + stringify!(src) ) ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize }, - 148usize, + 144usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -445,86 +414,139 @@ fn bindgen_test_layout_ggml_tensor() { pub const GGML_TENSOR_SIZE: usize = 240; #[repr(C)] #[derive(Debug, Copy, Clone)] -pub struct ggml_cgraph { - pub n_nodes: ::std::os::raw::c_int, - pub n_leafs: ::std::os::raw::c_int, - pub n_threads: ::std::os::raw::c_int, +pub struct ggml_cplan { pub work_size: usize, - pub work: *mut ggml_tensor, - pub nodes: [*mut ggml_tensor; 4096usize], - pub grads: [*mut ggml_tensor; 4096usize], - pub leafs: [*mut ggml_tensor; 4096usize], - pub perf_runs: ::std::os::raw::c_int, - pub perf_cycles: i64, - pub perf_time_us: i64, + pub work_data: *mut u8, + pub n_threads: ::std::os::raw::c_int, + pub n_tasks: [::std::os::raw::c_int; 4096usize], + pub abort_callback: + ::std::option::Option bool>, + pub abort_callback_data: *mut ::std::os::raw::c_void, } #[test] -fn bindgen_test_layout_ggml_cgraph() { - const UNINIT: ::std::mem::MaybeUninit = ::std::mem::MaybeUninit::uninit(); +fn bindgen_test_layout_ggml_cplan() { + const UNINIT: ::std::mem::MaybeUninit = ::std::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( - ::std::mem::size_of::(), - 98360usize, - concat!("Size of: ", stringify!(ggml_cgraph)) + ::std::mem::size_of::(), + 16424usize, + concat!("Size of: ", stringify!(ggml_cplan)) ); assert_eq!( - ::std::mem::align_of::(), + ::std::mem::align_of::(), 8usize, - concat!("Alignment of ", stringify!(ggml_cgraph)) + concat!("Alignment of ", stringify!(ggml_cplan)) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", - stringify!(ggml_cgraph), + stringify!(ggml_cplan), "::", - stringify!(n_nodes) + stringify!(work_size) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize }, - 4usize, + unsafe { ::std::ptr::addr_of!((*ptr).work_data) as usize - ptr as usize }, + 8usize, concat!( "Offset of field: ", - stringify!(ggml_cgraph), + stringify!(ggml_cplan), "::", - stringify!(n_leafs) + stringify!(work_data) ) ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize }, - 8usize, + 16usize, concat!( "Offset of field: ", - stringify!(ggml_cgraph), + stringify!(ggml_cplan), "::", stringify!(n_threads) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize }, - 16usize, + unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize }, + 20usize, + concat!( + "Offset of field: ", + stringify!(ggml_cplan), + "::", + stringify!(n_tasks) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize }, + 16408usize, + concat!( + "Offset of field: ", + stringify!(ggml_cplan), + "::", + stringify!(abort_callback) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize }, + 16416usize, + concat!( + "Offset of field: ", + stringify!(ggml_cplan), + "::", + stringify!(abort_callback_data) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct ggml_cgraph { + pub n_nodes: ::std::os::raw::c_int, + pub n_leafs: ::std::os::raw::c_int, + pub nodes: [*mut ggml_tensor; 4096usize], + pub grads: [*mut ggml_tensor; 4096usize], + pub leafs: [*mut ggml_tensor; 4096usize], + pub perf_runs: ::std::os::raw::c_int, + pub perf_cycles: i64, + pub perf_time_us: i64, +} +#[test] +fn bindgen_test_layout_ggml_cgraph() { + const UNINIT: ::std::mem::MaybeUninit = ::std::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::std::mem::size_of::(), + 98336usize, + concat!("Size of: ", stringify!(ggml_cgraph)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(ggml_cgraph)) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize }, + 0usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), "::", - stringify!(work_size) + stringify!(n_nodes) ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).work) as usize - ptr as usize }, - 24usize, + unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize }, + 4usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), "::", - stringify!(work) + stringify!(n_leafs) ) ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize }, - 32usize, + 8usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -534,7 +556,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize }, - 32800usize, + 32776usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -544,7 +566,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize }, - 65568usize, + 65544usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -554,7 +576,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize }, - 98336usize, + 98312usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -564,7 +586,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize }, - 98344usize, + 98320usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -574,7 +596,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize }, - 98352usize, + 98328usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -1715,11 +1737,27 @@ extern "C" { ) -> ggml_cgraph; } extern "C" { - pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph); + pub fn ggml_graph_plan( + cgraph: *mut ggml_cgraph, + n_threads: ::std::os::raw::c_int, + ) -> ggml_cplan; +} +extern "C" { + pub fn ggml_graph_compute( + cgraph: *mut ggml_cgraph, + cplan: *mut ggml_cplan, + ) -> ::std::os::raw::c_int; } extern "C" { pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph); } +extern "C" { + pub fn ggml_graph_compute_with_ctx( + ctx: *mut ggml_context, + cgraph: *mut ggml_cgraph, + n_threads: ::std::os::raw::c_int, + ); +} extern "C" { pub fn ggml_graph_get_tensor( cgraph: *mut ggml_cgraph, diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs index 95b427c0..37d97e68 100644 --- a/crates/ggml/sys/src/metal.rs +++ b/crates/ggml/sys/src/metal.rs @@ -17,11 +17,14 @@ pub struct ggml_metal_context { _unused: [u8; 0], } extern "C" { - pub fn ggml_metal_init() -> *mut ggml_metal_context; + pub fn ggml_metal_init(n_cb: ::std::os::raw::c_int) -> *mut ggml_metal_context; } extern "C" { pub fn ggml_metal_free(ctx: *mut ggml_metal_context); } +extern "C" { + pub fn ggml_metal_set_n_cb(ctx: *mut ggml_metal_context, n_cb: ::std::os::raw::c_int); +} extern "C" { pub fn ggml_metal_add_buffer( ctx: *mut ggml_metal_context, diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index ce9551f2..6e558d06 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -1,4 +1,4 @@ -use ggml::{Buffer, ComputationGraph, Context, Tensor}; +use ggml::{Buffer, ComputationGraph, Context, GraphExecutionPlan, Tensor}; use serde::Serialize; use std::{cell::RefCell, fmt::Display, sync::Arc}; use thiserror::Error; @@ -280,7 +280,8 @@ impl InferenceSession { } #[cfg(not(feature = "metal"))] { - ctx0.graph_compute(&mut built_gf); + let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads); + plan.execute(ctx0); } // Adjust the required memory per token if we didn't know that already @@ -302,7 +303,6 @@ impl InferenceSession { pub fn feed_prompt<'a, E: std::error::Error + 'static, P: Into>>( &mut self, model: &dyn Model, - params: &InferenceParameters, prompt: P, output_request: &mut OutputRequest, mut callback: impl FnMut(&[u8]) -> Result, @@ -317,7 +317,7 @@ impl InferenceSession { } for batch in prompt_tokens.chunks(self.config.n_batch) { - model.evaluate(self, params, batch, output_request); + model.evaluate(self, batch, output_request); for &tk in batch { let should_call_callback = Some(tk) != model.bot_token_id(); @@ -397,7 +397,7 @@ impl InferenceSession { self.tokens.push(next_token); // Then, evaluate the network again to compute the new last_logits - model.evaluate(self, params, &[next_token], output_request); + model.evaluate(self, &[next_token], output_request); // Return the next token if next_token as TokenId == model.eot_token_id() { @@ -460,7 +460,6 @@ impl InferenceSession { // context window with new data. self.feed_prompt( model, - parameters, request.prompt, output_request, feed_prompt_callback(&mut callback), @@ -509,7 +508,6 @@ impl InferenceSession { pub fn perplexity<'a, P: Into>>( &mut self, model: &dyn Model, - parameters: &InferenceParameters, prompt: P, mut perplexity_callback: impl FnMut(usize, f32), ) -> Result<(), TokenizationError> { @@ -554,7 +552,6 @@ impl InferenceSession { model.evaluate( self, - parameters, &tokens[batch_start..batch_start + batch_size], &mut output_request, ); @@ -804,6 +801,21 @@ pub struct InferenceSessionConfig { /// /// A reasonable default value is 8. pub n_batch: usize, + /// The number of threads to use. This is dependent on your user's system, + /// and should be selected accordingly. + /// + /// Note that you should aim for a value close to the number of physical cores + /// on the system, as this will give the best performance. This means that, for + /// example, on a 16-core system with hyperthreading, you should set this to 16. + /// + /// Also note that not all cores on a system are equal, and that you may need to + /// experiment with this value to find the optimal value for your use case. For example, + /// Apple Silicon and modern Intel processors have "performance" and "efficiency" cores, + /// and you may want to only use the performance cores. + /// + /// A reasonable default value is 8, as most modern high-performance computers have + /// 8 physical cores. Adjust to your needs. + pub n_threads: usize, } impl Default for InferenceSessionConfig { @@ -813,6 +825,7 @@ impl Default for InferenceSessionConfig { memory_v_type: ModelKVMemoryType::Float16, use_gpu: false, n_batch: 8, + n_threads: 8, } } } diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index b08e62cd..e7607888 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -49,21 +49,6 @@ pub use util::TokenUtf8Buffer; /// This needs to be provided during all inference calls, /// but can be changed between calls. pub struct InferenceParameters { - /// The number of threads to use. This is dependent on your user's system, - /// and should be selected accordingly. - /// - /// Note that you should aim for a value close to the number of physical cores - /// on the system, as this will give the best performance. This means that, for - /// example, on a 16-core system with hyperthreading, you should set this to 16. - /// - /// Also note that not all cores on a system are equal, and that you may need to - /// experiment with this value to find the optimal value for your use case. For example, - /// Apple Silicon and modern Intel processors have "performance" and "efficiency" cores, - /// and you may want to only use the performance cores. - /// - /// A reasonable default value is 8, as most modern high-performance computers have - /// 8 physical cores. Adjust to your needs. - pub n_threads: usize, /// The sampler to use for sampling tokens from the model's probabilities. /// /// Each time the model runs, it generates a distribution of probabilities; each token @@ -83,7 +68,6 @@ unsafe impl Sync for InferenceParameters {} impl Default for InferenceParameters { fn default() -> Self { Self { - n_threads: 8, sampler: Arc::new(samplers::TopPTopK::default()), } } diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs index 6c378b25..8cdc2c88 100644 --- a/crates/llm-base/src/lora.rs +++ b/crates/llm-base/src/lora.rs @@ -3,7 +3,7 @@ use crate::{ LoadError, }; -use ggml::format::TensorLoadInfo; +use ggml::{format::TensorLoadInfo, GraphExecutionPlan}; use std::{ collections::{HashMap, HashSet}, fs::File, @@ -112,9 +112,9 @@ impl LoraAdapter { let a = patch_file.get_tensor(&a_info)?; let b = patch_file.get_tensor(&b_info)?; - // Build a ggml context and apply the patch - // TODO: maybe pass the model's thread count to this context - let mut gf = ggml::ComputationGraph::new(8); + //Build a ggml context and apply the patch + + let mut gf = ggml::ComputationGraph::new(); // LoRA formula: w = w + ba*s let mut ba = patch_context.op_mul_mat(&a, &b); @@ -126,7 +126,10 @@ impl LoraAdapter { // Compute the graph gf.build_forward_expand(&output); - patch_context.graph_compute(&mut gf); + + //TODO: maybe pass the model's thread count to this context + let mut plan = GraphExecutionPlan::new(&mut gf, 8); + plan.execute(&patch_context); // Overwrite the original tensor. // The `output` and the `target_tensor` are not from the same context, diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index fd5d01bc..c8799cdc 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -11,8 +11,8 @@ use regex::Regex; use thiserror::Error; use crate::{ - loader::TensorLoader, tokenizer::TokenId, FileType, InferenceParameters, InferenceSession, - InferenceSessionConfig, LoadError, LoadProgress, Tokenizer, TokenizerSource, + loader::TensorLoader, tokenizer::TokenId, FileType, InferenceSession, InferenceSessionConfig, + LoadError, LoadProgress, Tokenizer, TokenizerSource, }; /// Common functions for model evaluation @@ -54,13 +54,12 @@ pub trait KnownModel: Send + Sync { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession; /// This function is called by the provided [InferenceSession]; it will use this model - /// and the [InferenceParameters] to generate output by evaluating the `input_tokens`. + /// to generate output by evaluating the `input_tokens`. /// The [OutputRequest] is used to specify additional data to fetch from the /// model. fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ); @@ -101,13 +100,12 @@ pub trait Model: Send + Sync { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession; /// This function is called by the provided [InferenceSession]; it will use this model - /// and the [InferenceParameters] to generate output by evaluating the `input_tokens`. + /// to generate output by evaluating the `input_tokens`. /// The [OutputRequest] is used to specify additional data to fetch from the /// model. fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ); @@ -136,11 +134,10 @@ impl> Model for M { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { - KnownModel::evaluate(self, session, params, input_tokens, output_request) + KnownModel::evaluate(self, session, input_tokens, output_request) } fn tokenizer(&self) -> &Tokenizer { diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs index 44e18676..219e2a6d 100644 --- a/crates/llm/examples/drop_multiple_sessions.rs +++ b/crates/llm/examples/drop_multiple_sessions.rs @@ -28,13 +28,9 @@ fn main() { println!("Starting session {i}"); let mut session = model.start_session(Default::default()); session - .feed_prompt( - model.as_ref(), - &InferenceParameters::default(), - prompt, - &mut Default::default(), - |_| Ok::(llm::InferenceFeedback::Continue), - ) + .feed_prompt(model.as_ref(), prompt, &mut Default::default(), |_| { + Ok::(llm::InferenceFeedback::Continue) + }) .unwrap(); drop(session); println!("Dropped session {i}"); diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs index 0a6a999a..427c9e48 100644 --- a/crates/llm/examples/embeddings.rs +++ b/crates/llm/examples/embeddings.rs @@ -125,12 +125,7 @@ fn get_embeddings( .iter() .map(|(_, tok)| *tok) .collect::>(); - model.evaluate( - &mut session, - inference_parameters, - &query_token_ids, - &mut output_request, - ); + model.evaluate(&mut session, &query_token_ids, &mut output_request); output_request.embeddings.unwrap() } diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs index 7cdeb1d1..6b96d756 100644 --- a/crates/llm/examples/vicuna-chat.rs +++ b/crates/llm/examples/vicuna-chat.rs @@ -57,7 +57,6 @@ fn main() { session .feed_prompt( model.as_ref(), - &inference_parameters, format!("{persona}\n{history}").as_str(), &mut Default::default(), llm::feed_prompt_callback(|resp| match resp { diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index ce781fc8..3f29c1ed 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -7,8 +7,8 @@ use std::sync::Arc; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, + ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom) @@ -121,13 +121,11 @@ impl KnownModel for Bloom { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -153,7 +151,7 @@ impl KnownModel for Bloom { input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer); - let mut gf = ggml::ComputationGraph::new(num_threads); + let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 9f44b055..15a77527 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -13,8 +13,8 @@ use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae) @@ -111,13 +111,11 @@ impl KnownModel for Falcon { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -150,7 +148,7 @@ impl KnownModel for Falcon { let memory_v = builder.memory_v; let memory_v_size = memory_v.element_size(); - let mut gf = ggml::ComputationGraph::new(num_threads); + let mut gf = ggml::ComputationGraph::new(); let mut current: Tensor; let mut layernorm_output: Tensor; diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index 0387df1a..e19264b1 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -7,8 +7,8 @@ use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) @@ -117,13 +117,11 @@ impl KnownModel for Gpt2 { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -152,7 +150,7 @@ impl KnownModel for Gpt2 { &ctx0.op_get_rows(&self.wpe, &position), ); - let mut gf = ggml::ComputationGraph::new(num_threads); + let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { ctx0.use_scratch(builder.get_scratch(0)); diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 5c7a0897..85864fce 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -7,8 +7,8 @@ use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b) @@ -112,13 +112,11 @@ impl KnownModel for GptJ { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -140,7 +138,7 @@ impl KnownModel for GptJ { let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - let mut gf = ggml::ComputationGraph::new(num_threads); + let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { // norm let mut current = ctx0.op_norm(&input_layer); diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 80d0b7b8..12fb42a0 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -8,8 +8,8 @@ use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox) @@ -127,13 +127,11 @@ impl KnownModel for GptNeoX { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let n = input_tokens.len(); let n_past = session.n_past; - let n_threads = params.n_threads; let n_ctx = self.context_size; let Hyperparameters { @@ -155,7 +153,7 @@ impl KnownModel for GptNeoX { builder.memory_v.element_size(), ); - let mut gf = ggml::ComputationGraph::new(n_threads); + let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { // attention uses first scratch buffer diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 91822ec3..f612d84f 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -6,8 +6,8 @@ use std::{collections::HashMap, error::Error, sync::Arc}; use llm_base::{ ggml::{self, Backend}, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) @@ -128,13 +128,11 @@ impl KnownModel for Llama { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -153,15 +151,8 @@ impl KnownModel for Llama { let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - let mut gf = ggml::ComputationGraph::new( - if input_len >= 32 && ggml::cpu_has_blas() && !ggml::cpu_has_gpublas() { - 1 - } else { - num_threads - }, - ); + let mut gf = ggml::ComputationGraph::new(); + for il in 0..n_layer { //TODO: find a better way to do this if self.model_params.should_offload(il) { diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 69484bcd..fa66eac2 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -7,8 +7,8 @@ use ggml::Tensor; use llm_base::{ ggml::{self}, model::{common, HyperparametersWriteError}, - util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, + util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, + ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b) @@ -98,13 +98,11 @@ impl KnownModel for Mpt { fn evaluate( &self, session: &mut InferenceSession, - params: &InferenceParameters, input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { let n = input_tokens.len(); let session_len = session.n_past; - let num_threads = params.n_threads; let ctx_size = self.context_size; let Hyperparameters { @@ -128,7 +126,7 @@ impl KnownModel for Mpt { let f32_size = std::mem::size_of::(); - let mut gf = ggml::ComputationGraph::new(num_threads); + let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { // attention uses first scratch buffer ctx0.use_scratch(builder.get_scratch(0)); From 351a0f53f085653225c4c8721f431aec4fcbcc08 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Wed, 12 Jul 2023 14:17:31 +0200 Subject: [PATCH 18/28] potentially fixed metal --- crates/ggml/src/metal.rs | 10 ++-------- crates/llm-base/src/inference_session.rs | 8 +++++--- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/crates/ggml/src/metal.rs b/crates/ggml/src/metal.rs index a86e9273..cbfdbcc6 100644 --- a/crates/ggml/src/metal.rs +++ b/crates/ggml/src/metal.rs @@ -14,8 +14,8 @@ pub struct MetalContext { impl MetalContext { /// Create a new Metal context - pub fn new() -> Self { - let raw = unsafe { metal::ggml_metal_init() }; + pub fn new(n_threads: usize) -> Self { + let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) }; MetalContext { contexts: vec![], @@ -76,12 +76,6 @@ impl MetalContext { } } -impl Default for MetalContext { - fn default() -> Self { - Self::new() - } -} - impl MetalContext { /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before. fn ref_context(&mut self, context: Arc) -> bool { diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 6e558d06..2998f0e3 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -192,7 +192,7 @@ impl InferenceSession { #[cfg(feature = "metal")] let metal_context = { if config.use_gpu { - let mut metal_context = MetalContext::new(); + let mut metal_context = MetalContext::new(config.n_threads); metal_context.add_scratch_buffer(ctx0.buffer.as_ref().unwrap()); for buf in scratch.iter() { @@ -272,10 +272,12 @@ impl InferenceSession { metal_context.graph_compute(&mut built_gf); metal_context.get_tensor(&built_result.result); } else { - ctx0.graph_compute(&mut built_gf); + let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads); + plan.execute(ctx0); } } else { - ctx0.graph_compute(&mut built_gf); + let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads); + plan.execute(ctx0); } } #[cfg(not(feature = "metal"))] From a8986b89b7a34126d2d4201a9f1931b09c8314dc Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Thu, 13 Jul 2023 18:06:03 +0200 Subject: [PATCH 19/28] Sync latest `llama.cpp` --- crates/ggml/sys/llama-cpp | 2 +- crates/ggml/sys/src/lib.rs | 55 +++++++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index 2b5eb72e..32c54116 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit 2b5eb72e109577ed84e44bb8fa47e4956f337300 +Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36 diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index a9779fd1..f3d9192d 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -138,19 +138,21 @@ pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50; pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51; pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52; pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53; -pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 54; -pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 55; -pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 56; -pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 57; -pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 58; -pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 59; -pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 60; -pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 61; -pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 62; -pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 63; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 64; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 65; -pub const ggml_op_GGML_OP_COUNT: ggml_op = 66; +pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 54; +pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 55; +pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 56; +pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 57; +pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 58; +pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 59; +pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 60; +pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 61; +pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 62; +pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63; +pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64; +pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67; +pub const ggml_op_GGML_OP_COUNT: ggml_op = 68; pub type ggml_op = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -1554,6 +1556,33 @@ extern "C" { d: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } +pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0; +pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1; +pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2; +pub type ggml_op_pool = ::std::os::raw::c_int; +extern "C" { + pub fn ggml_pool_1d( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + op: ggml_op_pool, + k0: ::std::os::raw::c_int, + s0: ::std::os::raw::c_int, + p0: ::std::os::raw::c_int, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_pool_2d( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + op: ggml_op_pool, + k0: ::std::os::raw::c_int, + k1: ::std::os::raw::c_int, + s0: ::std::os::raw::c_int, + s1: ::std::os::raw::c_int, + p0: ::std::os::raw::c_int, + p1: ::std::os::raw::c_int, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_flash_attn( ctx: *mut ggml_context, From cea02f2a3feb84887003cdedd50f558d843ea041 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 15 Jul 2023 16:10:02 +0200 Subject: [PATCH 20/28] Review fixes --- crates/ggml/src/context.rs | 29 +++-- crates/ggml/src/lib.rs | 21 ++-- crates/ggml/src/tensor.rs | 64 +++++++++-- crates/llm-base/src/inference_session.rs | 41 +++---- crates/llm-base/src/loader.rs | 22 +--- crates/llm-base/src/model/mod.rs | 11 +- crates/llm/examples/drop_multiple_sessions.rs | 57 ---------- crates/models/llama/src/lib.rs | 105 ++++++++---------- 8 files changed, 174 insertions(+), 176 deletions(-) delete mode 100644 crates/llm/examples/drop_multiple_sessions.rs diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index cb7aa21b..4a37f5d6 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -1,4 +1,9 @@ -use std::{os::raw::c_int, ptr::NonNull, sync::Arc}; +use std::{ + collections::HashMap, + os::raw::c_int, + ptr::NonNull, + sync::{Arc, Mutex}, +}; use memmap2::Mmap; @@ -21,6 +26,9 @@ pub struct Context { /// Whether the context can offload tensors to the GPU pub can_offload: bool, + + /// Offloaded tensors + offloaded_tensors: Arc>>, } impl Context { @@ -39,6 +47,7 @@ impl Context { mmap: None, buffer: Some(buffer), can_offload: false, + offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } @@ -57,6 +66,7 @@ impl Context { mmap: Some(mmap), buffer: None, can_offload: false, + offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } @@ -76,17 +86,13 @@ impl Context { mmap: None, buffer: None, can_offload: false, + offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU - pub fn enable_offloading(&mut self) { - self.can_offload = true; - } - - /// Disables the offloading of tensors to the GPU - pub fn disable_offloading(&mut self) { - self.can_offload = false; + pub fn set_offloading(&mut self, can_offload: bool) { + self.can_offload = can_offload; } /// Wraps a raw tensor with a weak pointer to the context. @@ -94,6 +100,7 @@ impl Context { let tensor = Tensor { ptr: NonNull::new(raw).expect("Should not be null"), ctx: Arc::downgrade(&self.ptr), + offloaded_tensors: Arc::downgrade(&self.offloaded_tensors), }; if self.can_offload { @@ -495,6 +502,12 @@ impl Drop for Context { fn drop(&mut self) { // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { + // if we moved tensors to an acceleratoor we need to free them + for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() { + if tensor.backend() != crate::Backend::Cpu { + crate::accelerator_free_tensor(&tensor); + } + } sys::ggml_free(self.ptr.as_ptr()); } } diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 3013254f..77cccb06 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -30,19 +30,19 @@ mod tests; pub mod metal; #[derive(Debug, Copy, Clone, PartialEq, Eq)] -///Accelerators supported by `ggml`. +/// Accelerators supported by `ggml`. pub enum Accelerator { - ///CuBLAS accelerated + /// CuBLAS accelerated CuBLAS, - ///CLBlast accelerated + /// CLBlast accelerated CLBlast, - ///Metal accelerated + /// Metal accelerated Metal, - ///Cpu accelerated + /// Cpu accelerated None, } -///Returns the accelerator `ggml` was compiled with. +/// Returns the accelerator `ggml` was compiled with. pub fn get_accelerator() -> Accelerator { #[cfg(feature = "cublas")] return Accelerator::CLBlast; @@ -55,14 +55,14 @@ pub fn get_accelerator() -> Accelerator { } #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] -///Backend to use for a tensor. +/// Backend to use for a tensor. pub enum Backend { /// CPU backend #[default] Cpu, /// GPU backend Gpu, - ///Multi-GPU backend + /// Multi-GPU backend GpuSplit, } @@ -184,6 +184,9 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR; /// The size of a `ggml` object. pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE; +/// The maximum length of a `ggml` tensor-name. +pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME; + #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] /// The type of a value in `ggml`. pub enum Type { @@ -537,7 +540,7 @@ pub fn cpu_has_gpublas() -> bool { } /// Sets the name of a tensor. -pub fn set_name(tensor: &Tensor, name: &str) { +pub fn set_tensor_name(tensor: &Tensor, name: &str) { let c_name = std::ffi::CString::new(name).unwrap(); unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) }; } diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index c313eaf7..3ade3c91 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -1,4 +1,9 @@ -use std::{os::raw::c_void, ptr::NonNull, sync::Weak}; +use std::{ + collections::HashMap, + os::raw::c_void, + ptr::NonNull, + sync::{Mutex, Weak}, +}; use crate::{i64_to_usize, sys, Type}; @@ -7,6 +12,7 @@ use crate::{i64_to_usize, sys, Type}; pub struct Tensor { pub(crate) ptr: NonNull, pub(crate) ctx: Weak>, + pub(crate) offloaded_tensors: Weak>>, } impl Tensor { @@ -15,9 +21,18 @@ impl Tensor { /// Exposed for purposes of determining context size. pub const C_TYPE_SIZE: usize = std::mem::size_of::(); - ///Sets the name of the tensor - pub fn set_name(&mut self, name: &str) -> &Tensor { - assert!(name.len() <= 48, "Name is too long!"); + /// Sets the name of the tensor. + /// + /// # Safety + /// + /// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters. + pub fn set_name(mut self, name: &str) -> Tensor { + assert!( + name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(), + "Name '{}' is too long, max length is {} characters", + name, + crate::MAX_NAME_LENGTH + ); let bytes = name.as_bytes(); let mut array = [0i8; 48]; @@ -27,29 +42,60 @@ impl Tensor { self } - ///Gets the name of the tensor - pub fn get_name(&self) -> String { + /// Gets the name of the tensor + pub fn name(&self) -> String { let name = unsafe { self.ptr.as_ref().name }; let mut name = name.iter().map(|&x| x as u8).collect::>(); name.retain(|&x| x != 0); String::from_utf8(name).unwrap() } - ///Sets the acceleration backend of the tensor + /// Sets the acceleration backend of the tensor. + /// + /// # Caution + /// + /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend. pub fn set_backend(&mut self, backend: crate::Backend) { unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) } } - ///Gets the acceleration backend of the tensor - pub fn get_backend(&self) -> crate::Backend { + /// Gets the acceleration backend of the tensor + pub fn backend(&self) -> crate::Backend { unsafe { crate::get_tensor_backend(self.ptr.as_ref()) } } + /// Sets the tensors acceleration backend and moves the tensors data to the new backend. + pub fn transfer_to(mut self, backend: crate::Backend) -> Result { + let current_backend = self.backend(); + self.set_backend(backend); + + if backend != crate::Backend::Cpu { + crate::accelerator_transform_tensor(&mut self); + if current_backend == crate::Backend::Cpu { + // tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator + self.with_alive_ctx_mut(|| { + if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() { + //TODO: Do we need to check if the tensor is already in the map? + offloaded_tensors + .lock() + .unwrap() + .insert(self.name(), self.share()); + } else { + panic!("Using a context after it was dropped!") + } + }) + } + } + + Ok(self) + } + /// Creates a shared copy of this tensor pointer. pub fn share(&self) -> Self { Tensor { ptr: self.ptr, ctx: Weak::clone(&self.ctx), + offloaded_tensors: Weak::clone(&self.offloaded_tensors), } } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 2998f0e3..7a76a9aa 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -26,24 +26,6 @@ fn scratch_buffers() -> ScratchBuffers { ] } -fn kv_memory( - context: &Context, - config: &InferenceSessionConfig, - n_elements: usize, -) -> (Tensor, Tensor) { - let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements); - let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements); - ggml::set_name(&memory_k, "memory_k"); - ggml::set_name(&memory_v, "memory_v"); - - if config.use_gpu { - ggml::accelerator_offload_tensor_no_scratch(&memory_k); - ggml::accelerator_offload_tensor_no_scratch(&memory_v); - } - - (memory_k, memory_v) -} - /// Result of graph building pub struct GraphOutputs { /// The output containing the model's result @@ -238,7 +220,7 @@ impl InferenceSession { self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap()); let ctx0 = &mut self.ctx0; let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len()); - ggml::set_name(&embd, "embd"); + ggml::set_tensor_name(&embd, "embd"); let bc = BuildContext { ctx0: RefCell::new(ctx0), @@ -946,3 +928,24 @@ pub fn feed_prompt_callback<'a, E: std::error::Error + 'static>( None => Ok(InferenceFeedback::Continue), } } + +/// Create the memory K/V tensors for the inference-session. +fn kv_memory( + context: &Context, + config: &InferenceSessionConfig, + n_elements: usize, +) -> (Tensor, Tensor) { + let memory_k = context + .new_tensor_1d(config.memory_k_type.into(), n_elements) + .set_name("memory_k"); + let memory_v = context + .new_tensor_1d(config.memory_v_type.into(), n_elements) + .set_name("memory_v"); + + if config.use_gpu { + ggml::accelerator_offload_tensor_no_scratch(&memory_k); + ggml::accelerator_offload_tensor_no_scratch(&memory_v); + } + + (memory_k, memory_v) +} diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 16d16e12..bfd5b26c 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -396,8 +396,6 @@ impl LoadError { pub trait TensorLoader { /// Gets a tensor from the loader. fn load(&mut self, name: &str) -> Result; - /// Gets a tensor from the loader and tries to offload it to the specified backend. - fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result; /// Finish loading the model, and extract all of the state from the loader. fn finish(self) -> (Context, HashMap); } @@ -672,15 +670,6 @@ impl TensorLoader for MmapCompatibleLoader<'_> { Ok(tensor) } - fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result { - let mut tensor = self.load(name)?; - if backend != ggml::Backend::Cpu { - tensor.set_backend(backend); - crate::ggml::accelerator_transform_tensor(&mut tensor); - } - Ok(tensor) - } - fn finish(self) -> (Context, HashMap) { (self.context, self.loaded_tensors) } @@ -752,16 +741,15 @@ impl<'a> FileContext<'a> { } } - // The tensor name is truncated to 32 characters. - - let tensor_name = if name.len() > 32 { - &name[name.len() - 32..] + // The tensor name is truncated to it's maximum length. + let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap(); + let tensor_name = if name.len() >= max_name_length { + &name[name.len() - max_name_length..] } else { name }; - tensor.set_name(tensor_name); - Ok(tensor) + Ok(tensor.set_name(tensor_name)) } } diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index c8799cdc..75a070f1 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -204,7 +204,7 @@ pub struct ModelParameters { pub lora_adapters: Option>, /// Whether to use GPU acceleration when available pub use_gpu: bool, - /// The number of layers to offload to the gpu. If `None`, all layers will be offloaded. + /// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded. pub gpu_layers: Option, } @@ -231,6 +231,15 @@ impl ModelParameters { true } } + + /// Returns the backend to use for the given layer. + pub fn backend(&self, layer: usize) -> ggml::Backend { + if self.should_offload(layer) { + ggml::Backend::Gpu + } else { + ggml::Backend::Cpu + } + } } /// Used in a call to [Model::evaluate] or [InferenceSession::infer] to request diff --git a/crates/llm/examples/drop_multiple_sessions.rs b/crates/llm/examples/drop_multiple_sessions.rs deleted file mode 100644 index 219e2a6d..00000000 --- a/crates/llm/examples/drop_multiple_sessions.rs +++ /dev/null @@ -1,57 +0,0 @@ -use llm::ModelArchitecture; -use llm_base::{InferenceFeedback, InferenceParameters, ModelParameters}; -use std::{convert::Infallible, path::PathBuf}; - -fn main() { - let prompt = "What is the meaning of life?"; - let model_path = PathBuf::from(r"C:\Users\lkreu\Downloads\orca-mini-v2_7b.ggmlv3.q5_K_M.bin"); - let now = std::time::Instant::now(); - - let model = llm::load_dynamic( - Some(ModelArchitecture::Llama), - &model_path, - llm_base::TokenizerSource::Embedded, - ModelParameters { - use_gpu: true, - ..Default::default() - }, - llm::load_progress_callback_stdout, - ) - .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}")); - - println!( - "Model fully loaded! Elapsed: {}ms", - now.elapsed().as_millis() - ); - - for i in 0..10 { - println!("Starting session {i}"); - let mut session = model.start_session(Default::default()); - session - .feed_prompt(model.as_ref(), prompt, &mut Default::default(), |_| { - Ok::(llm::InferenceFeedback::Continue) - }) - .unwrap(); - drop(session); - println!("Dropped session {i}"); - } - - drop(model); - - println!("Model dropped! Elapsed: {}ms", now.elapsed().as_millis()); - - for _ in 0..5 { - let model = llm::load_dynamic( - Some(ModelArchitecture::Llama), - &model_path, - llm_base::TokenizerSource::Embedded, - ModelParameters { - use_gpu: true, - ..Default::default() - }, - llm::load_progress_callback_stdout, - ) - .unwrap_or_else(|err| panic!("Failed to load llama model from {model_path:?}: {err}")); - drop(model); - } -} diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index f612d84f..4011bea3 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -1,10 +1,10 @@ //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem. #![deny(missing_docs)] -use std::{collections::HashMap, error::Error, sync::Arc}; +use std::{error::Error, sync::Arc}; use llm_base::{ - ggml::{self, Backend}, + ggml::{self}, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, @@ -34,22 +34,11 @@ pub struct Llama { // must be kept alive for the model context: Arc, - loaded_tensors: HashMap, } unsafe impl Send for Llama {} unsafe impl Sync for Llama {} -impl Drop for Llama { - fn drop(&mut self) { - for (_, tensor) in self.loaded_tensors.drain() { - if tensor.get_backend() != Backend::Cpu { - ggml::accelerator_free_tensor(&tensor); - } - } - } -} - impl KnownModel for Llama { type Hyperparameters = Hyperparameters; @@ -64,39 +53,49 @@ impl KnownModel for Llama { // model-global weights let wte = tl.load("tok_embeddings.weight")?; - let backend = if params.should_offload(0) { - Backend::Gpu - } else { - Backend::Cpu - }; + let backend = params.backend(0); - let norm = tl.offload("norm.weight", backend)?; + let norm = tl.load("norm.weight")?.transfer_to(backend)?; - let output = tl.offload("output.weight", backend)?; + let output = tl.load("output.weight")?.transfer_to(backend)?; let mut layers = Vec::new(); for i in 0..hyperparameters.n_layer { - let backend = if params.should_offload(i) { - Backend::Gpu - } else { - Backend::Cpu - }; + let backend = params.backend(i); + let layer = Layer { attention_norm: tl - .offload(&format!("layers.{i}.attention_norm.weight"), backend)?, - wq: tl.offload(&format!("layers.{i}.attention.wq.weight"), backend)?, - wk: tl.offload(&format!("layers.{i}.attention.wk.weight"), backend)?, - wv: tl.offload(&format!("layers.{i}.attention.wv.weight"), backend)?, - wo: tl.offload(&format!("layers.{i}.attention.wo.weight"), backend)?, - ffn_norm: tl.offload(&format!("layers.{i}.ffn_norm.weight"), backend)?, - w1: tl.offload(&format!("layers.{i}.feed_forward.w1.weight"), backend)?, - w2: tl.offload(&format!("layers.{i}.feed_forward.w2.weight"), backend)?, - w3: tl.offload(&format!("layers.{i}.feed_forward.w3.weight"), backend)?, + .load(&format!("layers.{i}.attention_norm.weight"))? + .transfer_to(backend)?, + wq: tl + .load(&format!("layers.{i}.attention.wq.weight"))? + .transfer_to(backend)?, + wk: tl + .load(&format!("layers.{i}.attention.wk.weight"))? + .transfer_to(backend)?, + wv: tl + .load(&format!("layers.{i}.attention.wv.weight"))? + .transfer_to(backend)?, + wo: tl + .load(&format!("layers.{i}.attention.wo.weight"))? + .transfer_to(backend)?, + ffn_norm: tl + .load(&format!("layers.{i}.ffn_norm.weight"))? + .transfer_to(backend)?, + w1: tl + .load(&format!("layers.{i}.feed_forward.w1.weight"))? + .transfer_to(backend)?, + w2: tl + .load(&format!("layers.{i}.feed_forward.w2.weight"))? + .transfer_to(backend)?, + w3: tl + .load(&format!("layers.{i}.feed_forward.w3.weight"))? + .transfer_to(backend)?, }; layers.push(layer); } - let (context, loaded_tensors) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -110,7 +109,6 @@ impl KnownModel for Llama { output, layers, context: Arc::new(context), - loaded_tensors, }) } @@ -154,12 +152,7 @@ impl KnownModel for Llama { let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { - //TODO: find a better way to do this - if self.model_params.should_offload(il) { - ctx0.enable_offloading(); - } else { - ctx0.disable_offloading(); - } + ctx0.set_offloading(self.model_params.should_offload(il)); let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; @@ -185,7 +178,7 @@ impl KnownModel for Llama { n_rot, 0, ); - ggml::set_name(&q_current, "Qcur"); + ggml::set_tensor_name(&q_current, "Qcur"); let k_current = ctx0.op_rope_inplace( &ctx0.op_reshape_3d( &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), @@ -197,7 +190,7 @@ impl KnownModel for Llama { n_rot, 0, ); - ggml::set_name(&k_current, "Kcur"); + ggml::set_tensor_name(&k_current, "Kcur"); // store key and value to memory // compute the transposed [N, n_embd] V matrix @@ -226,7 +219,7 @@ impl KnownModel for Llama { gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); - ggml::set_name(&q, "Q"); + ggml::set_tensor_name(&q, "Q"); let k = ctx0.op_permute( &ctx0.op_reshape_3d( @@ -241,25 +234,25 @@ impl KnownModel for Llama { ), (0, 2, 1, 3), ); - ggml::set_name(&k, "K"); + ggml::set_tensor_name(&k, "K"); // K * Q let k_q = ctx0.op_mul_mat(&k, &q); - ggml::set_name(&k_q, "KQ"); + ggml::set_tensor_name(&k_q, "KQ"); // KQ_scaled = KQ / sqrt(n_embd/n_head) let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())); - ggml::set_name(&kq_scale, "1/sqrt(n_embd/n_head)"); + ggml::set_tensor_name(&kq_scale, "1/sqrt(n_embd/n_head)"); let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale); - ggml::set_name(&k_q_scaled, "KQ_scaled"); + ggml::set_tensor_name(&k_q_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len); - ggml::set_name(&k_q_masked, "KQ_masked"); + ggml::set_tensor_name(&k_q_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked); - ggml::set_name(&k_q_soft_max, "KQ_soft_max"); + ggml::set_tensor_name(&k_q_soft_max, "KQ_soft_max"); // split cached V into n_head heads let v = ctx0.op_view_3d( @@ -271,21 +264,21 @@ impl KnownModel for Llama { ), il * ctx_size * builder.memory_v.element_size() * n_embd, ); - ggml::set_name(&v, "V"); + ggml::set_tensor_name(&v, "V"); let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max); - ggml::set_name(&k_q_v, "KQV"); + ggml::set_tensor_name(&k_q_v, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); - ggml::set_name(&k_q_v_merged, "KQV_merged"); + ggml::set_tensor_name(&k_q_v_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, N) current = ctx0.op_cpy( &k_q_v_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), ); - ggml::set_name(¤t, "KQV_merged_contiguous"); + ggml::set_tensor_name(¤t, "KQV_merged_contiguous"); // projection (no bias) current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); @@ -328,7 +321,7 @@ impl KnownModel for Llama { let embedding_result: ggml::Tensor = input_layer.share(); - ctx0.disable_offloading(); + ctx0.set_offloading(false); // lm_head input_layer = ctx0.op_mul_mat(&self.output, &input_layer); From e09b937bccfeaf77986469ec06334f8242346e01 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 15 Jul 2023 20:36:28 +0200 Subject: [PATCH 21/28] Typos and small review changes --- crates/ggml/src/context.rs | 2 +- crates/ggml/src/tensor.rs | 18 ++++++++++++------ crates/llm-base/src/inference_session.rs | 3 ++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 4a37f5d6..5bbea9a1 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -502,7 +502,7 @@ impl Drop for Context { fn drop(&mut self) { // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { - // if we moved tensors to an acceleratoor we need to free them + // if we moved tensors to an accelerator we need to free them for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() { if tensor.backend() != crate::Backend::Cpu { crate::accelerator_free_tensor(&tensor); diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 3ade3c91..cb8bb982 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -7,6 +7,8 @@ use std::{ use crate::{i64_to_usize, sys, Type}; +const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize; + /// Tensors are owned by the context. A tensor is alive as long as the /// underlying context it was created with is alive. pub struct Tensor { @@ -25,17 +27,17 @@ impl Tensor { /// /// # Safety /// - /// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters. + /// The name must be a valid UTF-8 string and must not be longer than `MAX_NAME_LENGTH` characters. pub fn set_name(mut self, name: &str) -> Tensor { assert!( - name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(), + name.len() <= MAX_NAME_LENGTH, "Name '{}' is too long, max length is {} characters", name, - crate::MAX_NAME_LENGTH + MAX_NAME_LENGTH ); let bytes = name.as_bytes(); - let mut array = [0i8; 48]; + let mut array = [0i8; MAX_NAME_LENGTH]; array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::>()); unsafe { self.ptr.as_mut().name = array } @@ -55,7 +57,7 @@ impl Tensor { /// # Caution /// /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend. - pub fn set_backend(&mut self, backend: crate::Backend) { + pub(crate) fn set_backend(&mut self, backend: crate::Backend) { unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) } } @@ -64,9 +66,13 @@ impl Tensor { unsafe { crate::get_tensor_backend(self.ptr.as_ref()) } } - /// Sets the tensors acceleration backend and moves the tensors data to the new backend. + /// Sets the tensor's acceleration backend and moves the tensors data to the new backend. pub fn transfer_to(mut self, backend: crate::Backend) -> Result { let current_backend = self.backend(); + + if current_backend != crate::Backend::Cpu && backend == crate::Backend::Cpu { + panic!("Currently there is no way to move data from an accelerator to the cpu") + } self.set_backend(backend); if backend != crate::Backend::Cpu { diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 7a76a9aa..c8bb2ade 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -136,7 +136,6 @@ impl InferenceSession { ctx_size }; - //TODO: check if this is needed and the right place to put it if config.use_gpu { ggml::accelerator_initialize(0); ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024); @@ -943,6 +942,8 @@ fn kv_memory( .set_name("memory_v"); if config.use_gpu { + // CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer. + // For OpenCL this is a no-op. ggml::accelerator_offload_tensor_no_scratch(&memory_k); ggml::accelerator_offload_tensor_no_scratch(&memory_v); } From 67bbaf9d578da2994b83704bf2e1d75719574eb9 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sat, 15 Jul 2023 22:13:39 +0200 Subject: [PATCH 22/28] refactor: remove unused tensorloader tensors --- crates/llm-base/src/loader.rs | 8 ++++---- crates/models/bloom/src/lib.rs | 2 +- crates/models/falcon/src/lib.rs | 2 +- crates/models/gpt2/src/lib.rs | 2 +- crates/models/gptj/src/lib.rs | 2 +- crates/models/gptneox/src/lib.rs | 2 +- crates/models/llama/src/lib.rs | 2 +- crates/models/mpt/src/lib.rs | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index bfd5b26c..8c5d2653 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -396,8 +396,8 @@ impl LoadError { pub trait TensorLoader { /// Gets a tensor from the loader. fn load(&mut self, name: &str) -> Result; - /// Finish loading the model, and extract all of the state from the loader. - fn finish(self) -> (Context, HashMap); + /// Finish loading the model, returning the context. + fn finish(self) -> Context; } /// Load a GGML model from the `path` and configure it per the `params`. The status @@ -670,8 +670,8 @@ impl TensorLoader for MmapCompatibleLoader<'_> { Ok(tensor) } - fn finish(self) -> (Context, HashMap) { - (self.context, self.loaded_tensors) + fn finish(self) -> Context { + self.context } } diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index 3f29c1ed..78c9cec6 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -89,7 +89,7 @@ impl KnownModel for Bloom { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 15a77527..02fcd933 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -81,7 +81,7 @@ impl KnownModel for Falcon { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index e19264b1..534f46e7 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -86,7 +86,7 @@ impl KnownModel for Gpt2 { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 85864fce..487123e0 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -81,7 +81,7 @@ impl KnownModel for GptJ { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 12fb42a0..83187273 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -95,7 +95,7 @@ impl KnownModel for GptNeoX { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 4011bea3..b99de13f 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -95,7 +95,7 @@ impl KnownModel for Llama { }; layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index fa66eac2..107c47aa 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -70,7 +70,7 @@ impl KnownModel for Mpt { layers.push(layer); } - let (context, _) = tl.finish(); + let context = tl.finish(); let ModelParameters { context_size, .. } = params; From e7ac55b09a32272f37b42f1aaa0b43ff341d99aa Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 01:25:49 +0200 Subject: [PATCH 23/28] refactor(ggml): accelerator/tensor/context --- crates/ggml/src/{ => accelerator}/metal.rs | 0 crates/ggml/src/accelerator/mod.rs | 94 +++++++++++ crates/ggml/src/context.rs | 101 +++++++----- crates/ggml/src/lib.rs | 157 +----------------- crates/ggml/src/tensor.rs | 180 ++++++++++++++------- crates/llm-base/src/inference_session.rs | 22 ++- crates/llm-base/src/model/mod.rs | 17 +- crates/models/llama/src/lib.rs | 23 ++- 8 files changed, 308 insertions(+), 286 deletions(-) rename crates/ggml/src/{ => accelerator}/metal.rs (100%) create mode 100644 crates/ggml/src/accelerator/mod.rs diff --git a/crates/ggml/src/metal.rs b/crates/ggml/src/accelerator/metal.rs similarity index 100% rename from crates/ggml/src/metal.rs rename to crates/ggml/src/accelerator/metal.rs diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs new file mode 100644 index 00000000..a4132583 --- /dev/null +++ b/crates/ggml/src/accelerator/mod.rs @@ -0,0 +1,94 @@ +//! Functionality related to hardware acceleration of GGML (GPU, etc.) +use crate::sys; + +#[cfg(feature = "metal")] +pub mod metal; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +/// Accelerators supported by `ggml`. +pub enum Accelerator { + /// CuBLAS accelerated + CuBLAS, + /// CLBlast accelerated + CLBlast, + /// Metal accelerated + Metal, + /// Cpu accelerated + None, +} + +/// Returns the accelerator `ggml` was compiled with. +pub fn get_accelerator() -> Accelerator { + #[cfg(feature = "cublas")] + return Accelerator::CLBlast; + #[cfg(feature = "clblast")] + return Accelerator::CuBLAS; + #[cfg(feature = "metal")] + return Accelerator::Metal; + #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))] + return Accelerator::None; +} + +#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] +/// Backend to use for a tensor. +pub enum Backend { + /// CPU backend + #[default] + Cpu, + /// GPU backend + Gpu, + /// Multi-GPU backend + GpuSplit, +} + +impl From for sys::ggml_backend { + fn from(b: Backend) -> Self { + match b { + Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU, + Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU, + Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT, + } + } +} + +impl TryFrom for Backend { + type Error = (); + fn try_from(b: sys::ggml_backend) -> Result { + match b { + sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu), + sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu), + sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit), + _ => Err(()), + } + } +} + +/// Initialize the accelerator. If ggml-sys is compiled with CUDA or CLBlast support, this function will initialize the accelerator. If not this is a no-op. +#[allow(unused_variables)] +pub fn initialize(device: i32) { + #[cfg(feature = "cublas")] + unsafe { + //TODO: Make this configurable + sys::cuda::ggml_init_cublas(); + sys::cuda::ggml_cuda_set_main_device(device); + let split = 1.0f32; + sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); + } +} + +/// Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op. +#[allow(unused_variables)] +pub fn set_scratch_size(size: usize) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_set_scratch_size(size); + } +} + +/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op. +pub fn free_scratch() { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_free_scratch(); + } +} diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 5bbea9a1..616edb32 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -7,7 +7,7 @@ use std::{ use memmap2::Mmap; -use crate::{sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type}; +use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type}; /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via /// `ggml_init` and dropping via `ggml_free`. @@ -27,7 +27,18 @@ pub struct Context { /// Whether the context can offload tensors to the GPU pub can_offload: bool, - /// Offloaded tensors + /// Offloaded tensors. Used to free them when the context is dropped. + // TODO: revisit this. What it means for a tensor to be "offloaded", + // "transferred", etc. is not clear. This map is necessary because + // there is no obvious heuristic for whether a given `Tensor` + // should have the accelerator free method called on it. + // + // This is because tensors can be present on the accelerator without + // having data (i.e. compute nodes), or they can refer to the scratch buffers. + // Freeing these offloaded-but-not-allocated tensors will lead to crashes. + // + // Hopefully, this is resolved by GGML redesigning both its accelerator + // interface and its scratch buffer solution. offloaded_tensors: Arc>>, } @@ -95,18 +106,31 @@ impl Context { self.can_offload = can_offload; } - /// Wraps a raw tensor with a weak pointer to the context. - fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor { - let tensor = Tensor { - ptr: NonNull::new(raw).expect("Should not be null"), - ctx: Arc::downgrade(&self.ptr), - offloaded_tensors: Arc::downgrade(&self.offloaded_tensors), - }; + /// Retrieves the memory used by this [Context]. + pub fn used_mem(&self) -> usize { + unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) } + } - if self.can_offload { - crate::accelerator_offload_tensor(&tensor); + /// Sets the scratch buffer to be used by this [Context]. + /// + /// If `scratch_buffer` is `None`, the scratch buffer will be disabled. + pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) { + let (size, data) = if let Some(buffer) = scratch_buffer { + (buffer.size(), buffer.data) + } else { + (0, std::ptr::null_mut()) + }; + // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API + unsafe { + sys::ggml_set_scratch( + self.ptr.as_ptr(), + sys::ggml_scratch { + offs: 0, + size, + data, + }, + ); } - tensor } /// Creates a new 1D tensor. @@ -148,7 +172,9 @@ impl Context { let raw = unsafe { sys::ggml_new_f32(self.ptr.as_ptr(), x) }; self.new_tensor_raw(raw) } - +} +// Operations +impl Context { /// Unknown, aside from the obvious. It's transposing something! pub fn op_transpose(&self, a: &Tensor) -> Tensor { let tensor = unsafe { sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) }; @@ -449,33 +475,6 @@ impl Context { self.new_tensor_raw(tensor) } - /// Retrieves the memory used by this [Context]. - pub fn used_mem(&self) -> usize { - unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) } - } - - /// Sets the scratch buffer to be used by this [Context]. - /// - /// If `scratch_buffer` is `None`, the scratch buffer will be disabled. - pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) { - let (size, data) = if let Some(buffer) = scratch_buffer { - (buffer.size(), buffer.data) - } else { - (0, std::ptr::null_mut()) - }; - // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API - unsafe { - sys::ggml_set_scratch( - self.ptr.as_ptr(), - sys::ggml_scratch { - offs: 0, - size, - data, - }, - ); - } - } - /// Attention with LInear BIases (Ref: ) pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize, bias_max: f32) -> Tensor { let tensor = unsafe { @@ -497,15 +496,31 @@ impl Context { self.new_tensor_raw(tensor) } } +// Private methods +impl Context { + /// Wraps a raw tensor with a weak pointer to the context. + fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor { + let tensor = Tensor { + ptr: NonNull::new(raw).expect("Should not be null"), + ctx: Arc::downgrade(&self.ptr), + offloaded_tensors: Arc::downgrade(&self.offloaded_tensors), + }; + + if self.can_offload { + tensor.offload(); + } + tensor + } +} impl Drop for Context { fn drop(&mut self) { // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { // if we moved tensors to an accelerator we need to free them - for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() { - if tensor.backend() != crate::Backend::Cpu { - crate::accelerator_free_tensor(&tensor); + for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() { + if tensor.backend() != Backend::Cpu { + tensor.free_accelerator(); } } sys::ggml_free(self.ptr.as_ptr()); diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 77cccb06..168a292a 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -18,6 +18,8 @@ mod tensor; pub mod format; pub mod util; +pub mod accelerator; + pub use context::Context; pub use tensor::Tensor; @@ -26,68 +28,6 @@ pub use ggml_sys as sys; #[cfg(test)] mod tests; -#[cfg(feature = "metal")] -pub mod metal; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -/// Accelerators supported by `ggml`. -pub enum Accelerator { - /// CuBLAS accelerated - CuBLAS, - /// CLBlast accelerated - CLBlast, - /// Metal accelerated - Metal, - /// Cpu accelerated - None, -} - -/// Returns the accelerator `ggml` was compiled with. -pub fn get_accelerator() -> Accelerator { - #[cfg(feature = "cublas")] - return Accelerator::CLBlast; - #[cfg(feature = "clblast")] - return Accelerator::CuBLAS; - #[cfg(feature = "metal")] - return Accelerator::Metal; - #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))] - return Accelerator::None; -} - -#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] -/// Backend to use for a tensor. -pub enum Backend { - /// CPU backend - #[default] - Cpu, - /// GPU backend - Gpu, - /// Multi-GPU backend - GpuSplit, -} - -impl From for sys::ggml_backend { - fn from(b: Backend) -> Self { - match b { - Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU, - Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU, - Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT, - } - } -} - -impl TryFrom for Backend { - type Error = (); - fn try_from(b: sys::ggml_backend) -> Result { - match b { - sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu), - sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu), - sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit), - _ => Err(()), - } - } -} - /// The type of a tensor element. pub type ElementType = Type; @@ -544,96 +484,3 @@ pub fn set_tensor_name(tensor: &Tensor, name: &str) { let c_name = std::ffi::CString::new(name).unwrap(); unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) }; } - -/// Gets the acceleration backend of a tensor. -pub fn get_tensor_backend(tensor: &sys::ggml_tensor) -> Backend { - (tensor.backend as sys::ggml_backend).try_into().unwrap() -} - -/// Sets the acceleration backend of a tensor. -/// # Safety -/// This function assumes that the tensor is valid. -pub unsafe fn set_tensor_backend(tensor: *mut sys::ggml_tensor, backend: Backend) { - unsafe { - (*tensor).backend = backend.try_into().unwrap(); - } -} - -/// If ggml-sys is compiled with CUDA or ClBlast support, this function will tranform and offload the tensor. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_transform_tensor(tensor: &mut Tensor) { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); - } - #[cfg(feature = "clblast")] - unsafe { - sys::opencl::ggml_cl_transform_tensor(tensor.data(), tensor.ptr.as_ptr()); - } -} - -/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. -pub fn accelerator_offload_tensor(tensor: &Tensor) { - accelerator_offload_raw_tensor(tensor.ptr.as_ptr()); -} - -/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_offload_raw_tensor(tensor: *mut sys::ggml_tensor) { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_assign_buffers(tensor); - } -} - -/// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_offload_tensor_no_scratch(tensor: &Tensor) { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_assign_buffers_no_scratch(tensor.ptr.as_ptr()); - } -} - -/// Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_set_scratch_size(size: usize) { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_set_scratch_size(size); - } -} - -///Initialize the accelerator. If ggml-sys is compiled with CUDA or ClBlast support, this function will initialize the accelerator. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_initialize(device: i32) { - #[cfg(feature = "cublas")] - unsafe { - //TODO: Make this configurable - sys::cuda::ggml_init_cublas(); - sys::cuda::ggml_cuda_set_main_device(device); - let split = 1.0f32; - sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32); - } -} - -/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op. -pub fn accelerator_free_scratch() { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_free_scratch(); - } -} - -/// Frees the memory of a tensor. If ggml-sys is compiled with CUDA or ClBlast support, this function will free the memory of a tensor. If not this is a no-op. -#[allow(unused_variables)] -pub fn accelerator_free_tensor(tensor: &Tensor) { - #[cfg(feature = "cublas")] - unsafe { - sys::cuda::ggml_cuda_free_data(tensor.ptr.as_ptr()); - } - #[cfg(feature = "clblast")] - unsafe { - sys::opencl::ggml_cl_free_data(tensor.ptr.as_ptr()); - } -} diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index cb8bb982..ee5370b7 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -5,7 +5,7 @@ use std::{ sync::{Mutex, Weak}, }; -use crate::{i64_to_usize, sys, Type}; +use crate::{accelerator::Backend, i64_to_usize, sys, Type}; const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize; @@ -40,60 +40,87 @@ impl Tensor { let mut array = [0i8; MAX_NAME_LENGTH]; array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::>()); - unsafe { self.ptr.as_mut().name = array } + self.with_alive_ctx_mut(|t| unsafe { t.ptr.as_mut().name = array }); self } /// Gets the name of the tensor pub fn name(&self) -> String { - let name = unsafe { self.ptr.as_ref().name }; - let mut name = name.iter().map(|&x| x as u8).collect::>(); - name.retain(|&x| x != 0); - String::from_utf8(name).unwrap() - } - - /// Sets the acceleration backend of the tensor. - /// - /// # Caution - /// - /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend. - pub(crate) fn set_backend(&mut self, backend: crate::Backend) { - unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) } + self.with_alive_ctx(|| { + let name = unsafe { self.ptr.as_ref().name }; + let mut name = name.iter().map(|&x| x as u8).collect::>(); + name.retain(|&x| x != 0); + String::from_utf8(name).unwrap() + }) } /// Gets the acceleration backend of the tensor - pub fn backend(&self) -> crate::Backend { - unsafe { crate::get_tensor_backend(self.ptr.as_ref()) } + pub fn backend(&self) -> Backend { + self.with_alive_ctx(|| unsafe { + (self.ptr.as_ref().backend as sys::ggml_backend) + .try_into() + .unwrap() + }) } - /// Sets the tensor's acceleration backend and moves the tensors data to the new backend. - pub fn transfer_to(mut self, backend: crate::Backend) -> Result { - let current_backend = self.backend(); + /// Sets the tensor's acceleration backend and moves the tensor's data to the new backend. + pub fn transfer_to(mut self, backend: Backend) -> Tensor { + self.with_alive_ctx_mut(|t| { + let current_backend = t.backend(); - if current_backend != crate::Backend::Cpu && backend == crate::Backend::Cpu { - panic!("Currently there is no way to move data from an accelerator to the cpu") - } - self.set_backend(backend); - - if backend != crate::Backend::Cpu { - crate::accelerator_transform_tensor(&mut self); - if current_backend == crate::Backend::Cpu { - // tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator - self.with_alive_ctx_mut(|| { - if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() { - //TODO: Do we need to check if the tensor is already in the map? - offloaded_tensors - .lock() - .unwrap() - .insert(self.name(), self.share()); - } else { - panic!("Using a context after it was dropped!") - } - }) + if current_backend != Backend::Cpu && backend == Backend::Cpu { + unimplemented!("Tensors cannot be moved from an accelerator to the CPU at present"); } - } + t.set_backend(backend); + if backend == Backend::Cpu { + return; + } + + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_transform_tensor(t.data(), t.ptr.as_ptr()); + } + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr()); + } + + t.offloaded_tensors + .upgrade() + .expect("Attempted to update a dropped context's offloaded tensors") + .lock() + .unwrap() + .insert(t.name(), t.share()); + }); + self + } - Ok(self) + /// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU. + /// If not, this is a no-op. + /// + /// It will not transfer the data. Use `transfer_to` for that. + #[allow(unused_variables)] + pub fn offload(&self) { + self.with_alive_ctx(|| { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr()); + } + }) + } + + /// If ggml-sys is compiled with CUDA support, this function will offload the tensor to the GPU without using the scratch buffer. + /// If not, this is a no-op. + /// + /// It will not transfer the data. Use `transfer_to` for that. + #[allow(unused_variables)] + pub fn offload_no_scratch(&self) { + self.with_alive_ctx(|| { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr()); + } + }) } /// Creates a shared copy of this tensor pointer. @@ -105,22 +132,6 @@ impl Tensor { } } - fn with_alive_ctx(&self, mut f: impl FnMut() -> U) -> U { - if let Some(_ctx) = self.ctx.upgrade() { - f() - } else { - panic!("Using a tensor after the context was dropped") - } - } - - fn with_alive_ctx_mut(&self, mut f: impl FnMut() -> U) -> U { - if let Some(_ctx) = self.ctx.upgrade() { - f() - } else { - panic!("Using a tensor after the context was dropped") - } - } - /// Number of bytes used by this tensor. pub fn nbytes(&self) -> usize { self.with_alive_ctx(|| { @@ -147,8 +158,8 @@ impl Tensor { /// /// The memory region from `data_ptr` to `data_ptr.offset(tensor.nbytes())` will be read from. pub unsafe fn set_data(&mut self, data_ptr: *mut c_void) { - let tensor = self.ptr.as_mut(); - self.with_alive_ctx_mut(|| { + self.with_alive_ctx_mut(|t| { + let tensor = t.ptr.as_mut(); // SAFETY: The with_alive_call guarantees the context is alive tensor.data = data_ptr; }) @@ -206,4 +217,53 @@ impl Tensor { let data = unsafe { sys::ggml_get_data(self.ptr.as_ptr()).add(offset) }; std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len()) } + + /// Frees the memory of a tensor on an accelerator if ggml-sys is compiled with CUDA or CLBlast support. + /// If not, this is a no-op. + /// + /// This is temporary while GGML improves their context memory management. This should only be called by + /// `Context` when it is dropped, as well as `llm`'s `InferenceSession`. + /// + /// # Safety + /// + /// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop` + /// isn't `self`. + pub unsafe fn free_accelerator(&mut self) { + #[cfg(feature = "cublas")] + unsafe { + sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr()); + } + #[cfg(feature = "clblast")] + unsafe { + sys::opencl::ggml_cl_free_data(self.ptr.as_ptr()); + } + } +} +impl Tensor { + fn with_alive_ctx(&self, mut f: impl FnMut() -> U) -> U { + if let Some(_ctx) = self.ctx.upgrade() { + f() + } else { + panic!("Using a tensor after the context was dropped") + } + } + + fn with_alive_ctx_mut(&mut self, mut f: impl FnMut(&mut Tensor) -> U) -> U { + if let Some(_ctx) = self.ctx.upgrade() { + f(self) + } else { + panic!("Using a tensor after the context was dropped") + } + } + + /// Sets the acceleration backend of the tensor. + /// + /// # Caution + /// + /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend. + fn set_backend(&mut self, backend: Backend) { + unsafe { + self.ptr.as_mut().backend = backend.try_into().unwrap(); + } + } } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index d1bae54f..d1c03ba2 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -4,7 +4,7 @@ use std::{cell::RefCell, fmt::Display, sync::Arc}; use thiserror::Error; #[cfg(feature = "metal")] -use ggml::metal::MetalContext; +use ggml::accelerator::metal::MetalContext; use crate::{ mulf, util, InferenceParameters, Model, OutputRequest, Prompt, TokenId, TokenUtf8Buffer, @@ -137,8 +137,8 @@ impl InferenceSession { }; if config.use_gpu { - ggml::accelerator_initialize(0); - ggml::accelerator_set_scratch_size(config.n_batch * 1024 * 1024); + ggml::accelerator::initialize(0); + ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024); } let session_ctx = Arc::new(ggml::Context::init(ctx_size, true)); @@ -633,9 +633,11 @@ impl InferenceSession { impl Drop for InferenceSession { fn drop(&mut self) { //if we are using an accelerator, we need to free the scratch memory and the k/v memory - ggml::accelerator_free_scratch(); - ggml::accelerator_free_tensor(&self.memory_k); - ggml::accelerator_free_tensor(&self.memory_v); + ggml::accelerator::free_scratch(); + unsafe { + self.memory_k.free_accelerator(); + self.memory_v.free_accelerator(); + } } } @@ -983,8 +985,12 @@ fn kv_memory( if config.use_gpu { // CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer. // For OpenCL this is a no-op. - ggml::accelerator_offload_tensor_no_scratch(&memory_k); - ggml::accelerator_offload_tensor_no_scratch(&memory_v); + // + // Note that these must be manually freed from the accelerator in the `InferenceSession` + // destructor. This is because `offload_no_scratch` does not update the `offloaded_tensors` + // map, because reasons. + memory_k.offload_no_scratch(); + memory_v.offload_no_scratch(); } (memory_k, memory_v) diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index 75a070f1..2fffbace 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -7,6 +7,7 @@ use std::{ path::{Path, PathBuf}, }; +use ggml::accelerator::Backend; use regex::Regex; use thiserror::Error; @@ -224,20 +225,20 @@ impl ModelParameters { /// Returns true if the model should offload the given layer to the accelerator. pub fn should_offload(&self, layer: usize) -> bool { if !self.use_gpu { - false - } else if let Some(offloadable_layers) = self.gpu_layers { - layer < offloadable_layers - } else { - true + return false; } + + self.gpu_layers + .map(|gpu_layers| layer < gpu_layers) + .unwrap_or(true) } /// Returns the backend to use for the given layer. - pub fn backend(&self, layer: usize) -> ggml::Backend { + pub fn backend(&self, layer: usize) -> Backend { if self.should_offload(layer) { - ggml::Backend::Gpu + Backend::Gpu } else { - ggml::Backend::Cpu + Backend::Cpu } } } diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index b99de13f..0c3623c8 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -55,9 +55,8 @@ impl KnownModel for Llama { let backend = params.backend(0); - let norm = tl.load("norm.weight")?.transfer_to(backend)?; - - let output = tl.load("output.weight")?.transfer_to(backend)?; + let norm = tl.load("norm.weight")?.transfer_to(backend); + let output = tl.load("output.weight")?.transfer_to(backend); let mut layers = Vec::new(); @@ -67,31 +66,31 @@ impl KnownModel for Llama { let layer = Layer { attention_norm: tl .load(&format!("layers.{i}.attention_norm.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), wq: tl .load(&format!("layers.{i}.attention.wq.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), wk: tl .load(&format!("layers.{i}.attention.wk.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), wv: tl .load(&format!("layers.{i}.attention.wv.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), wo: tl .load(&format!("layers.{i}.attention.wo.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), ffn_norm: tl .load(&format!("layers.{i}.ffn_norm.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), w1: tl .load(&format!("layers.{i}.feed_forward.w1.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), w2: tl .load(&format!("layers.{i}.feed_forward.w2.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), w3: tl .load(&format!("layers.{i}.feed_forward.w3.weight"))? - .transfer_to(backend)?, + .transfer_to(backend), }; layers.push(layer); } From c74e159316e226f79454f7cf2e694765cf953008 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 01:33:49 +0200 Subject: [PATCH 24/28] refactor(ggml): offload_no_scratch auto-free --- crates/ggml/src/context.rs | 2 +- crates/ggml/src/tensor.rs | 31 ++++++++++++++---------- crates/llm-base/src/inference_session.rs | 7 ++---- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 616edb32..04d80a4b 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -518,7 +518,7 @@ impl Drop for Context { // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { // if we moved tensors to an accelerator we need to free them - for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() { + for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() { if tensor.backend() != Backend::Cpu { tensor.free_accelerator(); } diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index ee5370b7..49ec741b 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -85,12 +85,7 @@ impl Tensor { sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr()); } - t.offloaded_tensors - .upgrade() - .expect("Attempted to update a dropped context's offloaded tensors") - .lock() - .unwrap() - .insert(t.name(), t.share()); + t.mark_as_offloaded(); }); self } @@ -113,6 +108,9 @@ impl Tensor { /// If not, this is a no-op. /// /// It will not transfer the data. Use `transfer_to` for that. + /// + /// Unlike `offload`, this function will add the tensor to the offloaded tensors map. This is because the non-use of a scratch buffer + /// allows us to safely assume that this tensor will actually point to data. #[allow(unused_variables)] pub fn offload_no_scratch(&self) { self.with_alive_ctx(|| { @@ -120,6 +118,7 @@ impl Tensor { unsafe { sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr()); } + self.mark_as_offloaded(); }) } @@ -222,13 +221,9 @@ impl Tensor { /// If not, this is a no-op. /// /// This is temporary while GGML improves their context memory management. This should only be called by - /// `Context` when it is dropped, as well as `llm`'s `InferenceSession`. - /// - /// # Safety - /// - /// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop` - /// isn't `self`. - pub unsafe fn free_accelerator(&mut self) { + /// `Context` when it is dropped. + pub(crate) fn free_accelerator(self) { + println!("Freeing tensor {}", self.name()); #[cfg(feature = "cublas")] unsafe { sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr()); @@ -266,4 +261,14 @@ impl Tensor { self.ptr.as_mut().backend = backend.try_into().unwrap(); } } + + /// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed. + fn mark_as_offloaded(&self) { + self.offloaded_tensors + .upgrade() + .expect("Attempted to update a dropped context's offloaded tensors") + .lock() + .unwrap() + .insert(self.name(), self.share()); + } } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index d1c03ba2..9b45687c 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -632,12 +632,9 @@ impl InferenceSession { impl Drop for InferenceSession { fn drop(&mut self) { - //if we are using an accelerator, we need to free the scratch memory and the k/v memory + // If we are using an accelerator, we need to free the scratch memory. + // The k/v memory is freed by the ctx0 destructor. ggml::accelerator::free_scratch(); - unsafe { - self.memory_k.free_accelerator(); - self.memory_v.free_accelerator(); - } } } From 55b2dc34276733a994bf495f507d5d5e54bfbf96 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 01:48:26 +0200 Subject: [PATCH 25/28] refactor(ggml): use ContextInner for shared state --- crates/ggml/src/context.rs | 121 ++++++++++++++++++++----------------- crates/ggml/src/tensor.rs | 39 +++++------- 2 files changed, 80 insertions(+), 80 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 04d80a4b..c367c83c 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -16,7 +16,7 @@ pub struct Context { /// allocated tensors. Tensors are owned by the object, so a [`Tensor`] /// contains a `Weak` reference underneath and doesn't let you do anything /// with it if the underlying context has been deallocated. - pub ptr: Arc>, + inner: Arc, /// Memory mapping information pub mmap: Option, @@ -26,6 +26,11 @@ pub struct Context { /// Whether the context can offload tensors to the GPU pub can_offload: bool, +} + +/// Contains state shared between a context and its tensors +pub(crate) struct ContextInner { + pub ptr: NonNull, /// Offloaded tensors. Used to free them when the context is dropped. // TODO: revisit this. What it means for a tensor to be "offloaded", @@ -39,7 +44,16 @@ pub struct Context { // // Hopefully, this is resolved by GGML redesigning both its accelerator // interface and its scratch buffer solution. - offloaded_tensors: Arc>>, + pub offloaded_tensors: Mutex>, +} + +impl ContextInner { + pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc { + Arc::new(Self { + ptr: NonNull::new(ptr).expect("Should not be null"), + offloaded_tensors: Default::default(), + }) + } } impl Context { @@ -54,11 +68,10 @@ impl Context { }; Self { - ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + inner: ContextInner::new(raw), mmap: None, buffer: Some(buffer), can_offload: false, - offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } @@ -73,11 +86,10 @@ impl Context { }; Self { - ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + inner: ContextInner::new(raw), mmap: Some(mmap), buffer: None, can_offload: false, - offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } @@ -93,11 +105,10 @@ impl Context { }; Self { - ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + inner: ContextInner::new(raw), mmap: None, buffer: None, can_offload: false, - offloaded_tensors: Arc::new(Mutex::new(HashMap::new())), } } @@ -108,7 +119,7 @@ impl Context { /// Retrieves the memory used by this [Context]. pub fn used_mem(&self) -> usize { - unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) } + unsafe { sys::ggml_used_mem(self.as_ptr()) } } /// Sets the scratch buffer to be used by this [Context]. @@ -123,7 +134,7 @@ impl Context { // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API unsafe { sys::ggml_set_scratch( - self.ptr.as_ptr(), + self.as_ptr(), sys::ggml_scratch { offs: 0, size, @@ -135,8 +146,7 @@ impl Context { /// Creates a new 1D tensor. pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor { - let raw = - unsafe { sys::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0)) }; + let raw = unsafe { sys::ggml_new_tensor_1d(self.as_ptr(), typ.into(), usize_to_i64(ne0)) }; self.new_tensor_raw(raw) } @@ -144,7 +154,7 @@ impl Context { pub fn new_tensor_2d(&self, typ: Type, ne0: usize, ne1: usize) -> Tensor { let raw = unsafe { sys::ggml_new_tensor_2d( - self.ptr.as_ptr(), + self.as_ptr(), typ.into(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -157,7 +167,7 @@ impl Context { pub fn new_tensor_3d(&self, typ: Type, ne0: usize, ne1: usize, ne2: usize) -> Tensor { let raw = unsafe { sys::ggml_new_tensor_3d( - self.ptr.as_ptr(), + self.as_ptr(), typ.into(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -169,7 +179,7 @@ impl Context { /// Creates a new 1D tensor with the specified value. pub fn new_f32(&self, x: f32) -> Tensor { - let raw = unsafe { sys::ggml_new_f32(self.ptr.as_ptr(), x) }; + let raw = unsafe { sys::ggml_new_f32(self.as_ptr(), x) }; self.new_tensor_raw(raw) } } @@ -177,38 +187,37 @@ impl Context { impl Context { /// Unknown, aside from the obvious. It's transposing something! pub fn op_transpose(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_transpose(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Unknown. pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = - unsafe { sys::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_get_rows(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Creates a new tensor with the values of `a`, but normalized. pub fn op_norm(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Creates a new tensor with the values of `a`, but normalized using RMSNorm. pub fn op_rms_norm(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Creates a new tensor with the multiplication of `a` and `b`. pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_mul(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Unknown. pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_repeat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } @@ -220,61 +229,59 @@ impl Context { /// /// Result is m columns, p rows pub fn op_mul_mat(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = - unsafe { sys::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_mul_mat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Creates a new tensor with the addition of `a` and `b`. pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_add(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Creates a new tensor with the [SiLU](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html) activation function applied to `a`. pub fn op_silu(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_silu(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Scales `a` by the 1D tensor `b`. pub fn op_scale(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_scale(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// In-place, scales `a` by the 1D tensor `b`. pub fn op_scale_inplace(&self, a: &Tensor, b: &Tensor) -> Tensor { let tensor = - unsafe { sys::ggml_scale_inplace(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + unsafe { sys::ggml_scale_inplace(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// Sets the elements above the diagonal to -INF. pub fn op_diag_mask_inf(&self, a: &Tensor, n_past: usize) -> Tensor { - let tensor = unsafe { - sys::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past)) - }; + let tensor = + unsafe { sys::ggml_diag_mask_inf(self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past)) }; self.new_tensor_raw(tensor) } /// In-place, sets the elements above the diagonal to -INF. pub fn op_diag_mask_inf_inplace(&self, a: &Tensor, n_past: usize) -> Tensor { let tensor = unsafe { - sys::ggml_diag_mask_inf_inplace(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past)) + sys::ggml_diag_mask_inf_inplace(self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past)) }; self.new_tensor_raw(tensor) } /// Applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`. pub fn op_soft_max(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_soft_max(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } /// In-place, applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`. pub fn op_soft_max_inplace(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_soft_max_inplace(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_soft_max_inplace(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } @@ -298,8 +305,7 @@ impl Context { a: &Tensor, fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32), ) -> Tensor { - let tensor = - unsafe { sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) }; + let tensor = unsafe { sys::ggml_map_unary_f32(self.as_ptr(), a.ptr.as_ptr(), Some(fun)) }; self.new_tensor_raw(tensor) } @@ -325,7 +331,7 @@ impl Context { fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32), ) -> Tensor { let tensor = unsafe { - sys::ggml_map_binary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun)) + sys::ggml_map_binary_f32(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun)) }; self.new_tensor_raw(tensor) } @@ -337,9 +343,8 @@ impl Context { offset < a.nbytes(), "Cannot create tensor view with offset larger than tensor" ); - let tensor = unsafe { - sys::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset) - }; + let tensor = + unsafe { sys::ggml_view_1d(self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset) }; self.new_tensor_raw(tensor) } @@ -348,7 +353,7 @@ impl Context { let (ne0, ne1) = ne; let tensor = unsafe { sys::ggml_view_2d( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -371,7 +376,7 @@ impl Context { let (nb1, nb2) = nb; let tensor = unsafe { sys::ggml_view_3d( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -386,7 +391,7 @@ impl Context { /// Copies `a` to `b` and returns `b`. pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_cpy(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } @@ -394,7 +399,7 @@ impl Context { pub fn op_permute(&self, a: &Tensor, axes: (usize, usize, usize, usize)) -> Tensor { let tensor = unsafe { sys::ggml_permute( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(axes.0), usize_to_i32(axes.1), @@ -407,8 +412,7 @@ impl Context { /// In-place; reshapes `a` in accordance with the dimensions of `b` pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor { - let tensor = - unsafe { sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_reshape(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } @@ -416,7 +420,7 @@ impl Context { pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor { let tensor = unsafe { sys::ggml_reshape_2d( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -429,7 +433,7 @@ impl Context { pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor { let tensor = unsafe { sys::ggml_reshape_3d( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), usize_to_i64(ne1), @@ -441,7 +445,7 @@ impl Context { /// ggml_cont pub fn op_cont(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_cont(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_cont(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } @@ -449,7 +453,7 @@ impl Context { pub fn op_rope(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor { let tensor = unsafe { sys::ggml_rope( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(npast), usize_to_i32(ndims), @@ -464,7 +468,7 @@ impl Context { pub fn op_rope_inplace(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor { let tensor = unsafe { sys::ggml_rope_inplace( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(npast), usize_to_i32(ndims), @@ -479,7 +483,7 @@ impl Context { pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize, bias_max: f32) -> Tensor { let tensor = unsafe { sys::ggml_alibi( - self.ptr.as_ptr(), + self.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past), usize_to_i32(n_head), @@ -492,7 +496,7 @@ impl Context { /// Gaussian Error Linear Units pub fn op_gelu(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + let tensor = unsafe { sys::ggml_gelu(self.as_ptr(), a.ptr.as_ptr()) }; self.new_tensor_raw(tensor) } } @@ -502,8 +506,7 @@ impl Context { fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor { let tensor = Tensor { ptr: NonNull::new(raw).expect("Should not be null"), - ctx: Arc::downgrade(&self.ptr), - offloaded_tensors: Arc::downgrade(&self.offloaded_tensors), + inner: Arc::downgrade(&self.inner), }; if self.can_offload { @@ -511,6 +514,10 @@ impl Context { } tensor } + + fn as_ptr(&self) -> *mut sys::ggml_context { + self.inner.ptr.as_ptr() + } } impl Drop for Context { @@ -518,12 +525,12 @@ impl Drop for Context { // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { // if we moved tensors to an accelerator we need to free them - for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() { + for (_, tensor) in self.inner.offloaded_tensors.lock().unwrap().drain() { if tensor.backend() != Backend::Cpu { tensor.free_accelerator(); } } - sys::ggml_free(self.ptr.as_ptr()); + sys::ggml_free(self.as_ptr()); } } } diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 49ec741b..25b8bf44 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -1,11 +1,6 @@ -use std::{ - collections::HashMap, - os::raw::c_void, - ptr::NonNull, - sync::{Mutex, Weak}, -}; +use std::{os::raw::c_void, ptr::NonNull, sync::Weak}; -use crate::{accelerator::Backend, i64_to_usize, sys, Type}; +use crate::{accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type}; const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize; @@ -13,8 +8,7 @@ const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize; /// underlying context it was created with is alive. pub struct Tensor { pub(crate) ptr: NonNull, - pub(crate) ctx: Weak>, - pub(crate) offloaded_tensors: Weak>>, + pub(crate) inner: Weak, } impl Tensor { @@ -126,8 +120,7 @@ impl Tensor { pub fn share(&self) -> Self { Tensor { ptr: self.ptr, - ctx: Weak::clone(&self.ctx), - offloaded_tensors: Weak::clone(&self.offloaded_tensors), + inner: Weak::clone(&self.inner), } } @@ -223,7 +216,6 @@ impl Tensor { /// This is temporary while GGML improves their context memory management. This should only be called by /// `Context` when it is dropped. pub(crate) fn free_accelerator(self) { - println!("Freeing tensor {}", self.name()); #[cfg(feature = "cublas")] unsafe { sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr()); @@ -236,19 +228,19 @@ impl Tensor { } impl Tensor { fn with_alive_ctx(&self, mut f: impl FnMut() -> U) -> U { - if let Some(_ctx) = self.ctx.upgrade() { - f() - } else { - panic!("Using a tensor after the context was dropped") - } + let _ctx = self + .inner + .upgrade() + .expect("Using a tensor after the context was dropped"); + f() } fn with_alive_ctx_mut(&mut self, mut f: impl FnMut(&mut Tensor) -> U) -> U { - if let Some(_ctx) = self.ctx.upgrade() { - f(self) - } else { - panic!("Using a tensor after the context was dropped") - } + let _ctx = self + .inner + .upgrade() + .expect("Using a tensor after the context was dropped"); + f(self) } /// Sets the acceleration backend of the tensor. @@ -264,9 +256,10 @@ impl Tensor { /// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed. fn mark_as_offloaded(&self) { - self.offloaded_tensors + self.inner .upgrade() .expect("Attempted to update a dropped context's offloaded tensors") + .offloaded_tensors .lock() .unwrap() .insert(self.name(), self.share()); From 70d57dc17ff5e98a00f8dc7ec3631b77763401f4 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 02:23:49 +0200 Subject: [PATCH 26/28] refactor(ggml): unify context creation --- crates/ggml/src/context.rs | 104 +++++++++++++---------- crates/ggml/src/lib.rs | 2 +- crates/llm-base/src/inference_session.rs | 6 +- crates/llm-base/src/loader.rs | 6 +- crates/llm-base/src/lora.rs | 2 +- 5 files changed, 68 insertions(+), 52 deletions(-) diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index c367c83c..dea550cd 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -18,11 +18,8 @@ pub struct Context { /// with it if the underlying context has been deallocated. inner: Arc, - /// Memory mapping information - pub mmap: Option, - - /// Backing buffer (in case we own it) - pub buffer: Option, + /// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped. + storage: Option, /// Whether the context can offload tensors to the GPU pub can_offload: bool, @@ -46,7 +43,6 @@ pub(crate) struct ContextInner { // interface and its scratch buffer solution. pub offloaded_tensors: Mutex>, } - impl ContextInner { pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc { Arc::new(Self { @@ -56,60 +52,72 @@ impl ContextInner { } } +/// Controls how the context uses memory. +pub enum ContextStorage { + /// Use the provided buffer as memory. + Buffer(Buffer), + /// Use the provided memory mapped file as memory. + Mmap(Mmap), + /// Allocate `mem_size` bytes of memory. + Allocate { + /// The size, in bytes, of the memory in to allocate. + mem_size: usize, + }, +} + impl Context { - /// Creates a new [Context] using the buffer provided as memory - pub fn init_buffer(buffer: Buffer) -> Self { - let raw = unsafe { - sys::ggml_init(sys::ggml_init_params { + /// Creates a new [Context] with the given storage.. + pub fn new(storage: ContextStorage) -> Self { + let init_params = match &storage { + ContextStorage::Buffer(buffer) => sys::ggml_init_params { mem_size: buffer.size(), mem_buffer: buffer.data, no_alloc: false, - }) + }, + ContextStorage::Mmap(mmap) => sys::ggml_init_params { + mem_size: mmap.len(), + mem_buffer: std::ptr::null_mut(), + // We are mmapping so ggml does not need to allocate any memory for us + no_alloc: true, + }, + ContextStorage::Allocate { mem_size } => sys::ggml_init_params { + mem_size: *mem_size, + // Null here means we want ggml to own this memory. + mem_buffer: std::ptr::null_mut(), + // It doesn't make sense to `no_alloc` when passing in a `mem_size` in this mode. + no_alloc: false, + }, }; + let raw = unsafe { sys::ggml_init(init_params) }; Self { inner: ContextInner::new(raw), - mmap: None, - buffer: Some(buffer), + storage: Some(storage), can_offload: false, } } - /// Creates a new [Context] with the memory mapped file provided - pub fn init_mmap(mmap: Mmap) -> Self { - let raw = unsafe { - sys::ggml_init(sys::ggml_init_params { - mem_size: mmap.len(), - mem_buffer: std::ptr::null_mut(), - no_alloc: true, // We are mmapping so ggml does not need to allocate any memory for us - }) - }; + /// Creates a new [Context] with the specified buffer. + /// The buffer will be used by GGML. + pub fn new_with_buffer(buffer: Buffer) -> Self { + Self::new(ContextStorage::Buffer(buffer)) + } - Self { - inner: ContextInner::new(raw), - mmap: Some(mmap), - buffer: None, - can_offload: false, - } + /// Creates a new [Context] with the specified memory mapped file. + pub fn new_with_mmap(mmap: Mmap) -> Self { + Self::new(ContextStorage::Mmap(mmap)) } - /// Creates a new [Context] with the specified `mem_size` as a working area. - pub fn init(mem_size: usize, alloc: bool) -> Self { - let raw = unsafe { - sys::ggml_init(sys::ggml_init_params { - mem_size, - // Null here means we want ggml to own this memory. - mem_buffer: std::ptr::null_mut(), - no_alloc: !alloc, - }) - }; + /// Creates a new [Context] with the specified memory size. + /// The memory will be allocated by GGML. + pub fn new_with_allocate(mem_size: usize) -> Self { + Self::new(ContextStorage::Allocate { mem_size }) + } - Self { - inner: ContextInner::new(raw), - mmap: None, - buffer: None, - can_offload: false, - } + /// Recreates this context using the same storage. + pub fn recreate(&mut self) { + // This is the only operation that can consume the `self.storage`, so we can unwrap here. + *self = Self::new(self.storage.take().unwrap()); } /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU @@ -182,6 +190,14 @@ impl Context { let raw = unsafe { sys::ggml_new_f32(self.as_ptr(), x) }; self.new_tensor_raw(raw) } + + /// Returns the mmap used by this [Context], if any. + pub fn mmap(&self) -> Option<&Mmap> { + match &self.storage { + Some(ContextStorage::Mmap(mmap)) => Some(mmap), + _ => None, + } + } } // Operations impl Context { diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 168a292a..af833b00 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -20,7 +20,7 @@ pub mod util; pub mod accelerator; -pub use context::Context; +pub use context::{Context, ContextStorage}; pub use tensor::Tensor; pub use ggml_sys as sys; diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 9b45687c..ee988eae 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -141,7 +141,7 @@ impl InferenceSession { ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024); } - let session_ctx = Arc::new(ggml::Context::init(ctx_size, true)); + let session_ctx = Arc::new(ggml::Context::new_with_allocate(ctx_size)); // Initialize key + value memory tensors let n_mem = n_layer * n_ctx; @@ -167,7 +167,7 @@ impl InferenceSession { }; let eval = Buffer::new(buf_size); - let ctx0 = ggml::Context::init_buffer(eval); + let ctx0 = ggml::Context::new_with_buffer(eval); // Set up Metal support #[cfg(feature = "metal")] @@ -216,7 +216,7 @@ impl InferenceSession { F: FnOnce(BuildContext) -> (ComputationGraph, GraphOutputs), { // Build a graph - self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap()); + self.ctx0.recreate(); let ctx0 = &mut self.ctx0; let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len()); ggml::set_tensor_name(&embd, "embd"); diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 8c5d2653..c3ff904b 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -527,10 +527,10 @@ pub fn load( unsafe { let mmap = Mmap::map(&file)?; let file_size = mmap.len() as u64; - (Context::init_mmap(mmap), file_size) + (Context::new_with_mmap(mmap), file_size) } } else { - (Context::init(ctx_size, true), file.metadata()?.len()) + (Context::new_with_allocate(ctx_size), file.metadata()?.len()) }; let tensors_len = tensors.len(); @@ -646,7 +646,7 @@ impl TensorLoader for MmapCompatibleLoader<'_> { &self.context, &mut self.file, &self.path, - self.context.mmap.as_ref(), + self.context.mmap(), ); let mut tensor = main_context.get_tensor(info)?; diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs index 8cdc2c88..9dcba74a 100644 --- a/crates/llm-base/src/lora.rs +++ b/crates/llm-base/src/lora.rs @@ -105,7 +105,7 @@ impl LoraAdapter { // Create a temporary context for the patching operations // TODO: test if GPU can be enabled (make it configurable) - let patch_context = ggml::Context::init(patch_context_size, true); + let patch_context = ggml::Context::new_with_allocate(patch_context_size); let mut patch_file = FileContext::new(&patch_context, &mut self.file, &self.path, None); // Load the A and B tensors From 090735a7f42c628cf682a93c7c0c49b32b6870ce Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 02:45:57 +0200 Subject: [PATCH 27/28] fix(ggml): make metal work again --- crates/ggml/src/accelerator/metal.rs | 47 +++++++--------- crates/ggml/src/context.rs | 68 ++++++++++++++++++++---- crates/ggml/src/lib.rs | 1 + crates/llm-base/src/inference_session.rs | 2 +- crates/llm-base/src/loader.rs | 2 +- 5 files changed, 80 insertions(+), 40 deletions(-) diff --git a/crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/accelerator/metal.rs index cbfdbcc6..3a471480 100644 --- a/crates/ggml/src/accelerator/metal.rs +++ b/crates/ggml/src/accelerator/metal.rs @@ -1,6 +1,6 @@ //! Metal support. use crate::{sys::metal, Buffer, ComputationGraph, Context, Tensor}; -use std::{ffi::c_void, ptr::NonNull, sync::Arc}; +use std::{ptr::NonNull, sync::Arc}; /// Acts as a RAII-guard over a `sys::metal::ggml_metal_context`, allocating via /// `ggml_metal_init` and dropping via `ggml_metal_free`. @@ -45,33 +45,24 @@ impl MetalContext { /// Add a context's memory as buffer to this Metal context pub fn add_context(&mut self, from_context: Arc) { - if self.ref_context(from_context.clone()) { - unsafe { - let raw_context = from_context.ptr.as_ptr(); - - let (data_ptr, data_size): (*mut c_void, usize) = - if let Some(ref mmap) = from_context.mmap { - // This is a bit naughty... - (mmap.as_ptr().cast_mut().cast(), mmap.len()) - } else { - ( - ggml_sys::ggml_get_mem_buffer(raw_context), - ggml_sys::ggml_get_mem_size(raw_context), - ) - }; + if !self.ref_context(from_context.clone()) { + return; + } - let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context); - assert!( - metal::ggml_metal_add_buffer( - self.ptr.as_ptr(), - "wt\0".as_ptr().cast(), // FIXME provide an actual name - data_ptr, - data_size, - max_size - ), - "Could not add weight buffer to metal context" - ); - } + unsafe { + let raw_context = from_context.as_ptr(); + let (data_ptr, data_size) = from_context.storage().as_ptr_and_size(&from_context); + let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context); + assert!( + metal::ggml_metal_add_buffer( + self.ptr.as_ptr(), + "wt\0".as_ptr().cast(), // FIXME provide an actual name + data_ptr, + data_size, + max_size + ), + "Could not add weight buffer to metal context" + ); } } } @@ -79,7 +70,7 @@ impl MetalContext { impl MetalContext { /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before. fn ref_context(&mut self, context: Arc) -> bool { - if self.contexts.iter().any(|c| c.ptr == context.ptr) { + if self.contexts.iter().any(|c| *c == context) { false } else { self.contexts.push(context); diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index dea550cd..472b58c1 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -1,5 +1,6 @@ use std::{ collections::HashMap, + ffi::c_void, os::raw::c_int, ptr::NonNull, sync::{Arc, Mutex}, @@ -11,6 +12,7 @@ use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tenso /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via /// `ggml_init` and dropping via `ggml_free`. +#[derive(PartialEq, Eq)] pub struct Context { /// An `Arc` is used to model the relation between the context and the /// allocated tensors. Tensors are owned by the object, so a [`Tensor`] @@ -43,6 +45,12 @@ pub(crate) struct ContextInner { // interface and its scratch buffer solution. pub offloaded_tensors: Mutex>, } +impl PartialEq for ContextInner { + fn eq(&self, other: &Self) -> bool { + self.ptr == other.ptr + } +} +impl Eq for ContextInner {} impl ContextInner { pub(crate) fn new(ptr: *mut ggml_sys::ggml_context) -> Arc { Arc::new(Self { @@ -64,6 +72,47 @@ pub enum ContextStorage { mem_size: usize, }, } +impl ContextStorage { + /// Returns the `Mmap` if this is a `Mmap` variant. + pub fn as_mmap(&self) -> Option<&Mmap> { + match self { + Self::Mmap(v) => Some(v), + _ => None, + } + } + + /// Returns the `Buffer` if this is a `Buffer` variant. + pub fn as_buffer(&self) -> Option<&Buffer> { + match self { + Self::Buffer(v) => Some(v), + _ => None, + } + } + + #[allow(dead_code)] + pub(crate) unsafe fn as_ptr_and_size(&self, ctx: &Context) -> (*mut c_void, usize) { + match self { + // This is a bit naughty... + Self::Mmap(mmap) => (mmap.as_ptr().cast_mut() as *mut c_void, mmap.len()), + _ => ( + ggml_sys::ggml_get_mem_buffer(ctx.as_ptr()), + ggml_sys::ggml_get_mem_size(ctx.as_ptr()), + ), + } + } +} +impl PartialEq for ContextStorage { + fn eq(&self, other: &Self) -> bool { + use ContextStorage::*; + match (self, other) { + (Buffer(l0), Buffer(r0)) => l0 == r0, + (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(), + (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r, + _ => false, + } + } +} +impl Eq for ContextStorage {} impl Context { /// Creates a new [Context] with the given storage.. @@ -191,12 +240,9 @@ impl Context { self.new_tensor_raw(raw) } - /// Returns the mmap used by this [Context], if any. - pub fn mmap(&self) -> Option<&Mmap> { - match &self.storage { - Some(ContextStorage::Mmap(mmap)) => Some(mmap), - _ => None, - } + /// Returns a reference to the [ContextStorage] used by this [Context]. + pub fn storage(&self) -> &ContextStorage { + self.storage.as_ref().unwrap() } } // Operations @@ -516,6 +562,12 @@ impl Context { self.new_tensor_raw(tensor) } } +// Public to this crate methods +impl Context { + pub(crate) fn as_ptr(&self) -> *mut sys::ggml_context { + self.inner.ptr.as_ptr() + } +} // Private methods impl Context { /// Wraps a raw tensor with a weak pointer to the context. @@ -530,10 +582,6 @@ impl Context { } tensor } - - fn as_ptr(&self) -> *mut sys::ggml_context { - self.inner.ptr.as_ptr() - } } impl Drop for Context { diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index af833b00..277c02c0 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -259,6 +259,7 @@ impl Type { /// A buffer of memory that can be used as a scratch buffer for a [Context]. /// /// See [Context::use_scratch]. +#[derive(PartialEq, Eq)] pub struct Buffer { data: *mut c_void, layout: Layout, diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index ee988eae..33f80f02 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -174,7 +174,7 @@ impl InferenceSession { let metal_context = { if config.use_gpu { let mut metal_context = MetalContext::new(config.n_threads); - metal_context.add_scratch_buffer(ctx0.buffer.as_ref().unwrap()); + metal_context.add_scratch_buffer(ctx0.storage().as_buffer().unwrap()); for buf in scratch.iter() { metal_context.add_scratch_buffer(buf); diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index c3ff904b..74544309 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -646,7 +646,7 @@ impl TensorLoader for MmapCompatibleLoader<'_> { &self.context, &mut self.file, &self.path, - self.context.mmap(), + self.context.storage().as_mmap(), ); let mut tensor = main_context.get_tensor(info)?; From d815857412581b1bd41cc78a2e8303f47dfa56b8 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 16 Jul 2023 12:41:52 +0200 Subject: [PATCH 28/28] fix(ggml): only set backend after if guards --- crates/ggml/src/tensor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 25b8bf44..34c2ab2d 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -65,10 +65,10 @@ impl Tensor { if current_backend != Backend::Cpu && backend == Backend::Cpu { unimplemented!("Tensors cannot be moved from an accelerator to the CPU at present"); } - t.set_backend(backend); if backend == Backend::Cpu { return; } + t.set_backend(backend); #[cfg(feature = "cublas")] unsafe {