Try to set the cuda scratch offset

rustformers · Sep 30, 2023 · e506b0b · e506b0b
1 parent 8ad589b
commit e506b0b
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 28 deletions.
diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs
@@ -71,6 +71,7 @@ pub fn initialize(device: i32) {
         //TODO: Make this configurable
         sys::cuda::ggml_init_cublas();
         sys::cuda::ggml_cuda_set_main_device(device);
+        sys::cuda::ggml_cuda_set_mul_mat_q(true);
         let split = 1.0f32;
         sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
     }

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -21,7 +21,7 @@ pub struct Context {
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
     /// contains a `Weak` reference underneath and doesn't let you do anything
     /// with it if the underlying context has been deallocated.
-    inner: Arc<ContextInner>,
+    pub inner: Arc<ContextInner>,
 
     /// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped.
     storage: Option<ContextStorage>,
@@ -31,7 +31,7 @@ pub struct Context {
 }
 
 /// Contains state shared between a context and its tensors
-pub(crate) struct ContextInner {
+pub struct ContextInner {
     pub ptr: NonNull<sys::ggml_context>,
 
     /// Offloaded tensors. Used to free them when the context is dropped.

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -10,6 +10,8 @@
 use std::{
     alloc::Layout,
     os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::Arc,
 };
 
 mod context;
@@ -308,10 +310,27 @@ impl Buffer {
         }
     }
 
+    /// Creates a new buffer of the specified size, without aligning it.
+    pub fn new_unaligned(size: usize) -> Self {
+        let layout = Layout::from_size_align(size, 1).unwrap();
+
+        unsafe {
+            Buffer {
+                data: std::alloc::alloc(layout).cast(),
+                layout,
+            }
+        }
+    }
+
     /// Returns the size of the buffer in bytes
     pub fn size(&self) -> usize {
         self.layout.size()
     }
+
+    /// Returns a pointer to the data in this buffer.
+    pub fn data(&mut self) -> *mut c_void {
+        self.data
+    }
 }
 
 impl Drop for Buffer {
@@ -337,6 +356,37 @@ impl ComputationGraph {
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
         unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
     }
+
+    /// Returns the leafs in this graph.
+    pub fn leafs(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_leafs: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_leafs.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_leafs
+        }
+    }
+    /// Returns the nodes in this graph.
+    pub fn nodes(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_nodes: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_nodes.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_nodes
+        }
+    }
 }
 
 /// A `ggml` execution plan. Contains the information needed to execute a computation graph.
@@ -413,13 +463,14 @@ impl GraphAllocator {
     }
 
     /// Switches the buffer used by the allocator.
-    pub fn switch_buffer(&mut self, buffer: Buffer, tensor_alignment: usize) {
+    pub fn resize_buffer(&mut self, graph_size: usize, tensor_alignment: usize) {
         // Free the old allocator
         unsafe { sys::ggml_allocr_free(self.ptr) }
+        //Resize the buffer
+        self.buffer = Buffer::new_unaligned(graph_size);
         // Create a new allocator with the new buffer
-        let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
-        self.ptr = ptr;
-        self.buffer = buffer;
+        self.ptr =
+            unsafe { sys::ggml_allocr_new(self.buffer.data, self.buffer.size(), tensor_alignment) };
     }
 }
 

diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
@@ -52,6 +52,11 @@ impl Tensor {
         })
     }
 
+    /// Returns true if the 'extra' field of this tensor is set. e.g. by ggml-cuda
+    pub fn has_extras(&self) -> bool {
+        self.with_alive_ctx(|| unsafe { !self.ptr.as_ref().extra.is_null() })
+    }
+
     /// Sets the tensor's acceleration backend and moves the tensor's data to the new backend.
     pub fn transfer_to(mut self, backend: Backend) -> Tensor {
         self.with_alive_ctx_mut(|t| {
@@ -111,6 +116,18 @@ impl Tensor {
         })
     }
 
+    /// If ggml-sys is compiled with CUDA support, this function will set the tensor's scratch offset.
+    /// If not, this is a no-op.
+    #[allow(unused_variables)]
+    pub fn assign_scratch_offset(&self, offset: usize) {
+        self.with_alive_ctx(|| {
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_assign_scratch_offset(self.ptr.as_ptr(), offset);
+            }
+        })
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {

diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
@@ -176,6 +176,10 @@ pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
 pub type ggml_object_type = ::std::os::raw::c_int;
+pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
+pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
+pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
+pub type ggml_log_level = ::std::os::raw::c_int;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -2187,6 +2191,13 @@ pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result =
 pub type ggml_opt_result = ::std::os::raw::c_int;
 pub type ggml_opt_callback =
     ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void, sched: *mut f32)>;
+pub type ggml_log_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        level: ggml_log_level,
+        text: *const ::std::os::raw::c_char,
+        user_data: *mut ::std::os::raw::c_void,
+    ),
+>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {

diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
@@ -2,15 +2,11 @@
 
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
 pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_tensor {
-    _unused: [u8; 0],
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_cgraph {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn ggml_metal_log_set_callback(
+        log_callback: ggml_log_callback,
+        user_data: *mut ::std::os::raw::c_void,
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -86,6 +86,9 @@ pub struct InferenceSession {
 
     /// Work buffer for graph planing
     work_buffer: Vec<u8>,
+
+    /// If the session can use the gpu
+    use_gpu: bool,
 }
 
 pub struct BuildContext<'session> {
@@ -120,7 +123,7 @@ impl InferenceSession {
             ..
         } = *params;
 
-        let context_byte_size = {
+        let cache_byte_size = {
             let mut size = 0;
             size += mulf!(
                 context_size,
@@ -134,14 +137,14 @@ impl InferenceSession {
                 n_embd,
                 ggml::type_sizef(config.memory_v_type.into())
             ); // memory_v
-            size += (5 + 10 * n_layer) * 256; // object overhead
+            size += 2 * 1024 * 1024; // overhead
 
             size
         };
 
         log::info!(
             "Allocating {:.2} MB for KV-memory",
-            context_byte_size / (1024 * 1024)
+            cache_byte_size / (1024 * 1024)
         );
 
         if use_gpu {
@@ -153,7 +156,7 @@ impl InferenceSession {
         // context is only accessed from one thread at a time, but I've already spent enough
         // time on this as-is.
         #[allow(clippy::arc_with_non_send_sync)]
-        let session_ctx = Arc::new(ggml::Context::new_with_allocate(context_byte_size));
+        let session_ctx = Arc::new(ggml::Context::new_with_allocate(cache_byte_size));
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * context_size;
@@ -190,7 +193,7 @@ impl InferenceSession {
 
         InferenceSession {
             _session_ctx: session_ctx,
-            _memory_size: context_byte_size,
+            _memory_size: cache_byte_size,
             config,
             memory_k,
             memory_v,
@@ -206,6 +209,7 @@ impl InferenceSession {
             allocator,
             context_size,
             work_buffer: vec![0],
+            use_gpu,
         }
     }
 
@@ -252,18 +256,26 @@ impl InferenceSession {
             let graph_size =
                 self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
             log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
-            // Pre-allocate the buffer foor future use
-            let buffer = Buffer::new(graph_size);
-            self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
+            // Pre-allocate the buffer for future use
+            self.allocator
+                .resize_buffer(graph_size, ggml::TENSOR_ALIGNMENT);
+
+            if self.use_gpu {
+                ggml::accelerator::set_scratch_size(graph_size);
+            }
         }
 
+        // Reset the context and allocator
         self.ctx0.recreate();
+        self.allocator.reset();
         let ctx0 = &mut self.ctx0;
 
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
 
+        self.allocator.allocate(&embd);
+
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
             allocator: RefCell::new(&self.allocator),
@@ -273,10 +285,6 @@ impl InferenceSession {
             n_past: self.n_past,
         };
 
-        // Reset the allocator
-        self.allocator.reset();
-        self.allocator.allocate(&embd);
-
         let (mut built_gf, built_result) = builder(bc);
 
         // Build the graph
@@ -285,6 +293,26 @@ impl InferenceSession {
         // Allocate the graph
         self.allocator.allocate_graph(&built_gf);
 
+        #[cfg(feature = "cublas")]
+        {
+            for mut leaf in built_gf.leafs(&ctx0) {
+                if leaf.backend() == ggml::accelerator::Backend::Gpu && !leaf.has_extras() {
+                    unsafe {
+                        let offset = leaf.data().offset_from(self.allocator.buffer.data()) as usize;
+                        leaf.assign_scratch_offset(offset);
+                    }
+                }
+            }
+
+            for mut node in built_gf.nodes(&ctx0) {
+                if node.backend() == ggml::accelerator::Backend::Gpu && !node.has_extras() {
+                    unsafe {
+                        let offset = node.data().offset_from(self.allocator.buffer.data()) as usize;
+                        node.assign_scratch_offset(offset);
+                    }
+                }
+            }
+        }
         // Do Metal'y stuff
         #[cfg(feature = "metal")]
         {