Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
Try to set the cuda scratch offset
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Sep 30, 2023
1 parent 8ad589b commit e506b0b
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 28 deletions.
1 change: 1 addition & 0 deletions crates/ggml/src/accelerator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub fn initialize(device: i32) {
//TODO: Make this configurable
sys::cuda::ggml_init_cublas();
sys::cuda::ggml_cuda_set_main_device(device);
sys::cuda::ggml_cuda_set_mul_mat_q(true);
let split = 1.0f32;
sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
}
Expand Down
4 changes: 2 additions & 2 deletions crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub struct Context {
/// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
/// contains a `Weak` reference underneath and doesn't let you do anything
/// with it if the underlying context has been deallocated.
inner: Arc<ContextInner>,
pub inner: Arc<ContextInner>,

/// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped.
storage: Option<ContextStorage>,
Expand All @@ -31,7 +31,7 @@ pub struct Context {
}

/// Contains state shared between a context and its tensors
pub(crate) struct ContextInner {
pub struct ContextInner {
pub ptr: NonNull<sys::ggml_context>,

/// Offloaded tensors. Used to free them when the context is dropped.
Expand Down
59 changes: 55 additions & 4 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
use std::{
alloc::Layout,
os::raw::{c_int, c_void},
ptr::NonNull,
sync::Arc,
};

mod context;
Expand Down Expand Up @@ -308,10 +310,27 @@ impl Buffer {
}
}

/// Creates a new buffer of the specified size, without aligning it.
pub fn new_unaligned(size: usize) -> Self {
let layout = Layout::from_size_align(size, 1).unwrap();

unsafe {
Buffer {
data: std::alloc::alloc(layout).cast(),
layout,
}
}
}

/// Returns the size of the buffer in bytes
pub fn size(&self) -> usize {
self.layout.size()
}

/// Returns a pointer to the data in this buffer.
pub fn data(&mut self) -> *mut c_void {
self.data
}
}

impl Drop for Buffer {
Expand All @@ -337,6 +356,37 @@ impl ComputationGraph {
pub fn build_forward_expand(&mut self, tensor: &Tensor) {
unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
}

/// Returns the leafs in this graph.
pub fn leafs(&self, context: &Context) -> Vec<Tensor> {
let mut wrapped_leafs: Vec<Tensor> = vec![];
unsafe {
for leaf in self.inner.as_ref().unwrap().leafs {
if !leaf.is_null() {
wrapped_leafs.push(Tensor {
ptr: NonNull::new(leaf).expect("Should not be null"),
inner: Arc::downgrade(&context.inner),
})
}
}
wrapped_leafs
}
}
/// Returns the nodes in this graph.
pub fn nodes(&self, context: &Context) -> Vec<Tensor> {
let mut wrapped_nodes: Vec<Tensor> = vec![];
unsafe {
for leaf in self.inner.as_ref().unwrap().leafs {
if !leaf.is_null() {
wrapped_nodes.push(Tensor {
ptr: NonNull::new(leaf).expect("Should not be null"),
inner: Arc::downgrade(&context.inner),
})
}
}
wrapped_nodes
}
}
}

/// A `ggml` execution plan. Contains the information needed to execute a computation graph.
Expand Down Expand Up @@ -413,13 +463,14 @@ impl GraphAllocator {
}

/// Switches the buffer used by the allocator.
pub fn switch_buffer(&mut self, buffer: Buffer, tensor_alignment: usize) {
pub fn resize_buffer(&mut self, graph_size: usize, tensor_alignment: usize) {
// Free the old allocator
unsafe { sys::ggml_allocr_free(self.ptr) }
//Resize the buffer
self.buffer = Buffer::new_unaligned(graph_size);
// Create a new allocator with the new buffer
let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
self.ptr = ptr;
self.buffer = buffer;
self.ptr =
unsafe { sys::ggml_allocr_new(self.buffer.data, self.buffer.size(), tensor_alignment) };
}
}

Expand Down
17 changes: 17 additions & 0 deletions crates/ggml/src/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ impl Tensor {
})
}

/// Returns true if the 'extra' field of this tensor is set. e.g. by ggml-cuda
pub fn has_extras(&self) -> bool {
self.with_alive_ctx(|| unsafe { !self.ptr.as_ref().extra.is_null() })
}

/// Sets the tensor's acceleration backend and moves the tensor's data to the new backend.
pub fn transfer_to(mut self, backend: Backend) -> Tensor {
self.with_alive_ctx_mut(|t| {
Expand Down Expand Up @@ -111,6 +116,18 @@ impl Tensor {
})
}

/// If ggml-sys is compiled with CUDA support, this function will set the tensor's scratch offset.
/// If not, this is a no-op.
#[allow(unused_variables)]
pub fn assign_scratch_offset(&self, offset: usize) {
self.with_alive_ctx(|| {
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_assign_scratch_offset(self.ptr.as_ptr(), offset);
}
})
}

/// Creates a shared copy of this tensor pointer.
pub fn share(&self) -> Self {
Tensor {
Expand Down
11 changes: 11 additions & 0 deletions crates/ggml/sys/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
pub type ggml_object_type = ::std::os::raw::c_int;
pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
pub type ggml_log_level = ::std::os::raw::c_int;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct ggml_object {
Expand Down Expand Up @@ -2187,6 +2191,13 @@ pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result =
pub type ggml_opt_result = ::std::os::raw::c_int;
pub type ggml_opt_callback =
::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void, sched: *mut f32)>;
pub type ggml_log_callback = ::std::option::Option<
unsafe extern "C" fn(
level: ggml_log_level,
text: *const ::std::os::raw::c_char,
user_data: *mut ::std::os::raw::c_void,
),
>;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct ggml_opt_params {
Expand Down
14 changes: 5 additions & 9 deletions crates/ggml/sys/src/metal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@

pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct ggml_tensor {
_unused: [u8; 0],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct ggml_cgraph {
_unused: [u8; 0],
extern "C" {
pub fn ggml_metal_log_set_callback(
log_callback: ggml_log_callback,
user_data: *mut ::std::os::raw::c_void,
);
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
Expand Down
52 changes: 40 additions & 12 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ pub struct InferenceSession {

/// Work buffer for graph planing
work_buffer: Vec<u8>,

/// If the session can use the gpu
use_gpu: bool,
}

pub struct BuildContext<'session> {
Expand Down Expand Up @@ -120,7 +123,7 @@ impl InferenceSession {
..
} = *params;

let context_byte_size = {
let cache_byte_size = {
let mut size = 0;
size += mulf!(
context_size,
Expand All @@ -134,14 +137,14 @@ impl InferenceSession {
n_embd,
ggml::type_sizef(config.memory_v_type.into())
); // memory_v
size += (5 + 10 * n_layer) * 256; // object overhead
size += 2 * 1024 * 1024; // overhead

size
};

log::info!(
"Allocating {:.2} MB for KV-memory",
context_byte_size / (1024 * 1024)
cache_byte_size / (1024 * 1024)
);

if use_gpu {
Expand All @@ -153,7 +156,7 @@ impl InferenceSession {
// context is only accessed from one thread at a time, but I've already spent enough
// time on this as-is.
#[allow(clippy::arc_with_non_send_sync)]
let session_ctx = Arc::new(ggml::Context::new_with_allocate(context_byte_size));
let session_ctx = Arc::new(ggml::Context::new_with_allocate(cache_byte_size));

// Initialize key + value memory tensors
let n_mem = n_layer * context_size;
Expand Down Expand Up @@ -190,7 +193,7 @@ impl InferenceSession {

InferenceSession {
_session_ctx: session_ctx,
_memory_size: context_byte_size,
_memory_size: cache_byte_size,
config,
memory_k,
memory_v,
Expand All @@ -206,6 +209,7 @@ impl InferenceSession {
allocator,
context_size,
work_buffer: vec![0],
use_gpu,
}
}

Expand Down Expand Up @@ -252,18 +256,26 @@ impl InferenceSession {
let graph_size =
self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
// Pre-allocate the buffer foor future use
let buffer = Buffer::new(graph_size);
self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
// Pre-allocate the buffer for future use
self.allocator
.resize_buffer(graph_size, ggml::TENSOR_ALIGNMENT);

if self.use_gpu {
ggml::accelerator::set_scratch_size(graph_size);
}
}

// Reset the context and allocator
self.ctx0.recreate();
self.allocator.reset();
let ctx0 = &mut self.ctx0;

let mut embd = ctx0
.new_tensor_1d(ggml::Type::I32, input_tokens.len())
.set_name("embd");

self.allocator.allocate(&embd);

let bc = BuildContext {
ctx0: RefCell::new(ctx0),
allocator: RefCell::new(&self.allocator),
Expand All @@ -273,10 +285,6 @@ impl InferenceSession {
n_past: self.n_past,
};

// Reset the allocator
self.allocator.reset();
self.allocator.allocate(&embd);

let (mut built_gf, built_result) = builder(bc);

// Build the graph
Expand All @@ -285,6 +293,26 @@ impl InferenceSession {
// Allocate the graph
self.allocator.allocate_graph(&built_gf);

#[cfg(feature = "cublas")]
{
for mut leaf in built_gf.leafs(&ctx0) {
if leaf.backend() == ggml::accelerator::Backend::Gpu && !leaf.has_extras() {
unsafe {
let offset = leaf.data().offset_from(self.allocator.buffer.data()) as usize;
leaf.assign_scratch_offset(offset);
}
}
}

for mut node in built_gf.nodes(&ctx0) {
if node.backend() == ggml::accelerator::Backend::Gpu && !node.has_extras() {
unsafe {
let offset = node.data().offset_from(self.allocator.buffer.data()) as usize;
node.assign_scratch_offset(offset);
}
}
}
}
// Do Metal'y stuff
#[cfg(feature = "metal")]
{
Expand Down

0 comments on commit e506b0b

Please sign in to comment.