Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
refactor(ggml): offload_no_scratch auto-free
Browse files Browse the repository at this point in the history
  • Loading branch information
philpax committed Jul 15, 2023
1 parent e7ac55b commit c74e159
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ impl Drop for Context {
// SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
unsafe {
// if we moved tensors to an accelerator we need to free them
for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() {
for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
if tensor.backend() != Backend::Cpu {
tensor.free_accelerator();
}
Expand Down
31 changes: 18 additions & 13 deletions crates/ggml/src/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,7 @@ impl Tensor {
sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr());
}

t.offloaded_tensors
.upgrade()
.expect("Attempted to update a dropped context's offloaded tensors")
.lock()
.unwrap()
.insert(t.name(), t.share());
t.mark_as_offloaded();
});
self
}
Expand All @@ -113,13 +108,17 @@ impl Tensor {
/// If not, this is a no-op.
///
/// It will not transfer the data. Use `transfer_to` for that.
///
/// Unlike `offload`, this function will add the tensor to the offloaded tensors map. This is because the non-use of a scratch buffer
/// allows us to safely assume that this tensor will actually point to data.
#[allow(unused_variables)]
pub fn offload_no_scratch(&self) {
self.with_alive_ctx(|| {
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr());
}
self.mark_as_offloaded();
})
}

Expand Down Expand Up @@ -222,13 +221,9 @@ impl Tensor {
/// If not, this is a no-op.
///
/// This is temporary while GGML improves their context memory management. This should only be called by
/// `Context` when it is dropped, as well as `llm`'s `InferenceSession`.
///
/// # Safety
///
/// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop`
/// isn't `self`.
pub unsafe fn free_accelerator(&mut self) {
/// `Context` when it is dropped.
pub(crate) fn free_accelerator(self) {
println!("Freeing tensor {}", self.name());
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr());
Expand Down Expand Up @@ -266,4 +261,14 @@ impl Tensor {
self.ptr.as_mut().backend = backend.try_into().unwrap();
}
}

/// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed.
fn mark_as_offloaded(&self) {
self.offloaded_tensors
.upgrade()
.expect("Attempted to update a dropped context's offloaded tensors")
.lock()
.unwrap()
.insert(self.name(), self.share());
}
}
7 changes: 2 additions & 5 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,12 +632,9 @@ impl InferenceSession {

impl Drop for InferenceSession {
fn drop(&mut self) {
//if we are using an accelerator, we need to free the scratch memory and the k/v memory
// If we are using an accelerator, we need to free the scratch memory.
// The k/v memory is freed by the ctx0 destructor.
ggml::accelerator::free_scratch();
unsafe {
self.memory_k.free_accelerator();
self.memory_v.free_accelerator();
}
}
}

Expand Down

0 comments on commit c74e159

Please sign in to comment.