refactor(ggml): offload_no_scratch auto-free

rustformers · Jul 15, 2023 · c74e159 · c74e159
1 parent e7ac55b
commit c74e159
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 19 deletions.
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -518,7 +518,7 @@ impl Drop for Context {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
             // if we moved tensors to an accelerator we need to free them
-            for (_, mut tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
                 if tensor.backend() != Backend::Cpu {
                     tensor.free_accelerator();
                 }

diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
@@ -85,12 +85,7 @@ impl Tensor {
                 sys::opencl::ggml_cl_transform_tensor(t.data(), t.ptr.as_ptr());
             }
 
-            t.offloaded_tensors
-                .upgrade()
-                .expect("Attempted to update a dropped context's offloaded tensors")
-                .lock()
-                .unwrap()
-                .insert(t.name(), t.share());
+            t.mark_as_offloaded();
         });
         self
     }
@@ -113,13 +108,17 @@ impl Tensor {
     /// If not, this is a no-op.
     ///
     /// It will not transfer the data. Use `transfer_to` for that.
+    ///
+    /// Unlike `offload`, this function will add the tensor to the offloaded tensors map. This is because the non-use of a scratch buffer
+    /// allows us to safely assume that this tensor will actually point to data.
     #[allow(unused_variables)]
     pub fn offload_no_scratch(&self) {
         self.with_alive_ctx(|| {
             #[cfg(feature = "cublas")]
             unsafe {
                 sys::cuda::ggml_cuda_assign_buffers_no_scratch(self.ptr.as_ptr());
             }
+            self.mark_as_offloaded();
         })
     }
 
@@ -222,13 +221,9 @@ impl Tensor {
     /// If not, this is a no-op.
     ///
     /// This is temporary while GGML improves their context memory management. This should only be called by
-    /// `Context` when it is dropped, as well as `llm`'s `InferenceSession`.
-    ///
-    /// # Safety
-    ///
-    /// This must be the last thing you do with this tensor. The only reason it's not `self` is because `Drop`
-    /// isn't `self`.
-    pub unsafe fn free_accelerator(&mut self) {
+    /// `Context` when it is dropped.
+    pub(crate) fn free_accelerator(self) {
+        println!("Freeing tensor {}", self.name());
         #[cfg(feature = "cublas")]
         unsafe {
             sys::cuda::ggml_cuda_free_data(self.ptr.as_ptr());
@@ -266,4 +261,14 @@ impl Tensor {
             self.ptr.as_mut().backend = backend.try_into().unwrap();
         }
     }
+
+    /// Adds this tensor to the context's list of offloaded tensors, so that it will be automatically freed.
+    fn mark_as_offloaded(&self) {
+        self.offloaded_tensors
+            .upgrade()
+            .expect("Attempted to update a dropped context's offloaded tensors")
+            .lock()
+            .unwrap()
+            .insert(self.name(), self.share());
+    }
 }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -632,12 +632,9 @@ impl InferenceSession {
 
 impl Drop for InferenceSession {
     fn drop(&mut self) {
-        //if we are using an accelerator, we need to free the scratch memory and the k/v memory
+        // If we are using an accelerator, we need to free the scratch memory.
+        // The k/v memory is freed by the ctx0 destructor.
         ggml::accelerator::free_scratch();
-        unsafe {
-            self.memory_k.free_accelerator();
-            self.memory_v.free_accelerator();
-        }
     }
 }