From ff83d3482daf092ee927680d1f6594165dd1afeb Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 19 Jan 2024 18:47:59 +0100 Subject: [PATCH] perf: apply string view GC more conservatively (#13850) --- crates/polars-arrow/src/array/binview/mod.rs | 24 ++++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index ecb8410660a5..769db31d736f 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -373,18 +373,28 @@ impl BinaryViewArrayGeneric { } pub fn maybe_gc(self) -> Self { - if self.total_buffer_len == 0 { + const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB. + + if self.total_buffer_len <= GC_MINIMUM_SAVINGS { return self; } + + // Subtract the maximum amount of inlined strings to get a lower bound + // on the number of buffer bytes needed (assuming no dedup). let total_bytes_len = self.total_bytes_len.load(Ordering::Relaxed) as usize; - // Subtract the maximum amount of inlined strings. - let min_in_buffer = total_bytes_len.saturating_sub(self.len() * 12); - let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64); + let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12); + + let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound; + let cur_mem_usage = self.len() * 16 + self.total_buffer_len(); + let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc); - if frac < 0.25 { - return self.gc(); + if savings_upper_bound >= GC_MINIMUM_SAVINGS + && cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc + { + self.gc() + } else { + self } - self } }