diff --git a/Cargo.lock b/Cargo.lock index 02f08064273b..ffd9bc55d0b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3382,7 +3382,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.9.0" +version = "1.10.0" dependencies = [ "jemallocator", "libc", diff --git a/crates/polars-arrow/src/array/fixed_size_list/mod.rs b/crates/polars-arrow/src/array/fixed_size_list/mod.rs index b8340825d0c7..32267cc5a4b7 100644 --- a/crates/polars-arrow/src/array/fixed_size_list/mod.rs +++ b/crates/polars-arrow/src/array/fixed_size_list/mod.rs @@ -1,4 +1,4 @@ -use super::{new_empty_array, new_null_array, Array, Splitable}; +use super::{new_empty_array, new_null_array, Array, ArrayRef, Splitable}; use crate::bitmap::Bitmap; use crate::datatypes::{ArrowDataType, Field}; @@ -9,8 +9,11 @@ mod iterator; mod mutable; pub use mutable::*; use polars_error::{polars_bail, polars_ensure, PolarsResult}; +use polars_utils::format_tuple; use polars_utils::pl_str::PlSmallStr; +use crate::datatypes::reshape::{Dimension, ReshapeDimension}; + /// The Arrow's equivalent to an immutable `Vec>` where `T` is an Arrow type. /// Cloning and slicing this struct is `O(1)`. #[derive(Clone)] @@ -120,6 +123,108 @@ impl FixedSizeListArray { let values = new_null_array(field.dtype().clone(), length * size); Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length))) } + + pub fn from_shape( + leaf_array: ArrayRef, + dimensions: &[ReshapeDimension], + ) -> PolarsResult { + polars_ensure!( + !dimensions.is_empty(), + InvalidOperation: "at least one dimension must be specified" + ); + let size = leaf_array.len(); + + let mut total_dim_size = 1; + let mut num_infers = 0; + for &dim in dimensions { + match dim { + ReshapeDimension::Infer => num_infers += 1, + ReshapeDimension::Specified(dim) => total_dim_size *= dim.get() as usize, + } + } + + polars_ensure!(num_infers <= 1, InvalidOperation: "can only specify one inferred dimension"); + + if size == 0 { + polars_ensure!( + num_infers > 0 || total_dim_size == 0, + InvalidOperation: "cannot reshape empty array into shape without zero dimension: {}", + format_tuple!(dimensions), + ); + + let mut prev_arrow_dtype = leaf_array.dtype().clone(); + let mut prev_array = leaf_array; + + // @NOTE: We need to collect the iterator here because it is lazily processed. + let mut current_length = dimensions[0].get_or_infer(0); + let len_iter = dimensions[1..] + .iter() + .map(|d| { + let length = current_length as usize; + current_length *= d.get_or_infer(0); + length + }) + .collect::>(); + + // We pop the outer dimension as that is the height of the series. + for (dim, length) in dimensions[1..].iter().zip(len_iter).rev() { + // Infer dimension if needed + let dim = dim.get_or_infer(0); + prev_arrow_dtype = prev_arrow_dtype.to_fixed_size_list(dim as usize, true); + + prev_array = + FixedSizeListArray::new(prev_arrow_dtype.clone(), length, prev_array, None) + .boxed(); + } + + return Ok(prev_array); + } + + polars_ensure!( + total_dim_size > 0, + InvalidOperation: "cannot reshape non-empty array into shape containing a zero dimension: {}", + format_tuple!(dimensions) + ); + + polars_ensure!( + size % total_dim_size == 0, + InvalidOperation: "cannot reshape array of size {} into shape {}", size, format_tuple!(dimensions) + ); + + let mut prev_arrow_dtype = leaf_array.dtype().clone(); + let mut prev_array = leaf_array; + + // We pop the outer dimension as that is the height of the series. + for dim in dimensions[1..].iter().rev() { + // Infer dimension if needed + let dim = dim.get_or_infer((size / total_dim_size) as u64); + prev_arrow_dtype = prev_arrow_dtype.to_fixed_size_list(dim as usize, true); + + prev_array = FixedSizeListArray::new( + prev_arrow_dtype.clone(), + prev_array.len() / dim as usize, + prev_array, + None, + ) + .boxed(); + } + Ok(prev_array) + } + + pub fn get_dims(&self) -> Vec { + let mut dims = vec![ + Dimension::new(self.length as _), + Dimension::new(self.size as _), + ]; + + let mut prev_array = &self.values; + + while let Some(a) = prev_array.as_any().downcast_ref::() { + dims.push(Dimension::new(a.size as _)); + prev_array = &a.values; + } + dims + } } // must use @@ -144,6 +249,7 @@ impl FixedSizeListArray { /// # Safety /// The caller must ensure that `offset + length <= self.len()`. pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { + debug_assert!(offset + length <= self.len()); self.validity = self .validity .take() diff --git a/crates/polars-arrow/src/array/growable/list.rs b/crates/polars-arrow/src/array/growable/list.rs index 90e4f15020a6..095f39522da4 100644 --- a/crates/polars-arrow/src/array/growable/list.rs +++ b/crates/polars-arrow/src/array/growable/list.rs @@ -14,7 +14,7 @@ unsafe fn extend_offset_values( start: usize, len: usize, ) { - let array = growable.arrays[index]; + let array = growable.arrays.get_unchecked_release(index); let offsets = array.offsets(); growable diff --git a/crates/polars-arrow/src/compute/take/fixed_size_list.rs b/crates/polars-arrow/src/compute/take/fixed_size_list.rs index 84f15bd44791..2d1e2b082dc3 100644 --- a/crates/polars-arrow/src/compute/take/fixed_size_list.rs +++ b/crates/polars-arrow/src/compute/take/fixed_size_list.rs @@ -15,12 +15,18 @@ // specific language governing permissions and limitations // under the License. +use polars_utils::itertools::Itertools; + use super::Index; use crate::array::growable::{Growable, GrowableFixedSizeList}; -use crate::array::{FixedSizeListArray, PrimitiveArray}; +use crate::array::{Array, ArrayRef, FixedSizeListArray, PrimitiveArray}; +use crate::bitmap::MutableBitmap; +use crate::datatypes::reshape::{Dimension, ReshapeDimension}; +use crate::datatypes::{ArrowDataType, PhysicalType}; +use crate::legacy::prelude::FromData; +use crate::with_match_primitive_type; -/// `take` implementation for FixedSizeListArrays -pub(super) unsafe fn take_unchecked( +pub(super) unsafe fn take_unchecked_slow( values: &FixedSizeListArray, indices: &PrimitiveArray, ) -> FixedSizeListArray { @@ -31,7 +37,7 @@ pub(super) unsafe fn take_unchecked( .iter() .map(|index| { let index = index.to_usize(); - let slice = values.clone().sliced(index, take_len); + let slice = values.clone().sliced_unchecked(index, take_len); capacity += slice.len(); slice }) @@ -62,3 +68,168 @@ pub(super) unsafe fn take_unchecked( growable.into() } } + +fn get_stride_and_leaf_type(dtype: &ArrowDataType, size: usize) -> (usize, &ArrowDataType) { + if let ArrowDataType::FixedSizeList(inner, size_inner) = dtype { + get_stride_and_leaf_type(inner.dtype(), *size_inner * size) + } else { + (size, dtype) + } +} + +fn get_leaves(array: &FixedSizeListArray) -> &dyn Array { + if let Some(array) = array.values().as_any().downcast_ref::() { + get_leaves(array) + } else { + &**array.values() + } +} + +fn get_buffer_and_size(array: &dyn Array) -> (&[u8], usize) { + match array.dtype().to_physical_type() { + PhysicalType::Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + + let arr = array.as_any().downcast_ref::>().unwrap(); + let values = arr.values(); + (bytemuck::cast_slice(values), size_of::<$T>()) + + }), + _ => { + unimplemented!() + }, + } +} + +unsafe fn from_buffer(mut buf: Vec, dtype: &ArrowDataType) -> ArrayRef { + assert_eq!(buf.as_ptr().align_offset(256), 0); + + match dtype.to_physical_type() { + PhysicalType::Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + + let ptr = buf.as_mut_ptr(); + let len_units = buf.len(); + let cap_units = buf.capacity(); + + std::mem::forget(buf); + + let buf = Vec::from_raw_parts( + ptr as *mut $T, + len_units / size_of::<$T>(), + cap_units / size_of::<$T>(), + ); + + PrimitiveArray::from_data_default(buf.into(), None).boxed() + + }), + _ => { + unimplemented!() + }, + } +} + +// Use an alignedvec so the alignment always fits the actual type +// That way we can operate on bytes and reduce monomorphization. +#[repr(C, align(256))] +struct Align256([u8; 256]); + +unsafe fn aligned_vec(n_bytes: usize) -> Vec { + // Lazy math to ensure we always have enough. + let n_units = (n_bytes / size_of::()) + 1; + + let mut aligned: Vec = Vec::with_capacity(n_units); + + let ptr = aligned.as_mut_ptr(); + let len_units = aligned.len(); + let cap_units = aligned.capacity(); + + std::mem::forget(aligned); + + Vec::from_raw_parts( + ptr as *mut u8, + len_units * size_of::(), + cap_units * size_of::(), + ) +} + +fn no_inner_validities(values: &ArrayRef) -> bool { + if let Some(arr) = values.as_any().downcast_ref::() { + arr.validity().is_none() && no_inner_validities(arr.values()) + } else { + values.validity().is_none() + } +} + +/// `take` implementation for FixedSizeListArrays +pub(super) unsafe fn take_unchecked( + values: &FixedSizeListArray, + indices: &PrimitiveArray, +) -> ArrayRef { + let (stride, leaf_type) = get_stride_and_leaf_type(values.dtype(), 1); + if leaf_type.to_physical_type().is_primitive() && no_inner_validities(values.values()) { + let leaves = get_leaves(values); + + let (leaves_buf, leave_size) = get_buffer_and_size(leaves); + let bytes_per_element = leave_size * stride; + + let n_idx = indices.len(); + let total_bytes = bytes_per_element * n_idx; + + let mut buf = aligned_vec(total_bytes); + let dst = buf.spare_capacity_mut(); + + let mut count = 0; + let validity = if indices.null_count() == 0 { + for i in indices.values().iter() { + let i = i.to_usize(); + + std::ptr::copy_nonoverlapping( + leaves_buf.as_ptr().add(i * bytes_per_element), + dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, + bytes_per_element, + ); + count += 1; + } + None + } else { + let mut new_validity = MutableBitmap::with_capacity(indices.len()); + new_validity.extend_constant(indices.len(), true); + for i in indices.iter() { + if let Some(i) = i { + let i = i.to_usize(); + std::ptr::copy_nonoverlapping( + leaves_buf.as_ptr().add(i * bytes_per_element), + dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, + bytes_per_element, + ); + } else { + new_validity.set_unchecked(count, false); + std::ptr::write_bytes( + dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, + 0, + bytes_per_element, + ); + } + + count += 1; + } + Some(new_validity.freeze()) + }; + assert_eq!(count * bytes_per_element, total_bytes); + + buf.set_len(total_bytes); + + let leaves = from_buffer(buf, leaves.dtype()); + let mut shape = values.get_dims(); + shape[0] = Dimension::new(indices.len() as _); + let shape = shape + .into_iter() + .map(ReshapeDimension::Specified) + .collect_vec(); + + FixedSizeListArray::from_shape(leaves.clone(), &shape) + .unwrap() + .with_validity(validity) + } else { + take_unchecked_slow(values, indices).boxed() + } +} diff --git a/crates/polars-arrow/src/compute/take/mod.rs b/crates/polars-arrow/src/compute/take/mod.rs index aed14823af1e..bdd782a1d609 100644 --- a/crates/polars-arrow/src/compute/take/mod.rs +++ b/crates/polars-arrow/src/compute/take/mod.rs @@ -68,7 +68,7 @@ pub unsafe fn take_unchecked(values: &dyn Array, indices: &IdxArr) -> Box { let array = values.as_any().downcast_ref().unwrap(); - Box::new(fixed_size_list::take_unchecked(array, indices)) + fixed_size_list::take_unchecked(array, indices) }, BinaryView => { take_binview_unchecked(values.as_any().downcast_ref().unwrap(), indices).boxed() diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs index c609ffbe432f..0c0b7024bc71 100644 --- a/crates/polars-arrow/src/datatypes/mod.rs +++ b/crates/polars-arrow/src/datatypes/mod.rs @@ -2,6 +2,7 @@ mod field; mod physical_type; +pub mod reshape; mod schema; use std::collections::BTreeMap; @@ -365,6 +366,25 @@ impl ArrowDataType { matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView) } + pub fn is_numeric(&self) -> bool { + use ArrowDataType as D; + matches!( + self, + D::Int8 + | D::Int16 + | D::Int32 + | D::Int64 + | D::UInt8 + | D::UInt16 + | D::UInt32 + | D::UInt64 + | D::Float32 + | D::Float64 + | D::Decimal(_, _) + | D::Decimal256(_, _) + ) + } + pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType { ArrowDataType::FixedSizeList( Box::new(Field::new( diff --git a/crates/polars-arrow/src/datatypes/physical_type.rs b/crates/polars-arrow/src/datatypes/physical_type.rs index 174c0401ca3f..732a129055a6 100644 --- a/crates/polars-arrow/src/datatypes/physical_type.rs +++ b/crates/polars-arrow/src/datatypes/physical_type.rs @@ -57,6 +57,10 @@ impl PhysicalType { false } } + + pub fn is_primitive(&self) -> bool { + matches!(self, Self::Primitive(_)) + } } /// the set of valid indices types of a dictionary-encoded Array. diff --git a/crates/polars-core/src/datatypes/reshape.rs b/crates/polars-arrow/src/datatypes/reshape.rs similarity index 100% rename from crates/polars-core/src/datatypes/reshape.rs rename to crates/polars-arrow/src/datatypes/reshape.rs diff --git a/crates/polars-arrow/src/util/macros.rs b/crates/polars-arrow/src/util/macros.rs index b09a9d5d5473..fb5bd61ebba0 100644 --- a/crates/polars-arrow/src/util/macros.rs +++ b/crates/polars-arrow/src/util/macros.rs @@ -13,6 +13,7 @@ macro_rules! with_match_primitive_type {( UInt16 => __with_ty__! { u16 }, UInt32 => __with_ty__! { u32 }, UInt64 => __with_ty__! { u64 }, + Int128 => __with_ty__! { i128 }, Float32 => __with_ty__! { f32 }, Float64 => __with_ty__! { f64 }, _ => panic!("operator does not support primitive `{:?}`", diff --git a/crates/polars-core/src/chunked_array/ops/append.rs b/crates/polars-core/src/chunked_array/ops/append.rs index 383c76d63600..1cbf0da390e7 100644 --- a/crates/polars-core/src/chunked_array/ops/append.rs +++ b/crates/polars-core/src/chunked_array/ops/append.rs @@ -132,7 +132,7 @@ where impl ChunkedArray where - T: PolarsDataType, + T: PolarsDataType, for<'a> T::Physical<'a>: TotalOrd, { /// Append in place. This is done by adding the chunks of `other` to this [`ChunkedArray`]. diff --git a/crates/polars-core/src/chunked_array/ops/gather.rs b/crates/polars-core/src/chunked_array/ops/gather.rs index cb24305f75f6..fc162626bc27 100644 --- a/crates/polars-core/src/chunked_array/ops/gather.rs +++ b/crates/polars-core/src/chunked_array/ops/gather.rs @@ -143,7 +143,7 @@ unsafe fn gather_idx_array_unchecked( impl + ?Sized> ChunkTakeUnchecked for ChunkedArray where - T: PolarsDataType, + T: PolarsDataType, { /// Gather values from ChunkedArray by index. unsafe fn take_unchecked(&self, indices: &I) -> Self { @@ -178,7 +178,7 @@ pub fn _update_gather_sorted_flag(sorted_arr: IsSorted, sorted_idx: IsSorted) -> impl ChunkTakeUnchecked for ChunkedArray where - T: PolarsDataType, + T: PolarsDataType, { /// Gather values from ChunkedArray by index. unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { @@ -312,3 +312,47 @@ impl IdxCa { f(&ca) } } + +#[cfg(feature = "dtype-array")] +impl ChunkTakeUnchecked for ArrayChunked { + unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { + let a = self.rechunk(); + let index = indices.rechunk(); + + let chunks = a + .downcast_iter() + .zip(index.downcast_iter()) + .map(|(arr, idx)| take_unchecked(arr, idx)) + .collect::>(); + self.copy_with_chunks(chunks) + } +} + +#[cfg(feature = "dtype-array")] +impl + ?Sized> ChunkTakeUnchecked for ArrayChunked { + unsafe fn take_unchecked(&self, indices: &I) -> Self { + let idx = IdxCa::mmap_slice(PlSmallStr::EMPTY, indices.as_ref()); + self.take_unchecked(&idx) + } +} + +impl ChunkTakeUnchecked for ListChunked { + unsafe fn take_unchecked(&self, indices: &IdxCa) -> Self { + let a = self.rechunk(); + let index = indices.rechunk(); + + let chunks = a + .downcast_iter() + .zip(index.downcast_iter()) + .map(|(arr, idx)| take_unchecked(arr, idx)) + .collect::>(); + self.copy_with_chunks(chunks) + } +} + +impl + ?Sized> ChunkTakeUnchecked for ListChunked { + unsafe fn take_unchecked(&self, indices: &I) -> Self { + let idx = IdxCa::mmap_slice(PlSmallStr::EMPTY, indices.as_ref()); + self.take_unchecked(&idx) + } +} diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index 712466482ce2..8d84d47be978 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -13,7 +13,6 @@ mod any_value; mod dtype; mod field; mod into_scalar; -mod reshape; #[cfg(feature = "object")] mod static_array_collect; mod time_unit; @@ -26,6 +25,7 @@ use std::ops::{Add, AddAssign, Div, Mul, Rem, Sub, SubAssign}; pub use aliases::*; pub use any_value::*; pub use arrow::array::{ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype, StaticArray}; +pub use arrow::datatypes::reshape::*; #[cfg(feature = "dtype-categorical")] use arrow::datatypes::IntegerType; pub use arrow::datatypes::{ArrowDataType, TimeUnit as ArrowTimeUnit}; @@ -42,7 +42,6 @@ use polars_utils::abs_diff::AbsDiff; use polars_utils::float::IsFloat; use polars_utils::min_max::MinMax; use polars_utils::nulls::IsNull; -pub use reshape::*; #[cfg(feature = "serde")] use serde::de::{EnumAccess, Error, Unexpected, VariantAccess, Visitor}; #[cfg(any(feature = "serde", feature = "serde-lazy"))] @@ -300,7 +299,7 @@ unsafe impl PolarsDataType for ObjectType { type OwnedPhysical = T; type ZeroablePhysical<'a> = Option<&'a T>; type Array = ObjectArray; - type IsNested = TrueT; + type IsNested = FalseT; type HasViews = FalseT; type IsStruct = FalseT; type IsObject = TrueT; diff --git a/crates/polars-core/src/series/ops/reshape.rs b/crates/polars-core/src/series/ops/reshape.rs index 89773abf794a..85998aa54de3 100644 --- a/crates/polars-core/src/series/ops/reshape.rs +++ b/crates/polars-core/src/series/ops/reshape.rs @@ -116,7 +116,7 @@ impl Series { InvalidOperation: "at least one dimension must be specified" ); - let leaf_array = self.get_leaf_array(); + let leaf_array = self.get_leaf_array().rechunk(); let size = leaf_array.len(); let mut total_dim_size = 1; diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index a8c304b8f65c..520f6bee729d 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -191,7 +191,7 @@ impl<'a> CoreReader<'a> { if let Some(b) = decompress(&reader_bytes, total_n_rows, separator, quote_char, eol_char) { - reader_bytes = ReaderBytes::Owned(b); + reader_bytes = ReaderBytes::Owned(b.into()); } } @@ -467,16 +467,10 @@ impl<'a> CoreReader<'a> { continue; } - let b = unsafe { - bytes.get_unchecked_release(total_offset..total_offset + position) - }; - // The parsers will not create a null row if we end on a new line. - if b.last() == Some(&self.eol_char) { - chunk_size *= 2; - continue; - } + let end = total_offset + position + 1; + let b = unsafe { bytes.get_unchecked_release(total_offset..end) }; - total_offset += position + 1; + total_offset = end; (b, count) }; let check_utf8 = matches!(self.encoding, CsvEncoding::Utf8) diff --git a/crates/polars-io/src/csv/read/schema_inference.rs b/crates/polars-io/src/csv/read/schema_inference.rs index 3684df1a2bac..a01ec5ddef3f 100644 --- a/crates/polars-io/src/csv/read/schema_inference.rs +++ b/crates/polars-io/src/csv/read/schema_inference.rs @@ -296,7 +296,7 @@ fn infer_file_schema_inner( buf.push(eol_char); return infer_file_schema_inner( - &ReaderBytes::Owned(buf), + &ReaderBytes::Owned(buf.into()), separator, max_read_rows, has_header, @@ -481,7 +481,7 @@ fn infer_file_schema_inner( rb.extend_from_slice(reader_bytes); rb.push(eol_char); return infer_file_schema_inner( - &ReaderBytes::Owned(rb), + &ReaderBytes::Owned(rb.into()), separator, max_read_rows, has_header, diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs index df91f32942f9..2373257469e7 100644 --- a/crates/polars-io/src/mmap.rs +++ b/crates/polars-io/src/mmap.rs @@ -1,9 +1,8 @@ use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek}; -use std::sync::Arc; use polars_core::config::verbose; -use polars_utils::mmap::{MMapSemaphore, MemSlice}; +use polars_utils::mmap::MemSlice; /// Trait used to get a hold to file handler or to the underlying bytes /// without performing a Read. @@ -67,8 +66,7 @@ impl MmapBytesReader for &mut T { // Handle various forms of input bytes pub enum ReaderBytes<'a> { Borrowed(&'a [u8]), - Owned(Vec), - Mapped(MMapSemaphore, &'a File), + Owned(MemSlice), } impl std::ops::Deref for ReaderBytes<'_> { @@ -77,19 +75,21 @@ impl std::ops::Deref for ReaderBytes<'_> { match self { Self::Borrowed(ref_bytes) => ref_bytes, Self::Owned(vec) => vec, - Self::Mapped(mmap, _) => mmap.as_ref(), } } } -/// Require 'static to force the caller to do any transmute as it's usually much -/// clearer to see there whether it's sound. +/// There are some places that perform manual lifetime management after transmuting `ReaderBytes` +/// to have a `'static` inner lifetime. The advantage to doing this is that it lets you construct a +/// `MemSlice` from the `ReaderBytes` in a zero-copy manner regardless of the underlying enum +/// variant. impl ReaderBytes<'static> { - pub fn into_mem_slice(self) -> MemSlice { + /// Construct a `MemSlice` in a zero-copy manner from the underlying bytes, with the assumption + /// that the underlying bytes have a `'static` lifetime. + pub fn to_memslice(&self) -> MemSlice { match self { ReaderBytes::Borrowed(v) => MemSlice::from_static(v), - ReaderBytes::Owned(v) => MemSlice::from_vec(v), - ReaderBytes::Mapped(v, _) => MemSlice::from_mmap(Arc::new(v)), + ReaderBytes::Owned(v) => v.clone(), } } } @@ -104,16 +104,14 @@ impl<'a, T: 'a + MmapBytesReader> From<&'a mut T> for ReaderBytes<'a> { }, None => { if let Some(f) = m.to_file() { - let f = unsafe { std::mem::transmute::<&File, &'a File>(f) }; - let mmap = MMapSemaphore::new_from_file(f).unwrap(); - ReaderBytes::Mapped(mmap, f) + ReaderBytes::Owned(MemSlice::from_file(f).unwrap()) } else { if verbose() { eprintln!("could not memory map file; read to buffer.") } let mut buf = vec![]; m.read_to_end(&mut buf).expect("could not read"); - ReaderBytes::Owned(buf) + ReaderBytes::Owned(MemSlice::from_vec(buf)) } }, } diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 7d926e434a72..aa86bfccce51 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -15,7 +15,6 @@ use polars_parquet::parquet::statistics::Statistics; use polars_parquet::read::{ self, ColumnChunkMetadata, FileMetadata, Filter, PhysicalType, RowGroupMetadata, }; -use polars_utils::mmap::MemSlice; use rayon::prelude::*; #[cfg(feature = "cloud")] @@ -908,10 +907,9 @@ pub fn read_parquet( } let reader = ReaderBytes::from(&mut reader); - let store = mmap::ColumnStore::Local( - unsafe { std::mem::transmute::, ReaderBytes<'static>>(reader) } - .into_mem_slice(), - ); + let store = mmap::ColumnStore::Local(unsafe { + std::mem::transmute::, ReaderBytes<'static>>(reader).to_memslice() + }); let dfs = rg_to_dfs( &store, @@ -959,9 +957,7 @@ impl FetchRowGroupsFromMmapReader { fn fetch_row_groups(&mut self, _row_groups: Range) -> PolarsResult { // @TODO: we can something smarter here with mmap - Ok(mmap::ColumnStore::Local(MemSlice::from_vec( - self.0.deref().to_vec(), - ))) + Ok(mmap::ColumnStore::Local(self.0.to_memslice())) } } diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 023d61fe525b..7033d55e1b0b 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -6,14 +6,14 @@ use once_cell::sync::Lazy; use polars_core::prelude::*; #[cfg(any(feature = "ipc_streaming", feature = "parquet"))] use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df_as_ref}; -use polars_utils::mmap::MMapSemaphore; +use polars_utils::mmap::{MMapSemaphore, MemSlice}; use regex::{Regex, RegexBuilder}; use crate::mmap::{MmapBytesReader, ReaderBytes}; -pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( - reader: &'a mut R, -) -> PolarsResult> { +pub fn get_reader_bytes( + reader: &mut R, +) -> PolarsResult> { // we have a file so we can mmap // only seekable files are mmap-able if let Some((file, offset)) = reader @@ -23,14 +23,8 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( { let mut options = memmap::MmapOptions::new(); options.offset(offset); - - // somehow bck thinks borrows alias - // this is sound as file was already bound to 'a - use std::fs::File; - - let file = unsafe { std::mem::transmute::<&File, &'a File>(file) }; let mmap = MMapSemaphore::new_from_file_with_options(file, options)?; - Ok(ReaderBytes::Mapped(mmap, file)) + Ok(ReaderBytes::Owned(MemSlice::from_mmap(Arc::new(mmap)))) } else { // we can get the bytes for free if reader.to_bytes().is_some() { @@ -40,7 +34,7 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( // we have to read to an owned buffer to get the bytes. let mut bytes = Vec::with_capacity(1024 * 128); reader.read_to_end(&mut bytes)?; - Ok(ReaderBytes::Owned(bytes)) + Ok(ReaderBytes::Owned(bytes.into())) } } } diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index dfe3ba1a3ddf..eb2cf3a228c1 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -46,7 +46,8 @@ impl ToDummies for Series { }) .collect::>(); - Ok(unsafe { DataFrame::new_no_checks_height_from_first(sort_columns(columns)) }) + // SAFETY: `dummies_helper` functions preserve `self.len()` length + unsafe { DataFrame::new_no_length_checks(sort_columns(columns)) } } } diff --git a/crates/polars-parquet/src/parquet/statistics/mod.rs b/crates/polars-parquet/src/parquet/statistics/mod.rs index b72e2a7c94b1..03335c27817b 100644 --- a/crates/polars-parquet/src/parquet/statistics/mod.rs +++ b/crates/polars-parquet/src/parquet/statistics/mod.rs @@ -11,7 +11,6 @@ pub use primitive::PrimitiveStatistics; use crate::parquet::error::ParquetResult; use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; pub use crate::parquet::thrift_format::Statistics as ParquetStatistics; -use crate::read::ParquetError; #[derive(Debug, PartialEq)] pub enum Statistics { @@ -42,6 +41,34 @@ impl Statistics { } } + pub fn clear_min(&mut self) { + use Statistics as S; + match self { + S::Binary(s) => _ = s.min_value.take(), + S::Boolean(s) => _ = s.min_value.take(), + S::FixedLen(s) => _ = s.min_value.take(), + S::Int32(s) => _ = s.min_value.take(), + S::Int64(s) => _ = s.min_value.take(), + S::Int96(s) => _ = s.min_value.take(), + S::Float(s) => _ = s.min_value.take(), + S::Double(s) => _ = s.min_value.take(), + }; + } + + pub fn clear_max(&mut self) { + use Statistics as S; + match self { + S::Binary(s) => _ = s.max_value.take(), + S::Boolean(s) => _ = s.max_value.take(), + S::FixedLen(s) => _ = s.max_value.take(), + S::Int32(s) => _ = s.max_value.take(), + S::Int64(s) => _ = s.max_value.take(), + S::Int96(s) => _ = s.max_value.take(), + S::Float(s) => _ = s.max_value.take(), + S::Double(s) => _ = s.max_value.take(), + }; + } + /// Deserializes a raw parquet statistics into [`Statistics`]. /// # Error /// This function errors if it is not possible to read the statistics to the @@ -51,19 +78,8 @@ impl Statistics { statistics: &ParquetStatistics, primitive_type: PrimitiveType, ) -> ParquetResult { - if statistics.is_min_value_exact.is_some() { - return Err(ParquetError::not_supported( - "is_min_value_exact in statistics", - )); - } - if statistics.is_max_value_exact.is_some() { - return Err(ParquetError::not_supported( - "is_max_value_exact in statistics", - )); - } - use {PhysicalType as T, PrimitiveStatistics as PrimStat}; - Ok(match primitive_type.physical_type { + let mut stats: Self = match primitive_type.physical_type { T::ByteArray => BinaryStatistics::deserialize(statistics, primitive_type)?.into(), T::Boolean => BooleanStatistics::deserialize(statistics)?.into(), T::Int32 => PrimStat::::deserialize(statistics, primitive_type)?.into(), @@ -74,7 +90,16 @@ impl Statistics { T::FixedLenByteArray(size) => { FixedLenStatistics::deserialize(statistics, size, primitive_type)?.into() }, - }) + }; + + if statistics.is_min_value_exact.is_some_and(|v| !v) { + stats.clear_min(); + } + if statistics.is_max_value_exact.is_some_and(|v| !v) { + stats.clear_max(); + } + + Ok(stats) } } diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs index 7a84c6990fda..55d5501dd44e 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs @@ -511,6 +511,21 @@ impl ProjectionPushDown { file_options.row_index = None; } }; + + if let Some(col_name) = &file_options.include_file_paths { + if output_schema + .as_ref() + .map_or(false, |schema| !schema.contains(col_name)) + { + // Need to remove it from the input schema so + // that projection indices are correct. + let mut file_schema = Arc::unwrap_or_clone(file_info.schema); + file_schema.shift_remove(col_name); + file_info.schema = Arc::new(file_schema); + file_options.include_file_paths = None; + } + }; + let lp = Scan { sources, file_info, diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index 9db54d1c3333..b5b875403b6d 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -24,7 +24,6 @@ rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlparser = { workspace = true } -# sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "ae3b5844c839072c235965fe0d1bddc473dced87" } [dev-dependencies] # to display dataframes in case of test failures @@ -34,6 +33,7 @@ polars-core = { workspace = true, features = ["fmt"] } default = [] nightly = [] binary_encoding = ["polars-lazy/binary_encoding"] +bitwise = ["polars-lazy/bitwise"] csv = ["polars-lazy/csv"] diagonal_concat = ["polars-lazy/diagonal_concat"] dtype-decimal = ["polars-lazy/dtype-decimal"] diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 342a5e0883d2..1a060545439c 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -709,6 +709,11 @@ impl SQLContext { }; lf = if group_by_keys.is_empty() { + // The 'having' clause is only valid inside 'group by' + if select_stmt.having.is_some() { + polars_bail!(SQLSyntax: "HAVING clause not valid outside of GROUP BY; found:\n{:?}", select_stmt.having); + }; + // Final/selected cols, accounting for 'SELECT *' modifiers let mut retained_cols = Vec::with_capacity(projections.len()); let have_order_by = query.order_by.is_some(); diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs index 5f6c311a5e15..ca4ae3504f71 100644 --- a/crates/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -30,11 +30,40 @@ pub(crate) struct SQLFunctionVisitor<'a> { /// SQL functions that are supported by Polars pub(crate) enum PolarsSQLFunctions { + // ---- + // Bitwise functions + // ---- + /// SQL 'bit_and' function. + /// Returns the bitwise AND of the input expressions. + /// ```sql + /// SELECT BIT_AND(column_1, column_2) FROM df; + /// ``` + BitAnd, + /// SQL 'bit_count' function. + /// Returns the number of set bits in the input expression. + /// ```sql + /// SELECT BIT_COUNT(column_1) FROM df; + /// ``` + #[cfg(feature = "bitwise")] + BitCount, + /// SQL 'bit_or' function. + /// Returns the bitwise OR of the input expressions. + /// ```sql + /// SELECT BIT_OR(column_1, column_2) FROM df; + /// ``` + BitOr, + /// SQL 'bit_xor' function. + /// Returns the bitwise XOR of the input expressions. + /// ```sql + /// SELECT BIT_XOR(column_1, column_2) FROM df; + /// ``` + BitXor, + // ---- // Math functions // ---- /// SQL 'abs' function - /// Returns the absolute value of the input column. + /// Returns the absolute value of the input expression. /// ```sql /// SELECT ABS(column_1) FROM df; /// ``` @@ -142,67 +171,67 @@ pub(crate) enum PolarsSQLFunctions { // Trig functions // ---- /// SQL 'cos' function - /// Compute the cosine sine of the input column (in radians). + /// Compute the cosine sine of the input expression (in radians). /// ```sql /// SELECT COS(column_1) FROM df; /// ``` Cos, /// SQL 'cot' function - /// Compute the cotangent of the input column (in radians). + /// Compute the cotangent of the input expression (in radians). /// ```sql /// SELECT COT(column_1) FROM df; /// ``` Cot, /// SQL 'sin' function - /// Compute the sine of the input column (in radians). + /// Compute the sine of the input expression (in radians). /// ```sql /// SELECT SIN(column_1) FROM df; /// ``` Sin, /// SQL 'tan' function - /// Compute the tangent of the input column (in radians). + /// Compute the tangent of the input expression (in radians). /// ```sql /// SELECT TAN(column_1) FROM df; /// ``` Tan, /// SQL 'cosd' function - /// Compute the cosine sine of the input column (in degrees). + /// Compute the cosine sine of the input expression (in degrees). /// ```sql /// SELECT COSD(column_1) FROM df; /// ``` CosD, /// SQL 'cotd' function - /// Compute cotangent of the input column (in degrees). + /// Compute cotangent of the input expression (in degrees). /// ```sql /// SELECT COTD(column_1) FROM df; /// ``` CotD, /// SQL 'sind' function - /// Compute the sine of the input column (in degrees). + /// Compute the sine of the input expression (in degrees). /// ```sql /// SELECT SIND(column_1) FROM df; /// ``` SinD, /// SQL 'tand' function - /// Compute the tangent of the input column (in degrees). + /// Compute the tangent of the input expression (in degrees). /// ```sql /// SELECT TAND(column_1) FROM df; /// ``` TanD, /// SQL 'acos' function - /// Compute inverse cosinus of the input column (in radians). + /// Compute inverse cosinus of the input expression (in radians). /// ```sql /// SELECT ACOS(column_1) FROM df; /// ``` Acos, /// SQL 'asin' function - /// Compute inverse sine of the input column (in radians). + /// Compute inverse sine of the input expression (in radians). /// ```sql /// SELECT ASIN(column_1) FROM df; /// ``` Asin, /// SQL 'atan' function - /// Compute inverse tangent of the input column (in radians). + /// Compute inverse tangent of the input expression (in radians). /// ```sql /// SELECT ATAN(column_1) FROM df; /// ``` @@ -214,19 +243,19 @@ pub(crate) enum PolarsSQLFunctions { /// ``` Atan2, /// SQL 'acosd' function - /// Compute inverse cosinus of the input column (in degrees). + /// Compute inverse cosinus of the input expression (in degrees). /// ```sql /// SELECT ACOSD(column_1) FROM df; /// ``` AcosD, /// SQL 'asind' function - /// Compute inverse sine of the input column (in degrees). + /// Compute inverse sine of the input expression (in degrees). /// ```sql /// SELECT ASIND(column_1) FROM df; /// ``` AsinD, /// SQL 'atand' function - /// Compute inverse tangent of the input column (in degrees). + /// Compute inverse tangent of the input expression (in degrees). /// ```sql /// SELECT ATAND(column_1) FROM df; /// ``` @@ -656,7 +685,11 @@ impl PolarsSQLFunctions { "atan2d", "atand", "avg", + "bit_and", + "bit_count", "bit_length", + "bit_or", + "bit_xor", "cbrt", "ceil", "ceiling", @@ -741,6 +774,15 @@ impl PolarsSQLFunctions { fn try_from_sql(function: &'_ SQLFunction, ctx: &'_ SQLContext) -> PolarsResult { let function_name = function.name.0[0].value.to_lowercase(); Ok(match function_name.as_str() { + // ---- + // Bitwise functions + // ---- + "bit_and" | "bitand" => Self::BitAnd, + #[cfg(feature = "bitwise")] + "bit_count" | "bitcount" => Self::BitCount, + "bit_or" | "bitor" => Self::BitOr, + "bit_xor" | "bitxor" | "xor" => Self::BitXor, + // ---- // Math functions // ---- @@ -894,6 +936,15 @@ impl SQLFunctionVisitor<'_> { } match function_name { + // ---- + // Bitwise functions + // ---- + BitAnd => self.visit_binary::(Expr::and), + #[cfg(feature = "bitwise")] + BitCount => self.visit_unary(Expr::bitwise_count_ones), + BitOr => self.visit_binary::(Expr::or), + BitXor => self.visit_binary::(Expr::xor), + // ---- // Math functions // ---- diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index f9caa288cb82..a6fe495d1ba5 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -469,48 +469,53 @@ impl SQLExprVisitor<'_> { rhs = self.convert_temporal_strings(&lhs, &rhs); Ok(match op { - SQLBinaryOperator::And => lhs.and(rhs), - SQLBinaryOperator::Divide => lhs / rhs, - SQLBinaryOperator::DuckIntegerDivide => lhs.floor_div(rhs).cast(DataType::Int64), - SQLBinaryOperator::Eq => lhs.eq(rhs), - SQLBinaryOperator::Gt => lhs.gt(rhs), - SQLBinaryOperator::GtEq => lhs.gt_eq(rhs), - SQLBinaryOperator::Lt => lhs.lt(rhs), - SQLBinaryOperator::LtEq => lhs.lt_eq(rhs), - SQLBinaryOperator::Minus => lhs - rhs, - SQLBinaryOperator::Modulo => lhs % rhs, - SQLBinaryOperator::Multiply => lhs * rhs, - SQLBinaryOperator::NotEq => lhs.eq(rhs).not(), - SQLBinaryOperator::Or => lhs.or(rhs), - SQLBinaryOperator::Plus => lhs + rhs, - SQLBinaryOperator::Spaceship => lhs.eq_missing(rhs), - SQLBinaryOperator::StringConcat => { + // ---- + // Bitwise operators + // ---- + SQLBinaryOperator::BitwiseAnd => lhs.and(rhs), // "x & y" + SQLBinaryOperator::BitwiseOr => lhs.or(rhs), // "x | y" + SQLBinaryOperator::Xor => lhs.xor(rhs), // "x XOR y" + + // ---- + // General operators + // ---- + SQLBinaryOperator::And => lhs.and(rhs), // "x AND y" + SQLBinaryOperator::Divide => lhs / rhs, // "x / y" + SQLBinaryOperator::DuckIntegerDivide => lhs.floor_div(rhs).cast(DataType::Int64), // "x // y" + SQLBinaryOperator::Eq => lhs.eq(rhs), // "x = y" + SQLBinaryOperator::Gt => lhs.gt(rhs), // "x > y" + SQLBinaryOperator::GtEq => lhs.gt_eq(rhs), // "x >= y" + SQLBinaryOperator::Lt => lhs.lt(rhs), // "x < y" + SQLBinaryOperator::LtEq => lhs.lt_eq(rhs), // "x <= y" + SQLBinaryOperator::Minus => lhs - rhs, // "x - y" + SQLBinaryOperator::Modulo => lhs % rhs, // "x % y" + SQLBinaryOperator::Multiply => lhs * rhs, // "x * y" + SQLBinaryOperator::NotEq => lhs.eq(rhs).not(), // "x != y" + SQLBinaryOperator::Or => lhs.or(rhs), // "x OR y" + SQLBinaryOperator::Plus => lhs + rhs, // "x + y" + SQLBinaryOperator::Spaceship => lhs.eq_missing(rhs), // "x <=> y" + SQLBinaryOperator::StringConcat => { // "x || y" lhs.cast(DataType::String) + rhs.cast(DataType::String) }, - SQLBinaryOperator::Xor => lhs.xor(rhs), - SQLBinaryOperator::PGStartsWith => lhs.str().starts_with(rhs), + SQLBinaryOperator::PGStartsWith => lhs.str().starts_with(rhs), // "x ^@ y" // ---- // Regular expression operators // ---- - // "a ~ b" - SQLBinaryOperator::PGRegexMatch => match rhs { + SQLBinaryOperator::PGRegexMatch => match rhs { // "x ~ y" Expr::Literal(LiteralValue::String(_)) => lhs.str().contains(rhs, true), _ => polars_bail!(SQLSyntax: "invalid pattern for '~' operator: {:?}", rhs), }, - // "a !~ b" - SQLBinaryOperator::PGRegexNotMatch => match rhs { + SQLBinaryOperator::PGRegexNotMatch => match rhs { // "x !~ y" Expr::Literal(LiteralValue::String(_)) => lhs.str().contains(rhs, true).not(), _ => polars_bail!(SQLSyntax: "invalid pattern for '!~' operator: {:?}", rhs), }, - // "a ~* b" - SQLBinaryOperator::PGRegexIMatch => match rhs { + SQLBinaryOperator::PGRegexIMatch => match rhs { // "x ~* y" Expr::Literal(LiteralValue::String(pat)) => { lhs.str().contains(lit(format!("(?i){}", pat)), true) }, _ => polars_bail!(SQLSyntax: "invalid pattern for '~*' operator: {:?}", rhs), }, - // "a !~* b" - SQLBinaryOperator::PGRegexNotIMatch => match rhs { + SQLBinaryOperator::PGRegexNotIMatch => match rhs { // "x !~* y" Expr::Literal(LiteralValue::String(pat)) => { lhs.str().contains(lit(format!("(?i){}", pat)), true).not() }, @@ -521,10 +526,10 @@ impl SQLExprVisitor<'_> { // ---- // LIKE/ILIKE operators // ---- - SQLBinaryOperator::PGLikeMatch - | SQLBinaryOperator::PGNotLikeMatch - | SQLBinaryOperator::PGILikeMatch - | SQLBinaryOperator::PGNotILikeMatch => { + SQLBinaryOperator::PGLikeMatch // "x ~~ y" + | SQLBinaryOperator::PGNotLikeMatch // "x !~~ y" + | SQLBinaryOperator::PGILikeMatch // "x ~~* y" + | SQLBinaryOperator::PGNotILikeMatch => { // "x !~~* y" let expr = if matches!( op, SQLBinaryOperator::PGLikeMatch | SQLBinaryOperator::PGNotLikeMatch @@ -548,7 +553,7 @@ impl SQLExprVisitor<'_> { // ---- // JSON/Struct field access operators // ---- - SQLBinaryOperator::Arrow | SQLBinaryOperator::LongArrow => match rhs { + SQLBinaryOperator::Arrow | SQLBinaryOperator::LongArrow => match rhs { // "x -> y", "x ->> y" Expr::Literal(LiteralValue::String(path)) => { let mut expr = self.struct_field_access_expr(&lhs, &path, false)?; if let SQLBinaryOperator::LongArrow = op { @@ -567,7 +572,7 @@ impl SQLExprVisitor<'_> { polars_bail!(SQLSyntax: "invalid json/struct path-extract definition: {:?}", right) }, }, - SQLBinaryOperator::HashArrow | SQLBinaryOperator::HashLongArrow => { + SQLBinaryOperator::HashArrow | SQLBinaryOperator::HashLongArrow => { // "x #> y", "x #>> y" if let Expr::Literal(LiteralValue::String(path)) = rhs { let mut expr = self.struct_field_access_expr(&lhs, &path, true)?; if let SQLBinaryOperator::HashLongArrow = op { diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index 78cdbc9115d0..4ed5ac3342c9 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -38,3 +38,4 @@ version_check = { workspace = true } [features] nightly = [] bitwise = ["polars-core/bitwise", "polars-plan/bitwise"] +merge_sorted = ["polars-plan/merge_sorted"] diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index f2b532ca1ca3..74321642f380 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -240,6 +240,7 @@ pub fn lower_ir( IR::MapFunction { input, function } => { // MergeSorted uses a rechunk hack incompatible with the // streaming engine. + #[cfg(feature = "merge_sorted")] if let FunctionIR::MergeSorted { .. } = function { todo!() } diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 29651d5eb56a..0ac1a643d93d 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -61,6 +61,12 @@ mod private { } } + impl From> for MemSlice { + fn from(value: Vec) -> Self { + Self::from_vec(value) + } + } + impl MemSlice { pub const EMPTY: Self = Self::from_static(&[]); diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index c625b276f553..f66762f041e6 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -131,7 +131,13 @@ array_any_all = ["polars-lazy?/array_any_all", "dtype-array"] asof_join = ["polars-lazy?/asof_join", "polars-ops/asof_join"] iejoin = ["polars-lazy?/iejoin"] binary_encoding = ["polars-ops/binary_encoding", "polars-lazy?/binary_encoding", "polars-sql?/binary_encoding"] -bitwise = ["polars-core/bitwise", "polars-plan?/bitwise", "polars-ops/bitwise", "polars-lazy?/bitwise"] +bitwise = [ + "polars-core/bitwise", + "polars-plan?/bitwise", + "polars-ops/bitwise", + "polars-lazy?/bitwise", + "polars-sql?/bitwise", +] business = ["polars-lazy?/business", "polars-ops/business"] checked_arithmetic = ["polars-core/checked_arithmetic"] chunked_ids = ["polars-ops?/chunked_ids"] diff --git a/docs/source/user-guide/ecosystem.md b/docs/source/user-guide/ecosystem.md index d6fc8e0c9524..9a8f96c4f72f 100644 --- a/docs/source/user-guide/ecosystem.md +++ b/docs/source/user-guide/ecosystem.md @@ -20,25 +20,7 @@ On this page you can find a non-exhaustive list of libraries and tools that supp ### Data visualisation -#### hvPlot - -[hvPlot](https://hvplot.holoviz.org/) is available as the default plotting backend for Polars making it simple to create interactive and static visualisations. You can use hvPlot by using the feature flag `plot` during installing. - -```python -pip install 'polars[plot]' -``` - -#### Matplotlib - -[Matplotlib](https://matplotlib.org/) is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible. - -#### Plotly - -[Plotly](https://plotly.com/python/) is an interactive, open-source, and browser-based graphing library for Python. Built on top of plotly.js, it ships with over 30 chart types, including scientific charts, 3D graphs, statistical charts, SVG maps, financial charts, and more. - -#### [Seaborn](https://seaborn.pydata.org/) - -Seaborn is a Python data visualization library based on Matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics. +See the [dedicated visualization section](misc/visualization.md). ### IO diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 63863959b720..56324e1dac66 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.9.0" +version = "1.10.0" edition = "2021" [lib] diff --git a/py-polars/docs/source/reference/sql/functions/bitwise.rst b/py-polars/docs/source/reference/sql/functions/bitwise.rst new file mode 100644 index 000000000000..bd66c3810df1 --- /dev/null +++ b/py-polars/docs/source/reference/sql/functions/bitwise.rst @@ -0,0 +1,151 @@ +Temporal +======== + +.. list-table:: + :header-rows: 1 + :widths: 20 60 + + * - Function + - Description + + * - :ref:`BIT_AND ` + - Returns the bitwise AND of the given values. + * - :ref:`BIT_COUNT ` + - Returns the number of bits set to 1 in the binary representation of the given value. + * - :ref:`BIT_OR ` + - Returns the bitwise OR of the given values. + * - :ref:`BIT_XOR ` + - Returns the bitwise XOR of the given values. + + +.. _bit_and: + +BIT_AND +------- +Returns the bitwise AND of the given values. +Also available as the `&` binary operator. + +.. code-block:: python + + df = pl.DataFrame( + { + "i": [3, 10, 4, 8], + "j": [4, 7, 9, 10], + } + ) + df.sql(""" + SELECT + i, + j, + i & j AS i_bitand_op_j, + BIT_AND(i, j) AS i_bitand_j + FROM self + """) + # shape: (4, 4) + # ┌─────┬─────┬───────────────┬────────────┐ + # │ i ┆ j ┆ i_bitand_op_j ┆ i_bitand_j │ + # │ --- ┆ --- ┆ --- ┆ --- │ + # │ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞═════╪═════╪═══════════════╪════════════╡ + # │ 3 ┆ 4 ┆ 0 ┆ 0 │ + # │ 10 ┆ 7 ┆ 2 ┆ 2 │ + # │ 4 ┆ 9 ┆ 0 ┆ 0 │ + # │ 8 ┆ 10 ┆ 8 ┆ 8 │ + # └─────┴─────┴───────────────┴────────────┘ + +.. _bit_count: + +BIT_COUNT +--------- +Returns the number of bits set to 1 in the binary representation of the given value. + +.. code-block:: python + + df = pl.DataFrame({"i": [16, 10, 55, 127]}) + df.sql(""" + SELECT + i, + BIT_COUNT(i) AS i_bitcount + FROM self + """) + # shape: (4, 2) + # ┌─────┬────────────┐ + # │ i ┆ i_bitcount │ + # │ --- ┆ --- │ + # │ i64 ┆ u32 │ + # ╞═════╪════════════╡ + # │ 16 ┆ 1 │ + # │ 10 ┆ 2 │ + # │ 55 ┆ 5 │ + # │ 127 ┆ 7 │ + # └─────┴────────────┘ + +.. _bit_or: + +BIT_OR +------ +Returns the bitwise OR of the given values. +Also available as the `|` binary operator. + +.. code-block:: python + + df = pl.DataFrame( + { + "i": [3, 10, 4, 8], + "j": [4, 7, 9, 10], + } + ) + df.sql(""" + SELECT + i, + j, + i | j AS i_bitor_op_j, + BIT_OR(i, j) AS i_bitor_j + FROM self + """) + # shape: (4, 4) + # ┌─────┬─────┬──────────────┬───────────┐ + # │ i ┆ j ┆ i_bitor_op_j ┆ i_bitor_j │ + # │ --- ┆ --- ┆ --- ┆ --- │ + # │ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞═════╪═════╪══════════════╪═══════════╡ + # │ 3 ┆ 4 ┆ 7 ┆ 7 │ + # │ 10 ┆ 7 ┆ 15 ┆ 15 │ + # │ 4 ┆ 9 ┆ 13 ┆ 13 │ + # │ 8 ┆ 10 ┆ 10 ┆ 10 │ + # └─────┴─────┴──────────────┴───────────┘ + +.. _bit_xor: + +BIT_XOR +------- +Returns the bitwise XOR of the given values. +Also available as the `XOR` binary operator. + +.. code-block:: python + + df = pl.DataFrame( + { + "i": [3, 10, 4, 8], + "j": [4, 7, 9, 10], + } + ) + df.sql(""" + SELECT + i, + j, + i XOR j AS i_bitxor_op_j, + BIT_XOR(i, j) AS i_bitxor_j + FROM self + """) + # shape: (4, 4) + # ┌─────┬─────┬───────────────┬────────────┐ + # │ i ┆ j ┆ i_bitxor_op_j ┆ i_bitxor_j │ + # │ --- ┆ --- ┆ --- ┆ --- │ + # │ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞═════╪═════╪═══════════════╪════════════╡ + # │ 3 ┆ 4 ┆ 7 ┆ 7 │ + # │ 10 ┆ 7 ┆ 13 ┆ 13 │ + # │ 4 ┆ 9 ┆ 13 ┆ 13 │ + # │ 8 ┆ 10 ┆ 2 ┆ 2 │ + # └─────┴─────┴───────────────┴────────────┘ diff --git a/py-polars/docs/source/reference/sql/functions/index.rst b/py-polars/docs/source/reference/sql/functions/index.rst index cf6247e41a10..3473a0741a91 100644 --- a/py-polars/docs/source/reference/sql/functions/index.rst +++ b/py-polars/docs/source/reference/sql/functions/index.rst @@ -32,6 +32,18 @@ SQL Functions array + .. grid-item-card:: + + **Bitwise** + ^^^^^^^^^^^ + + .. toctree:: + :maxdepth: 2 + + bitwise + +.. grid:: + .. grid-item-card:: **Conditional** @@ -52,8 +64,6 @@ SQL Functions math -.. grid:: - .. grid-item-card:: **String** @@ -64,6 +74,8 @@ SQL Functions string +.. grid:: + .. grid-item-card:: **Temporal** diff --git a/py-polars/polars/convert/general.py b/py-polars/polars/convert/general.py index cee4b925e9e9..13adff9c4cd3 100644 --- a/py-polars/polars/convert/general.py +++ b/py-polars/polars/convert/general.py @@ -98,7 +98,7 @@ def from_dict( def from_dicts( - data: Sequence[dict[str, Any]], + data: Iterable[dict[str, Any]], schema: SchemaDefinition | None = None, *, schema_overrides: SchemaDict | None = None, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 640e5418555c..22df34dad668 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2299,3 +2299,11 @@ def test_read_csv_cast_unparsable_later( df.write_csv(f) f.seek(0) assert df.equals(pl.read_csv(f, schema={"x": dtype})) + + +def test_csv_double_new_line() -> None: + assert pl.read_csv(b"a,b,c\n\n", has_header=False).to_dict(as_series=False) == { + "column_1": ["a", None], + "column_2": ["b", None], + "column_3": ["c", None], + } diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 799c4953cbf6..4977fa115749 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -801,3 +801,11 @@ def test_scan_double_collect_row_index_invalidates_cached_ir_18892() -> None: schema={"index": pl.UInt32, "a": pl.Int64}, ), ) + + +def test_scan_include_file_paths_respects_projection_pushdown() -> None: + q = pl.scan_csv(b"a,b,c\na1,b1,c1", include_file_paths="path_name").select( + ["a", "b"] + ) + + assert_frame_equal(q.collect(), pl.DataFrame({"a": "a1", "b": "b1"})) diff --git a/py-polars/tests/unit/operations/arithmetic/test_list_arithmetic.py b/py-polars/tests/unit/operations/arithmetic/test_list_arithmetic.py index 64ad0e533d8d..0e7af02d8290 100644 --- a/py-polars/tests/unit/operations/arithmetic/test_list_arithmetic.py +++ b/py-polars/tests/unit/operations/arithmetic/test_list_arithmetic.py @@ -101,6 +101,7 @@ def func( BROADCAST_SERIES_COMBINATIONS, ) @pytest.mark.parametrize("exec_op", EXEC_OP_COMBINATIONS) +@pytest.mark.slow def test_list_arithmetic_values( list_side: str, broadcast_series: Callable[ @@ -380,6 +381,7 @@ def test_list_add_supertype( "broadcast_series", BROADCAST_SERIES_COMBINATIONS, ) +@pytest.mark.slow def test_list_numeric_op_validity_combination( broadcast_series: Callable[ [pl.Series, pl.Series, pl.Series], tuple[pl.Series, pl.Series, pl.Series] @@ -451,6 +453,7 @@ def test_list_add_alignment() -> None: @pytest.mark.parametrize("exec_op", EXEC_OP_COMBINATIONS) +@pytest.mark.slow def test_list_add_empty_lists( exec_op: Callable[[pl.Series, pl.Series, Any], pl.Series], ) -> None: @@ -516,6 +519,7 @@ def test_list_add_height_mismatch( ], ) @pytest.mark.parametrize("exec_op", EXEC_OP_COMBINATIONS) +@pytest.mark.slow def test_list_date_to_numeric_arithmetic_raises_error( op: Callable[[Any], Any], exec_op: Callable[[pl.Series, pl.Series, Any], pl.Series] ) -> None: diff --git a/py-polars/tests/unit/operations/test_gather.py b/py-polars/tests/unit/operations/test_gather.py index ddc891df04f1..595b2bfee246 100644 --- a/py-polars/tests/unit/operations/test_gather.py +++ b/py-polars/tests/unit/operations/test_gather.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import polars as pl @@ -176,3 +177,14 @@ def test_gather_array_list_null_19302() -> None: assert data.select(pl.col("data").list.get(0)).to_dict(as_series=False) == { "data": [None] } + + +def test_gather_array() -> None: + a = np.arange(16).reshape(-1, 2, 2) + s = pl.Series(a) + + for idx in [[1, 2], [0, 0], [1, 0], [1, 1, 1, 1, 1, 1, 1, 1]]: + assert (s.gather(idx).to_numpy() == a[idx]).all() + + v = s[[0, 1, None, 3]] # type: ignore[list-item] + assert v[2] is None diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index 57dbec1a13ee..b19d008556e1 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -413,6 +413,7 @@ def test_sort_by_in_over_5499() -> None: } +@pytest.mark.may_fail_auto_streaming def test_merge_sorted() -> None: df_a = ( pl.datetime_range( diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index f1a858f62add..0ed478b9aa83 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -23,6 +23,7 @@ Unknown, ) from polars.exceptions import ( + DuplicateError, InvalidOperationError, PolarsInefficientMapWarning, ShapeError, @@ -1356,6 +1357,13 @@ def test_to_dummies_drop_first() -> None: assert_frame_equal(result, expected) +def test_to_dummies_null_clash_19096() -> None: + with pytest.raises( + DuplicateError, match="column with name '_null' has more than one occurrence" + ): + pl.Series([None, "null"]).to_dummies() + + def test_chunk_lengths() -> None: s = pl.Series("a", [1, 2, 2, 3]) # this is a Series with one chunk, of length 4 diff --git a/py-polars/tests/unit/sql/test_bitwise.py b/py-polars/tests/unit/sql/test_bitwise.py new file mode 100644 index 000000000000..7bba8cfde5c5 --- /dev/null +++ b/py-polars/tests/unit/sql/test_bitwise.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import pytest + +import polars as pl + + +@pytest.fixture +def df() -> pl.DataFrame: + return pl.DataFrame( + { + "x": [20, 32, 50, 88, 128], + "y": [-128, 0, 10, -1, None], + } + ) + + +def test_bitwise_and(df: pl.DataFrame) -> None: + res = df.sql( + """ + SELECT + x & y AS x_bitand_op_y, + BITAND(y, x) AS y_bitand_x, + BIT_AND(x, y) AS x_bitand_y, + FROM self + """ + ) + assert res.to_dict(as_series=False) == { + "x_bitand_op_y": [0, 0, 2, 88, None], + "y_bitand_x": [0, 0, 2, 88, None], + "x_bitand_y": [0, 0, 2, 88, None], + } + + +def test_bitwise_count(df: pl.DataFrame) -> None: + res = df.sql( + """ + SELECT + BITCOUNT(x) AS x_bits_set, + BIT_COUNT(y) AS y_bits_set, + FROM self + """ + ) + assert res.to_dict(as_series=False) == { + "x_bits_set": [2, 1, 3, 3, 1], + "y_bits_set": [57, 0, 2, 64, None], + } + + +def test_bitwise_or(df: pl.DataFrame) -> None: + res = df.sql( + """ + SELECT + x | y AS x_bitor_op_y, + BITOR(y, x) AS y_bitor_x, + BIT_OR(x, y) AS x_bitor_y, + FROM self + """ + ) + assert res.to_dict(as_series=False) == { + "x_bitor_op_y": [-108, 32, 58, -1, None], + "y_bitor_x": [-108, 32, 58, -1, None], + "x_bitor_y": [-108, 32, 58, -1, None], + } + + +def test_bitwise_xor(df: pl.DataFrame) -> None: + res = df.sql( + """ + SELECT + x XOR y AS x_bitxor_op_y, + BITXOR(y, x) AS y_bitxor_x, + BIT_XOR(x, y) AS x_bitxor_y, + FROM self + """ + ) + assert res.to_dict(as_series=False) == { + "x_bitxor_op_y": [-108, 32, 56, -89, None], + "y_bitxor_x": [-108, 32, 56, -89, None], + "x_bitxor_y": [-108, 32, 56, -89, None], + } diff --git a/py-polars/tests/unit/sql/test_group_by.py b/py-polars/tests/unit/sql/test_group_by.py index 08e4b236c833..71fa1572831c 100644 --- a/py-polars/tests/unit/sql/test_group_by.py +++ b/py-polars/tests/unit/sql/test_group_by.py @@ -238,3 +238,9 @@ def test_group_by_errors() -> None: match=r"'a' should participate in the GROUP BY clause or an aggregate function", ): df.sql("SELECT a, SUM(b) FROM self GROUP BY b") + + with pytest.raises( + SQLSyntaxError, + match=r"HAVING clause not valid outside of GROUP BY", + ): + df.sql("SELECT a, COUNT(a) AS n FROM self HAVING n > 1")