Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ScalarValue::Dictionary #12488

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 9 additions & 219 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,6 @@ pub enum ScalarValue {
/// `.1`: the list of fields, zero-to-one of which will by set in `.0`
/// `.2`: the physical storage of the source/destination UnionArray from which this Scalar came
Union(Option<(i8, Box<ScalarValue>)>, UnionFields, UnionMode),
/// Dictionary type: index type and value
Dictionary(Box<DataType>, Box<ScalarValue>),
}

impl Hash for Fl<f16> {
Expand Down Expand Up @@ -414,8 +412,6 @@ impl PartialEq for ScalarValue {
val1.eq(val2) && fields1.eq(fields2) && mode1.eq(mode2)
}
(Union(_, _, _), _) => false,
(Dictionary(k1, v1), Dictionary(k2, v2)) => k1.eq(k2) && v1.eq(v2),
(Dictionary(_, _), _) => false,
(Null, Null) => true,
(Null, _) => false,
}
Expand Down Expand Up @@ -558,15 +554,6 @@ impl PartialOrd for ScalarValue {
}
}
(Union(_, _, _), _) => None,
(Dictionary(k1, v1), Dictionary(k2, v2)) => {
// Don't compare if the key types don't match (it is effectively a different datatype)
if k1 == k2 {
v1.partial_cmp(v2)
} else {
None
}
}
(Dictionary(_, _), _) => None,
(Null, Null) => Some(Ordering::Equal),
(Null, _) => None,
}
Expand Down Expand Up @@ -757,10 +744,6 @@ impl std::hash::Hash for ScalarValue {
t.hash(state);
m.hash(state);
}
Dictionary(k, v) => {
k.hash(state);
v.hash(state);
}
// stable hash for Null value
Null => 1.hash(state),
}
Expand Down Expand Up @@ -791,68 +774,6 @@ pub fn get_dict_value<K: ArrowDictionaryKeyType>(
Ok((dict_array.values(), dict_array.key(index)))
}

/// Create a dictionary array representing `value` repeated `size`
/// times
fn dict_from_scalar<K: ArrowDictionaryKeyType>(
value: &ScalarValue,
size: usize,
) -> Result<ArrayRef> {
// values array is one element long (the value)
let values_array = value.to_array_of_size(1)?;

// Create a key array with `size` elements, each of 0
let key_array: PrimitiveArray<K> = std::iter::repeat(if value.is_null() {
None
} else {
Some(K::default_value())
})
.take(size)
.collect();

// create a new DictionaryArray
//
// Note: this path could be made faster by using the ArrayData
// APIs and skipping validation, if it every comes up in
// performance traces.
Ok(Arc::new(
DictionaryArray::<K>::try_new(key_array, values_array)?, // should always be valid by construction above
))
}

/// Create a dictionary array representing all the values in values
fn dict_from_values<K: ArrowDictionaryKeyType>(
values_array: ArrayRef,
) -> Result<ArrayRef> {
// Create a key array with `size` elements of 0..array_len for all
// non-null value elements
let key_array: PrimitiveArray<K> = (0..values_array.len())
.map(|index| {
if values_array.is_valid(index) {
let native_index = K::Native::from_usize(index).ok_or_else(|| {
DataFusionError::Internal(format!(
"Can not create index of type {} from value {}",
K::DATA_TYPE,
index
))
})?;
Ok(Some(native_index))
} else {
Ok(None)
}
})
.collect::<Result<Vec<_>>>()?
.into_iter()
.collect();

// create a new DictionaryArray
//
// Note: this path could be made faster by using the ArrayData
// APIs and skipping validation, if it every comes up in
// performance traces.
let dict_array = DictionaryArray::<K>::try_new(key_array, values_array)?;
Ok(Arc::new(dict_array))
}

macro_rules! typed_cast_tz {
($array:expr, $index:expr, $ARRAYTYPE:ident, $SCALAR:ident, $TZ:expr) => {{
use std::any::type_name;
Expand Down Expand Up @@ -1314,9 +1235,6 @@ impl ScalarValue {
DataType::Duration(TimeUnit::Nanosecond)
}
ScalarValue::Union(_, fields, mode) => DataType::Union(fields.clone(), *mode),
ScalarValue::Dictionary(k, v) => {
DataType::Dictionary(k.clone(), Box::new(v.data_type()))
}
ScalarValue::Null => DataType::Null,
}
}
Expand Down Expand Up @@ -1575,7 +1493,6 @@ impl ScalarValue {
Some((_, s)) => s.is_null(),
None => true,
},
ScalarValue::Dictionary(_, v) => v.is_null(),
}
}

Expand Down Expand Up @@ -1890,40 +1807,6 @@ impl ScalarValue {
let arrays = arrays.iter().map(|a| a.as_ref()).collect::<Vec<_>>();
arrow::compute::concat(arrays.as_slice())?
}
DataType::Dictionary(key_type, value_type) => {
// create the values array
let value_scalars = scalars
.map(|scalar| match scalar {
ScalarValue::Dictionary(inner_key_type, scalar) => {
if &inner_key_type == key_type {
Ok(*scalar)
} else {
_exec_err!("Expected inner key type of {key_type} but found: {inner_key_type}, value was ({scalar:?})")
}
}
_ => {
_exec_err!(
"Expected scalar of type {value_type} but found: {scalar} {scalar:?}"
)
}
})
.collect::<Result<Vec<_>>>()?;

let values = Self::iter_to_array(value_scalars)?;
assert_eq!(values.data_type(), value_type.as_ref());

match key_type.as_ref() {
DataType::Int8 => dict_from_values::<Int8Type>(values)?,
DataType::Int16 => dict_from_values::<Int16Type>(values)?,
DataType::Int32 => dict_from_values::<Int32Type>(values)?,
DataType::Int64 => dict_from_values::<Int64Type>(values)?,
DataType::UInt8 => dict_from_values::<UInt8Type>(values)?,
DataType::UInt16 => dict_from_values::<UInt16Type>(values)?,
DataType::UInt32 => dict_from_values::<UInt32Type>(values)?,
DataType::UInt64 => dict_from_values::<UInt64Type>(values)?,
_ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
}
}
DataType::FixedSizeBinary(size) => {
let array = scalars
.map(|sv| {
Expand Down Expand Up @@ -1953,6 +1836,7 @@ impl ScalarValue {
| DataType::Time64(TimeUnit::Second)
| DataType::Time64(TimeUnit::Millisecond)
| DataType::RunEndEncoded(_, _)
| DataType::Dictionary(..)
| DataType::ListView(_)
| DataType::LargeListView(_) => {
return _not_impl_err!(
Expand Down Expand Up @@ -2463,20 +2347,6 @@ impl ScalarValue {
new_null_array(&dt, size)
}
},
ScalarValue::Dictionary(key_type, v) => {
// values array is one element long (the value)
match key_type.as_ref() {
DataType::Int8 => dict_from_scalar::<Int8Type>(v, size)?,
DataType::Int16 => dict_from_scalar::<Int16Type>(v, size)?,
DataType::Int32 => dict_from_scalar::<Int32Type>(v, size)?,
DataType::Int64 => dict_from_scalar::<Int64Type>(v, size)?,
DataType::UInt8 => dict_from_scalar::<UInt8Type>(v, size)?,
DataType::UInt16 => dict_from_scalar::<UInt16Type>(v, size)?,
DataType::UInt32 => dict_from_scalar::<UInt32Type>(v, size)?,
DataType::UInt64 => dict_from_scalar::<UInt64Type>(v, size)?,
_ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
}
}
ScalarValue::Null => new_null_array(&DataType::Null, size),
})
}
Expand Down Expand Up @@ -2735,15 +2605,13 @@ impl ScalarValue {
_ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
};
// look up the index in the values dictionary
let value = match values_index {
match values_index {
Some(values_index) => {
ScalarValue::try_from_array(values_array, values_index)
}
// else entry was null, so return null
None => values_array.data_type().try_into(),
}?;

Self::Dictionary(key_type.clone(), Box::new(value))
}?
}
DataType::Struct(_) => {
let a = array.slice(index, 1);
Expand Down Expand Up @@ -3044,24 +2912,6 @@ impl ScalarValue {
array.child(ti).is_null(index)
}
}
ScalarValue::Dictionary(key_type, v) => {
let (values_array, values_index) = match key_type.as_ref() {
DataType::Int8 => get_dict_value::<Int8Type>(array, index)?,
DataType::Int16 => get_dict_value::<Int16Type>(array, index)?,
DataType::Int32 => get_dict_value::<Int32Type>(array, index)?,
DataType::Int64 => get_dict_value::<Int64Type>(array, index)?,
DataType::UInt8 => get_dict_value::<UInt8Type>(array, index)?,
DataType::UInt16 => get_dict_value::<UInt16Type>(array, index)?,
DataType::UInt32 => get_dict_value::<UInt32Type>(array, index)?,
DataType::UInt64 => get_dict_value::<UInt64Type>(array, index)?,
_ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
};
// was the value in the array non null?
match values_index {
Some(values_index) => v.eq_array(values_array, values_index)?,
None => v.is_null(),
}
}
ScalarValue::Null => array.is_null(index),
})
}
Expand Down Expand Up @@ -3135,10 +2985,6 @@ impl ScalarValue {
+ (std::mem::size_of::<Field>() * fields.len())
+ fields.iter().map(|(_idx, field)| field.size() - std::mem::size_of_val(field)).sum::<usize>()
}
ScalarValue::Dictionary(dt, sv) => {
// `dt` and `sv` are boxed, so they are NOT already included in `self`
dt.size() + sv.size()
}
}
}

Expand Down Expand Up @@ -3432,10 +3278,9 @@ impl TryFrom<&DataType> for ScalarValue {
DataType::Duration(TimeUnit::Nanosecond) => {
ScalarValue::DurationNanosecond(None)
}
DataType::Dictionary(index_type, value_type) => ScalarValue::Dictionary(
index_type.clone(),
Box::new(value_type.as_ref().try_into()?),
),
DataType::Dictionary(_index_type, value_type) => {
<Self as TryFrom<&DataType>>::try_from(value_type)?
}
// `ScalaValue::List` contains single element `ListArray`.
DataType::List(field_ref) => ScalarValue::List(Arc::new(
GenericListArray::new_null(Arc::clone(field_ref), 1),
Expand Down Expand Up @@ -3636,7 +3481,6 @@ impl fmt::Display for ScalarValue {
Some((id, val)) => write!(f, "{}:{}", id, val)?,
None => write!(f, "NULL")?,
},
ScalarValue::Dictionary(_k, v) => write!(f, "{v}")?,
ScalarValue::Null => write!(f, "NULL")?,
};
Ok(())
Expand Down Expand Up @@ -3813,7 +3657,6 @@ impl fmt::Debug for ScalarValue {
Some((id, val)) => write!(f, "Union {}:{}", id, val),
None => write!(f, "Union(NULL)"),
},
ScalarValue::Dictionary(k, v) => write!(f, "Dictionary({k:?}, {v:?})"),
ScalarValue::Null => write!(f, "NULL"),
}
}
Expand Down Expand Up @@ -3865,9 +3708,7 @@ impl ScalarType<i32> for Date32Type {
mod tests {

use super::*;
use crate::cast::{
as_map_array, as_string_array, as_struct_array, as_uint32_array, as_uint64_array,
};
use crate::cast::{as_map_array, as_struct_array, as_uint32_array, as_uint64_array};

use crate::assert_batches_eq;
use crate::utils::array_into_list_array_nullable;
Expand Down Expand Up @@ -4867,38 +4708,6 @@ mod tests {
);
}

#[test]
fn scalar_iter_to_dictionary() {
fn make_val(v: Option<String>) -> ScalarValue {
let key_type = DataType::Int32;
let value = ScalarValue::Utf8(v);
ScalarValue::Dictionary(Box::new(key_type), Box::new(value))
}

let scalars = [
make_val(Some("Foo".into())),
make_val(None),
make_val(Some("Bar".into())),
];

let array = ScalarValue::iter_to_array(scalars).unwrap();
let array = as_dictionary_array::<Int32Type>(&array).unwrap();
let values_array = as_string_array(array.values()).unwrap();

let values = array
.keys_iter()
.map(|k| {
k.map(|k| {
assert!(values_array.is_valid(k));
values_array.value(k)
})
})
.collect::<Vec<_>>();

let expected = vec![Some("Foo"), None, Some("Bar")];
assert_eq!(values, expected);
}

#[test]
fn scalar_iter_to_array_mismatched_types() {
use ScalarValue::*;
Expand Down Expand Up @@ -5033,10 +4842,7 @@ mod tests {
let data_type =
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8));
let data_type = &data_type;
let expected = ScalarValue::Dictionary(
Box::new(DataType::Int8),
Box::new(ScalarValue::Utf8(None)),
);
let expected = ScalarValue::Utf8(None);
assert_eq!(expected, data_type.try_into().unwrap())
}

Expand Down Expand Up @@ -5192,12 +4998,7 @@ mod tests {
),
scalars: $INPUT
.iter()
.map(|v| {
ScalarValue::Dictionary(
Box::new($INDEX_TY::DATA_TYPE),
Box::new(ScalarValue::Utf8(v.map(|v| v.to_string()))),
)
})
.map(|v| ScalarValue::Utf8(v.map(|v| v.to_string())))
.collect(),
}
}};
Expand Down Expand Up @@ -6908,15 +6709,4 @@ mod tests {
);
assert!(dense_scalar.is_null());
}

#[test]
fn null_dictionary_scalar_produces_null_dictionary_array() {
let dictionary_scalar = ScalarValue::Dictionary(
Box::new(DataType::Int32),
Box::new(ScalarValue::Null),
);
assert!(dictionary_scalar.is_null());
let dictionary_array = dictionary_scalar.to_array().unwrap();
assert!(dictionary_array.is_null(0));
}
}
4 changes: 0 additions & 4 deletions datafusion/core/src/datasource/listing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,7 @@ pub struct PartitionedFile {
///
/// These MUST have the same count, order, and type than the [`table_partition_cols`].
///
/// You may use [`wrap_partition_value_in_dict`] to wrap them if you have used [`wrap_partition_type_in_dict`] to wrap the column type.
///
///
/// [`wrap_partition_type_in_dict`]: crate::datasource::physical_plan::wrap_partition_type_in_dict
/// [`wrap_partition_value_in_dict`]: crate::datasource::physical_plan::wrap_partition_value_in_dict
/// [`table_partition_cols`]: table::ListingOptions::table_partition_cols
pub partition_values: Vec<ScalarValue>,
/// An optional file range for a more fine-grained parallel execution
Expand Down
Loading
Loading