diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index aefc06fb2a6d..d40b831fa2c2 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -120,53 +120,53 @@ jobs: cargo run --example read_csv_infer_schema # test the --features "simd" of the arrow crate. This requires nightly. - linux-test-simd: - name: Test SIMD on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow-rs/arrow-rs/testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow - cargo test --features "simd" - - name: Check new project build with simd features - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd arrow/test/dependency/simd - cargo check +# linux-test-simd: +# name: Test SIMD on AMD64 Rust ${{ matrix.rust }} +# runs-on: ubuntu-latest +# strategy: +# matrix: +# arch: [amd64] +# rust: [] +# container: +# image: ${{ matrix.arch }}/rust +# env: +# # Disable full debug symbol generation to speed up CI build and keep memory down +# # "1" means line tables only, which is useful for panic tracebacks. +# RUSTFLAGS: "-C debuginfo=1" +# ARROW_TEST_DATA: /__w/arrow-rs/arrow-rs/testing/data +# steps: +# - uses: actions/checkout@v2 +# with: +# submodules: true +# - name: Cache Cargo +# uses: actions/cache@v2 +# with: +# path: /github/home/.cargo +# # this key equals the ones on `linux-build-lib` for re-use +# key: cargo-cache- +# - name: Cache Rust dependencies +# uses: actions/cache@v2 +# with: +# path: /github/home/target +# # this key equals the ones on `linux-build-lib` for re-use +# key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} +# - name: Setup Rust toolchain +# run: | +# rustup toolchain install ${{ matrix.rust }} +# rustup default ${{ matrix.rust }} +# rustup component add rustfmt +# - name: Run tests +# run: | +# export CARGO_HOME="/github/home/.cargo" +# export CARGO_TARGET_DIR="/github/home/target" +# cd arrow +# cargo test --features "simd" +# - name: Check new project build with simd features +# run: | +# export CARGO_HOME="/github/home/.cargo" +# export CARGO_TARGET_DIR="/github/home/target" +# cd arrow/test/dependency/simd +# cargo check windows-and-macos: name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 7474ae41c526..c32ae087315f 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -68,6 +68,46 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } match (from_type, to_type) { + // TODO now just support signed numeric to decimal, support decimal to numeric later + (Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal(_, _)) + | ( + Null, + Boolean + | Int8 + | UInt8 + | Int16 + | UInt16 + | Int32 + | UInt32 + | Float32 + | Date32 + | Time32(_) + | Int64 + | UInt64 + | Float64 + | Date64 + | List(_) + | Dictionary(_, _), + ) + | ( + Boolean + | Int8 + | UInt8 + | Int16 + | UInt16 + | Int32 + | UInt32 + | Float32 + | Date32 + | Time32(_) + | Int64 + | UInt64 + | Float64 + | Date64 + | List(_) + | Dictionary(_, _), + Null, + ) => true, (Struct(_), _) => false, (_, Struct(_)) => false, (LargeList(list_from), LargeList(list_to)) => { @@ -306,7 +346,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), Date64) => true, // date64 to timestamp might not make sense, (Int64, Duration(_)) => true, - (Null, Int32) => true, (_, _) => false, } } @@ -833,6 +872,45 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } +// cast the integer array to defined decimal data type array +macro_rules! cast_integer_to_decimal { + ($ARRAY: expr, $ARRAY_TYPE: ident, $PRECISION : ident, $SCALE : ident) => {{ + let mut decimal_builder = DecimalBuilder::new($ARRAY.len(), *$PRECISION, *$SCALE); + let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); + let mul: i128 = 10_i128.pow(*$SCALE as u32); + for i in 0..array.len() { + if array.is_null(i) { + decimal_builder.append_null()?; + } else { + // convert i128 first + let v = array.value(i) as i128; + // if the input value is overflow, it will throw an error. + decimal_builder.append_value(mul * v)?; + } + } + Ok(Arc::new(decimal_builder.finish())) + }}; +} + +// cast the floating-point array to defined decimal data type array +macro_rules! cast_floating_point_to_decimal { + ($ARRAY: expr, $ARRAY_TYPE: ident, $PRECISION : ident, $SCALE : ident) => {{ + let mut decimal_builder = DecimalBuilder::new($ARRAY.len(), *$PRECISION, *$SCALE); + let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); + let mul = 10_f64.powi(*$SCALE as i32); + for i in 0..array.len() { + if array.is_null(i) { + decimal_builder.append_null()?; + } else { + let v = ((array.value(i) as f64) * mul) as i128; + // if the input value is overflow, it will throw an error. + decimal_builder.append_value(v)?; + } + } + Ok(Arc::new(decimal_builder.finish())) + }}; +} + /// Cast `array` to the provided data type and return a new Array with /// type `to_type`, if possible. It accepts `CastOptions` to allow consumers /// to configure cast behavior. @@ -867,6 +945,72 @@ pub fn cast_with_options( return Ok(array.clone()); } match (from_type, to_type) { + (_, Decimal(precision, scale)) => { + // cast data to decimal + match from_type { + // TODO now just support signed numeric to decimal, support decimal to numeric later + Int8 => { + cast_integer_to_decimal!(array, Int8Array, precision, scale) + } + Int16 => { + cast_integer_to_decimal!(array, Int16Array, precision, scale) + } + Int32 => { + cast_integer_to_decimal!(array, Int32Array, precision, scale) + } + Int64 => { + cast_integer_to_decimal!(array, Int64Array, precision, scale) + } + Float32 => { + cast_floating_point_to_decimal!(array, Float32Array, precision, scale) + } + Float64 => { + cast_floating_point_to_decimal!(array, Float64Array, precision, scale) + } + _ => Err(ArrowError::CastError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type + ))), + } + } + ( + Null, + Boolean + | Int8 + | UInt8 + | Int16 + | UInt16 + | Int32 + | UInt32 + | Float32 + | Date32 + | Time32(_) + | Int64 + | UInt64 + | Float64 + | Date64 + | List(_) + | Dictionary(_, _), + ) + | ( + Boolean + | Int8 + | UInt8 + | Int16 + | UInt16 + | Int32 + | UInt32 + | Float32 + | Date32 + | Time32(_) + | Int64 + | UInt64 + | Float64 + | Date64 + | List(_) + | Dictionary(_, _), + Null, + ) => Ok(new_null_array(to_type, array.len())), (Struct(_), _) => Err(ArrowError::CastError( "Cannot cast from struct to other types".to_string(), )), @@ -1706,10 +1850,6 @@ pub fn cast_with_options( } } } - - // null to primitive/flat types - (Null, Int32) => Ok(Arc::new(Int32Array::from(vec![None; array.len()]))), - (_, _) => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -2003,7 +2143,7 @@ fn cast_string_to_date64( if string_array.is_null(i) { Ok(None) } else { - let string = string_array + let string = string_array .value(i); let result = string @@ -2220,7 +2360,7 @@ fn dictionary_cast( return Err(ArrowError::CastError(format!( "Unsupported type {:?} for dictionary index", to_index_type - ))) + ))); } }; @@ -2584,6 +2724,115 @@ where mod tests { use super::*; use crate::{buffer::Buffer, util::display::array_value_to_string}; + use num::traits::Pow; + + #[test] + fn test_cast_numeric_to_decimal() { + // test cast type + let data_types = vec![ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::Float32, + DataType::Float64, + ]; + let decimal_type = DataType::Decimal(38, 6); + for data_type in data_types { + assert!(can_cast_types(&data_type, &decimal_type)) + } + assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); + + // test cast data + let input_datas = vec![ + Arc::new(Int8Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i8 + Arc::new(Int16Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i16 + Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i32 + Arc::new(Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i64 + ]; + + // i8, i16, i32, i64 + for array in input_datas { + let casted_array = cast(&array, &decimal_type).unwrap(); + let decimal_array = casted_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&decimal_type, decimal_array.data_type()); + for i in 0..array.len() { + if i == 3 { + assert!(decimal_array.is_null(i as usize)); + } else { + assert_eq!( + 10_i128.pow(6) * (i as i128 + 1), + decimal_array.value(i as usize) + ); + } + } + } + + // test i8 to decimal type with overflow the result type + // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. + let array = Int8Array::from(vec![1, 2, 3, 4, 100]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast(&array, &DataType::Decimal(3, 1)); + assert!(casted_array.is_err()); + assert_eq!("Invalid argument error: The value of 1000 i128 is not compatible with Decimal(3,1)", casted_array.unwrap_err().to_string()); + + // test f32 to decimal type + let f_data: Vec = vec![1.1, 2.2, 4.4, 1.123_456_8]; + let array = Float32Array::from(f_data.clone()); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast(&array, &decimal_type).unwrap(); + let decimal_array = casted_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&decimal_type, decimal_array.data_type()); + for (i, item) in f_data.iter().enumerate().take(array.len()) { + let left = (*item as f64) * 10_f64.pow(6); + assert_eq!(left as i128, decimal_array.value(i as usize)); + } + + // test f64 to decimal type + let f_data: Vec = vec![1.1, 2.2, 4.4, 1.123_456_789_123_4]; + let array = Float64Array::from(f_data.clone()); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast(&array, &decimal_type).unwrap(); + let decimal_array = casted_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&decimal_type, decimal_array.data_type()); + for (i, item) in f_data.iter().enumerate().take(array.len()) { + let left = (*item as f64) * 10_f64.pow(6); + assert_eq!(left as i128, decimal_array.value(i as usize)); + } + } #[test] fn test_cast_i32_to_f64() { @@ -4268,17 +4517,39 @@ mod tests { } #[test] - fn test_cast_null_array_to_int32() { - let array = Arc::new(NullArray::new(6)) as ArrayRef; + fn test_cast_null_array_from_and_to_others() { + macro_rules! typed_test { + ($ARR_TYPE:ident, $DATATYPE:ident, $TYPE:tt) => {{ + { + let array = Arc::new(NullArray::new(6)) as ArrayRef; + let expected = $ARR_TYPE::from(vec![None; 6]); + let cast_type = DataType::$DATATYPE; + let cast_array = cast(&array, &cast_type).expect("cast failed"); + let cast_array = as_primitive_array::<$TYPE>(&cast_array); + assert_eq!(cast_array.data_type(), &cast_type); + assert_eq!(cast_array, &expected); + } + { + let array = Arc::new($ARR_TYPE::from(vec![None; 4])) as ArrayRef; + let expected = NullArray::new(4); + let cast_array = cast(&array, &DataType::Null).expect("cast failed"); + let cast_array = as_null_array(&cast_array); + assert_eq!(cast_array.data_type(), &DataType::Null); + assert_eq!(cast_array, &expected); + } + }}; + } - let expected = Int32Array::from(vec![None; 6]); + typed_test!(Int16Array, Int16, Int16Type); + typed_test!(Int32Array, Int32, Int32Type); + typed_test!(Int64Array, Int64, Int64Type); - // Cast to a dictionary (same value type, Utf8) - let cast_type = DataType::Int32; - let cast_array = cast(&array, &cast_type).expect("cast failed"); - let cast_array = as_primitive_array::(&cast_array); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(cast_array, &expected); + typed_test!(UInt16Array, UInt16, UInt16Type); + typed_test!(UInt32Array, UInt32, UInt32Type); + typed_test!(UInt64Array, UInt64, UInt64Type); + + typed_test!(Float32Array, Float32, Float32Type); + typed_test!(Float64Array, Float64, Float64Type); } /// Print the `DictionaryArray` `array` as a vector of strings