Skip to content

Commit

Permalink
Implement native stringview support for BTRIM (#11920)
Browse files Browse the repository at this point in the history
* add utf8view support for generic_trim

* add utf8view support for BTRIM

* stop LTRIM and RTRIM from complaining generic_trim missing args

* add tests to cover utf8view support of BTRIM

* fix typo and tiny err

* remove useless imports
  • Loading branch information
Kev1n8 authored Aug 12, 2024
1 parent b60cdc7 commit 34ec9d4
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 12 deletions.
24 changes: 18 additions & 6 deletions datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
// under the License.

use arrow::array::{ArrayRef, OffsetSizeTrait};
use std::any::Any;

use arrow::datatypes::DataType;
use std::any::Any;

use datafusion_common::{exec_err, Result};
use datafusion_expr::function::Hint;
Expand All @@ -32,7 +31,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Both)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Both, use_string_view)
}

#[derive(Debug)]
Expand All @@ -52,7 +52,16 @@ impl BTrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
aliases: vec![String::from("trim")],
Expand All @@ -79,15 +88,18 @@ impl ScalarUDFImpl for BTrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
btrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
btrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function btrim"),
other => exec_err!(
"Unsupported data type {other:?} for function btrim,\
expected for Utf8, LargeUtf8 or Utf8View."
),
}
}

Expand Down
78 changes: 75 additions & 3 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use arrow::array::{
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
use arrow::datatypes::DataType;

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::Result;
use datafusion_common::{exec_err, ScalarValue};
use datafusion_expr::ColumnarValue;
Expand All @@ -49,6 +49,7 @@ impl Display for TrimType {
pub(crate) fn general_trim<T: OffsetSizeTrait>(
args: &[ArrayRef],
trim_type: TrimType,
use_string_view: bool,
) -> Result<ArrayRef> {
let func = match trim_type {
TrimType::Left => |input, pattern: &str| {
Expand All @@ -68,6 +69,74 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
},
};

if use_string_view {
string_view_trim::<T>(trim_type, func, args)
} else {
string_trim::<T>(trim_type, func, args)
}
}

// removing 'a will cause compiler complaining lifetime of `func`
fn string_view_trim<'a, T: OffsetSizeTrait>(
trim_type: TrimType,
func: fn(&'a str, &'a str) -> &'a str,
args: &'a [ArrayRef],
) -> Result<ArrayRef> {
let string_array = as_string_view_array(&args[0])?;

match args.len() {
1 => {
let result = string_array
.iter()
.map(|string| string.map(|string: &str| func(string, " ")))
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}
2 => {
let characters_array = as_string_view_array(&args[1])?;

if characters_array.len() == 1 {
if characters_array.is_null(0) {
return Ok(new_null_array(
// The schema is expecting utf8 as null
&DataType::Utf8,
string_array.len(),
));
}

let characters = characters_array.value(0);
let result = string_array
.iter()
.map(|item| item.map(|string| func(string, characters)))
.collect::<GenericStringArray<T>>();
return Ok(Arc::new(result) as ArrayRef);
}

let result = string_array
.iter()
.zip(characters_array.iter())
.map(|(string, characters)| match (string, characters) {
(Some(string), Some(characters)) => Some(func(string, characters)),
_ => None,
})
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}
other => {
exec_err!(
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
)
}
}
}

fn string_trim<'a, T: OffsetSizeTrait>(
trim_type: TrimType,
func: fn(&'a str, &'a str) -> &'a str,
args: &'a [ArrayRef],
) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;

match args.len() {
Expand All @@ -84,7 +153,10 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(

if characters_array.len() == 1 {
if characters_array.is_null(0) {
return Ok(new_null_array(args[0].data_type(), args[0].len()));
return Ok(new_null_array(
string_array.data_type(),
string_array.len(),
));
}

let characters = characters_array.value(0);
Expand All @@ -109,7 +181,7 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
other => {
exec_err!(
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
)
)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Left)
general_trim::<T>(args, TrimType::Left, false)
}

#[derive(Debug)]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Right)
general_trim::<T>(args, TrimType::Right, false)
}

#[derive(Debug)]
Expand Down
37 changes: 36 additions & 1 deletion datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -563,15 +563,50 @@ SELECT
228 0 NULL

## Ensure no casts for BTRIM
# Test BTRIM with Utf8View input
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: btrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l
01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM outputs
query TTTT
SELECT
BTRIM(column1_utf8view, 'foo') AS l1,
BTRIM(column1_utf8view, 'A') AS l2,
BTRIM(column1_utf8view) AS l3,
BTRIM(column1_utf8view, NULL) AS l4
FROM test;
----
Andrew ndrew Andrew NULL
Xiangpeng Xiangpeng Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL

## Ensure no casts for CHARACTER_LENGTH
query TT
EXPLAIN SELECT
Expand Down

0 comments on commit 34ec9d4

Please sign in to comment.