Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement native stringview support for BTRIM #11920

Merged
merged 6 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
// under the License.

use arrow::array::{ArrayRef, OffsetSizeTrait};
use std::any::Any;

use arrow::datatypes::DataType;
use std::any::Any;

use datafusion_common::{exec_err, Result};
use datafusion_expr::function::Hint;
Expand All @@ -32,7 +31,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Both)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Both, use_string_view)
}

#[derive(Debug)]
Expand All @@ -52,7 +52,16 @@ impl BTrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
aliases: vec![String::from("trim")],
Expand All @@ -79,15 +88,18 @@ impl ScalarUDFImpl for BTrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
btrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
btrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function btrim"),
other => exec_err!(
"Unsupported data type {other:?} for function btrim,\
expected for Utf8, LargeUtf8 or Utf8View."
),
}
}

Expand Down
78 changes: 75 additions & 3 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use arrow::array::{
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
use arrow::datatypes::DataType;

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::Result;
use datafusion_common::{exec_err, ScalarValue};
use datafusion_expr::ColumnarValue;
Expand All @@ -49,6 +49,7 @@ impl Display for TrimType {
pub(crate) fn general_trim<T: OffsetSizeTrait>(
args: &[ArrayRef],
trim_type: TrimType,
use_string_view: bool,
) -> Result<ArrayRef> {
let func = match trim_type {
TrimType::Left => |input, pattern: &str| {
Expand All @@ -68,6 +69,74 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
},
};

if use_string_view {
string_view_trim::<T>(trim_type, func, args)
} else {
string_trim::<T>(trim_type, func, args)
}
}

// removing 'a will cause compiler complaining lifetime of `func`
fn string_view_trim<'a, T: OffsetSizeTrait>(
Copy link
Contributor

@alamb alamb Aug 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation is an improvement -- however, it will now copy the string values to a new StringArray

I think if the function generated StringView output directly it would be possible to avoid all string copies (and only adjust the views).

I can file this as a follow on optimization idea.

I noticed this on #11790

trim_type: TrimType,
func: fn(&'a str, &'a str) -> &'a str,
args: &'a [ArrayRef],
) -> Result<ArrayRef> {
let string_array = as_string_view_array(&args[0])?;

match args.len() {
1 => {
let result = string_array
.iter()
.map(|string| string.map(|string: &str| func(string, " ")))
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}
2 => {
let characters_array = as_string_view_array(&args[1])?;

if characters_array.len() == 1 {
if characters_array.is_null(0) {
return Ok(new_null_array(
// The schema is expecting utf8 as null
&DataType::Utf8,
string_array.len(),
));
}

let characters = characters_array.value(0);
let result = string_array
.iter()
.map(|item| item.map(|string| func(string, characters)))
.collect::<GenericStringArray<T>>();
return Ok(Arc::new(result) as ArrayRef);
}

let result = string_array
.iter()
.zip(characters_array.iter())
.map(|(string, characters)| match (string, characters) {
(Some(string), Some(characters)) => Some(func(string, characters)),
_ => None,
})
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}
other => {
exec_err!(
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
)
}
}
}

fn string_trim<'a, T: OffsetSizeTrait>(
trim_type: TrimType,
func: fn(&'a str, &'a str) -> &'a str,
args: &'a [ArrayRef],
) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;

match args.len() {
Expand All @@ -84,7 +153,10 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(

if characters_array.len() == 1 {
if characters_array.is_null(0) {
return Ok(new_null_array(args[0].data_type(), args[0].len()));
return Ok(new_null_array(
string_array.data_type(),
string_array.len(),
));
}

let characters = characters_array.value(0);
Expand All @@ -109,7 +181,7 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
other => {
exec_err!(
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
)
)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Left)
general_trim::<T>(args, TrimType::Left, false)
}

#[derive(Debug)]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Right)
general_trim::<T>(args, TrimType::Right, false)
}

#[derive(Debug)]
Expand Down
37 changes: 36 additions & 1 deletion datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -519,15 +519,50 @@ SELECT
228 0 NULL

## Ensure no casts for BTRIM
# Test BTRIM with Utf8View input
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: btrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l
01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
BTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test BTRIM outputs
query TTTT
SELECT
BTRIM(column1_utf8view, 'foo') AS l1,
BTRIM(column1_utf8view, 'A') AS l2,
BTRIM(column1_utf8view) AS l3,
BTRIM(column1_utf8view, NULL) AS l4
FROM test;
----
Andrew ndrew Andrew NULL
Xiangpeng Xiangpeng Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL

## Ensure no casts for CHARACTER_LENGTH
query TT
EXPLAIN SELECT
Expand Down