Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add native stringview support for LTRIM & RTRIM #11948

Merged
merged 5 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ impl BTrimFunc {
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
Expand Down Expand Up @@ -98,7 +97,7 @@ impl ScalarUDFImpl for BTrimFunc {
)(args),
other => exec_err!(
"Unsupported data type {other:?} for function btrim,\
expected for Utf8, LargeUtf8 or Utf8View."
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
Expand Down
20 changes: 16 additions & 4 deletions datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Left, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Left, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,15 @@ impl LtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +86,18 @@ impl ScalarUDFImpl for LtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
ltrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
ltrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function ltrim"),
other => exec_err!(
"Unsupported data type {other:?} for function ltrim,\
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
}
20 changes: 16 additions & 4 deletions datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Right, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Right, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,15 @@ impl RtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +86,18 @@ impl ScalarUDFImpl for RtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
rtrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
rtrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function rtrim"),
other => exec_err!(
"Unsupported data type {other:?} for function rtrim,\
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
}
128 changes: 93 additions & 35 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,99 @@ Xiangpeng Xiangpeng Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL

## Ensure no casts for LTRIM
# Test LTRIM with Utf8View input
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM outputs
query TTTTT
SELECT
LTRIM(column1_utf8view, 'foo') AS l1,
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
LTRIM(column1_utf8view, column2_utf8view) AS l2,
LTRIM(column1_utf8view) AS l3,
LTRIM(column1_utf8view, NULL) AS l4,
LTRIM(column1_utf8view, 'Xiang') AS l5
FROM test;
----
Andrew Andrew Andrew NULL Andrew
Xiangpeng (empty) Xiangpeng NULL peng
Raphael aphael Raphael NULL Raphael
NULL NULL NULL NULL NULL

## ensure no casts for RTRIM
# Test RTRIM with Utf8View input
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM outputs
query TTTTT
SELECT
RTRIM(column1_utf8view, 'foo') AS l1,
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
RTRIM(column1_utf8view, column2_utf8view) AS l2,
RTRIM(column1_utf8view) AS l3,
RTRIM(column1_utf8view, NULL) AS l4,
RTRIM(column1_utf8view, 'peng') As l5
FROM test;
----
Andrew Andrew Andrew NULL Andrew
Xiangpeng (empty) Xiangpeng NULL Xia
Raphael Raphael Raphael NULL Raphael
NULL NULL NULL NULL NULL


## Ensure no casts for CHARACTER_LENGTH
query TT
EXPLAIN SELECT
Expand Down Expand Up @@ -685,16 +778,6 @@ logical_plan
01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LTRIM
## TODO https://github.com/apache/datafusion/issues/11856
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LPAD
## TODO https://github.com/apache/datafusion/issues/11857
Expand Down Expand Up @@ -795,18 +878,6 @@ logical_plan
01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c1,
RTRIM(column1_utf8view, 'foo') as c2
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1
03)----TableScan: test projection=[column1_utf8view]

## Ensure no casts for RIGHT
## TODO file ticket
Expand All @@ -833,19 +904,6 @@ logical_plan
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]


## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c,
RTRIM(column1_utf8view, column2_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SPLIT_PART
## TODO file ticket
query TT
Expand Down