From 60ff4f6af30281feea48e1967a9f828ab18b33f8 Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Thu, 15 Aug 2024 00:38:20 -0700 Subject: [PATCH 1/7] draft implement utf8_view for replace --- datafusion/functions/src/string/replace.rs | 94 ++++++++++++++++++- .../sqllogictest/test_files/string_view.slt | 5 +- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 4cebbba839fa..4f9928a836fc 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -18,16 +18,16 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use crate::utils::make_scalar_function; #[derive(Debug)] pub struct ReplaceFunc { @@ -66,13 +66,24 @@ impl ScalarUDFImpl for ReplaceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "replace") + match arg_types[0].clone() { + DataType::Utf8 => return Ok(DataType::Utf8), + DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), + DataType::Utf8View => { + println!("utf8view"); + return Ok(DataType::Utf8View); + } + other => { + exec_err!("Unsupported data type {other:?} for function replace") + } + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { DataType::Utf8 => make_scalar_function(replace::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(replace::, vec![])(args), + DataType::Utf8View => make_scalar_function(replace_view, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function replace") } @@ -80,6 +91,24 @@ impl ScalarUDFImpl for ReplaceFunc { } } +fn replace_view(args: &[ArrayRef]) -> Result { + let string_array = as_string_view_array(&args[0])?; + let from_array = as_string_view_array(&args[1])?; + let to_array = as_string_view_array(&args[2])?; + + let result = string_array + .iter() + .zip(from_array.iter()) + .zip(to_array.iter()) + .map(|((string, from), to)| match (string, from, to) { + (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), + _ => None, + }) + .collect::(); + + Ok(Arc::new(result) as ArrayRef) + +} /// Replaces all occurrences in string of substring from with substring to. /// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' fn replace(args: &[ArrayRef]) -> Result { @@ -100,4 +129,59 @@ fn replace(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } -mod test {} +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::test::test_function; + use arrow::array::Array; + use arrow::array::LargeStringArray; + use arrow::array::StringArray; + use arrow::array::StringViewArray; + use arrow::datatypes::DataType::{Utf8, LargeUtf8, Utf8View}; + use datafusion_common::ScalarValue; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + ReplaceFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("aabbdqcbb")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("bb")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ccc")))), + ], + Ok(Some("aacccdqcccc")), + &str, + Utf8, + StringArray + ); + + test_function!( + ReplaceFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("aabbb")))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("bbb")))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("cc")))), + ], + Ok(Some("aacc")), + &str, + LargeUtf8, + LargeStringArray + ); + + test_function!( + ReplaceFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("aabbbcw")))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("bb")))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("cc")))), + ], + Ok(Some("aaccbcw")), + &str, + Utf8View, + StringViewArray + ); + + Ok(()) + } + +} diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index e7166690580f..4b515683578c 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -718,9 +718,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] +01)Projection: replace(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8("bar")) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for REVERSE ## TODO file ticket From 7a3569b7096c111bf2e7f9e12467334d13de253f Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Fri, 16 Aug 2024 00:17:32 -0700 Subject: [PATCH 2/7] add function signature --- datafusion/functions/src/string/replace.rs | 11 ++++++----- datafusion/sqllogictest/test_files/string_view.slt | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 4f9928a836fc..e1338a918977 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -45,7 +45,11 @@ impl ReplaceFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Utf8, Utf8])], + vec![ + Exact(vec![Utf8View, Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]) + ], Volatility::Immutable, ), } @@ -69,10 +73,7 @@ impl ScalarUDFImpl for ReplaceFunc { match arg_types[0].clone() { DataType::Utf8 => return Ok(DataType::Utf8), DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), - DataType::Utf8View => { - println!("utf8view"); - return Ok(DataType::Utf8View); - } + DataType::Utf8View => return Ok(DataType::Utf8View), other => { exec_err!("Unsupported data type {other:?} for function replace") } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 4b515683578c..c356612da3b6 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -718,7 +718,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: replace(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8("bar")) AS c2 +01)Projection: replace(test.column1_utf8view, Utf8View("foo"), Utf8View("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8View("bar")) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for REVERSE From b08f96eb593016461fe3f28119481e5ec5aade95 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Aug 2024 06:16:45 -0400 Subject: [PATCH 3/7] Add sql test --- datafusion/sqllogictest/test_files/string_view.slt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index c356612da3b6..a97ed38349a5 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -697,7 +697,6 @@ logical_plan 01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k 02)--TableScan: test projection=[column1_utf8view] - ## Ensure no casts for REPEAT ## TODO file ticket query TT @@ -710,7 +709,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REPLACE -## TODO file ticket query TT EXPLAIN SELECT REPLACE(column1_utf8view, 'foo', 'bar') as c1, @@ -721,6 +719,18 @@ logical_plan 01)Projection: replace(test.column1_utf8view, Utf8View("foo"), Utf8View("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8View("bar")) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] +query ?? +SELECT + REPLACE(column1_utf8view, 'foo', 'bar') as c1, + REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 +FROM test; +---- +Andrew Andrew +Xiangpeng bar +Raphael baraphael +NULL NULL + + ## Ensure no casts for REVERSE ## TODO file ticket query TT From ab85d92907b8aa43033a239f62879c94e673ee70 Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Fri, 16 Aug 2024 12:15:21 -0700 Subject: [PATCH 4/7] move macro util to replace function --- datafusion/functions/src/string/replace.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index e1338a918977..e14add2de815 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray}; use arrow::datatypes::DataType; +use arrow::ipc::LargeUtf8; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::*; @@ -74,6 +75,19 @@ impl ScalarUDFImpl for ReplaceFunc { DataType::Utf8 => return Ok(DataType::Utf8), DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), DataType::Utf8View => return Ok(DataType::Utf8View), + DataType::Dictionary(_, value_type) => match *value_type { + DataType::LargeUtf8 | DataType::LargeBinary => return Ok(DataType::LargeUtf8), + DataType::Utf8 | DataType::Binary => return Ok(DataType::Utf8), + DataType::Utf8View | DataType::BinaryView => return Ok(DataType::Utf8View), + DataType::Null => return Ok(DataType::Null), + _ => { + return exec_err!( + "The replace function can only accept strings, but got {:?}.", + *value_type + ); + } + }, + DataType::Null => return Ok(DataType::Null), other => { exec_err!("Unsupported data type {other:?} for function replace") } From 8ce402885efa15479c005d4dc4040d62bb0f04c4 Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Fri, 16 Aug 2024 12:16:34 -0700 Subject: [PATCH 5/7] remove unused import --- datafusion/functions/src/string/replace.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index e14add2de815..5b12fabbf24b 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray}; use arrow::datatypes::DataType; -use arrow::ipc::LargeUtf8; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::*; From b948b0fa2a223a60425c43436fa6e3325358b27a Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Fri, 16 Aug 2024 15:08:37 -0700 Subject: [PATCH 6/7] rust format --- datafusion/functions/src/string/replace.rs | 29 +++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 5b12fabbf24b..c91afaf84c10 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -46,9 +46,9 @@ impl ReplaceFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]) + Exact(vec![Utf8View, Utf8View, Utf8View]), + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), @@ -75,9 +75,13 @@ impl ScalarUDFImpl for ReplaceFunc { DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), DataType::Utf8View => return Ok(DataType::Utf8View), DataType::Dictionary(_, value_type) => match *value_type { - DataType::LargeUtf8 | DataType::LargeBinary => return Ok(DataType::LargeUtf8), + DataType::LargeUtf8 | DataType::LargeBinary => { + return Ok(DataType::LargeUtf8) + } DataType::Utf8 | DataType::Binary => return Ok(DataType::Utf8), - DataType::Utf8View | DataType::BinaryView => return Ok(DataType::Utf8View), + DataType::Utf8View | DataType::BinaryView => { + return Ok(DataType::Utf8View) + } DataType::Null => return Ok(DataType::Null), _ => { return exec_err!( @@ -106,7 +110,7 @@ impl ScalarUDFImpl for ReplaceFunc { } fn replace_view(args: &[ArrayRef]) -> Result { - let string_array = as_string_view_array(&args[0])?; + let string_array = as_string_view_array(&args[0])?; let from_array = as_string_view_array(&args[1])?; let to_array = as_string_view_array(&args[2])?; @@ -121,7 +125,6 @@ fn replace_view(args: &[ArrayRef]) -> Result { .collect::(); Ok(Arc::new(result) as ArrayRef) - } /// Replaces all occurrences in string of substring from with substring to. /// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' @@ -151,9 +154,8 @@ mod tests { use arrow::array::LargeStringArray; use arrow::array::StringArray; use arrow::array::StringViewArray; - use arrow::datatypes::DataType::{Utf8, LargeUtf8, Utf8View}; + use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; use datafusion_common::ScalarValue; - #[test] fn test_functions() -> Result<()> { test_function!( @@ -172,7 +174,9 @@ mod tests { test_function!( ReplaceFunc::new(), &[ - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("aabbb")))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from( + "aabbb" + )))), ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("bbb")))), ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("cc")))), ], @@ -185,7 +189,9 @@ mod tests { test_function!( ReplaceFunc::new(), &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("aabbbcw")))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "aabbbcw" + )))), ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("bb")))), ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("cc")))), ], @@ -197,5 +203,4 @@ mod tests { Ok(()) } - } From d26f21ae00c04fbdb818a2ac152fb59fb1423a67 Mon Sep 17 00:00:00 2001 From: Thinh Bui Date: Sun, 18 Aug 2024 14:42:47 -0700 Subject: [PATCH 7/7] change return type from utf8view to utf8 --- datafusion/functions/src/string/replace.rs | 39 ++++--------------- .../sqllogictest/test_files/functions.slt | 10 +++++ .../sqllogictest/test_files/string_view.slt | 2 +- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index c91afaf84c10..13fa3d55672d 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray}; +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; @@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::make_scalar_function; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub struct ReplaceFunc { @@ -70,31 +70,7 @@ impl ScalarUDFImpl for ReplaceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - match arg_types[0].clone() { - DataType::Utf8 => return Ok(DataType::Utf8), - DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), - DataType::Utf8View => return Ok(DataType::Utf8View), - DataType::Dictionary(_, value_type) => match *value_type { - DataType::LargeUtf8 | DataType::LargeBinary => { - return Ok(DataType::LargeUtf8) - } - DataType::Utf8 | DataType::Binary => return Ok(DataType::Utf8), - DataType::Utf8View | DataType::BinaryView => { - return Ok(DataType::Utf8View) - } - DataType::Null => return Ok(DataType::Null), - _ => { - return exec_err!( - "The replace function can only accept strings, but got {:?}.", - *value_type - ); - } - }, - DataType::Null => return Ok(DataType::Null), - other => { - exec_err!("Unsupported data type {other:?} for function replace") - } - } + utf8_to_str_type(&arg_types[0], "replace") } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -122,7 +98,7 @@ fn replace_view(args: &[ArrayRef]) -> Result { (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), _ => None, }) - .collect::(); + .collect::(); Ok(Arc::new(result) as ArrayRef) } @@ -153,8 +129,7 @@ mod tests { use arrow::array::Array; use arrow::array::LargeStringArray; use arrow::array::StringArray; - use arrow::array::StringViewArray; - use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; + use arrow::datatypes::DataType::{LargeUtf8, Utf8}; use datafusion_common::ScalarValue; #[test] fn test_functions() -> Result<()> { @@ -197,8 +172,8 @@ mod tests { ], Ok(Some("aaccbcw")), &str, - Utf8View, - StringViewArray + Utf8, + StringArray ); Ok(()) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index cb592fdda0c8..426123dd9be1 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -826,6 +826,16 @@ SELECT replace(arrow_cast('foobar', 'Dictionary(Int32, Utf8)'), 'bar', 'hello') ---- foohello +query T +SELECT replace(arrow_cast('foobar', 'Utf8View'), arrow_cast('bar', 'Utf8View'), arrow_cast('hello', 'Utf8View')) +---- +foohello + +query T +SELECT replace(arrow_cast('foobar', 'LargeUtf8'), arrow_cast('bar', 'LargeUtf8'), arrow_cast('hello', 'LargeUtf8')) +---- +foohello + query T SELECT rtrim(' foo ') ---- diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index f84a15896702..15e8fce20296 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -923,7 +923,7 @@ logical_plan 01)Projection: replace(test.column1_utf8view, Utf8View("foo"), Utf8View("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8View("bar")) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] -query ?? +query TT SELECT REPLACE(column1_utf8view, 'foo', 'bar') as c1, REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2