Skip to content

Commit

Permalink
Backport: Fix incorrect ... LIKE '%' simplification with NULLs (apa…
Browse files Browse the repository at this point in the history
…che#13259)

* Fix incorrect `... LIKE '%'` simplification

`expr LIKE '%'` was previously simplified to `true`, but the expression
returns `NULL` when `expr` is null.
The conversion was conditional on `!is_null(expr)` which means "is not
always true, i.e. is not a null literal".

This commit adds correct simplification logic. It additionally expands
the rule coverage to include string view (Utf8View) and large string
(LargeUtf8). This allows writing shared test cases even despite
`utf8_view LIKE '%'` returning incorrect results at execution time
(tracked by apache#12637). I.e. the
simplification masks the bug for cases where pattern is statically
known.

* fixup! Fix incorrect `... LIKE '%'` simplification

* fix tests (re review comments)
  • Loading branch information
findepi committed Nov 6, 2024
1 parent 7a13a9b commit f97e57d
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 16 deletions.
67 changes: 51 additions & 16 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1458,13 +1458,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
negated,
escape_char: _,
case_insensitive: _,
}) if !is_null(&expr)
&& matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
) =>
}) if matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%"
) =>
{
Transformed::yes(lit(!negated))
// exp LIKE '%' is
// - when exp is not NULL, it's true
// - when exp is NULL, it's NULL
// exp NOT LIKE '%' is
// - when exp is not NULL, it's false
// - when exp is NULL, it's NULL
Transformed::yes(Expr::Case(Case {
expr: Some(Box::new(Expr::IsNotNull(expr))),
when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))],
else_expr: None,
}))
}

// a is not null/unknown --> true (if a is not nullable)
Expand Down Expand Up @@ -2759,10 +2774,22 @@ mod tests {
assert_no_change(regex_match(col("c1"), lit("f_o")));

// empty cases
assert_change(regex_match(col("c1"), lit("")), lit(true));
assert_change(regex_not_match(col("c1"), lit("")), lit(false));
assert_change(regex_imatch(col("c1"), lit("")), lit(true));
assert_change(regex_not_imatch(col("c1"), lit("")), lit(false));
assert_change(
regex_match(col("c1"), lit("")),
if_not_null(col("c1"), true),
);
assert_change(
regex_not_match(col("c1"), lit("")),
if_not_null(col("c1"), false),
);
assert_change(
regex_imatch(col("c1"), lit("")),
if_not_null(col("c1"), true),
);
assert_change(
regex_not_imatch(col("c1"), lit("")),
if_not_null(col("c1"), false),
);

// single character
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
Expand Down Expand Up @@ -3588,20 +3615,20 @@ mod tests {

#[test]
fn test_like_and_ilke() {
// test non-null values
// LIKE '%'
let expr = like(col("c1"), "%");
assert_eq!(simplify(expr), lit(true));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_like(col("c1"), "%");
assert_eq!(simplify(expr), lit(false));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

let expr = ilike(col("c1"), "%");
assert_eq!(simplify(expr), lit(true));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_ilike(col("c1"), "%");
assert_eq!(simplify(expr), lit(false));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

// test null values
// null_constant LIKE '%'
let null = lit(ScalarValue::Utf8(None));
let expr = like(null.clone(), "%");
assert_eq!(simplify(expr), lit_bool_null());
Expand Down Expand Up @@ -3918,4 +3945,12 @@ mod tests {
unimplemented!("not needed for tests")
}
}

fn if_not_null(expr: Expr, then: bool) -> Expr {
Expr::Case(Case {
expr: Some(expr.is_not_null().into()),
when_then_expr: vec![(lit(true).into(), lit(then).into())],
else_expr: None,
})
}
}
85 changes: 85 additions & 0 deletions datafusion/sqllogictest/test_files/strings.slt
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,90 @@ SELECT s::VARCHAR(2) FROM vals
abc
def


query BBB
SELECT
NULL LIKE '%',
'' LIKE '%',
'a' LIKE '%'
----
NULL true true

statement ok
create table test_source as values
('Andrew', 'X', 'datafusion📊🔥', '🔥'),
('Xiangpeng', 'Xiangpeng', 'datafusion数据融合', 'datafusion数据融合'),
('Raphael', 'R', 'datafusionДатаФусион', 'аФус'),
('under_score', 'un_____core', 'un iść core', 'chrząszcz na łące w 東京都'),
('percent', 'p%t', 'pan Tadeusz ma iść w kąt', 'Pan Tadeusz ma frunąć stąd w kąt'),
('', '%', '', ''),
(NULL, '%', NULL, NULL),
(NULL, 'R', NULL, '🔥');

statement ok
create table test_basic_operator as
select
arrow_cast(column1, 'Utf8') as ascii_1,
arrow_cast(column2, 'Utf8') as ascii_2,
arrow_cast(column3, 'Utf8') as unicode_1,
arrow_cast(column4, 'Utf8') as unicode_2
from test_source;

# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
query TTBB
SELECT ascii_1, unicode_1,
ascii_1 LIKE '%' AS ascii_1_like_percent,
unicode_1 LIKE '%' AS unicode_1_like_percent
-- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
-- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
FROM test_basic_operator
----
Andrew datafusion📊🔥 true true
Xiangpeng datafusion数据融合 true true
Raphael datafusionДатаФусион true true
under_score un iść core true true
percent pan Tadeusz ma iść w kąt true true
(empty) (empty) true true
NULL NULL NULL NULL
NULL NULL NULL NULL

# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
query TTBB
SELECT ascii_1, unicode_1,
ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent,
unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent
-- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
-- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
FROM test_basic_operator
----
Andrew datafusion📊🔥 false false
Xiangpeng datafusion数据融合 false false
Raphael datafusionДатаФусион false false
under_score un iść core false false
percent pan Tadeusz ma iść w kąt false false
(empty) (empty) false false
NULL NULL NULL NULL
NULL NULL NULL NULL

query T
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%'
----
Andrew
Xiangpeng
Raphael
under_score
percent
(empty)

query T
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%'
----

statement ok
drop table test_basic_operator;

statement ok
drop table test_source;

statement ok
drop table vals;

0 comments on commit f97e57d

Please sign in to comment.