Skip to content

Commit

Permalink
Expand LIKE simplification
Browse files Browse the repository at this point in the history
- cover expression known not to be null
- cover NULL pattern
- cover repeated '%%' in pattern
  • Loading branch information
findepi committed Nov 5, 2024
1 parent f4798a1 commit be26107
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 91 deletions.
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/optimizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ indexmap = { workspace = true }
itertools = { workspace = true }
log = { workspace = true }
paste = "1.0.14"
regex = "1.11.0"
regex-syntax = "0.8.0"

[dev-dependencies]
Expand Down
260 changes: 196 additions & 64 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use crate::analyzer::type_coercion::TypeCoercionRewriter;
use crate::simplify_expressions::guarantees::GuaranteeRewriter;
use crate::simplify_expressions::regex::simplify_regex_expr;
use crate::simplify_expressions::SimplifyInfo;
use regex::Regex;

use super::inlist_simplifier::ShortenInListSimplifier;
use super::utils::*;
Expand Down Expand Up @@ -1470,34 +1471,54 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
}) => Transformed::yes(simplify_regex_expr(left, op, right)?),

// Rules for Like
Expr::Like(Like {
expr,
pattern,
negated,
escape_char: _,
case_insensitive: _,
}) if matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%"
) =>
{
// exp LIKE '%' is
// - when exp is not NULL, it's true
// - when exp is NULL, it's NULL
// exp NOT LIKE '%' is
// - when exp is not NULL, it's false
// - when exp is NULL, it's NULL
Transformed::yes(Expr::Case(Case {
expr: Some(Box::new(Expr::IsNotNull(expr))),
when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))],
else_expr: None,
}))
Expr::Like(like) => {
match as_string_scalar(&like.pattern) {
Some((data_type, pattern_str)) => {
match pattern_str {
None => return Ok(Transformed::yes(lit_bool_null())),
Some(pattern_str) if pattern_str == "%" => {
// exp LIKE '%' is
// - when exp is not NULL, it's true
// - when exp is NULL, it's NULL
// exp NOT LIKE '%' is
// - when exp is not NULL, it's false
// - when exp is NULL, it's NULL
let result_for_non_null = lit(!like.negated);
Transformed::yes(if !info.nullable(&like.expr)? {
result_for_non_null
} else {
Expr::Case(Case {
expr: Some(Box::new(Expr::IsNotNull(like.expr))),
when_then_expr: vec![(
Box::new(lit(true)),
Box::new(result_for_non_null),
)],
else_expr: None,
})
})
}
Some(pattern_str)
if pattern_str.contains("%%") &&
// TODO support more complete unescaping
(like.escape_char.is_none() || pattern_str.contains(like.escape_char.unwrap())) =>
{
let simplified_pattern = Regex::new("%%+")
.unwrap()
.replace_all(pattern_str, "%")
.to_string();
Transformed::yes(Expr::Like(Like {
pattern: Box::new(to_string_scalar(
data_type,
Some(simplified_pattern),
)),
..like
}))
}
Some(_pattern_str) => Transformed::no(Expr::Like(like)),
}
}
None => Transformed::no(Expr::Like(like)),
}
}

// a is not null/unknown --> true (if a is not nullable)
Expand Down Expand Up @@ -1696,6 +1717,24 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
}
}

fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
match expr {
Expr::Literal(ScalarValue::Utf8(s)) => Some((DataType::Utf8, s)),
Expr::Literal(ScalarValue::LargeUtf8(s)) => Some((DataType::LargeUtf8, s)),
Expr::Literal(ScalarValue::Utf8View(s)) => Some((DataType::Utf8View, s)),
_ => None,
}
}

fn to_string_scalar(data_type: DataType, value: Option<String>) -> Expr {
match data_type {
DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value)),
DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value)),
DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value)),
_ => unreachable!(),
}
}

fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool {
let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect();
iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile())
Expand Down Expand Up @@ -2810,10 +2849,16 @@ mod tests {
);

// single character
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
assert_change(
regex_match(col("c1"), lit("x")),
like(col("c1"), lit("%x%")),
);

// single word
assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));
assert_change(
regex_match(col("c1"), lit("foo")),
like(col("c1"), lit("%foo%")),
);

// regular expressions that match an exact literal
assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit("")));
Expand Down Expand Up @@ -2900,44 +2945,50 @@ mod tests {
assert_no_change(regex_match(col("c1"), lit("$foo^")));

// regular expressions that match a partial literal
assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), "foo%"));
assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), "%foo"));
assert_change(
regex_match(col("c1"), lit("^foo")),
like(col("c1"), lit("foo%")),
);
assert_change(
regex_match(col("c1"), lit("foo$")),
like(col("c1"), lit("%foo")),
);
assert_change(
regex_match(col("c1"), lit("^foo|bar$")),
like(col("c1"), "foo%").or(like(col("c1"), "%bar")),
like(col("c1"), lit("foo%")).or(like(col("c1"), lit("%bar"))),
);

// OR-chain
assert_change(
regex_match(col("c1"), lit("foo|bar|baz")),
like(col("c1"), "%foo%")
.or(like(col("c1"), "%bar%"))
.or(like(col("c1"), "%baz%")),
like(col("c1"), lit("%foo%"))
.or(like(col("c1"), lit("%bar%")))
.or(like(col("c1"), lit("%baz%"))),
);
assert_change(
regex_match(col("c1"), lit("foo|x|baz")),
like(col("c1"), "%foo%")
.or(like(col("c1"), "%x%"))
.or(like(col("c1"), "%baz%")),
like(col("c1"), lit("%foo%"))
.or(like(col("c1"), lit("%x%")))
.or(like(col("c1"), lit("%baz%"))),
);
assert_change(
regex_not_match(col("c1"), lit("foo|bar|baz")),
not_like(col("c1"), "%foo%")
.and(not_like(col("c1"), "%bar%"))
.and(not_like(col("c1"), "%baz%")),
not_like(col("c1"), lit("%foo%"))
.and(not_like(col("c1"), lit("%bar%")))
.and(not_like(col("c1"), lit("%baz%"))),
);
// both anchored expressions (translated to equality) and unanchored
assert_change(
regex_match(col("c1"), lit("foo|^x$|baz")),
like(col("c1"), "%foo%")
like(col("c1"), lit("%foo%"))
.or(col("c1").eq(lit("x")))
.or(like(col("c1"), "%baz%")),
.or(like(col("c1"), lit("%baz%"))),
);
assert_change(
regex_not_match(col("c1"), lit("foo|^bar$|baz")),
not_like(col("c1"), "%foo%")
not_like(col("c1"), lit("%foo%"))
.and(col("c1").not_eq(lit("bar")))
.and(not_like(col("c1"), "%baz%")),
.and(not_like(col("c1"), lit("%baz%"))),
);
// Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
Expand Down Expand Up @@ -2987,41 +3038,41 @@ mod tests {
})
}

fn like(expr: Expr, pattern: &str) -> Expr {
fn like(expr: Expr, pattern: impl Into<Expr>) -> Expr {
Expr::Like(Like {
negated: false,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
pattern: Box::new(pattern.into()),
escape_char: None,
case_insensitive: false,
})
}

fn not_like(expr: Expr, pattern: &str) -> Expr {
fn not_like(expr: Expr, pattern: impl Into<Expr>) -> Expr {
Expr::Like(Like {
negated: true,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
pattern: Box::new(pattern.into()),
escape_char: None,
case_insensitive: false,
})
}

fn ilike(expr: Expr, pattern: &str) -> Expr {
fn ilike(expr: Expr, pattern: impl Into<Expr>) -> Expr {
Expr::Like(Like {
negated: false,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
pattern: Box::new(pattern.into()),
escape_char: None,
case_insensitive: true,
})
}

fn not_ilike(expr: Expr, pattern: &str) -> Expr {
fn not_ilike(expr: Expr, pattern: impl Into<Expr>) -> Expr {
Expr::Like(Like {
negated: true,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
pattern: Box::new(pattern.into()),
escape_char: None,
case_insensitive: true,
})
Expand Down Expand Up @@ -3633,31 +3684,112 @@ mod tests {

#[test]
fn test_like_and_ilke() {
// LIKE '%'
let expr = like(col("c1"), "%");
let null = lit(ScalarValue::Utf8(None));

// expr [NOT] [I]LIKE NULL
let expr = like(col("c1"), null.clone());
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_like(col("c1"), null.clone());
assert_eq!(simplify(expr), lit_bool_null());

let expr = ilike(col("c1"), null.clone());
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_ilike(col("c1"), null.clone());
assert_eq!(simplify(expr), lit_bool_null());

// expr [NOT] [I]LIKE '%'
let expr = like(col("c1"), lit("%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_like(col("c1"), lit("%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

let expr = ilike(col("c1"), lit("%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_ilike(col("c1"), lit("%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

// expr [NOT] [I]LIKE '%%'
let expr = like(col("c1"), lit("%%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_like(col("c1"), "%");
let expr = not_like(col("c1"), lit("%%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

let expr = ilike(col("c1"), "%");
let expr = ilike(col("c1"), lit("%%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_ilike(col("c1"), "%");
let expr = not_ilike(col("c1"), lit("%%"));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

// null_constant LIKE '%'
// not_null_expr [NOT] [I]LIKE '%'
let expr = like(col("c1_non_null"), lit("%"));
assert_eq!(simplify(expr), lit(true));

let expr = not_like(col("c1_non_null"), lit("%"));
assert_eq!(simplify(expr), lit(false));

let expr = ilike(col("c1_non_null"), lit("%"));
assert_eq!(simplify(expr), lit(true));

let expr = not_ilike(col("c1_non_null"), lit("%"));
assert_eq!(simplify(expr), lit(false));

// not_null_expr [NOT] [I]LIKE '%%'
let expr = like(col("c1_non_null"), lit("%%"));
assert_eq!(simplify(expr), lit(true));

let expr = not_like(col("c1_non_null"), lit("%%"));
assert_eq!(simplify(expr), lit(false));

let expr = ilike(col("c1_non_null"), lit("%%"));
assert_eq!(simplify(expr), lit(true));

let expr = not_ilike(col("c1_non_null"), lit("%%"));
assert_eq!(simplify(expr), lit(false));

// null_constant [NOT] [I]LIKE '%'
let expr = like(null.clone(), lit("%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_like(null.clone(), lit("%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = ilike(null.clone(), lit("%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_ilike(null, lit("%"));
assert_eq!(simplify(expr), lit_bool_null());

// null_constant [NOT] [I]LIKE '%%'
let null = lit(ScalarValue::Utf8(None));
let expr = like(null.clone(), lit("%%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_like(null.clone(), lit("%%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = ilike(null.clone(), lit("%%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_ilike(null, lit("%%"));
assert_eq!(simplify(expr), lit_bool_null());

// null_constant [NOT] [I]LIKE 'a%'
let null = lit(ScalarValue::Utf8(None));
let expr = like(null.clone(), "%");
let expr = like(null.clone(), lit("a%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_like(null.clone(), "%");
let expr = not_like(null.clone(), lit("a%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = ilike(null.clone(), "%");
let expr = ilike(null.clone(), lit("a%"));
assert_eq!(simplify(expr), lit_bool_null());

let expr = not_ilike(null, "%");
let expr = not_ilike(null, lit("a%"));
assert_eq!(simplify(expr), lit_bool_null());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

# TODO (https://github.com/apache/datafusion/issues/12637): add a row with '%%' pattern
statement ok
create table test_source as values
('Andrew', 'X', 'datafusion📊🔥', '🔥'),
Expand Down
Loading

0 comments on commit be26107

Please sign in to comment.