This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8b412deaca fix: Optimize `!~ '.*'` case to `col IS NULL AND
Boolean(NULL)` instead of `Eq ""` (#20702)
8b412deaca is described below
commit 8b412deaca33ec79f4982019379f3c8e538ab650
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Mar 12 05:53:34 2026 -0700
fix: Optimize `!~ '.*'` case to `col IS NULL AND Boolean(NULL)` instead of
`Eq ""` (#20702)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Closes #20701
## Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
## What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
A pre-existing optimization rule for the `!~ .*` (regexp not match) case
rewrote the plan to `Eq ""`, which would return empty strings as part of
the result. This is incorrect and doesn't match the output without the
optimization rule.
Instead, this PR rewrites the plan to simply `col IS NULL AND
Boolean(NULL)` or, in other words, "NULL if col is NULL else false."
I've confirmed this behavior matches the result of running queries
manually with the optimization rule turned off.
## Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
Fixed expected output in tests. Added new tests for nulls
## Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
Yes, a minor bug fix. When querying `s !~ .*`, empty strings will no
longer be included in the result which is consistent with the behavior
without the optimization rule.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
.../optimizer/src/simplify_expressions/regex.rs | 13 +++--
.../src/simplify_expressions/simplify_exprs.rs | 55 ++++++++++++++++++----
.../sqllogictest/test_files/simplify_expr.slt | 19 +++++++-
3 files changed, 68 insertions(+), 19 deletions(-)
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 6c2492d054..b341c328e9 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -16,7 +16,7 @@
// under the License.
use datafusion_common::tree_node::Transformed;
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
@@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
/// - combinations (alternatives) of the above, will be concatenated with `OR`
or `AND`
/// - `EQ .*` to NotNull
-/// - `NE .*` means IS EMPTY
+/// - `NE .*` to col IS NULL AND Boolean(NULL) (false for any string, or NULL
if col is NULL)
///
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case
`test_simplify_regex`.
pub fn simplify_regex_expr(
@@ -68,12 +68,11 @@ pub fn simplify_regex_expr(
// Handle the special case for ".*" pattern
if pattern == ANY_CHAR_REGEX_PATTERN {
let new_expr = if mode.not {
- // not empty
- let empty_lit = Box::new(string_scalar.to_expr(""));
+ let null_bool = lit(ScalarValue::Boolean(None));
Expr::BinaryExpr(BinaryExpr {
- left,
- op: Operator::Eq,
- right: empty_lit,
+ left: Box::new(left.is_null()),
+ op: Operator::And,
+ right: Box::new(null_bool),
})
} else {
// not null
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index f7f1000150..2114c5ef3d 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -155,6 +155,7 @@ mod tests {
use arrow::datatypes::{DataType, Field, Schema};
use chrono::{DateTime, Utc};
+ use datafusion_common::ScalarValue;
use datafusion_expr::logical_plan::builder::table_scan_with_filters;
use datafusion_expr::logical_plan::table_scan;
use datafusion_expr::*;
@@ -870,7 +871,7 @@ mod tests {
]);
let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
- // Test `= ".*"` transforms to true (except for empty strings)
+ // Test `~ ".*"` transforms to true for any non-NULL string
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
.build()?;
@@ -883,22 +884,22 @@ mod tests {
"
)?;
- // Test `!= ".*"` transforms to checking if the column is empty
+ // Test `!~ ".*"` preserves NULL semantics while remaining false for
non-NULL strings
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
.build()?;
assert_optimized_plan_equal!(
plan,
- @ r#"
- Filter: test.a = Utf8("")
+ @ r"
+ Filter: test.a IS NULL AND Boolean(NULL)
TableScan: test
- "#
+ "
)?;
// Test case-insensitive versions
- // Test `=~ ".*"` (case-insensitive) transforms to true (except for
empty strings)
+ // Test `~* ".*"` transforms to true for any non-NULL string
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))?
.build()?;
@@ -911,17 +912,51 @@ mod tests {
"
)?;
- // Test `!~ ".*"` (case-insensitive) transforms to checking if the
column is empty
+ // Test NULL `!~ ".*"` transforms to Boolean(NULL)
+ let plan = LogicalPlanBuilder::from(table_scan.clone())
+ .filter(binary_expr(
+ lit(ScalarValue::Utf8(None)),
+ Operator::RegexNotMatch,
+ lit(".*"),
+ ))?
+ .build()?;
+
+ assert_optimized_plan_equal!(
+ plan,
+ @ r"
+ Filter: Boolean(NULL)
+ TableScan: test
+ "
+ )?;
+
+ // Test `!~* ".*"` preserves NULL semantics while remaining false for
non-NULL strings
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotIMatch,
lit(".*")))?
.build()?;
assert_optimized_plan_equal!(
plan,
- @ r#"
- Filter: test.a = Utf8("")
+ @ r"
+ Filter: test.a IS NULL AND Boolean(NULL)
TableScan: test
- "#
+ "
+ )?;
+
+ // Test NULL `!~* ".*"` transforms to Boolean(NULL)
+ let plan = LogicalPlanBuilder::from(table_scan.clone())
+ .filter(binary_expr(
+ lit(ScalarValue::Utf8(None)),
+ Operator::RegexNotIMatch,
+ lit(".*"),
+ ))?
+ .build()?;
+
+ assert_optimized_plan_equal!(
+ plan,
+ @ r"
+ Filter: Boolean(NULL)
+ TableScan: test
+ "
)
}
diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt
b/datafusion/sqllogictest/test_files/simplify_expr.slt
index f8c219e052..58ec7a1b26 100644
--- a/datafusion/sqllogictest/test_files/simplify_expr.slt
+++ b/datafusion/sqllogictest/test_files/simplify_expr.slt
@@ -44,12 +44,27 @@ query TT
explain select b from t where b !~ '.*'
----
logical_plan
-01)Filter: t.b = Utf8View("")
+01)Filter: t.b IS NULL AND Boolean(NULL)
02)--TableScan: t projection=[b]
physical_plan
-01)FilterExec: b@0 =
+01)FilterExec: b@0 IS NULL AND NULL
02)--DataSourceExec: partitions=1, partition_sizes=[1]
+query TB
+WITH vals(id, col) AS (
+ VALUES
+ (1, 'foo'::text),
+ (2, ''::text),
+ (3, NULL::text)
+)
+SELECT col, col !~ '.*'
+FROM vals
+ORDER BY id
+----
+foo false
+(empty) false
+NULL NULL
+
query T
select b from t where b ~ '.*'
----
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]