This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 782e769f6c Add BETWEEN support to NumericalFilterOptimizer (#14163)
782e769f6c is described below

commit 782e769f6c683e181903d5b6e1fc469b12193fc0
Author: Yash Mayya <yash.ma...@gmail.com>
AuthorDate: Fri Oct 11 11:12:48 2024 +0530

    Add BETWEEN support to NumericalFilterOptimizer (#14163)
---
 .../optimizer/filter/NumericalFilterOptimizer.java | 277 ++++++++++++++++-----
 .../filter/NumericalFilterOptimizerTest.java       |  68 +++++
 2 files changed, 282 insertions(+), 63 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
index 7d2336a1a0..961dfa3b71 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
@@ -24,7 +24,6 @@ import javax.annotation.Nullable;
 import org.apache.pinot.common.request.Expression;
 import org.apache.pinot.common.request.ExpressionType;
 import org.apache.pinot.common.request.Function;
-import org.apache.pinot.common.request.Literal;
 import org.apache.pinot.spi.data.FieldSpec;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.data.Schema;
@@ -32,34 +31,44 @@ import org.apache.pinot.sql.FilterKind;
 
 
 /**
- * Numerical expressions of form "column <operator> literal", where operator 
can be '=', '!=', '>', '>=', '<', or '<=',
- * can compare a column of one datatype (say INT) with a literal of different 
datatype (say DOUBLE). These expressions
- * can not be evaluated on the Server. Hence, we rewrite such expressions into 
an equivalent expression whose LHS and
- * RHS are of the same datatype.
- *
+ * Numerical expressions of the form "column [operator] literal", where 
operator can be '=', '!=', '>', '>=', '<', '<=',
+ * or 'BETWEEN' can compare a column of one datatype (say INT) with a literal 
of different datatype (say DOUBLE). These
+ * expressions can not be evaluated on the Server. Hence, we rewrite such 
expressions into an equivalent expression
+ * whose LHS and RHS are of the same datatype.
+ * <p>
  * Simple predicate examples:
- *  1) WHERE "intColumn = 5.0"  gets rewritten to "WHERE intColumn = 5"
- *  2) WHERE "intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
- *  3) WHERE "intColumn = 5.5"  gets rewritten to "WHERE false" because INT 
values can not match 5.5.
- *  4) WHERE "intColumn = 3000000000" gets rewritten to "WHERE false" because 
INT values can not match 3000000000.
- *  5) WHERE "intColumn != 3000000000" gets rewritten to "WHERE true" because 
INT values always not equal to 3000000000.
- *  6) WHERE "intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
- *  7) WHERE "intColumn > -3E9" gets rewritten to "WHERE true" because int 
values are always greater than -3E9.
- *
+ * <ol>
+ *  <li> "WHERE intColumn = 5.0"  gets rewritten to "WHERE intColumn = 5"
+ *  <li> "WHERE intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
+ *  <li> "WHERE intColumn = 5.5"  gets rewritten to "WHERE false" because INT 
values can not match 5.5.
+ *  <li> "WHERE intColumn = 3000000000" gets rewritten to "WHERE false" 
because INT values can not match 3000000000.
+ *  <li> "WHERE intColumn != 3000000000" gets rewritten to "WHERE true" 
because INT values always not equal to
+ *  3000000000.
+ *  <li> "WHERE intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
+ *  <li> "WHERE intColumn > -3E9" gets rewritten to "WHERE true" because int 
values are always greater than -3E9.
+ *  <li> "WHERE intColumn BETWEEN 2.5 AND 7.5" gets rewritten to "WHERE 
intColumn BETWEEN 3 AND 7"
+ *  <li> "WHERE intColumn BETWEEN 5.5 AND 3000000000" gets rewritten to "WHERE 
intColumn BETWEEN 6 AND 2147483647" since
+ *  3000000000 is greater than Integer.MAX_VALUE.
+ *  <li> "WHERE intColumn BETWEEN 10 AND 0" gets rewritten to "WHERE false" 
because lower bound is greater than upper
+ *  bound.
+ * </ol>
+ * <p>
  * Compound predicate examples:
- *  8) WHERE "intColumn1 = 5.5 AND intColumn2 = intColumn3"
+ * <ol>
+ *  <li> "WHERE intColumn1 = 5.5 AND intColumn2 = intColumn3"
  *       rewrite to "WHERE false AND intColumn2 = intColumn3"
  *       rewrite to "WHERE intColumn2 = intColumn3"
- *  9) WHERE "intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out 
of bounds for integer column)
+ *  <li> "WHERE intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is 
out of bounds for integer column)
  *       rewrite to "WHERE true OR false"
  *       rewrite to "WHERE true"
  *       rewrite to query without any WHERE clause.
- *
+ * </ol>
+ * <p>
  * When entire predicate gets rewritten to false (Example 3 above), the query 
will not return any data. Hence, it is
  * better for the Broker itself to return an empty response rather than 
sending the query to servers for further
  * evaluation.
- *
- * TODO: Add support for BETWEEN, IN, and NOT IN operators.
+ * <p>
+ * TODO: Add support for IN, and NOT IN operators.
  */
 public class NumericalFilterOptimizer extends BaseAndOrBooleanFilterOptimizer {
 
@@ -74,33 +83,39 @@ public class NumericalFilterOptimizer extends 
BaseAndOrBooleanFilterOptimizer {
   Expression optimizeChild(Expression filterExpression, @Nullable Schema 
schema) {
     Function function = filterExpression.getFunctionCall();
     FilterKind kind = FilterKind.valueOf(function.getOperator());
+
+    if (!kind.isRange() && kind != FilterKind.EQUALS && kind != 
FilterKind.NOT_EQUALS) {
+      return filterExpression;
+    }
+
+    List<Expression> operands = function.getOperands();
+    // Verify that LHS is a numeric column and RHS is a literal before 
rewriting.
+    Expression lhs = operands.get(0);
+    Expression rhs = operands.get(1);
+
+    DataType dataType = getDataType(lhs, schema);
+    if (dataType == null || !dataType.isNumeric() || !rhs.isSetLiteral()) {
+      // No rewrite here
+      return filterExpression;
+    }
+
     switch (kind) {
-      case IS_NULL:
-      case IS_NOT_NULL:
-        // No need to try to optimize IS_NULL and IS_NOT_NULL operations on 
numerical columns.
-        break;
-      default:
-        List<Expression> operands = function.getOperands();
-        // Verify that LHS is a numeric column and RHS is a numeric literal 
before rewriting.
-        Expression lhs = operands.get(0);
-        Expression rhs = operands.get(1);
-        if (isNumericLiteral(rhs)) {
-          DataType dataType = getDataType(lhs, schema);
-          if (dataType != null && dataType.isNumeric()) {
-            switch (kind) {
-              case EQUALS:
-              case NOT_EQUALS:
-                return rewriteEqualsExpression(filterExpression, kind, 
dataType, rhs);
-              case GREATER_THAN:
-              case GREATER_THAN_OR_EQUAL:
-              case LESS_THAN:
-              case LESS_THAN_OR_EQUAL:
-                return rewriteRangeExpression(filterExpression, kind, 
dataType, rhs);
-              default:
-                break;
-            }
-          }
+      case BETWEEN: {
+        return rewriteBetweenExpression(filterExpression, dataType);
+      }
+      case EQUALS:
+      case NOT_EQUALS:
+      case GREATER_THAN:
+      case GREATER_THAN_OR_EQUAL:
+      case LESS_THAN:
+      case LESS_THAN_OR_EQUAL: {
+        if (kind.isRange()) {
+          return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
+        } else {
+          return rewriteEqualsExpression(filterExpression, kind, dataType, 
rhs);
         }
+      }
+      default:
         break;
     }
     return filterExpression;
@@ -346,6 +361,159 @@ public class NumericalFilterOptimizer extends 
BaseAndOrBooleanFilterOptimizer {
     return range;
   }
 
+  /**
+   * Rewrite expressions of the form "column BETWEEN lower AND upper" to 
ensure that lower and upper bounds are the same
+   * datatype as the column (or can be cast to the same datatype in the 
server).
+   */
+  private static Expression rewriteBetweenExpression(Expression between, 
DataType dataType) {
+    // TODO: Consider unifying logic with rewriteRangeExpression
+    List<Expression> operands = between.getFunctionCall().getOperands();
+    Expression lower = operands.get(1);
+    Expression upper = operands.get(2);
+
+    // The BETWEEN filter predicate currently only supports literals as lower 
and upper bounds, but we're still checking
+    // here just in case.
+    if (lower.isSetLiteral()) {
+      switch (lower.getLiteral().getSetField()) {
+        case LONG_VALUE: {
+          long actual = lower.getLiteral().getLongValue();
+          // Other data types can be converted on the server side.
+          if (dataType == DataType.INT) {
+            if (actual > Integer.MAX_VALUE) {
+              // Lower bound literal value is greater than the bounds of INT.
+              return getExpressionFromBoolean(false);
+            }
+            if (actual < Integer.MIN_VALUE) {
+              lower.getLiteral().setIntValue(Integer.MIN_VALUE);
+            }
+          }
+          break;
+        }
+        case DOUBLE_VALUE: {
+          double actual = lower.getLiteral().getDoubleValue();
+
+          switch (dataType) {
+            case INT: {
+              if (actual > Integer.MAX_VALUE) {
+                // Lower bound literal value is greater than the bounds of INT.
+                return getExpressionFromBoolean(false);
+              }
+              if (actual < Integer.MIN_VALUE) {
+                lower.getLiteral().setIntValue(Integer.MIN_VALUE);
+              } else {
+                // Double value is in int range
+                int converted = (int) actual;
+                int comparison = 
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+                if (comparison >= 0) {
+                  lower.getLiteral().setIntValue(converted);
+                } else {
+                  lower.getLiteral().setIntValue(converted + 1);
+                }
+              }
+              break;
+            }
+            case LONG: {
+              if (actual > Long.MAX_VALUE) {
+                // Lower bound literal value is greater than the bounds of 
LONG.
+                return getExpressionFromBoolean(false);
+              }
+              if (actual < Long.MIN_VALUE) {
+                lower.getLiteral().setLongValue(Long.MIN_VALUE);
+              } else {
+                // Double value is in long range
+                long converted = (long) actual;
+                int comparison = 
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+                if (comparison >= 0) {
+                  lower.getLiteral().setLongValue(converted);
+                } else {
+                  lower.getLiteral().setLongValue(converted + 1);
+                }
+              }
+              break;
+            }
+            default:
+              // For other numeric data types, the double literal can be 
converted on the server side.
+              break;
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    if (upper.isSetLiteral()) {
+      switch (upper.getLiteral().getSetField()) {
+        case LONG_VALUE: {
+          long actual = upper.getLiteral().getLongValue();
+          // Other data types can be converted on the server side.
+          if (dataType == DataType.INT) {
+            if (actual < Integer.MIN_VALUE) {
+              // Upper bound literal value is lesser than the bounds of INT.
+              return getExpressionFromBoolean(false);
+            }
+            if (actual > Integer.MAX_VALUE) {
+              upper.getLiteral().setIntValue(Integer.MAX_VALUE);
+            }
+          }
+          break;
+        }
+        case DOUBLE_VALUE: {
+          double actual = upper.getLiteral().getDoubleValue();
+
+          switch (dataType) {
+            case INT: {
+              if (actual < Integer.MIN_VALUE) {
+                // Upper bound literal value is lesser than the bounds of INT.
+                return getExpressionFromBoolean(false);
+              }
+              if (actual > Integer.MAX_VALUE) {
+                upper.getLiteral().setIntValue(Integer.MAX_VALUE);
+              } else {
+                // Double value is in int range
+                int converted = (int) actual;
+                int comparison = 
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+                if (comparison <= 0) {
+                  upper.getLiteral().setIntValue(converted);
+                } else {
+                  upper.getLiteral().setIntValue(converted - 1);
+                }
+              }
+              break;
+            }
+            case LONG: {
+              if (actual < Long.MIN_VALUE) {
+                // Upper bound literal value is lesser than the bounds of LONG.
+                return getExpressionFromBoolean(false);
+              }
+              if (actual > Long.MAX_VALUE) {
+                upper.getLiteral().setLongValue(Long.MAX_VALUE);
+              } else {
+                // Double value is in long range
+                long converted = (long) actual;
+                int comparison = 
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+                if (comparison <= 0) {
+                  upper.getLiteral().setLongValue(converted);
+                } else {
+                  upper.getLiteral().setLongValue(converted - 1);
+                }
+              }
+              break;
+            }
+            default:
+              // For other numeric data types, the double literal can be 
converted on the server side.
+              break;
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    return between;
+  }
+
   /**
    * Helper function to rewrite range operator of a range expression.
    * @param range Range expression.
@@ -356,9 +524,9 @@ public class NumericalFilterOptimizer extends 
BaseAndOrBooleanFilterOptimizer {
     if (comparison > 0) {
       // Literal value is greater than the converted value, so rewrite:
       //   "column >  literal" to "column >  converted"
-      //   "column >= literal" to "column >= converted"
+      //   "column >= literal" to "column >  converted"
       //   "column <  literal" to "column <= converted"
-      //   "column <= literal" to "column <  converted"
+      //   "column <= literal" to "column <= converted"
       if (kind == FilterKind.GREATER_THAN || kind == 
FilterKind.GREATER_THAN_OR_EQUAL) {
         range.getFunctionCall().setOperator(FilterKind.GREATER_THAN.name());
       } else if (kind == FilterKind.LESS_THAN || kind == 
FilterKind.LESS_THAN_OR_EQUAL) {
@@ -413,21 +581,4 @@ public class NumericalFilterOptimizer extends 
BaseAndOrBooleanFilterOptimizer {
     }
     return null;
   }
-
-  /** @return true if expression is a numeric literal; otherwise, false. */
-  private static boolean isNumericLiteral(Expression expression) {
-    if (expression.getType() == ExpressionType.LITERAL) {
-      Literal._Fields type = expression.getLiteral().getSetField();
-      switch (type) {
-        case INT_VALUE:
-        case LONG_VALUE:
-        case FLOAT_VALUE:
-        case DOUBLE_VALUE:
-          return true;
-        default:
-          break;
-      }
-    }
-    return false;
-  }
 }
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
index e947740e24..363ef8887d 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
@@ -295,6 +295,74 @@ public class NumericalFilterOptimizerTest {
             + ".223372036854776E18>)]))");
   }
 
+  @Test
+  public void testBetweenRewrites() {
+    // Test LONG literal with INT column.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN 3000000000 AND 4000000000"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN -4000000000 AND -3000000000"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+
+    // No rewrite
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN -2000000000 AND 2000000000"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:intColumn)), 
Expression(type:LITERAL, literal:<Literal "
+            + "intValue:-2000000000>), Expression(type:LITERAL, 
literal:<Literal intValue:2000000000>)]))");
+
+    // Test INT column with DOUBLE lower bound greater than Integer.MAX_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN 3000000000.0 AND 4000000000.0"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+    // Test INT column with DOUBLE upper bound lesser than Integer.MIN_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN -4000000000.0 AND -3000000000.0"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+    // Test INT column with LONG lower bound lesser than Integer.MIN_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN -4000000000 AND 0"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:intColumn)), 
Expression(type:LITERAL, literal:<Literal "
+            + "intValue:-2147483648>), Expression(type:LITERAL, 
literal:<Literal intValue:0>)]))");
+    // Test INT column with LONG upper bound greater than Integer.MAX_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN 0 AND 4000000000"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:intColumn)), 
Expression(type:LITERAL, literal:<Literal "
+            + "intValue:0>), Expression(type:LITERAL, literal:<Literal 
intValue:2147483647>)]))");
+
+    // Test LONG column with DOUBLE lower bound greater than Long.MAX_VALUE.
+    Assert.assertEquals(
+        rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN 
9323372036854775808.0 AND 9323372036854775809.0"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+    // Test LONG column with DOUBLE upper bound lesser than Long.MIN_VALUE.
+    Assert.assertEquals(
+        rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN 
-9323372036854775809.0 AND -9323372036854775808.0"),
+        "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+    // Test LONG column with DOUBLE lower bound lesser than Long.MIN_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn 
BETWEEN -9323372036854775809.0 AND 0"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:longColumn)), 
Expression(type:LITERAL, literal:<Literal "
+            + "longValue:-9223372036854775808>), Expression(type:LITERAL, 
literal:<Literal intValue:0>)]))");
+    // Test LONG column with DOUBLE upper bound greater than Long.MAX_VALUE.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn 
BETWEEN 0 AND 9323372036854775808.0"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:longColumn)), 
Expression(type:LITERAL, literal:<Literal intValue:0>), "
+            + "Expression(type:LITERAL, literal:<Literal 
longValue:9223372036854775807>)]))");
+
+    // Test DOUBLE literal rewrite for INT and LONG columns.
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN 2.5 AND 7.5"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:intColumn)), 
Expression(type:LITERAL, literal:<Literal intValue:3>), "
+            + "Expression(type:LITERAL, literal:<Literal intValue:7>)]))");
+
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn 
BETWEEN 2.1 AND 7.9"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:longColumn)), 
Expression(type:LITERAL, literal:<Literal longValue:3>), "
+            + "Expression(type:LITERAL, literal:<Literal longValue:7>)]))");
+
+    Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn 
BETWEEN -5.0 AND -3.5"),
+        "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, 
operands:[Expression(type:IDENTIFIER, "
+            + "identifier:Identifier(name:intColumn)), 
Expression(type:LITERAL, literal:<Literal intValue:-5>), "
+            + "Expression(type:LITERAL, literal:<Literal intValue:-4>)]))");
+  }
+
   @Test
   public void testNull() {
     // Test column IS NOT NULL.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to