This is an automated email from the ASF dual-hosted git repository.

zhangzc pushed a commit to branch kylin-on-parquet-v2
in repository https://gitbox.apache.org/repos/asf/kylin.git


The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this 
push:
     new d16e7f0  KYLIN-4980 Support prunning segments from complex filter 
conditions
d16e7f0 is described below

commit d16e7f053116cf659a3998affba233320d3d1dca
Author: zhengshengjun <shengjun_zh...@sina.com>
AuthorDate: Tue Apr 20 10:05:56 2021 +0800

    KYLIN-4980 Support prunning segments from complex filter conditions
---
 .../resources/query/sql_prune_segment/query02.sql  |  28 +++++++++++++
 .../query02.sql.expected/._SUCCESS.crc             | Bin 0 -> 8 bytes
 ...343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc | Bin 0 -> 12 bytes
 .../query02.sql.expected/_SUCCESS                  |   0
 ...0-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv |   3 ++
 .../sql/execution/datasource/FilePruner.scala      |  46 +++++++++++++++++++--
 6 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql
new file mode 100644
index 0000000..7eab250
--- /dev/null
+++ b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql
@@ -0,0 +1,28 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+select  lo_orderdate, lo_quantity, sum(lo_revenue) from ssb.p_lineorder
+where
+(lo_orderdate = 19920906 and lo_quantity = 4) or
+(lo_orderdate = 19920905 and lo_quantity = 9) and
+(lo_orderdate = 19920904 and lo_quantity = 7) or
+(
+    lo_orderdate > 19920906 and lo_orderdate <= 19920907 and (lo_quantity = 6 
or lo_quantity = 49)
+)
+group by lo_orderdate, lo_quantity
+;{"scanRowCount":9,"scanBytes":0,"scanFiles":2,"cuboidId":[7],"exactlyMatched":[false]}
\ No newline at end of file
diff --git 
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
new file mode 100644
index 0000000..3b7b044
Binary files /dev/null and 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
 differ
diff --git 
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
new file mode 100644
index 0000000..71996cc
Binary files /dev/null and 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
 differ
diff --git 
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS
 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS
new file mode 100644
index 0000000..e69de29
diff --git 
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
new file mode 100644
index 0000000..9770131
--- /dev/null
+++ 
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
@@ -0,0 +1,3 @@
+19920907,6,1192201
+19920907,49,8718385
+19920906,4,396435
diff --git 
a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
 
b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
index b6008c2..91faab4 100644
--- 
a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
+++ 
b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
@@ -237,7 +237,7 @@ class FilePruner(cubeInstance: CubeInstance,
 
     require(isResolved)
     val startTime = System.nanoTime
-    val timePartitionFilters = getSpecFilter(dataFilters, timePartitionColumn)
+    val timePartitionFilters = getSegmentFilter(dataFilters, 
timePartitionColumn)
     logInfo(s"Applying time partition filters: 
${timePartitionFilters.mkString(",")}")
 
     val fsc = ShardFileStatusCache.getFileStatusCache(session)
@@ -295,8 +295,48 @@ class FilePruner(cubeInstance: CubeInstance,
     }
   }
 
-  private def getSpecFilter(dataFilters: Seq[Expression], col: Attribute): 
Seq[Expression] = {
-    dataFilters.filter(_.references.subsetOf(AttributeSet(col)))
+  private def getSegmentFilter(dataFilters: Seq[Expression], col: Attribute): 
Seq[Expression] = {
+    dataFilters.map(extractSegmentFilter(_, 
col)).filter(!_.equals(None)).map(_.get)
+  }
+
+  private def extractSegmentFilter(filter: Expression, col: Attribute): 
Option[Expression] = {
+    filter match {
+      case expressions.Or(left, right) =>
+        val leftChild = extractSegmentFilter(left, col)
+        val rightChild = extractSegmentFilter(right, col)
+
+        //if there exists leaf-node that doesn't contain partition column, the 
parent filter is
+        //unnecessary for segment prunning.
+        //e.g. "where a = xxx or partition = xxx", we can't filter any segment
+        if (leftChild.eq(None) || rightChild.eq(None)) {
+          None
+        } else {
+          Some(expressions.Or(leftChild.get, rightChild.get))
+        }
+      case expressions.And(left, right) =>
+        val leftChild = extractSegmentFilter(left, col)
+        val rightChild = extractSegmentFilter(right, col)
+
+        //if there is only one leaf-node that contains partition column
+        //e.g. "where a = xxx and partition = xxx",
+        //then we can filter segment using "where partition = xxx"
+        if (!leftChild.eq(None) && !rightChild.eq(None)) {
+          Some(expressions.And(leftChild.get, rightChild.get))
+        } else if (!rightChild.eq(None)) {
+          rightChild
+        } else if (!leftChild.eq(None)) {
+          leftChild
+        } else {
+          None
+        }
+      case _ =>
+        //other unary filter like EqualTo, GreaterThan, GreaterThanOrEqual, 
etc.
+        if (filter.references.subsetOf(AttributeSet(col))) {
+          Some(filter)
+        } else {
+          None
+        }
+    }
   }
 
   private def pruneSegments(filters: Seq[Expression],

Reply via email to