This is an automated email from the ASF dual-hosted git repository. zhangzc pushed a commit to branch kylin-on-parquet-v2 in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this push: new d16e7f0 KYLIN-4980 Support prunning segments from complex filter conditions d16e7f0 is described below commit d16e7f053116cf659a3998affba233320d3d1dca Author: zhengshengjun <shengjun_zh...@sina.com> AuthorDate: Tue Apr 20 10:05:56 2021 +0800 KYLIN-4980 Support prunning segments from complex filter conditions --- .../resources/query/sql_prune_segment/query02.sql | 28 +++++++++++++ .../query02.sql.expected/._SUCCESS.crc | Bin 0 -> 8 bytes ...343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc | Bin 0 -> 12 bytes .../query02.sql.expected/_SUCCESS | 0 ...0-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv | 3 ++ .../sql/execution/datasource/FilePruner.scala | 46 +++++++++++++++++++-- 6 files changed, 74 insertions(+), 3 deletions(-) diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql new file mode 100644 index 0000000..7eab250 --- /dev/null +++ b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql @@ -0,0 +1,28 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +select lo_orderdate, lo_quantity, sum(lo_revenue) from ssb.p_lineorder +where +(lo_orderdate = 19920906 and lo_quantity = 4) or +(lo_orderdate = 19920905 and lo_quantity = 9) and +(lo_orderdate = 19920904 and lo_quantity = 7) or +( + lo_orderdate > 19920906 and lo_orderdate <= 19920907 and (lo_quantity = 6 or lo_quantity = 49) +) +group by lo_orderdate, lo_quantity +;{"scanRowCount":9,"scanBytes":0,"scanFiles":2,"cuboidId":[7],"exactlyMatched":[false]} \ No newline at end of file diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc differ diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc new file mode 100644 index 0000000..71996cc Binary files /dev/null and b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc differ diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv new file mode 100644 index 0000000..9770131 --- /dev/null +++ b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv @@ -0,0 +1,3 @@ +19920907,6,1192201 +19920907,49,8718385 +19920906,4,396435 diff --git a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala index b6008c2..91faab4 100644 --- a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala +++ b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala @@ -237,7 +237,7 @@ class FilePruner(cubeInstance: CubeInstance, require(isResolved) val startTime = System.nanoTime - val timePartitionFilters = getSpecFilter(dataFilters, timePartitionColumn) + val timePartitionFilters = getSegmentFilter(dataFilters, timePartitionColumn) logInfo(s"Applying time partition filters: ${timePartitionFilters.mkString(",")}") val fsc = ShardFileStatusCache.getFileStatusCache(session) @@ -295,8 +295,48 @@ class FilePruner(cubeInstance: CubeInstance, } } - private def getSpecFilter(dataFilters: Seq[Expression], col: Attribute): Seq[Expression] = { - dataFilters.filter(_.references.subsetOf(AttributeSet(col))) + private def getSegmentFilter(dataFilters: Seq[Expression], col: Attribute): Seq[Expression] = { + dataFilters.map(extractSegmentFilter(_, col)).filter(!_.equals(None)).map(_.get) + } + + private def extractSegmentFilter(filter: Expression, col: Attribute): Option[Expression] = { + filter match { + case expressions.Or(left, right) => + val leftChild = extractSegmentFilter(left, col) + val rightChild = extractSegmentFilter(right, col) + + //if there exists leaf-node that doesn't contain partition column, the parent filter is + //unnecessary for segment prunning. + //e.g. "where a = xxx or partition = xxx", we can't filter any segment + if (leftChild.eq(None) || rightChild.eq(None)) { + None + } else { + Some(expressions.Or(leftChild.get, rightChild.get)) + } + case expressions.And(left, right) => + val leftChild = extractSegmentFilter(left, col) + val rightChild = extractSegmentFilter(right, col) + + //if there is only one leaf-node that contains partition column + //e.g. "where a = xxx and partition = xxx", + //then we can filter segment using "where partition = xxx" + if (!leftChild.eq(None) && !rightChild.eq(None)) { + Some(expressions.And(leftChild.get, rightChild.get)) + } else if (!rightChild.eq(None)) { + rightChild + } else if (!leftChild.eq(None)) { + leftChild + } else { + None + } + case _ => + //other unary filter like EqualTo, GreaterThan, GreaterThanOrEqual, etc. + if (filter.references.subsetOf(AttributeSet(col))) { + Some(filter) + } else { + None + } + } } private def pruneSegments(filters: Seq[Expression],