spark git commit: [SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING

wenchen Thu, 26 Apr 2018 04:08:27 -0700

Repository: spark
Updated Branches:
  refs/heads/master d1eb8d3dd -> ce2f919f8



[SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong 
stats for STRING

## What changes were proposed in this pull request?
`colStat.min` AND `colStat.max` are empty for string type. Thus, 
`evaluateInSet` should not return zero when either `colStat.min` or 
`colStat.max`.

## How was this patch tested?
Added a test case.

Author: gatorsmile <gatorsm...@gmail.com>

Closes #21147 from gatorsmile/cached.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce2f919f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce2f919f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce2f919f

Branch: refs/heads/master
Commit: ce2f919f8df1b794ceaa23e1a59d5d541ed47bf5
Parents: d1eb8d3
Author: gatorsmile <gatorsm...@gmail.com>
Authored: Thu Apr 26 19:07:13 2018 +0800
Committer: Wenchen Fan <wenc...@databricks.com>
Committed: Thu Apr 26 19:07:13 2018 +0800

----------------------------------------------------------------------
 .../logical/statsEstimation/FilterEstimation.scala      | 12 ++++++++----
 .../statsEstimation/FilterEstimationSuite.scala         | 12 ++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 263c9ba..5a3eeef 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging 
{
     val dataType = attr.dataType
     var newNdv = ndv
 
-    if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
-      return Some(0.0)
-    }
-
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>
+        if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
+          return Some(0.0)
+        }
+
         val statsInterval =
           ValueInterval(colStat.min, colStat.max, 
dataType).asInstanceOf[NumericValueInterval]
         val validQuerySet = hSet.filter { v =>
@@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
       // We assume the whole set since there is no min/max information for 
String/Binary type
       case StringType | BinaryType =>
+        if (ndv.toDouble == 0)  {
+          return Some(0.0)
+        }
+
         newNdv = ndv.min(BigInt(hSet.size))
         if (update) {
           val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount 
= Some(0))

http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 16cb5d0..47bfa62 100755
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -368,6 +368,18 @@ class FilterEstimationSuite extends 
StatsEstimationTestBase {
       expectedRowCount = 0)
   }
 
+  test("evaluateInSet with string") {
+    validateEstimatedStats(
+      Filter(InSet(attrString, Set("A0")),
+        StatsTestPlan(Seq(attrString), 10,
+          AttributeMap(Seq(attrString ->
+            ColumnStat(distinctCount = Some(10), min = None, max = None,
+              nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
+      Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = 
None,
+        nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
+      expectedRowCount = 1)
+  }
+
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), 
childStatsTestPlan(Seq(attrInt), 10L)),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING

Reply via email to