spark git commit: [SPARK-6040][SQL] Fix the percent bug in tablesample

marmbrus Mon, 02 Mar 2015 13:16:53 -0800

Repository: spark
Updated Branches:
  refs/heads/master 3f9def811 -> 582e5a24c



[SPARK-6040][SQL] Fix the percent bug in tablesample

HiveQL expression like `select count(1) from src tablesample(1 percent);` means 
take 1% sample to select. But it means 100% in the current version of the Spark.

Author: q00251598 <[email protected]>

Closes #4789 from watermen/SPARK-6040 and squashes the following commits:

2453ebe [q00251598] check and adjust the fraction.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/582e5a24
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/582e5a24
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/582e5a24

Branch: refs/heads/master
Commit: 582e5a24c55e8c876733537c9910001affc8b29b
Parents: 3f9def8
Author: q00251598 <[email protected]>
Authored: Mon Mar 2 13:16:29 2015 -0800
Committer: Michael Armbrust <[email protected]>
Committed: Mon Mar 2 13:16:29 2015 -0800

----------------------------------------------------------------------
 .../main/scala/org/apache/spark/sql/hive/HiveQl.scala    | 11 ++++++++++-
 .../apache/spark/sql/hive/execution/HiveQuerySuite.scala |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/582e5a24/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 98263f6..ced99cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.sources.DescribeCommand
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, 
AnalyzeTable, HiveScriptIOSchema}
 import org.apache.spark.sql.types._
+import org.apache.spark.util.random.RandomSampler
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -850,7 +851,15 @@ 
https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case Token("TOK_TABLESPLITSAMPLE",
                Token("TOK_PERCENT", Nil) ::
                Token(fraction, Nil) :: Nil) =>
-          Sample(fraction.toDouble, withReplacement = false, (math.random * 
1000).toInt, relation)
+          // The range of fraction accepted by Sample is [0, 1]. Because 
Hive's block sampling
+          // function takes X PERCENT as the input and the range of X is [0, 
100], we need to
+          // adjust the fraction.
+          require(
+            fraction.toDouble >= (0.0 - RandomSampler.roundingEpsilon)
+              && fraction.toDouble <= (100.0 + RandomSampler.roundingEpsilon),
+            s"Sampling fraction ($fraction) must be on interval [0, 100]")
+          Sample(fraction.toDouble / 100, withReplacement = false, 
(math.random * 1000).toInt,
+            relation)
         case Token("TOK_TABLEBUCKETSAMPLE",
                Token(numerator, Nil) ::
                Token(denominator, Nil) :: Nil) =>

http://git-wip-us.apache.org/repos/asf/spark/blob/582e5a24/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index bb0a67d..c0d21bc 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -467,6 +467,7 @@ class HiveQuerySuite extends HiveComparisonTest with 
BeforeAndAfter {
 
   test("sampling") {
     sql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
+    sql("SELECT * FROM src TABLESAMPLE(100 PERCENT) s")
   }
 
   test("DataFrame toString") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-6040][SQL] Fix the percent bug in tablesample

Reply via email to