git commit: [SPARK-2919] [SQL] Basic support for analyze command in HiveQl

marmbrus Fri, 08 Aug 2014 11:24:33 -0700

Repository: spark
Updated Branches:
  refs/heads/master c874723fa -> 45d8f4dea



[SPARK-2919] [SQL] Basic support for analyze command in HiveQl

The command we will support is
```
ANALYZE TABLE tablename COMPUTE STATISTICS noscan
```
Other cases shown in 
https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables
 will still be treated as Hive native commands.

JIRA: https://issues.apache.org/jira/browse/SPARK-2919

Author: Yin Huai <[email protected]>

Closes #1848 from yhuai/sqlAnalyze and squashes the following commits:

0b79d36 [Yin Huai] Typo and format.
c59d94b [Yin Huai] Support "ANALYZE TABLE tableName COMPUTE STATISTICS noscan".


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/45d8f4de
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/45d8f4de
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/45d8f4de

Branch: refs/heads/master
Commit: 45d8f4deab50ae069ecde2201bd486d464a4501e
Parents: c874723
Author: Yin Huai <[email protected]>
Authored: Fri Aug 8 11:23:58 2014 -0700
Committer: Michael Armbrust <[email protected]>
Committed: Fri Aug 8 11:23:58 2014 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/hive/HiveQl.scala      | 21 +++++-
 .../apache/spark/sql/hive/HiveStrategies.scala  |  2 +
 .../spark/sql/hive/execution/DropTable.scala    | 48 -------------
 .../spark/sql/hive/execution/commands.scala     | 74 ++++++++++++++++++++
 .../apache/spark/sql/hive/StatisticsSuite.scala | 45 +++++++++++-
 5 files changed, 137 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/45d8f4de/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index bc2fefa..05b2f5f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -46,6 +46,8 @@ private[hive] case class AddFile(filePath: String) extends 
Command
 
 private[hive] case class DropTable(tableName: String, ifExists: Boolean) 
extends Command
 
+private[hive] case class AnalyzeTable(tableName: String) extends Command
+
 /** Provides a mapping from HiveQL statements to catalyst logical plans and 
expression trees. */
 private[hive] object HiveQl {
   protected val nativeCommands = Seq(
@@ -74,7 +76,6 @@ private[hive] object HiveQl {
     "TOK_CREATEFUNCTION",
     "TOK_DROPFUNCTION",
 
-    "TOK_ANALYZE",
     "TOK_ALTERDATABASE_PROPERTIES",
     "TOK_ALTERINDEX_PROPERTIES",
     "TOK_ALTERINDEX_REBUILD",
@@ -92,7 +93,6 @@ private[hive] object HiveQl {
     "TOK_ALTERTABLE_SKEWED",
     "TOK_ALTERTABLE_TOUCH",
     "TOK_ALTERTABLE_UNARCHIVE",
-    "TOK_ANALYZE",
     "TOK_CREATEDATABASE",
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
@@ -239,7 +239,6 @@ private[hive] object HiveQl {
         ShellCommand(sql.drop(1))
       } else {
         val tree = getAst(sql)
-
         if (nativeCommands contains tree.getText) {
           NativeCommand(sql)
         } else {
@@ -387,6 +386,22 @@ private[hive] object HiveQl {
            ifExists) =>
       val tableName = tableNameParts.map { case Token(p, Nil) => p 
}.mkString(".")
       DropTable(tableName, ifExists.nonEmpty)
+    // Support "ANALYZE TABLE tableNmae COMPUTE STATISTICS noscan"
+    case Token("TOK_ANALYZE",
+            Token("TOK_TAB", Token("TOK_TABNAME", tableNameParts) :: 
partitionSpec) ::
+            isNoscan) =>
+      // Reference:
+      // 
https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables
+      if (partitionSpec.nonEmpty) {
+        // Analyze partitions will be treated as a Hive native command.
+        NativePlaceholder
+      } else if (isNoscan.isEmpty) {
+        // If users do not specify "noscan", it will be treated as a Hive 
native command.
+        NativePlaceholder
+      } else {
+        val tableName = tableNameParts.map { case Token(p, Nil) => p 
}.mkString(".")
+        AnalyzeTable(tableName)
+      }
     // Just fake explain for any of the native commands.
     case Token("TOK_EXPLAIN", explainArgs)
       if noExplainCommands.contains(explainArgs.head.getText) =>

http://git-wip-us.apache.org/repos/asf/spark/blob/45d8f4de/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 2175c5f..85d2496 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -83,6 +83,8 @@ private[hive] trait HiveStrategies {
 
       case DropTable(tableName, ifExists) => execution.DropTable(tableName, 
ifExists) :: Nil
 
+      case AnalyzeTable(tableName) => execution.AnalyzeTable(tableName) :: Nil
+
       case describe: logical.DescribeCommand =>
         val resolvedTable = context.executePlan(describe.table).analyzed
         resolvedTable match {

http://git-wip-us.apache.org/repos/asf/spark/blob/45d8f4de/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
deleted file mode 100644
index 9cd0c86..0000000
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.execution.{Command, LeafNode}
-import org.apache.spark.sql.hive.HiveContext
-
-/**
- * :: DeveloperApi ::
- * Drops a table from the metastore and removes it if it is cached.
- */
-@DeveloperApi
-case class DropTable(tableName: String, ifExists: Boolean) extends LeafNode 
with Command {
-
-  def hiveContext = sqlContext.asInstanceOf[HiveContext]
-
-  def output = Seq.empty
-
-  override protected[sql] lazy val sideEffectResult: Seq[Any] = {
-    val ifExistsClause = if (ifExists) "IF EXISTS " else ""
-    hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
-    hiveContext.catalog.unregisterTable(None, tableName)
-    Seq.empty
-  }
-
-  override def execute(): RDD[Row] = {
-    sideEffectResult
-    sparkContext.emptyRDD[Row]
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/45d8f4de/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
new file mode 100644
index 0000000..2985169
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.execution.{Command, LeafNode}
+import org.apache.spark.sql.hive.HiveContext
+
+/**
+ * :: DeveloperApi ::
+ * Analyzes the given table in the current database to generate statistics, 
which will be
+ * used in query optimizations.
+ *
+ * Right now, it only supports Hive tables and it only updates the size of a 
Hive table
+ * in the Hive metastore.
+ */
+@DeveloperApi
+case class AnalyzeTable(tableName: String) extends LeafNode with Command {
+
+  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+  def output = Seq.empty
+
+  override protected[sql] lazy val sideEffectResult = {
+    hiveContext.analyze(tableName)
+    Seq.empty[Any]
+  }
+
+  override def execute(): RDD[Row] = {
+    sideEffectResult
+    sparkContext.emptyRDD[Row]
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Drops a table from the metastore and removes it if it is cached.
+ */
+@DeveloperApi
+case class DropTable(tableName: String, ifExists: Boolean) extends LeafNode 
with Command {
+
+  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+  def output = Seq.empty
+
+  override protected[sql] lazy val sideEffectResult: Seq[Any] = {
+    val ifExistsClause = if (ifExists) "IF EXISTS " else ""
+    hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
+    hiveContext.catalog.unregisterTable(None, tableName)
+    Seq.empty
+  }
+
+  override def execute(): RDD[Row] = {
+    sideEffectResult
+    sparkContext.emptyRDD[Row]
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/45d8f4de/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index bf5931b..7c82964 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -19,13 +19,54 @@ package org.apache.spark.sql.hive
 
 import scala.reflect.ClassTag
 
+
 import org.apache.spark.sql.{SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
 import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 
 class StatisticsSuite extends QueryTest {
 
+  test("parse analyze commands") {
+    def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
+      val parsed = HiveQl.parseSql(analyzeCommand)
+      val operators = parsed.collect {
+        case a: AnalyzeTable => a
+        case o => o
+      }
+
+      assert(operators.size === 1)
+      if (operators(0).getClass() != c) {
+        fail(
+          s"""$analyzeCommand expected command: $c, but got ${operators(0)}
+             |parsed command:
+             |$parsed
+           """.stripMargin)
+      }
+    }
+
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 COMPUTE STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE 
STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE 
STATISTICS noscan",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
+      classOf[NativeCommand])
+
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
+      classOf[AnalyzeTable])
+  }
+
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
       catalog.lookupRelation(None, tableName).statistics.sizeInBytes
@@ -37,7 +78,7 @@ class StatisticsSuite extends QueryTest {
 
     assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
 
-    analyze("analyzeTable")
+    sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
 
     assert(queryTotalSize("analyzeTable") === BigInt(11624))
 
@@ -66,7 +107,7 @@ class StatisticsSuite extends QueryTest {
 
     assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
 
-    analyze("analyzeTable_part")
+    sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
     assert(queryTotalSize("analyzeTable_part") === BigInt(17436))
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

git commit: [SPARK-2919] [SQL] Basic support for analyze command in HiveQl

Reply via email to