spark git commit: [SPARK-15655][SQL] Fix Wrong Partition Column Order when Fetching Partitioned Tables

wenchen Tue, 14 Jun 2016 09:59:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master 6151d2641 -> bc02d0112



[SPARK-15655][SQL] Fix Wrong Partition Column Order when Fetching Partitioned 
Tables

#### What changes were proposed in this pull request?
When fetching the partitioned table, the output contains wrong results. The 
order of partition key values do not match the order of partition key columns 
in output schema. For example,

```SQL
CREATE TABLE table_with_partition(c1 string) PARTITIONED BY (p1 string,p2 
string,p3 string,p4 string,p5 string)

INSERT OVERWRITE TABLE table_with_partition PARTITION 
(p1='a',p2='b',p3='c',p4='d',p5='e') SELECT 'blarr'

SELECT p1, p2, p3, p4, p5, c1 FROM table_with_partition
```
```
+---+---+---+---+---+-----+
| p1| p2| p3| p4| p5|   c1|
+---+---+---+---+---+-----+
|  d|  e|  c|  b|  a|blarr|
+---+---+---+---+---+-----+
```

The expected result should be
```
+---+---+---+---+---+-----+
| p1| p2| p3| p4| p5|   c1|
+---+---+---+---+---+-----+
|  a|  b|  c|  d|  e|blarr|
+---+---+---+---+---+-----+
```
This PR is to fix this by enforcing the order matches the table partition 
definition.

#### How was this patch tested?
Added a test case into `SQLQuerySuite`

Author: gatorsmile <[email protected]>

Closes #13400 from gatorsmile/partitionedTableFetch.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc02d011
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc02d011
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc02d011

Branch: refs/heads/master
Commit: bc02d011294fcd1ab07b9baf1011c3f2bdf749d9
Parents: 6151d26
Author: gatorsmile <[email protected]>
Authored: Tue Jun 14 09:58:06 2016 -0700
Committer: Wenchen Fan <[email protected]>
Committed: Tue Jun 14 09:58:06 2016 -0700

----------------------------------------------------------------------
 .../spark/sql/hive/MetastoreRelation.scala      |  2 +-
 .../sql/hive/execution/SQLQuerySuite.scala      | 32 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bc02d011/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
index 9c82014..5596a44 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
@@ -160,7 +160,7 @@ private[hive] case class MetastoreRelation(
       val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
       tPartition.setDbName(databaseName)
       tPartition.setTableName(tableName)
-      tPartition.setValues(p.spec.values.toList.asJava)
+      tPartition.setValues(partitionKeys.map(a => p.spec(a.name)).asJava)
 
       val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
       tPartition.setSd(sd)

http://git-wip-us.apache.org/repos/asf/spark/blob/bc02d011/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 1a0eaa6..9c1f218 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1610,6 +1610,38 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
     assert(fs.exists(path), "This is an external table, so the data should not 
have been dropped")
   }
 
+  test("select partitioned table") {
+    val table = "table_with_partition"
+    withTable(table) {
+      sql(
+        s"""
+           |CREATE TABLE $table(c1 string)
+           |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string)
+         """.stripMargin)
+      sql(
+        s"""
+           |INSERT OVERWRITE TABLE $table
+           |PARTITION (p1='a',p2='b',p3='c',p4='d',p5='e')
+           |SELECT 'blarr'
+         """.stripMargin)
+
+      // project list is the same order of paritioning columns in table 
definition
+      checkAnswer(
+        sql(s"SELECT p1, p2, p3, p4, p5, c1 FROM $table"),
+        Row("a", "b", "c", "d", "e", "blarr") :: Nil)
+
+      // project list does not have the same order of paritioning columns in 
table definition
+      checkAnswer(
+        sql(s"SELECT p2, p3, p4, p1, p5, c1 FROM $table"),
+        Row("b", "c", "d", "a", "e", "blarr") :: Nil)
+
+      // project list contains partial partition columns in table definition
+      checkAnswer(
+        sql(s"SELECT p2, p1, p5, c1 FROM $table"),
+        Row("b", "a", "e", "blarr") :: Nil)
+    }
+  }
+
   test("SPARK-14981: DESC not supported for sorting columns") {
     withTable("t") {
       val cause = intercept[ParseException] {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-15655][SQL] Fix Wrong Partition Column Order when Fetching Partitioned Tables

Reply via email to