Repository: spark
Updated Branches:
  refs/heads/branch-1.5 829c33a4b -> afaed7ef4


[SPARK-10073] [SQL] Python withColumn should replace the old column

DataFrame.withColumn in Python should be consistent with the Scala one 
(replacing the existing column  that has the same name).

cc marmbrus

Author: Davies Liu <[email protected]>

Closes #8300 from davies/with_column.

(cherry picked from commit 08887369c890e0dd87eb8b34e8c32bb03307bf24)
Signed-off-by: Michael Armbrust <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afaed7ef
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afaed7ef
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afaed7ef

Branch: refs/heads/branch-1.5
Commit: afaed7ef49751e2002b84da25abe08fb8987372c
Parents: 829c33a
Author: Davies Liu <[email protected]>
Authored: Wed Aug 19 13:56:40 2015 -0700
Committer: Michael Armbrust <[email protected]>
Committed: Wed Aug 19 13:56:54 2015 -0700

----------------------------------------------------------------------
 python/pyspark/sql/dataframe.py                         | 12 ++++++------
 python/pyspark/sql/tests.py                             |  4 ++++
 .../src/main/scala/org/apache/spark/sql/DataFrame.scala |  3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index da742d7..025811f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1202,7 +1202,9 @@ class DataFrame(object):
     @ignore_unicode_prefix
     @since(1.3)
     def withColumn(self, colName, col):
-        """Returns a new :class:`DataFrame` by adding a column.
+        """
+        Returns a new :class:`DataFrame` by adding a column or replacing the
+        existing column that has the same name.
 
         :param colName: string, name of the new column.
         :param col: a :class:`Column` expression for the new column.
@@ -1210,7 +1212,8 @@ class DataFrame(object):
         >>> df.withColumn('age2', df.age + 2).collect()
         [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
         """
-        return self.select('*', col.alias(colName))
+        assert isinstance(col, Column), "col should be Column"
+        return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
 
     @ignore_unicode_prefix
     @since(1.3)
@@ -1223,10 +1226,7 @@ class DataFrame(object):
         >>> df.withColumnRenamed('age', 'age2').collect()
         [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
         """
-        cols = [Column(_to_java_column(c)).alias(new)
-                if c == existing else c
-                for c in self.columns]
-        return self.select(*cols)
+        return DataFrame(self._jdf.withColumnRenamed(existing, new), 
self.sql_ctx)
 
     @since(1.4)
     @ignore_unicode_prefix

http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13cf647..aacfb34 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1035,6 +1035,10 @@ class SQLTests(ReusedPySparkTestCase):
         self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the 
permitted values",
                                 lambda: df.select(sha2(df.a, 1024)).collect())
 
+    def test_with_column_with_existing_name(self):
+        keys = self.df.withColumn("key", self.df.key).select("key").collect()
+        self.assertEqual([r.key for r in keys], list(range(100)))
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 

http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index ec5084a..5bed299 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1133,7 +1133,8 @@ class DataFrame private[sql](
   /////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Returns a new [[DataFrame]] by adding a column.
+   * Returns a new [[DataFrame]] by adding a column or replacing the existing 
column that has
+   * the same name.
    * @group dfops
    * @since 1.3.0
    */


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to