spark git commit: [SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter

rxin Fri, 08 Jul 2016 11:46:29 -0700

Repository: spark
Updated Branches:
  refs/heads/master 255d74fe4 -> 38cf8f2a5



[SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter

## What changes were proposed in this pull request?

Adds an quoteAll option for writing CSV which will quote all fields.
See https://issues.apache.org/jira/browse/SPARK-13638

## How was this patch tested?

Added a test to verify the output columns are quoted for all fields in the 
Dataframe

Author: Jurriaan Pruis <[email protected]>

Closes #13374 from jurriaan/csv-quote-all.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/38cf8f2a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/38cf8f2a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/38cf8f2a

Branch: refs/heads/master
Commit: 38cf8f2a50068f80350740ac28e31c8accd20634
Parents: 255d74f
Author: Jurriaan Pruis <[email protected]>
Authored: Fri Jul 8 11:45:41 2016 -0700
Committer: Reynold Xin <[email protected]>
Committed: Fri Jul 8 11:45:41 2016 -0700

----------------------------------------------------------------------
 python/pyspark/sql/readwriter.py                |  7 ++++--
 .../org/apache/spark/sql/DataFrameWriter.scala  |  2 ++
 .../execution/datasources/csv/CSVOptions.scala  |  2 ++
 .../execution/datasources/csv/CSVParser.scala   |  2 +-
 .../execution/datasources/csv/CSVSuite.scala    | 26 ++++++++++++++++++++
 5 files changed, 36 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/38cf8f2a/python/pyspark/sql/readwriter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 78d992e..f7c354f 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -633,7 +633,7 @@ class DataFrameWriter(OptionUtils):
 
     @since(2.0)
     def csv(self, path, mode=None, compression=None, sep=None, quote=None, 
escape=None,
-            header=None, nullValue=None, escapeQuotes=None):
+            header=None, nullValue=None, escapeQuotes=None, quoteAll=None):
         """Saves the content of the :class:`DataFrame` in CSV format at the 
specified path.
 
         :param path: the path in any Hadoop supported file system
@@ -658,6 +658,9 @@ class DataFrameWriter(OptionUtils):
         :param escapeQuotes: A flag indicating whether values containing 
quotes should always
                              be enclosed in quotes. If None is set, it uses 
the default value
                              ``true``, escaping all values containing a quote 
character.
+        :param quoteAll: A flag indicating whether all values should always be 
enclosed in
+                          quotes. If None is set, it uses the default value 
``false``,
+                          only escaping values containing a quote character.
         :param header: writes the names of columns as the first line. If None 
is set, it uses
                        the default value, ``false``.
         :param nullValue: sets the string representation of a null value. If 
None is set, it uses
@@ -667,7 +670,7 @@ class DataFrameWriter(OptionUtils):
         """
         self.mode(mode)
         self._set_opts(compression=compression, sep=sep, quote=quote, 
escape=escape, header=header,
-                       nullValue=nullValue, escapeQuotes=escapeQuotes)
+                       nullValue=nullValue, escapeQuotes=escapeQuotes, 
quoteAll=quoteAll)
         self._jwrite.csv(path)
 
     @since(1.5)

http://git-wip-us.apache.org/repos/asf/spark/blob/38cf8f2a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f77af76..12b3046 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) 
{
    * <li>`escapeQuotes` (default `true`): a flag indicating whether values 
containing
    * quotes should always be enclosed in quotes. Default is to escape all 
values containing
    * a quote character.</li>
+   * <li>`quoteAll` (default `false`): A flag indicating whether all values 
should always be
+   * enclosed in quotes. Default is to only escape values containing a quote 
character.</li>
    * <li>`header` (default `false`): writes the names of columns as the first 
line.</li>
    * <li>`nullValue` (default empty string): sets the string representation of 
a null value.</li>
    * <li>`compression` (default `null`): compression codec to use when saving 
to file. This can be

http://git-wip-us.apache.org/repos/asf/spark/blob/38cf8f2a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 581eda7..22fb816 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val 
parameters: Map[String, Str
 
   val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10)
 
+  val quoteAll = getBool("quoteAll", false)
+
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'

http://git-wip-us.apache.org/repos/asf/spark/blob/38cf8f2a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index bf62732..13ae76d 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: 
Seq[String]) exten
   writerSettings.setNullValue(params.nullValue)
   writerSettings.setEmptyValue(params.nullValue)
   writerSettings.setSkipEmptyLines(true)
-  writerSettings.setQuoteAllFields(false)
+  writerSettings.setQuoteAllFields(params.quoteAll)
   writerSettings.setHeaders(headers: _*)
   writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/38cf8f2a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f170065..311f1fa 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext 
with SQLTestUtils {
     }
   }
 
+  test("save csv with quoteAll enabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escape", "\"")
+        .option("quoteAll", "true")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "\"test \"\"quote\"\"\",\"123\",\"it 
\"\"works\"\"!\",\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
   test("save csv with quote escaping enabled") {
     withTempDir { dir =>
       val csvDir = new File(dir, "csv").getCanonicalPath


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter

Reply via email to