Repository: spark Updated Branches: refs/heads/master 4b8806741 -> c875d81a3
[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV ## What changes were proposed in this pull request? Default QuoteEscapingEnabled flag to true when writing CSV and add an escapeQuotes option to be able to change this. See https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247 This change is needed to be able to write RFC 4180 compatible CSV files (https://tools.ietf.org/html/rfc4180#section-2) https://issues.apache.org/jira/browse/SPARK-15493 ## How was this patch tested? Added a test that verifies the output is quoted correctly. Author: Jurriaan Pruis <[email protected]> Closes #13267 from jurriaan/quote-escaping. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c875d81a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c875d81a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c875d81a Branch: refs/heads/master Commit: c875d81a3de3f209b9eb03adf96b7c740b2c7b52 Parents: 4b88067 Author: Jurriaan Pruis <[email protected]> Authored: Wed May 25 12:40:16 2016 -0700 Committer: Reynold Xin <[email protected]> Committed: Wed May 25 12:40:16 2016 -0700 ---------------------------------------------------------------------- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- python/pyspark/sql/readwriter.py | 7 ++- sql/core/pom.xml | 2 +- .../org/apache/spark/sql/DataFrameWriter.scala | 3 ++ .../execution/datasources/csv/CSVOptions.scala | 2 + .../execution/datasources/csv/CSVParser.scala | 1 + .../execution/datasources/csv/CSVSuite.scala | 51 ++++++++++++++++++++ 11 files changed, 69 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.2 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index a9068da..0ac1c00 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -159,7 +159,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.0.jar +univocity-parsers-2.1.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.3 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 7e60a31..fa35fa7 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.0.jar +univocity-parsers-2.1.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.4 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 70d33b4..99dffa9 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.0.jar +univocity-parsers-2.1.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.6 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index a80f6bc..a3bee36 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -175,7 +175,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.0.jar +univocity-parsers-2.1.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/dev/deps/spark-deps-hadoop-2.7 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index c0b53f7..dbd7a8e 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -176,7 +176,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.0.jar +univocity-parsers-2.1.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/python/pyspark/sql/readwriter.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 6f788cf..73d2b81 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -769,7 +769,7 @@ class DataFrameWriter(object): @since(2.0) def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None, - header=None, nullValue=None): + header=None, nullValue=None, escapeQuotes=None): """Saves the content of the [[DataFrame]] in CSV format at the specified path. :param path: the path in any Hadoop supported file system @@ -790,6 +790,9 @@ class DataFrameWriter(object): value, ``"``. :param escape: sets the single character used for escaping quotes inside an already quoted value. If None is set, it uses the default value, ``\`` + :param escapeQuotes: A flag indicating whether values containing quotes should always + be enclosed in quotes. If None is set, it uses the default value + ``true``, escaping all values containing a quote character. :param header: writes the names of columns as the first line. If None is set, it uses the default value, ``false``. :param nullValue: sets the string representation of a null value. If None is set, it uses @@ -810,6 +813,8 @@ class DataFrameWriter(object): self.option("header", header) if nullValue is not None: self.option("nullValue", nullValue) + if escapeQuotes is not None: + self.option("escapeQuotes", nullValue) self._jwrite.csv(path) @since(1.5) http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/sql/core/pom.xml ---------------------------------------------------------------------- diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 2ea980b..b833b93 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -39,7 +39,7 @@ <dependency> <groupId>com.univocity</groupId> <artifactId>univocity-parsers</artifactId> - <version>2.1.0</version> + <version>2.1.1</version> <type>jar</type> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 6f5fb69..3aacce7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -684,6 +684,9 @@ final class DataFrameWriter private[sql](df: DataFrame) { * the separator can be part of the value.</li> * <li>`escape` (default `\`): sets the single character used for escaping quotes inside * an already quoted value.</li> + * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing + * quotes should always be enclosed in quotes. Default is to escape all values containing + * a quote character.</li> * <li>`header` (default `false`): writes the names of columns as the first line.</li> * <li>`nullValue` (default empty string): sets the string representation of a null value.</li> * <li>`compression` (default `null`): compression codec to use when saving to file. This can be http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index e4fd094..9f4ce83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -111,6 +111,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000) + val escapeQuotes = getBool("escapeQuotes", true) + val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index 111995d..b06f123 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -75,6 +75,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten writerSettings.setSkipEmptyLines(true) writerSettings.setQuoteAllFields(false) writerSettings.setHeaders(headers: _*) + writerSettings.setQuoteEscapingEnabled(params.escapeQuotes) private var buffer = new ByteArrayOutputStream() private var writer = new CsvWriter( http://git-wip-us.apache.org/repos/asf/spark/blob/c875d81a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index bae2907..ad7c05c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -366,6 +366,57 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } + test("save csv with quote escaping enabled") { + withTempDir { dir => + val csvDir = new File(dir, "csv").getCanonicalPath + + val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well")) + val df = spark.createDataFrame(data) + + // escapeQuotes should be true by default + df.coalesce(1).write + .format("csv") + .option("quote", "\"") + .option("escape", "\"") + .save(csvDir) + + val results = spark.read + .format("text") + .load(csvDir) + .collect() + + val expected = "\"test \"\"quote\"\"\",123,\"it \"\"works\"\"!\",\"\"\"very\"\" well\"" + + assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected))) + } + } + + test("save csv with quote escaping disabled") { + withTempDir { dir => + val csvDir = new File(dir, "csv").getCanonicalPath + + val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well")) + val df = spark.createDataFrame(data) + + // escapeQuotes should be true by default + df.coalesce(1).write + .format("csv") + .option("quote", "\"") + .option("escapeQuotes", "false") + .option("escape", "\"") + .save(csvDir) + + val results = spark.read + .format("text") + .load(csvDir) + .collect() + + val expected = "test \"quote\",123,it \"works\"!,\"\"\"very\"\" well\"" + + assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected))) + } + } + test("commented lines in CSV data") { val results = spark.read .format("csv") --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
