Repository: spark Updated Branches: refs/heads/master 1dbe9896b -> 44c8bfda7
[SQL][DOC] updating doc for JSON source to link to jsonlines.org ## What changes were proposed in this pull request? API and programming guide doc changes for Scala, Python and R. ## How was this patch tested? manual test Author: Felix Cheung <[email protected]> Closes #15629 from felixcheung/jsondoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/44c8bfda Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/44c8bfda Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/44c8bfda Branch: refs/heads/master Commit: 44c8bfda793b7655e2bd1da5e9915a09ed9d42ce Parents: 1dbe989 Author: Felix Cheung <[email protected]> Authored: Wed Oct 26 23:06:11 2016 -0700 Committer: Felix Cheung <[email protected]> Committed: Wed Oct 26 23:06:11 2016 -0700 ---------------------------------------------------------------------- R/pkg/R/DataFrame.R | 3 ++- R/pkg/R/SQLContext.R | 3 ++- docs/sparkr.md | 2 +- docs/sql-programming-guide.md | 22 ++++++++++++-------- python/pyspark/sql/readwriter.py | 5 +++-- python/pyspark/sql/streaming.py | 3 ++- .../org/apache/spark/sql/DataFrameReader.scala | 14 +++++++------ .../org/apache/spark/sql/DataFrameWriter.scala | 3 ++- .../spark/sql/streaming/DataStreamReader.scala | 3 ++- 9 files changed, 35 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/R/pkg/R/DataFrame.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index be34e4b..1df8bbf 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -761,7 +761,8 @@ setMethod("toJSON", #' Save the contents of SparkDataFrame as a JSON file #' -#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out +#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{ +#' JSON Lines text format or newline-delimited JSON}). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). #' #' @param x A SparkDataFrame http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/R/pkg/R/SQLContext.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 0d6a229..216ca51 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -324,7 +324,8 @@ setMethod("toDF", signature(x = "RDD"), #' Create a SparkDataFrame from a JSON file. #' -#' Loads a JSON file (one object per line), returning the result as a SparkDataFrame +#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON} +#' ), returning the result as a SparkDataFrame #' It goes through the entire dataset once to determine the schema. #' #' @param path Path of file to read. A vector of multiple paths is allowed. http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/docs/sparkr.md ---------------------------------------------------------------------- diff --git a/docs/sparkr.md b/docs/sparkr.md index c1829ef..f30bd40 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -135,7 +135,7 @@ sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") {% endhighlight %} </div> -We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. +We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a consequence, a regular multi-line JSON file will most often fail. <div data-lang="r" markdown="1"> {% highlight r %} http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/docs/sql-programming-guide.md ---------------------------------------------------------------------- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 064af41..b9be7a7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -316,7 +316,7 @@ Serializable and has getters and setters for all of its fields. Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table, -and the types are inferred by sampling the whole datase, similar to the inference that is performed on JSON files. +and the types are inferred by sampling the whole dataset, similar to the inference that is performed on JSON files. {% include_example schema_inferring python/sql/basic.py %} </div> @@ -832,8 +832,9 @@ This conversion can be done using `SparkSession.read.json()` on either an RDD of or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} </div> @@ -844,8 +845,9 @@ This conversion can be done using `SparkSession.read().json()` on either an RDD or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} </div> @@ -855,8 +857,9 @@ Spark SQL can automatically infer the schema of a JSON dataset and load it as a This conversion can be done using `SparkSession.read.json` on a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset python/sql/datasource.py %} </div> @@ -867,8 +870,9 @@ the `read.json()` function, which loads data from a directory of JSON files wher files is a JSON object. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset r/RSparkSQLExample.R %} http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/python/pyspark/sql/readwriter.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 91c2b17..bc786ef 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -160,8 +160,9 @@ class DataFrameReader(OptionUtils): allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None): """ - Loads a JSON file (one object per line) or an RDD of Strings storing JSON objects - (one object per record) and returns the result as a :class`DataFrame`. + Loads a JSON file (`JSON Lines text format or newline-delimited JSON + <[http://jsonlines.org/>`_) or an RDD of Strings storing JSON objects (one object per + record) and returns the result as a :class`DataFrame`. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/python/pyspark/sql/streaming.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 35fc469..559647b 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -640,7 +640,8 @@ class DataStreamReader(OptionUtils): mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None): """ - Loads a JSON file stream (one object per line) and returns a :class`DataFrame`. + Loads a JSON file stream (`JSON Lines text format or newline-delimited JSON + <[http://jsonlines.org/>`_) and returns a :class`DataFrame`. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index b7b2203..a77937e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -239,7 +239,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) + * and returns the result as a [[DataFrame]]. * See the documentation on the overloaded `json()` method with varargs for more details. * * @since 1.4.0 @@ -250,7 +251,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) + * and returns the result as a [[DataFrame]]. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. @@ -295,8 +297,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(paths: String*): DataFrame = format("json").load(paths : _*) /** - * Loads a `JavaRDD[String]` storing JSON objects (one object per record) and - * returns the result as a [[DataFrame]]. + * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format + * or newline-delimited JSON]]) and returns the result as a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. @@ -307,8 +309,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd) /** - * Loads an `RDD[String]` storing JSON objects (one object per record) and - * returns the result as a [[DataFrame]]. + * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or + * newline-delimited JSON]]) and returns the result as a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 5be3277..4b5f024 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -434,7 +434,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in JSON format at the specified path. + * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text + * format or newline-delimited JSON]]) at the specified path. * This is equivalent to: * {{{ * format("json").save(path) http://git-wip-us.apache.org/repos/asf/spark/blob/44c8bfda/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 87b7306..40b482e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -134,7 +134,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } /** - * Loads a JSON file stream (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited + * JSON]]) and returns the result as a [[DataFrame]]. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
