This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5217f7b [SPARK-26248][SQL] Infer date type from CSV
5217f7b is described below
commit 5217f7b2263c7aaeadf60ef602776bb3777269cd
Author: Maxim Gekk <[email protected]>
AuthorDate: Mon Dec 17 08:24:51 2018 +0800
[SPARK-26248][SQL] Infer date type from CSV
## What changes were proposed in this pull request?
The `CSVInferSchema` class is extended to support inferring of `DateType`
from CSV input. The attempt to infer `DateType` is performed after inferring
`TimestampType`.
## How was this patch tested?
Added new test for inferring date types from CSV . It was also tested by
existing suites like `CSVInferSchemaSuite`, `CsvExpressionsSuite`,
`CsvFunctionsSuite` and `CsvSuite`.
Closes #23202 from MaxGekk/csv-date-inferring.
Lead-authored-by: Maxim Gekk <[email protected]>
Co-authored-by: Maxim Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/csv/CSVInferSchema.scala | 20 ++++++++++++++++----
.../spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 18 ++++++++++++++++++
2 files changed, 34 insertions(+), 4 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index 35ade13..11f3740 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -22,16 +22,20 @@ import scala.util.control.Exception.allCatch
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.analysis.TypeCoercion
import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.TimestampFormatter
+import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
import org.apache.spark.sql.types._
class CSVInferSchema(val options: CSVOptions) extends Serializable {
@transient
- private lazy val timestampParser = TimestampFormatter(
+ private lazy val timestampFormatter = TimestampFormatter(
options.timestampFormat,
options.timeZone,
options.locale)
+ @transient
+ private lazy val dateFormatter = DateFormatter(
+ options.dateFormat,
+ options.locale)
private val decimalParser = {
ExprUtils.getDecimalParser(options.locale)
@@ -104,6 +108,7 @@ class CSVInferSchema(val options: CSVOptions) extends
Serializable {
compatibleType(typeSoFar,
tryParseDecimal(field)).getOrElse(StringType)
case DoubleType => tryParseDouble(field)
case TimestampType => tryParseTimestamp(field)
+ case DateType => tryParseDate(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
case other: DataType =>
@@ -159,10 +164,17 @@ class CSVInferSchema(val options: CSVOptions) extends
Serializable {
}
private def tryParseTimestamp(field: String): DataType = {
- // This case infers a custom `dataFormat` is set.
- if ((allCatch opt timestampParser.parse(field)).isDefined) {
+ if ((allCatch opt timestampFormatter.parse(field)).isDefined) {
TimestampType
} else {
+ tryParseDate(field)
+ }
+ }
+
+ private def tryParseDate(field: String): DataType = {
+ if ((allCatch opt dateFormatter.parse(field)).isDefined) {
+ DateType
+ } else {
tryParseBoolean(field)
}
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
index c2b525a..84b2e61 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -187,4 +187,22 @@ class CSVInferSchemaSuite extends SparkFunSuite with
SQLHelper {
Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_,
DecimalType(7, 0)))
}
+
+ test("inferring date type") {
+ var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd"), false,
"GMT")
+ var inferSchema = new CSVInferSchema(options)
+ assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)
+
+ options = new CSVOptions(Map("dateFormat" -> "MMM yyyy"), false, "GMT")
+ inferSchema = new CSVInferSchema(options)
+ assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)
+
+ options = new CSVOptions(
+ Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" ->
"yyyy-MM-dd'T'HH:mm:ss"),
+ columnPruning = false,
+ defaultTimeZoneId = "GMT")
+ inferSchema = new CSVInferSchema(options)
+ assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") ==
TimestampType)
+ assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]