This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5217f7b  [SPARK-26248][SQL] Infer date type from CSV
5217f7b is described below

commit 5217f7b2263c7aaeadf60ef602776bb3777269cd
Author: Maxim Gekk <[email protected]>
AuthorDate: Mon Dec 17 08:24:51 2018 +0800

    [SPARK-26248][SQL] Infer date type from CSV
    
    ## What changes were proposed in this pull request?
    
    The `CSVInferSchema` class is extended to support inferring of `DateType` 
from CSV input. The attempt to infer `DateType` is performed after inferring 
`TimestampType`.
    
    ## How was this patch tested?
    
    Added new test for inferring date types from CSV . It was also tested by 
existing suites like `CSVInferSchemaSuite`, `CsvExpressionsSuite`, 
`CsvFunctionsSuite` and `CsvSuite`.
    
    Closes #23202 from MaxGekk/csv-date-inferring.
    
    Lead-authored-by: Maxim Gekk <[email protected]>
    Co-authored-by: Maxim Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/csv/CSVInferSchema.scala      | 20 ++++++++++++++++----
 .../spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index 35ade13..11f3740 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -22,16 +22,20 @@ import scala.util.control.Exception.allCatch
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.TimestampFormatter
+import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
 import org.apache.spark.sql.types._
 
 class CSVInferSchema(val options: CSVOptions) extends Serializable {
 
   @transient
-  private lazy val timestampParser = TimestampFormatter(
+  private lazy val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.timeZone,
     options.locale)
+  @transient
+  private lazy val dateFormatter = DateFormatter(
+    options.dateFormat,
+    options.locale)
 
   private val decimalParser = {
     ExprUtils.getDecimalParser(options.locale)
@@ -104,6 +108,7 @@ class CSVInferSchema(val options: CSVOptions) extends 
Serializable {
           compatibleType(typeSoFar, 
tryParseDecimal(field)).getOrElse(StringType)
         case DoubleType => tryParseDouble(field)
         case TimestampType => tryParseTimestamp(field)
+        case DateType => tryParseDate(field)
         case BooleanType => tryParseBoolean(field)
         case StringType => StringType
         case other: DataType =>
@@ -159,10 +164,17 @@ class CSVInferSchema(val options: CSVOptions) extends 
Serializable {
   }
 
   private def tryParseTimestamp(field: String): DataType = {
-    // This case infers a custom `dataFormat` is set.
-    if ((allCatch opt timestampParser.parse(field)).isDefined) {
+    if ((allCatch opt timestampFormatter.parse(field)).isDefined) {
       TimestampType
     } else {
+      tryParseDate(field)
+    }
+  }
+
+  private def tryParseDate(field: String): DataType = {
+    if ((allCatch opt dateFormatter.parse(field)).isDefined) {
+      DateType
+    } else {
       tryParseBoolean(field)
     }
   }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
index c2b525a..84b2e61 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -187,4 +187,22 @@ class CSVInferSchemaSuite extends SparkFunSuite with 
SQLHelper {
 
     Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, 
DecimalType(7, 0)))
   }
+
+  test("inferring date type") {
+    var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd"), false, 
"GMT")
+    var inferSchema = new CSVInferSchema(options)
+    assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)
+
+    options = new CSVOptions(Map("dateFormat" -> "MMM yyyy"), false, "GMT")
+    inferSchema = new CSVInferSchema(options)
+    assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)
+
+    options = new CSVOptions(
+      Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> 
"yyyy-MM-dd'T'HH:mm:ss"),
+      columnPruning = false,
+      defaultTimeZoneId = "GMT")
+    inferSchema = new CSVInferSchema(options)
+    assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") == 
TimestampType)
+    assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to