This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 5229d1c  [SPARK-31211][SQL] Fix rebasing of 29 February of Julian leap 
years
5229d1c is described below

commit 5229d1c66c8817d8fc37b9bc992bf1e411e302b4
Author: Maxim Gekk <[email protected]>
AuthorDate: Mon Mar 23 14:21:24 2020 +0800

    [SPARK-31211][SQL] Fix rebasing of 29 February of Julian leap years
    
    In the PR, I propose to fix the issue of rebasing leap years in Julian 
calendar to Proleptic Gregorian calendar in which the years are not leap years. 
In the Julian calendar, every four years is a leap year, with a leap day added 
to the month of February. In Proleptic Gregorian calendar, every year that is 
exactly divisible by four is a leap year, except for years that are exactly 
divisible by 100, but these centurial years are leap years, if they are exactly 
divisible by 400. In this [...]
    
    I modified the `rebaseJulianToGregorianMicros()` and 
`rebaseJulianToGregorianDays()` in `DateTimeUtils` by passing 1 as a day number 
of month while forming `LocalDate` or `LocalDateTime`, and adding the number of 
days using the `plusDays()` method. For example, **1000-02-29** doesn't exist 
in Proleptic Gregorian calendar, and `LocalDate.of(1000, 2, 29)` throws an 
exception. To avoid the issue, I build the `LocalDate.of(1000, 2, 1)` date and 
add 28 days. The `plusDays(28)` method produ [...]
    
    Before the changes, the `java.time.DateTimeException` exception is raised 
while loading the date `1000-02-29` from parquet files saved by Spark 2.4.5:
    ```scala
    scala> spark.conf.set("spark.sql.legacy.parquet.rebaseDateTime.enabled", 
true)
    scala> 
spark.read.parquet("/Users/maxim/tmp/before_1582/2_4_5_date_leap").show
    20/03/21 03:03:59 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3)
    java.time.DateTimeException: Invalid date 'February 29' as '1000' is not a 
leap year
    ```
    The parquet files were saved via the commands:
    ```shell
    $ export TZ="America/Los_Angeles"
    ```
    ```scala
    scala> scala> spark.conf.set("spark.sql.session.timeZone", 
"America/Los_Angeles")
    scala> val df = 
Seq(java.sql.Date.valueOf("1000-02-29")).toDF("dateS").select($"dateS".as("date"))
    df: org.apache.spark.sql.DataFrame = [date: date]
    scala> 
df.write.mode("overwrite").parquet("/Users/maxim/tmp/before_1582/2_4_5_date_leap")
    scala> 
spark.read.parquet("/Users/maxim/tmp/before_1582/2_4_5_date_leap").show
    +----------+
    |      date|
    +----------+
    |1000-02-29|
    +----------+
    ```
    
    Yes, after the fix:
    ```scala
    scala> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
    scala> spark.conf.set("spark.sql.legacy.parquet.rebaseDateTime.enabled", 
true)
    scala> 
spark.read.parquet("/Users/maxim/tmp/before_1582/2_4_5_date_leap").show
    +----------+
    |      date|
    +----------+
    |1000-03-01|
    +----------+
    ```
    
    Added tests to `DateTimeUtilsSuite`.
    
    Closes #27974 from MaxGekk/julian-date-29-feb.
    
    Authored-by: Maxim Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit db6247faa8780bca8f8d3ba71b568ea63b162973)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    | 14 +++-
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     | 82 +++++++++++++++++-----
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 7f5babf..ba1c509 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -983,11 +983,16 @@ object DateTimeUtils {
     val localDateTime = LocalDateTime.of(
       cal.get(Calendar.YEAR),
       cal.get(Calendar.MONTH) + 1,
-      cal.get(Calendar.DAY_OF_MONTH),
+      // The number of days will be added later to handle non-existing
+      // Julian dates in Proleptic Gregorian calendar.
+      // For example, 1000-02-29 exists in Julian calendar because 1000
+      // is a leap year but it is not a leap year in Gregorian calendar.
+      1,
       cal.get(Calendar.HOUR_OF_DAY),
       cal.get(Calendar.MINUTE),
       cal.get(Calendar.SECOND),
       (Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt)
+      .plusDays(cal.get(Calendar.DAY_OF_MONTH) - 1)
     instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant)
   }
 
@@ -1011,7 +1016,12 @@ object DateTimeUtils {
     val localDate = LocalDate.of(
       utcCal.get(Calendar.YEAR),
       utcCal.get(Calendar.MONTH) + 1,
-      utcCal.get(Calendar.DAY_OF_MONTH))
+      // The number of days will be added later to handle non-existing
+      // Julian dates in Proleptic Gregorian calendar.
+      // For example, 1000-02-29 exists in Julian calendar because 1000
+      // is a leap year but it is not a leap year in Gregorian calendar.
+      1)
+      .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
     Math.toIntExact(localDate.toEpochDay)
   }
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index f2ad9e6..96da4be 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -670,6 +670,17 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
     }
   }
 
+  private def parseToJulianMicros(s: String): Long = {
+    val ts = Timestamp.valueOf(s)
+    val julianMicros = fromMillis(ts.getTime) +
+      ((ts.getNanos / NANOS_PER_MICROS) % MICROS_PER_MILLIS)
+    julianMicros
+  }
+
+  private def parseToGregMicros(s: String, zoneId: ZoneId): Long = {
+    instantToMicros(LocalDateTime.parse(s).atZone(zoneId).toInstant)
+  }
+
   test("rebase julian to/from gregorian micros") {
     outstandingTimezones.foreach { timeZone =>
       withDefaultTimeZone(timeZone) {
@@ -684,30 +695,27 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
           "1970-01-01 00:00:00.000001", // The epoch day
           "2020-03-14 09:33:01.500000").foreach { ts =>
           withClue(s"time zone = ${timeZone.getID} ts = $ts") {
-            val julianTs = Timestamp.valueOf(ts)
-            val julianMicros = fromMillis(julianTs.getTime) +
-              ((julianTs.getNanos / NANOS_PER_MICROS) % MICROS_PER_MILLIS)
-            val gregorianMicros = 
instantToMicros(LocalDateTime.parse(ts.replace(' ', 'T'))
-              .atZone(timeZone.toZoneId)
-              .toInstant)
+            val julianMicros = parseToJulianMicros(ts)
+            val gregMicros = parseToGregMicros(ts.replace(' ', 'T'), 
timeZone.toZoneId)
 
-            assert(rebaseJulianToGregorianMicros(julianMicros) === 
gregorianMicros)
-            assert(rebaseGregorianToJulianMicros(gregorianMicros) === 
julianMicros)
+            assert(rebaseJulianToGregorianMicros(julianMicros) === gregMicros)
+            assert(rebaseGregorianToJulianMicros(gregMicros) === julianMicros)
           }
         }
       }
     }
   }
 
+  // millisToDays() and fromJavaDate() are taken from Spark 2.4
+  private def millisToDaysLegacy(millisUtc: Long, timeZone: TimeZone): Int = {
+    val millisLocal = millisUtc + timeZone.getOffset(millisUtc)
+    Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt
+  }
+  private def fromJavaDateLegacy(date: Date): Int = {
+    millisToDaysLegacy(date.getTime, defaultTimeZone())
+  }
+
   test("rebase gregorian to/from julian days") {
-    // millisToDays() and fromJavaDate() are taken from Spark 2.4
-    def millisToDays(millisUtc: Long, timeZone: TimeZone): Int = {
-      val millisLocal = millisUtc + timeZone.getOffset(millisUtc)
-      Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt
-    }
-    def fromJavaDate(date: Date): Int = {
-      millisToDays(date.getTime, defaultTimeZone())
-    }
     outstandingTimezones.foreach { timeZone =>
       withDefaultTimeZone(timeZone) {
         Seq(
@@ -720,7 +728,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
           "1969-12-31",
           "1970-01-01", // The epoch day
           "2020-03-14").foreach { date =>
-          val julianDays = fromJavaDate(Date.valueOf(date))
+          val julianDays = fromJavaDateLegacy(Date.valueOf(date))
           val gregorianDays = localDateToDays(LocalDate.parse(date))
 
           assert(rebaseGregorianToJulianDays(gregorianDays) === julianDays)
@@ -729,4 +737,44 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
       }
     }
   }
+
+  test("rebase julian to gregorian date for leap years") {
+    outstandingTimezones.foreach { timeZone =>
+      withDefaultTimeZone(timeZone) {
+        Seq(
+          "1000-02-29" -> "1000-03-01",
+          "1600-02-29" -> "1600-02-29",
+          "1700-02-29" -> "1700-03-01",
+          "2000-02-29" -> "2000-02-29").foreach { case (julianDate, gregDate) 
=>
+          withClue(s"tz = ${timeZone.getID} julian date = $julianDate greg 
date = $gregDate") {
+            val date = Date.valueOf(julianDate)
+            val julianDays = fromJavaDateLegacy(date)
+            val gregorianDays = localDateToDays(LocalDate.parse(gregDate))
+
+            assert(rebaseJulianToGregorianDays(julianDays) === gregorianDays)
+          }
+        }
+      }
+    }
+  }
+
+  test("rebase julian to gregorian timestamp for leap years") {
+    outstandingTimezones.foreach { timeZone =>
+      withDefaultTimeZone(timeZone) {
+        Seq(
+          "1000-02-29 01:02:03.123456" -> "1000-03-01T01:02:03.123456",
+          "1600-02-29 11:12:13.654321" -> "1600-02-29T11:12:13.654321",
+          "1700-02-29 21:22:23.000001" -> "1700-03-01T21:22:23.000001",
+          "2000-02-29 00:00:00.999999" -> "2000-02-29T00:00:00.999999"
+        ).foreach { case (julianTs, gregTs) =>
+          withClue(s"tz = ${timeZone.getID} julian ts = $julianTs greg ts = 
$gregTs") {
+            val julianMicros = parseToJulianMicros(julianTs)
+            val gregorianMicros = parseToGregMicros(gregTs, timeZone.toZoneId)
+
+            assert(rebaseJulianToGregorianMicros(julianMicros) === 
gregorianMicros)
+          }
+        }
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to