This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1fd4607 [SPARK-31221][SQL] Rebase any date-times in conversions
to/from Java types
1fd4607 is described below
commit 1fd4607d844905f10fe1edbfd6282c25b95b233d
Author: Maxim Gekk <[email protected]>
AuthorDate: Tue Mar 24 21:14:25 2020 +0800
[SPARK-31221][SQL] Rebase any date-times in conversions to/from Java types
### What changes were proposed in this pull request?
In the PR, I propose to apply rebasing for all dates/timestamps in
conversion functions `fromJavaDate()`, `toJavaDate()`, `toJavaTimestamp()` and
`fromJavaTimestamp()`. The rebasing is performed via building a local date-time
in an original calendar, extracting date-time fields from the result, and
creating new local date-time in the target calendar.
### Why are the changes needed?
The changes are need to be compatible with previous Spark version (2.4.5
and earlier versions) not only before the Gregorian cutover date `1582-10-15`
but also for dates after the date. For instance, Gregorian calendar
implementation in Java 7 `java.util.GregorianCalendar` is not accurate in
resolving time zone offsets as Gregorian calendar introduced since Java 8.
### Does this PR introduce any user-facing change?
Yes, this PR can introduce behavior changes for dates after `1582-10-15`,
in particular conversions of zone ids to zone offsets will be much more
accurate.
### How was this patch tested?
By existing test suites `DateTimeUtilsSuite`, `DateFunctionsSuite`,
`DateExpressionsSuite`, `CollectionExpressionsSuite`,
`HiveOrcHadoopFsRelationSuite`, `ParquetIOSuite`.
Closes #27980 from MaxGekk/reuse-rebase-funcs-in-java-funcs.
Authored-by: Maxim Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/DateTimeUtils.scala | 117 ++++++++++++++-------
.../expressions/CollectionExpressionsSuite.scala | 28 ++---
.../sql/catalyst/util/DateTimeUtilsSuite.scala | 25 ++---
.../org/apache/spark/sql/hive/DaysWritable.scala | 39 +------
4 files changed, 109 insertions(+), 100 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 3a5f95b..268cd19 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -47,13 +47,6 @@ object DateTimeUtils {
// it's 2440587.5, rounding up to compatible with Hive
final val JULIAN_DAY_OF_EPOCH = 2440588
- final val GREGORIAN_CUTOVER_DAY = LocalDate.of(1582, 10, 15).toEpochDay
- final val GREGORIAN_CUTOVER_MICROS = instantToMicros(
- LocalDateTime.of(1582, 10, 15, 0, 0, 0)
- .atOffset(ZoneOffset.UTC)
- .toInstant)
- final val GREGORIAN_CUTOVER_MILLIS = microsToMillis(GREGORIAN_CUTOVER_MICROS)
-
final val julianCommonEraStart = Timestamp.valueOf("0001-01-01 00:00:00")
final val TimeZoneGMT = TimeZone.getTimeZone("GMT")
@@ -92,53 +85,103 @@ object DateTimeUtils {
}
/**
- * Returns the number of days since epoch from java.sql.Date.
+ * Converts an instance of `java.sql.Date` to a number of days since the
epoch
+ * 1970-01-01 via extracting date fields `year`, `month`, `days` from the
input,
+ * creating a local date in Proleptic Gregorian calendar from the fields, and
+ * getting the number of days from the resulted local date.
+ *
+ * This approach was taken to have the same local date as the triple of
`year`,
+ * `month`, `day` in the original hybrid calendar used by `java.sql.Date` and
+ * Proleptic Gregorian calendar used by Spark since version 3.0.0, see
SPARK-26651.
+ *
+ * @param date It represents a specific instant in time based on
+ * the hybrid calendar which combines Julian and
+ * Gregorian calendars.
+ * @return The number of days since epoch from java.sql.Date.
*/
def fromJavaDate(date: Date): SQLDate = {
- if (date.getTime < GREGORIAN_CUTOVER_MILLIS) {
- val era = if (date.before(julianCommonEraStart)) 0 else 1
- val localDate = date.toLocalDate.`with`(ChronoField.ERA, era)
- localDateToDays(localDate)
- } else {
- microsToDays(millisToMicros(date.getTime))
- }
+ val era = if (date.before(julianCommonEraStart)) 0 else 1
+ val localDate = LocalDate
+ .of(date.getYear + 1900, date.getMonth + 1, 1)
+ .`with`(ChronoField.ERA, era)
+ // Add days separately to convert dates existed in Julian calendar but
not
+ // in Proleptic Gregorian calendar. For example, 1000-02-29 is valid date
+ // in Julian calendar because 1000 is a leap year but 1000 is not a leap
+ // year in Proleptic Gregorian calendar. And 1000-02-29 doesn't exist in
it.
+ .plusDays(date.getDate - 1) // Returns the next valid date after
`date.getDate - 1` days
+ localDateToDays(localDate)
}
/**
- * Returns a java.sql.Date from number of days since epoch.
+ * The opposite to `fromJavaDate` method which converts a number of days to
an
+ * instance of `java.sql.Date`. It builds a local date in Proleptic Gregorian
+ * calendar, extracts date fields `year`, `month`, `day`, and creates a local
+ * date in the hybrid calendar (Julian + Gregorian calendars) from the
fields.
+ *
+ * The purpose of the conversion is to have the same local date as the triple
+ * of `year`, `month`, `day` in the original Proleptic Gregorian calendar and
+ * in the target calender.
+ *
+ * @param daysSinceEpoch The number of days since 1970-01-01.
+ * @return A `java.sql.Date` from number of days since epoch.
*/
def toJavaDate(daysSinceEpoch: SQLDate): Date = {
- if (daysSinceEpoch < GREGORIAN_CUTOVER_DAY) {
- Date.valueOf(LocalDate.ofEpochDay(daysSinceEpoch))
- } else {
- new Date(microsToMillis(daysToMicros(daysSinceEpoch)))
- }
+ val localDate = LocalDate.ofEpochDay(daysSinceEpoch)
+ new Date(localDate.getYear - 1900, localDate.getMonthValue - 1,
localDate.getDayOfMonth)
}
/**
- * Returns a java.sql.Timestamp from number of micros since epoch.
+ * Converts microseconds since the epoch to an instance of
`java.sql.Timestamp`
+ * via creating a local timestamp at the system time zone in Proleptic
Gregorian
+ * calendar, extracting date and time fields like `year` and `hours`, and
forming
+ * new timestamp in the hybrid calendar from the extracted fields.
+ *
+ * The conversion is based on the JVM system time zone because the
`java.sql.Timestamp`
+ * uses the time zone internally.
+ *
+ * The method performs the conversion via local timestamp fields to have the
same date-time
+ * representation as `year`, `month`, `day`, ..., `seconds` in the original
calendar
+ * and in the target calendar.
+ *
+ * @param us The number of microseconds since 1970-01-01T00:00:00.000000Z.
+ * @return A `java.sql.Timestamp` from number of micros since epoch.
*/
def toJavaTimestamp(us: SQLTimestamp): Timestamp = {
- if (us < GREGORIAN_CUTOVER_MICROS) {
- val ldt =
microsToInstant(us).atZone(ZoneId.systemDefault()).toLocalDateTime
- Timestamp.valueOf(ldt)
- } else {
- Timestamp.from(microsToInstant(us))
- }
+ val ldt =
microsToInstant(us).atZone(ZoneId.systemDefault()).toLocalDateTime
+ Timestamp.valueOf(ldt)
}
/**
- * Returns the number of micros since epoch from java.sql.Timestamp.
+ * Converts an instance of `java.sql.Timestamp` to the number of
microseconds since
+ * 1970-01-01T00:00:00.000000Z. It extracts date-time fields from the input,
builds
+ * a local timestamp in Proleptic Gregorian calendar from the fields, and
binds
+ * the timestamp to the system time zone. The resulted instant is converted
to
+ * microseconds since the epoch.
+ *
+ * The conversion is performed via the system time zone because it is used
internally
+ * in `java.sql.Timestamp` while extracting date-time fields.
+ *
+ * The goal of the function is to have the same local date-time in the
original calendar
+ * - the hybrid calendar (Julian + Gregorian) and in the target calendar
which is
+ * Proleptic Gregorian calendar, see SPARK-26651.
+ *
+ * @param t It represents a specific instant in time based on
+ * the hybrid calendar which combines Julian and
+ * Gregorian calendars.
+ * @return The number of micros since epoch from `java.sql.Timestamp`.
*/
def fromJavaTimestamp(t: Timestamp): SQLTimestamp = {
- if (t.getTime < GREGORIAN_CUTOVER_MILLIS) {
- val era = if (t.before(julianCommonEraStart)) 0 else 1
- val localDateTime = t.toLocalDateTime.`with`(ChronoField.ERA, era)
- val instant = ZonedDateTime.of(localDateTime,
ZoneId.systemDefault()).toInstant
- instantToMicros(instant)
- } else {
- instantToMicros(t.toInstant)
- }
+ val era = if (t.before(julianCommonEraStart)) 0 else 1
+ val localDateTime = LocalDateTime.of(
+ t.getYear + 1900, t.getMonth + 1, 1,
+ t.getHours, t.getMinutes, t.getSeconds, t.getNanos)
+ .`with`(ChronoField.ERA, era)
+ // Add days separately to convert dates existed in Julian calendar but
not
+ // in Proleptic Gregorian calendar. For example, 1000-02-29 is valid date
+ // in Julian calendar because 1000 is a leap year but 1000 is not a leap
+ // year in Proleptic Gregorian calendar. And 1000-02-29 doesn't exist in
it.
+ .plusDays(t.getDate - 1) // Returns the next valid date after
`date.getDate - 1` days
+ instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant)
}
/**
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 173f248..e9d1da6 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -27,8 +27,8 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils}
import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY
-import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
import org.apache.spark.sql.catalyst.util.IntervalUtils._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
@@ -845,9 +845,11 @@ class CollectionExpressionsSuite extends SparkFunSuite
with ExpressionEvalHelper
test("Sequence on DST boundaries") {
val timeZone = TimeZone.getTimeZone("Europe/Prague")
- val dstOffset = timeZone.getDSTSavings
- def noDST(t: Timestamp): Timestamp = new Timestamp(t.getTime - dstOffset)
+ def ts(s: String, noDST: Boolean = false): Long = {
+ val offset = if (noDST) timeZone.getDSTSavings else 0
+ DateTimeUtils.millisToMicros(Timestamp.valueOf(s).getTime - offset)
+ }
DateTimeTestUtils.withDefaultTimeZone(timeZone) {
// Spring time change
@@ -856,9 +858,9 @@ class CollectionExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper
Literal(Timestamp.valueOf("2018-03-25 03:30:00")),
Literal(stringToInterval("interval 30 minutes"))),
Seq(
- Timestamp.valueOf("2018-03-25 01:30:00"),
- Timestamp.valueOf("2018-03-25 03:00:00"),
- Timestamp.valueOf("2018-03-25 03:30:00")))
+ ts("2018-03-25 01:30:00"),
+ ts("2018-03-25 03:00:00"),
+ ts("2018-03-25 03:30:00")))
// Autumn time change
checkEvaluation(new Sequence(
@@ -866,13 +868,13 @@ class CollectionExpressionsSuite extends SparkFunSuite
with ExpressionEvalHelper
Literal(Timestamp.valueOf("2018-10-28 03:30:00")),
Literal(stringToInterval("interval 30 minutes"))),
Seq(
- Timestamp.valueOf("2018-10-28 01:30:00"),
- noDST(Timestamp.valueOf("2018-10-28 02:00:00")),
- noDST(Timestamp.valueOf("2018-10-28 02:30:00")),
- Timestamp.valueOf("2018-10-28 02:00:00"),
- Timestamp.valueOf("2018-10-28 02:30:00"),
- Timestamp.valueOf("2018-10-28 03:00:00"),
- Timestamp.valueOf("2018-10-28 03:30:00")))
+ ts("2018-10-28 01:30:00"),
+ ts("2018-10-28 02:00:00", noDST = true),
+ ts("2018-10-28 02:30:00", noDST = true),
+ ts("2018-10-28 02:00:00"),
+ ts("2018-10-28 02:30:00"),
+ ts("2018-10-28 03:00:00"),
+ ts("2018-10-28 03:30:00")))
}
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 87bc2e1..4756e2f 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -88,13 +88,9 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers
with SQLHelper {
}
test("SPARK-6785: java date conversion before and after epoch") {
- def format(d: Date): String = {
- TimestampFormatter("yyyy-MM-dd", defaultTimeZone().toZoneId)
- .format(millisToMicros(d.getTime))
- }
def checkFromToJavaDate(d1: Date): Unit = {
val d2 = toJavaDate(fromJavaDate(d1))
- assert(format(d2) === format(d1))
+ assert(d2.toString === d1.toString)
}
val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
@@ -500,17 +496,16 @@ class DateTimeUtilsSuite extends SparkFunSuite with
Matchers with SQLHelper {
}
}
- withDefaultTimeZone(TimeZone.getTimeZone("PST")) {
+ val tz = "America/Los_Angeles"
+ withDefaultTimeZone(TimeZone.getTimeZone(tz)) {
// Daylight Saving Time
- test("2016-03-13 01:59:59", "PST", "2016-03-13 09:59:59.0")
- // 2016-03-13 02:00:00 PST does not exists
- test("2016-03-13 02:00:00", "PST", "2016-03-13 10:00:00.0")
- test("2016-03-13 03:00:00", "PST", "2016-03-13 10:00:00.0")
- test("2016-11-06 00:59:59", "PST", "2016-11-06 07:59:59.0")
- // 2016-11-06 01:00:00 PST could be 2016-11-06 08:00:00 UTC or
2016-11-06 09:00:00 UTC
- test("2016-11-06 01:00:00", "PST", "2016-11-06 09:00:00.0")
- test("2016-11-06 01:59:59", "PST", "2016-11-06 09:59:59.0")
- test("2016-11-06 02:00:00", "PST", "2016-11-06 10:00:00.0")
+ test("2016-03-13 01:59:59", tz, "2016-03-13 09:59:59.0")
+ test("2016-03-13 02:00:00", tz, "2016-03-13 10:00:00.0")
+ test("2016-03-13 03:00:00", tz, "2016-03-13 10:00:00.0")
+ test("2016-11-06 00:59:59", tz, "2016-11-06 07:59:59.0")
+ test("2016-11-06 01:00:00", tz, "2016-11-06 08:00:00.0")
+ test("2016-11-06 01:59:59", tz, "2016-11-06 08:59:59.0")
+ test("2016-11-06 02:00:00", tz, "2016-11-06 10:00:00.0")
}
}
diff --git
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
index 862be33..1eec8d7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
@@ -23,7 +23,7 @@ import java.sql.Date
import org.apache.hadoop.hive.serde2.io.DateWritable
import org.apache.hadoop.io.WritableUtils
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import
org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulianDays,
rebaseJulianToGregorianDays}
/**
* The class accepts/returns days in Gregorian calendar and rebase them
@@ -41,13 +41,13 @@ private[hive] class DaysWritable(
extends DateWritable {
def this(gregorianDays: Int) =
- this(gregorianDays,
DaysWritable.rebaseGregorianToJulianDays(gregorianDays))
+ this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays))
def this(dateWritable: DateWritable) = {
this(
gregorianDays = dateWritable match {
case daysWritable: DaysWritable => daysWritable.gregorianDays
case dateWritable: DateWritable =>
- DaysWritable.rebaseJulianToGregorianDays(dateWritable.getDays)
+ rebaseJulianToGregorianDays(dateWritable.getDays)
},
julianDays = dateWritable.getDays)
}
@@ -63,37 +63,6 @@ private[hive] class DaysWritable(
@throws[IOException]
override def readFields(in: DataInput): Unit = {
julianDays = WritableUtils.readVInt(in)
- gregorianDays = DaysWritable.rebaseJulianToGregorianDays(julianDays)
+ gregorianDays = rebaseJulianToGregorianDays(julianDays)
}
}
-
-private[hive] object DaysWritable {
- // Rebasing days since the epoch to store the same number of days
- // as by Spark 2.4 and earlier versions. Spark 3.0 switched to
- // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of
that,
- // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use
- // Julian calendar for dates before 1582-10-15. So, the same local date may
- // be mapped to different number of days since the epoch in different
calendars.
- // For example:
- // Proleptic Gregorian calendar: 1582-01-01 -> -141714
- // Julian calendar: 1582-01-01 -> -141704
- // The code below converts -141714 to -141704.
- def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = {
- if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) {
- DateTimeUtils.rebaseGregorianToJulianDays(daysSinceEpoch)
- } else {
- daysSinceEpoch
- }
- }
-
- def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = {
- if (daysSinceEpoch < JULIAN_CUTOVER_DAY) {
- DateTimeUtils.rebaseJulianToGregorianDays(daysSinceEpoch)
- } else {
- daysSinceEpoch
- }
- }
-
- final val JULIAN_CUTOVER_DAY =
- rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt)
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]