This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 41b22c4 [SPARK-34424][SQL][TESTS][3.1][3.0] Fix failures of
HiveOrcHadoopFsRelationSuite
41b22c4 is described below
commit 41b22c4d8b5f5082a602d3adf6d3e405484a442b
Author: Max Gekk <[email protected]>
AuthorDate: Fri Feb 19 13:26:37 2021 +0000
[SPARK-34424][SQL][TESTS][3.1][3.0] Fix failures of
HiveOrcHadoopFsRelationSuite
### What changes were proposed in this pull request?
Modify `RandomDataGenerator.forType()` to allow generation of
dates/timestamps that are valid in both Julian and Proleptic Gregorian
calendars. Currently, the function can produce a date (for example
`1582-10-06`) which is valid in the Proleptic Gregorian calendar. Though it
cannot be saved to ORC files AS IS since ORC format (ORC libs in fact) assumes
Julian calendar. So, Spark shifts `1582-10-06` to the next valid date
`1582-10-15` while saving it to ORC files. And as a consequence [...]
In this PR, I propose to generate valid dates/timestamps in both calendars
for ORC datasource till SPARK-34440 is resolved.
### Why are the changes needed?
The changes fix failures of `HiveOrcHadoopFsRelationSuite`. For instance,
the test "test all data types" fails with the seed **610710213676**:
```
== Results ==
!== Correct Answer - 20 == == Spark Answer - 20 ==
struct<index:int,col:date> struct<index:int,col:date>
...
![9,1582-10-06] [9,1582-10-15]
```
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By running the modified test suite:
```
$ build/sbt -Phive -Phive-thriftserver "test:testOnly
*HiveOrcHadoopFsRelationSuite"
```
Authored-by: Max Gekk <max.gekkgmail.com>
Signed-off-by: HyukjinKwon <gurwls223apache.org>
(cherry picked from commit 03161055de0c132070354407160553363175c4d7)
Signed-off-by: Max Gekk <max.gekkgmail.com>
Closes #31589 from MaxGekk/fix-HiveOrcHadoopFsRelationSuite-3.1.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../org/apache/spark/sql/RandomDataGenerator.scala | 73 ++++++++++++++--------
.../spark/sql/sources/HadoopFsRelationTest.scala | 13 +++-
2 files changed, 58 insertions(+), 28 deletions(-)
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 9fa27c7..0fc040f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -149,12 +149,15 @@ object RandomDataGenerator {
* @param dataType the type to generate values for
* @param nullable whether null values should be generated
* @param rand an optional random number generator
+ * @param validJulianDatetime whether to generate dates and timestamps that
are valid
+ * in the Julian calendar.
* @return a function which can be called to generate random values.
*/
def forType(
dataType: DataType,
nullable: Boolean = true,
- rand: Random = new Random): Option[() => Any] = {
+ rand: Random = new Random,
+ validJulianDatetime: Boolean = false): Option[() => Any] = {
val valueGenerator: Option[() => Any] = dataType match {
case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN)))
case BinaryType => Some(() => {
@@ -182,29 +185,37 @@ object RandomDataGenerator {
"1970-01-01", // the epoch date
"9999-12-31" // the last supported date according to SQL standard
)
+ def getRandomDate(rand: Random): java.sql.Date = {
+ val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand))
+ // The generated `date` is based on the hybrid calendar Julian +
Gregorian since
+ // 1582-10-15 but it should be valid in Proleptic Gregorian calendar
too which is used
+ // by Spark SQL since version 3.0 (see SPARK-26651). We try to
convert `date` to
+ // a local date in Proleptic Gregorian calendar to satisfy this
requirement. Some
+ // years are leap years in Julian calendar but not in Proleptic
Gregorian calendar.
+ // As the consequence of that, 29 February of such years might not
exist in Proleptic
+ // Gregorian calendar. When this happens, we shift the date by one
day.
+ Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime +
MILLIS_PER_DAY))
+ }
if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) {
randomNumeric[LocalDate](
rand,
- (rand: Random) => LocalDate.ofEpochDay(uniformDaysRand(rand)),
+ (rand: Random) => {
+ val days = if (validJulianDatetime) {
+ DateTimeUtils.fromJavaDate(getRandomDate(rand))
+ } else {
+ uniformDaysRand(rand)
+ }
+ LocalDate.ofEpochDay(days)
+ },
specialDates.map(LocalDate.parse))
} else {
randomNumeric[java.sql.Date](
rand,
- (rand: Random) => {
- val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand))
- // The generated `date` is based on the hybrid calendar Julian +
Gregorian since
- // 1582-10-15 but it should be valid in Proleptic Gregorian
calendar too which is used
- // by Spark SQL since version 3.0 (see SPARK-26651). We try to
convert `date` to
- // a local date in Proleptic Gregorian calendar to satisfy this
requirement. Some
- // years are leap years in Julian calendar but not in Proleptic
Gregorian calendar.
- // As the consequence of that, 29 February of such years might
not exist in Proleptic
- // Gregorian calendar. When this happens, we shift the date by
one day.
- Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime +
MILLIS_PER_DAY))
- },
+ getRandomDate,
specialDates.map(java.sql.Date.valueOf))
}
case TimestampType =>
- def uniformMicorsRand(rand: Random): Long = {
+ def uniformMicrosRand(rand: Random): Long = {
var milliseconds = rand.nextLong() % 253402329599999L
// -62135740800000L is the number of milliseconds before January 1,
1970, 00:00:00 GMT
// for "0001-01-01 00:00:00.000000". We need to find a
@@ -222,10 +233,29 @@ object RandomDataGenerator {
"1970-01-01 00:00:00", // the epoch timestamp
"9999-12-31 23:59:59" // the last supported timestamp according to
SQL standard
)
+ def getRandomTimestamp(rand: Random): java.sql.Timestamp = {
+ // DateTimeUtils.toJavaTimestamp takes microsecond.
+ val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand))
+ // The generated `ts` is based on the hybrid calendar Julian +
Gregorian since
+ // 1582-10-15 but it should be valid in Proleptic Gregorian calendar
too which is used
+ // by Spark SQL since version 3.0 (see SPARK-26651). We try to
convert `ts` to
+ // a local timestamp in Proleptic Gregorian calendar to satisfy this
requirement. Some
+ // years are leap years in Julian calendar but not in Proleptic
Gregorian calendar.
+ // As the consequence of that, 29 February of such years might not
exist in Proleptic
+ // Gregorian calendar. When this happens, we shift the timestamp
`ts` by one day.
+ Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime +
MILLIS_PER_DAY))
+ }
if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) {
randomNumeric[Instant](
rand,
- (rand: Random) =>
DateTimeUtils.microsToInstant(uniformMicorsRand(rand)),
+ (rand: Random) => {
+ val micros = if (validJulianDatetime) {
+ DateTimeUtils.fromJavaTimestamp(getRandomTimestamp(rand))
+ } else {
+ uniformMicrosRand(rand)
+ }
+ DateTimeUtils.microsToInstant(micros)
+ },
specialTs.map { s =>
val ldt = LocalDateTime.parse(s.replace(" ", "T"))
ldt.atZone(ZoneId.systemDefault()).toInstant
@@ -233,18 +263,7 @@ object RandomDataGenerator {
} else {
randomNumeric[java.sql.Timestamp](
rand,
- (rand: Random) => {
- // DateTimeUtils.toJavaTimestamp takes microsecond.
- val ts = DateTimeUtils.toJavaTimestamp(uniformMicorsRand(rand))
- // The generated `ts` is based on the hybrid calendar Julian +
Gregorian since
- // 1582-10-15 but it should be valid in Proleptic Gregorian
calendar too which is used
- // by Spark SQL since version 3.0 (see SPARK-26651). We try to
convert `ts` to
- // a local timestamp in Proleptic Gregorian calendar to satisfy
this requirement. Some
- // years are leap years in Julian calendar but not in Proleptic
Gregorian calendar.
- // As the consequence of that, 29 February of such years might
not exist in Proleptic
- // Gregorian calendar. When this happens, we shift the timestamp
`ts` by one day.
- Try { ts.toLocalDateTime; ts }.getOrElse(new
Timestamp(ts.getTime + MILLIS_PER_DAY))
- },
+ getRandomTimestamp,
specialTs.map(java.sql.Timestamp.valueOf))
}
case CalendarIntervalType => Some(() => {
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
index b65a004..9befd91 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -18,6 +18,7 @@
package org.apache.spark.sql.sources
import java.io.File
+import java.util.Locale
import scala.util.Random
@@ -160,7 +161,17 @@ abstract class HadoopFsRelationTest extends QueryTest with
SQLTestUtils with Tes
val dataGenerator = RandomDataGenerator.forType(
dataType = dataType,
nullable = true,
- new Random(seed)
+ new Random(seed),
+ // TODO(SPARK-34440): Allow saving/loading datetime in ORC
w/o rebasing
+ // The ORC datasource always performs datetime rebasing that
can lead to
+ // shifting of the original dates/timestamps. For instance,
1582-10-06 is valid
+ // date in the Proleptic Gregorian calendar but it does not
exist in the Julian
+ // calendar. The ORC datasource shifts the date to the next
valid date 1582-10-15
+ // during rebasing of this date to the Julian calendar.
Since the test compares
+ // the original date before saving and the date loaded back
from the ORC files,
+ // we set `validJulianDatetime` to `true` to generate only
Proleptic Gregorian
+ // dates that exist in the Julian calendar and will be not
changed during rebase.
+ validJulianDatetime =
dataSourceName.toLowerCase(Locale.ROOT).contains("orc")
).getOrElse {
fail(s"Failed to create data generator for schema $dataType")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]