This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 59d1603a48b [SPARK-18011] Fix SparkR NA date serialization
59d1603a48b is described below
commit 59d1603a48b014a3bf3153c8ffdce8e281fda451
Author: Jove Yuan <[email protected]>
AuthorDate: Tue Jan 24 22:06:32 2023 +0900
[SPARK-18011] Fix SparkR NA date serialization
### What changes were proposed in this pull request?
This PR ensures that SparkR serializes `NA` dates as `"NA"` (string) to
avoid an undefined length when deserializing in the JVM.
### Why are the changes needed?
Currently, SparkR assumes that a `NegativeArraySizeException` when
deserializing dates and timestamps represents an `NA`. However, this handling
can be made more robust by ensuring that serialization on the R side always
provides a valid string length (note that `nchar(as.character(as.Date(NA)))` is
`NA`).
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Closes #39681 from joveyuan-db/SPARK-18011.
Authored-by: Jove Yuan <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
R/pkg/R/serialize.R | 6 +++-
R/pkg/tests/fulltests/test_sparkSQL.R | 2 +-
.../main/scala/org/apache/spark/api/r/SerDe.scala | 36 ++++++++--------------
3 files changed, 19 insertions(+), 25 deletions(-)
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 8a582b4c808..61e174de9ac 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -208,7 +208,11 @@ writeEnv <- function(con, env) {
}
writeDate <- function(con, date) {
- writeString(con, as.character(date))
+ if (is.na(date)) {
+ writeString(con, "NA")
+ } else {
+ writeString(con, as.character(date))
+ }
}
writeTime <- function(con, time) {
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R
b/R/pkg/tests/fulltests/test_sparkSQL.R
index a7a1d4fe1f4..e5408840e72 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -495,7 +495,7 @@ test_that("SPARK-17902: collect() with stringsAsFactors
enabled", {
expect_equal(iris$Species, df$Species)
})
-test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+test_that("SPARK-17811, SPARK-18011: can create DataFrame containing NA as
date and time", {
df <- data.frame(
id = 1:2,
time = c(as.POSIXlt("2016-01-10"), NA),
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index f06d3d00126..57fc8a997cc 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -133,33 +133,23 @@ private[spark] object SerDe {
}
def readDate(in: DataInputStream): Date = {
- try {
- val inStr = readString(in)
- if (inStr == "NA") {
- null
- } else {
- Date.valueOf(inStr)
- }
- } catch {
- // TODO: SPARK-18011 with some versions of R deserializing NA from R
results in NASE
- case _: NegativeArraySizeException => null
+ val inStr = readString(in)
+ if (inStr == "NA") {
+ null
+ } else {
+ Date.valueOf(inStr)
}
}
def readTime(in: DataInputStream): Timestamp = {
- try {
- val seconds = in.readDouble()
- if (java.lang.Double.isNaN(seconds)) {
- null
- } else {
- val sec = Math.floor(seconds).toLong
- val t = new Timestamp(sec * 1000L)
- t.setNanos(((seconds - sec) * 1e9).toInt)
- t
- }
- } catch {
- // TODO: SPARK-18011 with some versions of R deserializing NA from R
results in NASE
- case _: NegativeArraySizeException => null
+ val seconds = in.readDouble()
+ if (java.lang.Double.isNaN(seconds)) {
+ null
+ } else {
+ val sec = Math.floor(seconds).toLong
+ val t = new Timestamp(sec * 1000L)
+ t.setNanos(((seconds - sec) * 1e9).toInt)
+ t
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]