This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7083ec0 [SPARK-28215][SQL][R] as_tibble was removed from Arrow R API
7083ec0 is described below
commit 7083ec051ed47b9c6500f2107650814f0ff9206f
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Jul 1 13:21:06 2019 +0900
[SPARK-28215][SQL][R] as_tibble was removed from Arrow R API
## What changes were proposed in this pull request?
New R api of Arrow has removed `as_tibble` as of
https://github.com/apache/arrow/commit/2ef96c8623cbad1770f82e97df733bd881ab967b.
Arrow optimization for DataFrame in R doesn't work due to the change.
This can be tested as below, after installing latest Arrow:
```
./bin/sparkR --conf spark.sql.execution.arrow.sparkr.enabled=true
```
```
> collect(createDataFrame(mtcars))
```
Before this PR:
```
> collect(createDataFrame(mtcars))
Error in get("as_tibble", envir = asNamespace("arrow")) :
object 'as_tibble' not found
```
After:
```
> collect(createDataFrame(mtcars))
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
...
```
## How was this patch tested?
Manual test.
Closes #25012 from viirya/SPARK-28215.
Authored-by: Liang-Chi Hsieh <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
---
R/pkg/R/DataFrame.R | 10 ++++++++--
R/pkg/R/deserialize.R | 13 ++++++++++---
2 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 439cad0..6f3c7c1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1203,7 +1203,8 @@ setMethod("collect",
requireNamespace1 <- requireNamespace
if (requireNamespace1("arrow", quietly = TRUE)) {
read_arrow <- get("read_arrow", envir = asNamespace("arrow"),
inherits = FALSE)
- as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
+ # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
+ useAsTibble <- exists("as_tibble", envir =
asNamespace("arrow"))
portAuth <- callJMethod(x@sdf, "collectAsArrowToR")
port <- portAuth[[1]]
@@ -1213,7 +1214,12 @@ setMethod("collect",
output <- tryCatch({
doServerAuth(conn, authSecret)
arrowTable <- read_arrow(readRaw(conn))
- as.data.frame(as_tibble(arrowTable), stringsAsFactors =
stringsAsFactors)
+ if (useAsTibble) {
+ as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
+ as.data.frame(as_tibble(arrowTable), stringsAsFactors =
stringsAsFactors)
+ } else {
+ as.data.frame(arrowTable, stringsAsFactors =
stringsAsFactors)
+ }
}, finally = {
close(conn)
})
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 191c51e..b38d245 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -237,7 +237,9 @@ readDeserializeInArrow <- function(inputCon) {
if (requireNamespace1("arrow", quietly = TRUE)) {
RecordBatchStreamReader <- get(
"RecordBatchStreamReader", envir = asNamespace("arrow"), inherits =
FALSE)
- as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
+ # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
+ useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))
+
# Currently, there looks no way to read batch by batch by socket
connection in R side,
# See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted
binary at once
@@ -246,8 +248,13 @@ readDeserializeInArrow <- function(inputCon) {
arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
batches <- RecordBatchStreamReader(arrowData)$batches()
- # Read all groupped batches. Tibble -> data.frame is cheap.
- lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
+ if (useAsTibble) {
+ as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
+ # Read all groupped batches. Tibble -> data.frame is cheap.
+ lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
+ } else {
+ lapply(batches, function(batch) as.data.frame(batch))
+ }
} else {
stop("'arrow' package should be installed.")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]