paleolimbot opened a new issue, #40217:
URL: https://github.com/apache/arrow/issues/40217
### Describe the bug, including details regarding any error messages,
version, and platform.
From the performance report for #40197, apparently we get:
```
' *** caught segfault ***', "address 0x3d, cause 'memory not mapped'", '',
'Traceback:', ' 1: RecordBatchReader__UnsafeDelete(self)', ' 2:
reader$.unsafe_delete()', ' 3: as_arrow_table.arrow_dplyr_query(x)',
```
for the job:
```
engine=arrow, format=parquet, language=R, memory_map=False,
query_id=TPCH-21, scale_factor=10
```
`RecordBatchReader__UnsafeDelete()` is something I added but I would have to
re-look into its use to ensure it is not getting called twice...I believe it
was introduced to ensure that open files were closed promptly since this caused
problems on Windows.
Also, huge regression last July:
<img width="443" alt="Screenshot 2024-02-23 at 4 32 28 PM"
src="https://github.com/apache/arrow/assets/10995762/b6170627-1c00-48ad-bb1c-38a5a37914ca">
https://conbench.ursa.dev/benchmark-results/065d8d9e6ab17d1e8000cb6422edfa64/
Full dump:
<details>
```
['', ' *** caught segfault ***', "address 0x3d, cause
'memory not mapped'", '', 'Traceback:', ' 1:
RecordBatchReader__UnsafeDelete(self)', ' 2: reader$.unsafe_delete()', ' 3:
as_arrow_table.arrow_dplyr_query(x)', ' 4: as_arrow_table(x)', ' 5:
doTryCatch(return(expr), name, parentenv, handler)', ' 6: tryCatchOne(expr,
names, parentenv, handlers[[1L]])', ' 7: tryCatchList(expr, classes, parentenv,
handlers)', ' 8: tryCatch(as_arrow_table(x), error = function(e, call =
caller_env(n = 4)) { augment_io_error_msg(e, call, schema = schema())})', '
9: compute.arrow_dplyr_query(x)', '10: collect.arrow_dplyr_query(.)', '11:
collect_func(.)', '12: input_func("supplier") %>% inner_join(line_items, by =
c(s_suppkey = "l_suppkey")) %>% filter(l_receiptdate > l_commitdate) %>%
inner_join(input_func("nation"), by = c(s_nationkey = "n_nationkey")) %>%
filter(n_name == "SAUDI ARABIA") %>% group_by(s_name) %>% summarise(numwait
= n()) %>% ungroup() %>% arrange(desc
(numwait), s_name) %>% head(100) %>% collect_func()', '13:
query(input_func, collect_func, con)', '14: eval(bm$run, envir = ctx)', '15:
eval(bm$run, envir = ctx)', '16: eval(expr, p)', '17: eval.parent(...)', '18:
as_bench_time(.Call(system_time_, substitute(expr), parent.frame()))', '19:
stats::setNames(as_bench_time(.Call(system_time_, substitute(expr),
parent.frame())), c("process", "real"))', '20:
bench::bench_time(eval.parent(...))', '21: eval(expr, p)', '22:
eval.parent(expr)', '23: with_profiling(profiling, { timings <-
bench::bench_time(eval.parent(...))})', '24: force(expr)', '25: with_gc_info({
prof_file <- with_profiling(profiling, { timings <-
bench::bench_time(eval.parent(...)) })})', '26: measure(eval(bm$run, envir =
ctx), profiling = profiling, drop_caches = drop_caches)', '27: run_iteration(bm
= bm, ctx = ctx, profiling = profiling, drop_caches =
global_params[["drop_caches"]])', '28: withCallingHandlers({ results[[i]] <-
run_iteration(bm =
bm, ctx = ctx, profiling = profiling, drop_caches =
global_params[["drop_caches"]])}, warning = function(w) { warnings <<-
c(warnings, list(list(warning = as.character(w), stack_trace =
vapply(traceback(3), function(x) paste(x, collapse = "\\n"),
character(1)))))})', '29: doTryCatch(return(expr), name, parentenv, handler)',
'30: tryCatchOne(expr, names, parentenv, handlers[[1L]])', '31:
tryCatchList(expr, classes, parentenv, handlers)', '32:
tryCatch(withCallingHandlers({ results[[i]] <- run_iteration(bm = bm, ctx =
ctx, profiling = profiling, drop_caches =
global_params[["drop_caches"]])}, warning = function(w) { warnings <<-
c(warnings, list(list(warning = as.character(w), stack_trace =
vapply(traceback(3), function(x) paste(x, collapse = "\\n"),
character(1)))))}), error = function(e) { error <<- list(error =
as.character(e), stack_trace = vapply(traceback(3), function(x)
paste(x, collapse = "\\n"), char
acter(1)))})', '33: run_bm(format = "parquet", scale_factor = 10, engine =
"arrow", memory_map = FALSE, query_id = 21, bm = structure(list(name =
"tpch", setup = function(engine = "arrow", query_id = 1:22, format =
c("native", "parquet"), scale_factor = c(1, 10), memory_map =
FALSE, output = "data_frame", chunk_size = NULL) {
engine <- match.arg(engine, c("arrow", "duckdb", "duckdb_sql",
"dplyr")) format <- match.arg(format, c("parquet", "feather",
"native")) stopifnot(`query_id must be an int` =
query_id%%1 == 0, `query_id must 1-22` = query_id >= 1 &
query_id <= 22) output <- match.arg(output,
c("arrow_table", "data_frame")) library("dplyr", warn.conflicts =
FALSE) collect_func <- collect if (output ==
"data_frame") { collect_func <- collect } else if
(output == "arrow_table") {
collect_func <- compute } con <- NULL
if (engine %in% c("duckdb", "duckdb_sql")) { con <-
DBI::dbConnect(duckdb::duckdb()) DBI::dbExecute(con,
paste0("PRAGMA threads=", getOption("Ncpus"))) }
BenchEnvironment(input_func = get_input_func(engine = engine,
scale_factor = scale_factor, query_id = query_id, format
= format, con = con, memory_map = memory_map, chunk_size =
chunk_size), query = get_query_func(query_id, engine), engine =
engine, con = con, scale_factor = scale_factor, query_id =
query_id, collect_func = collect_func) }, before_each = quote({
result <- NULL }), run = quote({ result <-
query(input_func, collect_func, con) }), after_each = quote({
if (scale_factor %in% c(0.01, 0.1, 1, 10)) { answer <-
tpch_answer(sca
le_factor, query_id) result <- dplyr::as_tibble(result)
all_equal_out <- waldo::compare(result, answer,
tolerance = 0.01) if (length(all_equal_out) != 0) {
warning(paste0("\\n", all_equal_out, "\\n")) stop("The
answer does not match") } } else {
warning("There is no validation for scale_factors other than 0.01, 0.1, 1, and
10. Be careful with these results!") } result <- NULL
}), teardown = quote({ if (!is.null(con)) {
DBI::dbDisconnect(con, shutdown = TRUE) } }), valid_params =
function(params) { drop <- (params$engine != "arrow" & params$format
== "feather") | (params$engine != "arrow" & params$output ==
"arrow_table") | (params$engine != "arrow" &
params$memory_map == TRUE) | (params$engine == "dply
r" & params$format == "native") params[!drop, ] },
case_version = function(params) NULL, batch_id_fun = function(params) {
batch_id <- uuid() paste0(batch_id, "-", params$scale_factor,
substr(params$format, 1, 1)) }, tags_fun =
function(params) { params$query_id <- sprintf("TPCH-%02d",
params$query_id) if (!is.null(params$output) && params$output ==
"data_frame") { params$output <- NULL }
params }, packages_used = function(params) { c(params$engine,
"dplyr", "lubridate") }), class = "Benchmark"), n_iter = 1, batch_id =
NULL, profiling = FALSE, global_params = list(cpu_count = NULL,
lib_path = "latest"), run_id = NULL, run_name = NULL, run_reason = NULL)',
'An irrecoverable exception occurred. R is aborting now ...', 'Segmentation
fault (core dumped)']
```
</details>
### Component(s)
R
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]