paleolimbot opened a new issue, #40217: URL: https://github.com/apache/arrow/issues/40217
### Describe the bug, including details regarding any error messages, version, and platform. From the performance report for #40197, apparently we get: ``` ' *** caught segfault ***', "address 0x3d, cause 'memory not mapped'", '', 'Traceback:', ' 1: RecordBatchReader__UnsafeDelete(self)', ' 2: reader$.unsafe_delete()', ' 3: as_arrow_table.arrow_dplyr_query(x)', ``` for the job: ``` engine=arrow, format=parquet, language=R, memory_map=False, query_id=TPCH-21, scale_factor=10 ``` `RecordBatchReader__UnsafeDelete()` is something I added but I would have to re-look into its use to ensure it is not getting called twice...I believe it was introduced to ensure that open files were closed promptly since this caused problems on Windows. Also, huge regression last July: <img width="443" alt="Screenshot 2024-02-23 at 4 32 28 PM" src="https://github.com/apache/arrow/assets/10995762/b6170627-1c00-48ad-bb1c-38a5a37914ca"> https://conbench.ursa.dev/benchmark-results/065d8d9e6ab17d1e8000cb6422edfa64/ Full dump: <details> ``` ['', ' *** caught segfault ***', "address 0x3d, cause 'memory not mapped'", '', 'Traceback:', ' 1: RecordBatchReader__UnsafeDelete(self)', ' 2: reader$.unsafe_delete()', ' 3: as_arrow_table.arrow_dplyr_query(x)', ' 4: as_arrow_table(x)', ' 5: doTryCatch(return(expr), name, parentenv, handler)', ' 6: tryCatchOne(expr, names, parentenv, handlers[[1L]])', ' 7: tryCatchList(expr, classes, parentenv, handlers)', ' 8: tryCatch(as_arrow_table(x), error = function(e, call = caller_env(n = 4)) { augment_io_error_msg(e, call, schema = schema())})', ' 9: compute.arrow_dplyr_query(x)', '10: collect.arrow_dplyr_query(.)', '11: collect_func(.)', '12: input_func("supplier") %>% inner_join(line_items, by = c(s_suppkey = "l_suppkey")) %>% filter(l_receiptdate > l_commitdate) %>% inner_join(input_func("nation"), by = c(s_nationkey = "n_nationkey")) %>% filter(n_name == "SAUDI ARABIA") %>% group_by(s_name) %>% summarise(numwait = n()) %>% ungroup() %>% arrange(desc (numwait), s_name) %>% head(100) %>% collect_func()', '13: query(input_func, collect_func, con)', '14: eval(bm$run, envir = ctx)', '15: eval(bm$run, envir = ctx)', '16: eval(expr, p)', '17: eval.parent(...)', '18: as_bench_time(.Call(system_time_, substitute(expr), parent.frame()))', '19: stats::setNames(as_bench_time(.Call(system_time_, substitute(expr), parent.frame())), c("process", "real"))', '20: bench::bench_time(eval.parent(...))', '21: eval(expr, p)', '22: eval.parent(expr)', '23: with_profiling(profiling, { timings <- bench::bench_time(eval.parent(...))})', '24: force(expr)', '25: with_gc_info({ prof_file <- with_profiling(profiling, { timings <- bench::bench_time(eval.parent(...)) })})', '26: measure(eval(bm$run, envir = ctx), profiling = profiling, drop_caches = drop_caches)', '27: run_iteration(bm = bm, ctx = ctx, profiling = profiling, drop_caches = global_params[["drop_caches"]])', '28: withCallingHandlers({ results[[i]] <- run_iteration(bm = bm, ctx = ctx, profiling = profiling, drop_caches = global_params[["drop_caches"]])}, warning = function(w) { warnings <<- c(warnings, list(list(warning = as.character(w), stack_trace = vapply(traceback(3), function(x) paste(x, collapse = "\\n"), character(1)))))})', '29: doTryCatch(return(expr), name, parentenv, handler)', '30: tryCatchOne(expr, names, parentenv, handlers[[1L]])', '31: tryCatchList(expr, classes, parentenv, handlers)', '32: tryCatch(withCallingHandlers({ results[[i]] <- run_iteration(bm = bm, ctx = ctx, profiling = profiling, drop_caches = global_params[["drop_caches"]])}, warning = function(w) { warnings <<- c(warnings, list(list(warning = as.character(w), stack_trace = vapply(traceback(3), function(x) paste(x, collapse = "\\n"), character(1)))))}), error = function(e) { error <<- list(error = as.character(e), stack_trace = vapply(traceback(3), function(x) paste(x, collapse = "\\n"), char acter(1)))})', '33: run_bm(format = "parquet", scale_factor = 10, engine = "arrow", memory_map = FALSE, query_id = 21, bm = structure(list(name = "tpch", setup = function(engine = "arrow", query_id = 1:22, format = c("native", "parquet"), scale_factor = c(1, 10), memory_map = FALSE, output = "data_frame", chunk_size = NULL) { engine <- match.arg(engine, c("arrow", "duckdb", "duckdb_sql", "dplyr")) format <- match.arg(format, c("parquet", "feather", "native")) stopifnot(`query_id must be an int` = query_id%%1 == 0, `query_id must 1-22` = query_id >= 1 & query_id <= 22) output <- match.arg(output, c("arrow_table", "data_frame")) library("dplyr", warn.conflicts = FALSE) collect_func <- collect if (output == "data_frame") { collect_func <- collect } else if (output == "arrow_table") { collect_func <- compute } con <- NULL if (engine %in% c("duckdb", "duckdb_sql")) { con <- DBI::dbConnect(duckdb::duckdb()) DBI::dbExecute(con, paste0("PRAGMA threads=", getOption("Ncpus"))) } BenchEnvironment(input_func = get_input_func(engine = engine, scale_factor = scale_factor, query_id = query_id, format = format, con = con, memory_map = memory_map, chunk_size = chunk_size), query = get_query_func(query_id, engine), engine = engine, con = con, scale_factor = scale_factor, query_id = query_id, collect_func = collect_func) }, before_each = quote({ result <- NULL }), run = quote({ result <- query(input_func, collect_func, con) }), after_each = quote({ if (scale_factor %in% c(0.01, 0.1, 1, 10)) { answer <- tpch_answer(sca le_factor, query_id) result <- dplyr::as_tibble(result) all_equal_out <- waldo::compare(result, answer, tolerance = 0.01) if (length(all_equal_out) != 0) { warning(paste0("\\n", all_equal_out, "\\n")) stop("The answer does not match") } } else { warning("There is no validation for scale_factors other than 0.01, 0.1, 1, and 10. Be careful with these results!") } result <- NULL }), teardown = quote({ if (!is.null(con)) { DBI::dbDisconnect(con, shutdown = TRUE) } }), valid_params = function(params) { drop <- (params$engine != "arrow" & params$format == "feather") | (params$engine != "arrow" & params$output == "arrow_table") | (params$engine != "arrow" & params$memory_map == TRUE) | (params$engine == "dply r" & params$format == "native") params[!drop, ] }, case_version = function(params) NULL, batch_id_fun = function(params) { batch_id <- uuid() paste0(batch_id, "-", params$scale_factor, substr(params$format, 1, 1)) }, tags_fun = function(params) { params$query_id <- sprintf("TPCH-%02d", params$query_id) if (!is.null(params$output) && params$output == "data_frame") { params$output <- NULL } params }, packages_used = function(params) { c(params$engine, "dplyr", "lubridate") }), class = "Benchmark"), n_iter = 1, batch_id = NULL, profiling = FALSE, global_params = list(cpu_count = NULL, lib_path = "latest"), run_id = NULL, run_name = NULL, run_reason = NULL)', 'An irrecoverable exception occurred. R is aborting now ...', 'Segmentation fault (core dumped)'] ``` </details> ### Component(s) R -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org