zhuqi-lucas commented on code in PR #21674:
URL: https://github.com/apache/datafusion/pull/21674#discussion_r3093910770
##########
benchmarks/bench.sh:
##########
@@ -1137,6 +1144,59 @@ run_sort_pushdown_sorted() {
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted
--iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path
"${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG}
${LATENCY_ARG}
}
+# Generates data for sort pushdown Inexact benchmark.
+#
+# Produces a single large lineitem parquet file where row groups have
+# NON-OVERLAPPING but OUT-OF-ORDER l_orderkey ranges (each RG internally
+# sorted, RGs shuffled). This simulates append-heavy workloads where data
+# is written in batches at different times.
+data_sort_pushdown_inexact() {
+ INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
+ if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet
2>/dev/null)" ]; then
+ echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
+ return
+ fi
+
+ echo "Generating sort pushdown Inexact benchmark data (single file,
shuffled RGs)..."
+
+ # Re-use the sort_pushdown data as the source (generate if missing)
+ data_sort_pushdown
+
+ mkdir -p "${INEXACT_DIR}"
+ SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+
+ # Use datafusion-cli to bucket rows into 64 groups by a deterministic
+ # scrambler, then sort within each bucket by orderkey. This produces
+ # ~64 RG-sized segments where each has a tight orderkey range but the
+ # segments appear in scrambled (non-sorted) order in the file.
+ (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
+ CREATE EXTERNAL TABLE src
+ STORED AS PARQUET
+ LOCATION '${SRC_DIR}';
+
+ COPY (
+ SELECT * FROM src
+ ORDER BY
+ (l_orderkey * 1664525 + 1013904223) % 64,
+ l_orderkey
+ )
+ TO '${INEXACT_DIR}/shuffled.parquet'
+ STORED AS PARQUET
+ OPTIONS ('format.max_row_group_size' '100000');
+ ")
+
+ echo "Sort pushdown Inexact data generated at ${INEXACT_DIR}"
+ ls -la "${INEXACT_DIR}"
+}
+
+# Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
+run_sort_pushdown_inexact() {
+ INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
+ RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact.json"
+ echo "Running sort pushdown Inexact benchmark (row group reorder by
statistics)..."
+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted
--iterations 5 --path "${INEXACT_DIR}" --queries-path
"${SCRIPT_DIR}/queries/sort_pushdown_inexact" -o "${RESULTS_FILE}" ${QUERY_ARG}
${LATENCY_ARG}
Review Comment:
`--sorted` adds `WITH ORDER (l_orderkey ASC)` which is needed to trigger the
reverse scan path in `try_pushdown_sort` — currently the only path where
`reorder_by_statistics` is called.
After reviewing this more carefully, I realized we need two benchmark suites
to cover the full optimization path:
1. `sort_pushdown_inexact` (with `--sorted`, DESC queries) — tests the
reverse scan path where RG reorder is already supported (#21580)
2. `sort_pushdown_inexact_unsorted` (without `--sorted`, ASC+DESC queries) —
tests the Unsupported path where RG reorder will be supported in a follow-up PR
Updated in the latest push. This way each follow-up PR can run its
corresponding benchmark to show the improvement.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]