zhuqi-lucas commented on code in PR #21711:
URL: https://github.com/apache/datafusion/pull/21711#discussion_r3121505367
##########
benchmarks/bench.sh:
##########
@@ -1154,84 +1154,143 @@ run_sort_pushdown_sorted() {
# Generates data for sort pushdown Inexact benchmark.
#
-# Produces a single large lineitem parquet file where row groups have
-# NON-OVERLAPPING but OUT-OF-ORDER l_orderkey ranges (each RG internally
-# sorted, RGs shuffled). This simulates append-heavy workloads where data
-# is written in batches at different times.
+# Produces multiple parquet files where each file has MULTIPLE row groups
+# with scrambled RG order. This tests both:
+# - Row-group-level reorder within each file (reorder_by_statistics)
+# - TopK threshold initialization from RG statistics
+#
+# Strategy:
+# 1. Write a single sorted file with small (100K-row) RGs (~61 RGs total).
+# 2. Use pyarrow to redistribute RGs into N_FILES files, scrambling the
+# RG order within each file using a deterministic permutation.
+# Each file gets ~61/N_FILES RGs with narrow, non-overlapping ranges
+# but in scrambled order.
+#
+# Writing a single file with ORDER BY scramble does NOT work: the parquet
+# writer merges rows from adjacent chunks at RG boundaries, widening
+# ranges and defeating reorder_by_statistics.
+#
+# Requires pyarrow (pip install pyarrow).
data_sort_pushdown_inexact() {
INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet
2>/dev/null)" ]; then
echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
return
fi
- echo "Generating sort pushdown Inexact benchmark data (single file,
shuffled RGs)..."
+ echo "Generating sort pushdown Inexact benchmark data (multi-file,
scrambled RGs)..."
# Re-use the sort_pushdown data as the source (generate if missing)
data_sort_pushdown
mkdir -p "${INEXACT_DIR}"
SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"
- # Use datafusion-cli to bucket rows into 64 groups by a deterministic
- # scrambler, then sort within each bucket by orderkey. This produces
- # ~64 RG-sized segments where each has a tight orderkey range but the
- # segments appear in scrambled (non-sorted) order in the file.
+ # Step 1: Write a single sorted file with small (100K-row) RGs
+ TMPFILE="${INEXACT_DIR}/_sorted_small_rgs.parquet"
(cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
CREATE EXTERNAL TABLE src
STORED AS PARQUET
LOCATION '${SRC_DIR}';
- COPY (
- SELECT * FROM src
- ORDER BY
- (l_orderkey * 1664525 + 1013904223) % 64,
- l_orderkey
- )
- TO '${INEXACT_DIR}/shuffled.parquet'
+ COPY (SELECT * FROM src ORDER BY l_orderkey)
+ TO '${TMPFILE}'
STORED AS PARQUET
OPTIONS ('format.max_row_group_size' '100000');
")
- echo "Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR}"
+ # Step 2: Redistribute RGs into 3 files with scrambled RG order.
+ # Each file gets ~20 RGs. RG assignment: rg_idx % 3 determines file,
+ # permutation (rg_idx * 41 + 7) % n scrambles the order within file.
+ python3 -c "
Review Comment:
Addressed in latest PR!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]