This is an automated email from the ASF dual-hosted git repository.
yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1e14c5c9edb0 [SPARK-54693][CORE][TESTS] Add LZ4TPCDSDataBenchmark
1e14c5c9edb0 is described below
commit 1e14c5c9edb06cf8894101c726d8d62b9ce498af
Author: Cheng Pan <[email protected]>
AuthorDate: Mon Dec 15 16:19:02 2025 +0800
[SPARK-54693][CORE][TESTS] Add LZ4TPCDSDataBenchmark
### What changes were proposed in this pull request?
Add `LZ4TPCDSDataBenchmark`, test `LZ4CompressionCodec` against TPCDS
`catalog_sales.dat` (SF1), the size is about 283M.
### Why are the changes needed?
Add a benchmark to measure the perf impact of lz4 security upgrading.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Added benchmark result later. Since the change has a refactor that touched
`ZSTDTPCDSDataBenchmark`, also updated its benchmark results.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #53453 from pan3793/SPARK-54693.
Lead-authored-by: Cheng Pan <[email protected]>
Co-authored-by: pan3793 <[email protected]>
Signed-off-by: yangjie01 <[email protected]>
---
.github/workflows/benchmark.yml | 4 +-
.../LZ4TPCDSDataBenchmark-jdk21-results.txt | 17 +++++
core/benchmarks/LZ4TPCDSDataBenchmark-results.txt | 17 +++++
.../ZStandardTPCDSDataBenchmark-jdk21-results.txt | 60 ++++++++---------
.../ZStandardTPCDSDataBenchmark-results.txt | 68 +++++++++----------
.../apache/spark/io/LZ4TPCDSDataBenchmark.scala | 78 ++++++++++++++++++++++
.../org/apache/spark/io/TPCDSDataBenchmark.scala | 64 ++++++++++++++++++
.../spark/io/ZStandardTPCDSDataBenchmark.scala | 31 ++++-----
8 files changed, 254 insertions(+), 85 deletions(-)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 3e90bb329be5..b0454c016640 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -72,7 +72,7 @@ jobs:
# Any TPC-DS related updates on this job need to be applied to tpcds-1g job
of build_and_test.yml as well
tpcds-1g-gen:
name: "Generate an TPC-DS dataset with SF=1"
- if: contains(inputs.class, 'TPCDSQueryBenchmark') ||
contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class,
'*')
+ if: contains(inputs.class, 'TPCDSQueryBenchmark') ||
contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class,
'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*')
runs-on: ubuntu-latest
env:
SPARK_LOCAL_IP: localhost
@@ -177,7 +177,7 @@ jobs:
distribution: zulu
java-version: ${{ inputs.jdk }}
- name: Cache TPC-DS generated data
- if: contains(inputs.class, 'TPCDSQueryBenchmark') ||
contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class,
'*')
+ if: contains(inputs.class, 'TPCDSQueryBenchmark') ||
contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class,
'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*')
id: cache-tpcds-sf-1
uses: actions/cache@v4
with:
diff --git a/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt
b/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..64fb5dc7875a
--- /dev/null
+++ b/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt
@@ -0,0 +1,17 @@
+================================================================================================
+Benchmark LZ4CompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Compression 4 times 2621 2633
16 0.0 655315176.3 1.0X
+
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Decompression 4 times 2270 2286
23 0.0 567457203.5 1.0X
+
+
diff --git a/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt
b/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt
new file mode 100644
index 000000000000..ff05d27454ec
--- /dev/null
+++ b/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt
@@ -0,0 +1,17 @@
+================================================================================================
+Benchmark LZ4CompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Compression 4 times 2614 2621
10 0.0 653597000.8 1.0X
+
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Decompression 4 times 2346 2361
22 0.0 586455894.5 1.0X
+
+
diff --git a/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
b/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
index 7630433a2f7a..bdf21e4ad764 100644
--- a/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
+++ b/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
@@ -2,48 +2,48 @@
Benchmark ZStandardCompressionCodec
================================================================================================
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
AMD EPYC 7763 64-Core Processor
-Benchmark ZStandardCompressionCodec: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+Compression: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
-Compression 4 times at level 1 without buffer pool 2567
2573 9 0.0 641834272.8 1.0X
-Compression 4 times at level 2 without buffer pool 4171
4173 2 0.0 1042717610.7 0.6X
-Compression 4 times at level 3 without buffer pool 6202
6205 4 0.0 1550416592.3 0.4X
-Compression 4 times at level 1 with buffer pool 2562
2562 0 0.0 640541340.8 1.0X
-Compression 4 times at level 2 with buffer pool 4172
4173 1 0.0 1043037897.5 0.6X
-Compression 4 times at level 3 with buffer pool 6199
6203 4 0.0 1549874630.3 0.4X
+Compression 4 times at level 1 without buffer pool 2585
2586 1 0.0 646239352.5 1.0X
+Compression 4 times at level 2 without buffer pool 4211
4216 7 0.0 1052689958.8 0.6X
+Compression 4 times at level 3 without buffer pool 6328
6328 1 0.0 1581904435.8 0.4X
+Compression 4 times at level 1 with buffer pool 2585
2587 3 0.0 646306514.0 1.0X
+Compression 4 times at level 2 with buffer pool 4193
4195 3 0.0 1048293261.0 0.6X
+Compression 4 times at level 3 with buffer pool 6264
6284 28 0.0 1565962696.5 0.4X
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
AMD EPYC 7763 64-Core Processor
-Benchmark ZStandardCompressionCodec: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+Decompression: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
-Decompression 4 times from level 1 without buffer pool 856
875 17 0.0 214044635.0 1.0X
-Decompression 4 times from level 2 without buffer pool 1141
1147 10 0.0 285134371.0 0.8X
-Decompression 4 times from level 3 without buffer pool 1345
1345 0 0.0 336239892.8 0.6X
-Decompression 4 times from level 1 with buffer pool 844
851 12 0.0 210917530.8 1.0X
-Decompression 4 times from level 2 with buffer pool 1111
1126 22 0.0 277630951.0 0.8X
-Decompression 4 times from level 3 with buffer pool 1338
1348 13 0.0 334616619.0 0.6X
+Decompression 4 times from level 1 without buffer pool 845
877 53 0.0 211244491.3 1.0X
+Decompression 4 times from level 2 without buffer pool 1141
1145 6 0.0 285267829.3 0.7X
+Decompression 4 times from level 3 without buffer pool 1358
1366 11 0.0 339611143.3 0.6X
+Decompression 4 times from level 1 with buffer pool 856
862 10 0.0 213948489.5 1.0X
+Decompression 4 times from level 2 with buffer pool 1153
1156 5 0.0 288195943.3 0.7X
+Decompression 4 times from level 3 with buffer pool 1368
1375 9 0.0 342032342.0 0.6X
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
AMD EPYC 7763 64-Core Processor
Parallel Compression at level 3: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers 2099 2099
0 0.0 524768920.0 1.0X
-Parallel Compression with 1 workers 2119 2119
0 0.0 529758581.0 1.0X
-Parallel Compression with 2 workers 1087 1090
4 0.0 271687124.0 1.9X
-Parallel Compression with 4 workers 761 764
2 0.0 190289606.5 2.8X
-Parallel Compression with 8 workers 782 789
7 0.0 195588720.8 2.7X
-Parallel Compression with 16 workers 897 908
10 0.0 224272497.0 2.3X
+Parallel Compression with 0 workers 2128 2130
3 0.0 531987145.5 1.0X
+Parallel Compression with 1 workers 2117 2131
20 0.0 529324483.0 1.0X
+Parallel Compression with 2 workers 1106 1107
0 0.0 276552237.3 1.9X
+Parallel Compression with 4 workers 771 774
3 0.0 192777338.5 2.8X
+Parallel Compression with 8 workers 815 826
13 0.0 203648175.7 2.6X
+Parallel Compression with 16 workers 950 963
11 0.0 237496152.5 2.2X
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
AMD EPYC 7763 64-Core Processor
Parallel Compression at level 9: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers 8240 8242
2 0.0 2060062709.3 1.0X
-Parallel Compression with 1 workers 6915 6924
13 0.0 1728757390.0 1.2X
-Parallel Compression with 2 workers 3663 3664
1 0.0 915832528.8 2.2X
-Parallel Compression with 4 workers 3239 3247
12 0.0 809829698.3 2.5X
-Parallel Compression with 8 workers 3594 3610
23 0.0 898542204.8 2.3X
-Parallel Compression with 16 workers 3888 3898
14 0.0 972017914.3 2.1X
+Parallel Compression with 0 workers 8976 9140
231 0.0 2244064460.3 1.0X
+Parallel Compression with 1 workers 7198 7214
24 0.0 1799376873.8 1.2X
+Parallel Compression with 2 workers 3836 3850
20 0.0 959020774.8 2.3X
+Parallel Compression with 4 workers 3271 3284
18 0.0 817803690.0 2.7X
+Parallel Compression with 8 workers 3806 3807
2 0.0 951490679.3 2.4X
+Parallel Compression with 16 workers 4004 4026
31 0.0 1000957120.3 2.2X
diff --git a/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
b/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
index 01784a686f81..b69321de8861 100644
--- a/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
+++ b/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
@@ -2,48 +2,48 @@
Benchmark ZStandardCompressionCodec
================================================================================================
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
-Benchmark ZStandardCompressionCodec: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
-Compression 4 times at level 1 without buffer pool 2513
2519 10 0.0 628127696.5 1.0X
-Compression 4 times at level 2 without buffer pool 3914
3916 3 0.0 978555442.0 0.6X
-Compression 4 times at level 3 without buffer pool 5417
5424 9 0.0 1354342306.3 0.5X
-Compression 4 times at level 1 with buffer pool 2524
2526 4 0.0 630890054.5 1.0X
-Compression 4 times at level 2 with buffer pool 3911
3911 0 0.0 977705511.5 0.6X
-Compression 4 times at level 3 with buffer pool 5447
5451 5 0.0 1361830044.0 0.5X
+Compression 4 times at level 1 without buffer pool 2572
2573 0 0.0 643092504.0 1.0X
+Compression 4 times at level 2 without buffer pool 4223
4224 2 0.0 1055729472.5 0.6X
+Compression 4 times at level 3 without buffer pool 6306
6307 1 0.0 1576606903.0 0.4X
+Compression 4 times at level 1 with buffer pool 2566
2567 2 0.0 641403366.5 1.0X
+Compression 4 times at level 2 with buffer pool 4229
4229 0 0.0 1057252850.2 0.6X
+Compression 4 times at level 3 with buffer pool 6369
6370 1 0.0 1592238794.8 0.4X
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
-Benchmark ZStandardCompressionCodec: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
-Decompression 4 times from level 1 without buffer pool 1053
1056 4 0.0 263342668.0 1.0X
-Decompression 4 times from level 2 without buffer pool 1401
1403 3 0.0 350193821.5 0.8X
-Decompression 4 times from level 3 without buffer pool 1683
1683 1 0.0 420625735.0 0.6X
-Decompression 4 times from level 1 with buffer pool 1078
1079 2 0.0 269507650.5 1.0X
-Decompression 4 times from level 2 with buffer pool 1397
1403 8 0.0 349213546.5 0.8X
-Decompression 4 times from level 3 with buffer pool 1679
1680 2 0.0 419721856.3 0.6X
+Decompression 4 times from level 1 without buffer pool 914
915 1 0.0 228498988.8 1.0X
+Decompression 4 times from level 2 without buffer pool 1186
1188 2 0.0 296544159.0 0.8X
+Decompression 4 times from level 3 without buffer pool 1424
1424 0 0.0 355996464.8 0.6X
+Decompression 4 times from level 1 with buffer pool 915
916 2 0.0 228733774.3 1.0X
+Decompression 4 times from level 2 with buffer pool 1189
1192 4 0.0 297227461.0 0.8X
+Decompression 4 times from level 3 with buffer pool 1421
1425 5 0.0 355242896.0 0.6X
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
Parallel Compression at level 3: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers 1815 1825
14 0.0 453734902.0 1.0X
-Parallel Compression with 1 workers 1654 1663
12 0.0 413617141.5 1.1X
-Parallel Compression with 2 workers 857 859
2 0.0 214195314.3 2.1X
-Parallel Compression with 4 workers 777 781
3 0.0 194366718.5 2.3X
-Parallel Compression with 8 workers 784 786
3 0.0 195965343.5 2.3X
-Parallel Compression with 16 workers 836 858
19 0.0 208885356.8 2.2X
+Parallel Compression with 0 workers 2142 2143
2 0.0 535512459.0 1.0X
+Parallel Compression with 1 workers 2145 2146
2 0.0 536274942.0 1.0X
+Parallel Compression with 2 workers 1094 1100
9 0.0 273416030.8 2.0X
+Parallel Compression with 4 workers 768 769
2 0.0 192023783.8 2.8X
+Parallel Compression with 8 workers 831 839
13 0.0 207743301.0 2.6X
+Parallel Compression with 16 workers 960 972
16 0.0 240059822.0 2.2X
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
Parallel Compression at level 9: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers 8302 8310
12 0.0 2075384302.0 1.0X
-Parallel Compression with 1 workers 6869 6871
3 0.0 1717196107.5 1.2X
-Parallel Compression with 2 workers 3570 3587
25 0.0 892437927.8 2.3X
-Parallel Compression with 4 workers 3152 3152
1 0.0 787940909.5 2.6X
-Parallel Compression with 8 workers 3463 3470
10 0.0 865739615.8 2.4X
-Parallel Compression with 16 workers 4061 4083
31 0.0 1015251097.3 2.0X
+Parallel Compression with 0 workers 9016 9031
21 0.0 2254017491.8 1.0X
+Parallel Compression with 1 workers 7423 7467
62 0.0 1855653850.8 1.2X
+Parallel Compression with 2 workers 4203 4242
54 0.0 1050838310.7 2.1X
+Parallel Compression with 4 workers 3472 3481
13 0.0 867935936.0 2.6X
+Parallel Compression with 8 workers 3952 3996
62 0.0 988086500.5 2.3X
+Parallel Compression with 16 workers 4081 4115
47 0.0 1020289087.7 2.2X
diff --git
a/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala
b/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala
new file mode 100644
index 000000000000..f3f2bd1057b7
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.io
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, OutputStream}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark for LZ4 codec performance.
+ * {{{
+ * To run this benchmark:
+ * 1. without sbt: bin/spark-submit --class <this class> <spark core test
jar>
+ * 2. build/sbt "core/Test/runMain <this class>"
+ * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
"core/Test/runMain <this class>"
+ * Results will be written to
"benchmarks/LZ4TPCDSDataBenchmark-results.txt".
+ * }}}
+ */
+object LZ4TPCDSDataBenchmark extends TPCDSDataBenchmark {
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ prepareData()
+ runBenchmark("Benchmark LZ4CompressionCodec") {
+ compressionBenchmark()
+ decompressionBenchmark()
+ }
+ }
+
+ private def compressionBenchmark(): Unit = {
+ val benchmark = new Benchmark("Compression", N, output = output)
+ val conf = new SparkConf(false)
+ benchmark.addCase(s"Compression $N times") { _ =>
+ (1 until N).foreach { _ =>
+ val os = new LZ4CompressionCodec(conf)
+ .compressedOutputStream(OutputStream.nullOutputStream())
+ os.write(data)
+ os.close()
+ }
+ }
+ benchmark.run()
+ }
+
+ private def decompressionBenchmark(): Unit = {
+ val benchmark = new Benchmark("Decompression", N, output = output)
+ val conf = new SparkConf(false)
+ val outputStream = new ByteArrayOutputStream()
+ val out = new
LZ4CompressionCodec(conf).compressedOutputStream(outputStream)
+ out.write(data)
+ out.close()
+ val bytes = outputStream.toByteArray
+
+ benchmark.addCase(s"Decompression $N times") { _ =>
+ (1 until N).foreach { _ =>
+ val bais = new ByteArrayInputStream(bytes)
+ val is = new LZ4CompressionCodec(conf).compressedInputStream(bais)
+ is.readAllBytes()
+ is.close()
+ }
+ }
+ benchmark.run()
+ }
+}
diff --git a/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala
b/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala
new file mode 100644
index 000000000000..b6b419e0201a
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.io
+
+import java.nio.file.{Files, Paths}
+
+import org.apache.spark.benchmark.BenchmarkBase
+
+/**
+ * TPC-DS data preparation:
+ * <p>
+ * 1. Follow https://github.com/gregrahn/tpcds-kit.git to set up tpcds-kit
+ * <p>
+ * 2. Create a folder and export environment variable SPARK_TPCDS_DATA_TEXT
+ * {{{
+ * mkdir -p /path/of/tpcds-sf1-text
+ * export SPARK_TPCDS_DATA_TEXT=/path/of/tpcds-sf1-text
+ * }}}
+ * <p>
+ * 3. Generate TPC-DS (SF1) text data under SPARK_TPCDS_DATA_TEXT
+ * {{{
+ * tpcds-kit/tools/dsdgen \
+ * -DISTRIBUTIONS tpcds-kit/tools/tpcds.idx \
+ * -SCALE 1 \
+ * -DIR $SPARK_TPCDS_DATA_TEXT
+ * }}}
+ */
+abstract class TPCDSDataBenchmark extends BenchmarkBase {
+
+ val N = 4
+
+ var data: Array[Byte] = _
+
+ protected def prepareData(): Unit = {
+ val tpcDsDataDir = sys.env.get("SPARK_TPCDS_DATA_TEXT")
+ assert(tpcDsDataDir.nonEmpty, "Can not find env var SPARK_TPCDS_DATA_TEXT")
+
+ val catalogSalesDatPath = Paths.get(tpcDsDataDir.get, "catalog_sales.dat")
+ assert(Files.exists(catalogSalesDatPath), s"File $catalogSalesDatPath does
not exists, " +
+ s"please follow instruction to generate the TPC-DS (SF1) text data
first.")
+
+ // the size of TPCDS catalog_sales.dat (SF1) is about 283M
+ data = Files.readAllBytes(catalogSalesDatPath)
+ }
+
+ override def afterAll(): Unit = {
+ data = null
+ }
+}
diff --git
a/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
b/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
index 69820c15be30..a546d03c8eb3 100644
--- a/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
+++ b/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
@@ -18,10 +18,9 @@
package org.apache.spark.io
import java.io.{ByteArrayInputStream, ByteArrayOutputStream,
ObjectOutputStream, OutputStream}
-import java.nio.file.{Files, Paths}
import org.apache.spark.SparkConf
-import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
+import org.apache.spark.benchmark.Benchmark
import
org.apache.spark.internal.config.{IO_COMPRESSION_ZSTD_BUFFERPOOL_ENABLED,
IO_COMPRESSION_ZSTD_LEVEL, IO_COMPRESSION_ZSTD_WORKERS}
/**
@@ -34,28 +33,19 @@ import
org.apache.spark.internal.config.{IO_COMPRESSION_ZSTD_BUFFERPOOL_ENABLED,
* Results will be written to
"benchmarks/ZStandardTPCDSDataBenchmark-results.txt".
* }}}
*/
-object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
-
- val N = 4
-
- // the size of TPCDS catalog_sales.dat (SF1) is about 283M
- val data = Files.readAllBytes(Paths.get(sys.env("SPARK_TPCDS_DATA_TEXT"),
"catalog_sales.dat"))
+object ZStandardTPCDSDataBenchmark extends TPCDSDataBenchmark {
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
- val name = "Benchmark ZStandardCompressionCodec"
- runBenchmark(name) {
- val benchmark1 = new Benchmark(name, N, output = output)
- compressionBenchmark(benchmark1, N)
- benchmark1.run()
-
- val benchmark2 = new Benchmark(name, N, output = output)
- decompressionBenchmark(benchmark2, N)
- benchmark2.run()
+ prepareData()
+ runBenchmark("Benchmark ZStandardCompressionCodec") {
+ compressionBenchmark()
+ decompressionBenchmark()
parallelCompressionBenchmark()
}
}
- private def compressionBenchmark(benchmark: Benchmark, N: Int): Unit = {
+ private def compressionBenchmark(): Unit = {
+ val benchmark = new Benchmark("Compression", N, output = output)
Seq(false, true).foreach { enablePool =>
Seq(1, 2, 3).foreach { level =>
val conf = new SparkConf(false)
@@ -72,9 +62,11 @@ object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
}
}
}
+ benchmark.run()
}
- private def decompressionBenchmark(benchmark: Benchmark, N: Int): Unit = {
+ private def decompressionBenchmark(): Unit = {
+ val benchmark = new Benchmark("Decompression", N, output = output)
Seq(false, true).foreach { enablePool =>
Seq(1, 2, 3).foreach { level =>
val conf = new SparkConf(false)
@@ -97,6 +89,7 @@ object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
}
}
}
+ benchmark.run()
}
private def parallelCompressionBenchmark(): Unit = {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]