This is an automated email from the ASF dual-hosted git repository.
gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 488c36247168 [SPARK-50788][TESTS] Add Benchmark for Large-Row Dataframe
488c36247168 is described below
commit 488c362471687eba9a0f6ed4280b007dba8a0050
Author: Yuchuan Huang <[email protected]>
AuthorDate: Tue Jan 14 17:11:16 2025 -0800
[SPARK-50788][TESTS] Add Benchmark for Large-Row Dataframe
### What changes were proposed in this pull request?
This PR introduces LargeRowBenchmark, a micro benchmark to the suite of
spark.sql.execution.benchmark. A corresponding function is also added to create
large-row dataframes during the benchmark running time.
### Why are the changes needed?
Large-row dataframes, especially dataframes with large string cells are
becoming common with business like online customer chatting. However, it is
unknown how well/bad Spark would be able to support them.
This benchmark aims to provide a baseline to indicate Spark's performance
and limitation on large-row dataframes. It will also be included in future
performance regression check.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
It is tested in Github Action and manual reviewed.
https://github.com/yhuang-db/spark/actions/runs/12716337093 (Java 17)
https://github.com/yhuang-db/spark/actions/runs/12716339158 (Java 21)
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #49447 from yhuang-db/large-row-benchmark.
Authored-by: Yuchuan Huang <[email protected]>
Signed-off-by: Gengliang Wang <[email protected]>
---
.../benchmarks/LargeRowBenchmark-jdk21-results.txt | 26 +++++++
sql/core/benchmarks/LargeRowBenchmark-results.txt | 26 +++++++
.../execution/benchmark/LargeRowBenchmark.scala | 85 ++++++++++++++++++++++
3 files changed, 137 insertions(+)
diff --git a/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt
b/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..dbcf544b492d
--- /dev/null
+++ b/sql/core/benchmarks/LargeRowBenchmark-jdk21-results.txt
@@ -0,0 +1,26 @@
+================================================================================================
+Large Row Benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 100, #cols: 10, cell: 1.3 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 5909 6154
347 0.0 59088236.5 1.0X
+udf UPPER 4106 4364
364 0.0 41062501.9 1.4X
+
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 1, #cols: 1, cell: 300.0 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 1317 1319
3 0.0 1317449498.0 1.0X
+udf UPPER 954 975
25 0.0 953744994.0 1.4X
+
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 1, #cols: 200, cell: 1.0 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 1118 1138
28 0.0 1117901962.0 1.0X
+udf UPPER 1145 1210
91 0.0 1145234313.0 1.0X
+
+
diff --git a/sql/core/benchmarks/LargeRowBenchmark-results.txt
b/sql/core/benchmarks/LargeRowBenchmark-results.txt
new file mode 100644
index 000000000000..9fafe282238b
--- /dev/null
+++ b/sql/core/benchmarks/LargeRowBenchmark-results.txt
@@ -0,0 +1,26 @@
+================================================================================================
+Large Row Benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 100, #cols: 10, cell: 1.3 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 6610 6651
58 0.0 66101681.9 1.0X
+udf UPPER 4289 4291
3 0.0 42892607.0 1.5X
+
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 1, #cols: 1, cell: 300.0 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 1492 1510
26 0.0 1492292577.0 1.0X
+udf UPPER 1033 1034
1 0.0 1032584220.0 1.4X
+
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.8.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+#rows: 1, #cols: 200, cell: 1.0 MB: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+built-in UPPER 1271 1290
28 0.0 1270654457.0 1.0X
+udf UPPER 1397 1558
228 0.0 1396607518.0 0.9X
+
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala
new file mode 100644
index 000000000000..8b4f78e79913
--- /dev/null
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/LargeRowBenchmark.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.functions.lit
+
+/**
+ * Benchmark to measure performance for large row table.
+ * {{{
+ * To run this benchmark:
+ * 1. without sbt: bin/spark-submit --class <this class>
+ * --jars <spark core test jar>,<spark catalyst test jar> <spark sql
test jar>
+ * 2. build/sbt "sql/Test/runMain <this class>"
+ * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
"sql/Test/runMain <this class>"
+ * Results will be written to "benchmarks/LargeRowBenchmark-results.txt".
+ * }}}
+ */
+object LargeRowBenchmark extends SqlBasedBenchmark {
+
+ /**
+ * Prepares a table with large row for benchmarking. The table will be
written into
+ * the given path.
+ */
+ private def writeLargeRow(path: String, rowsNum: Int, numCols: Int,
cellSizeMb: Double): Unit = {
+ val stringLength = (cellSizeMb * 1024 * 1024).toInt
+ spark.range(rowsNum)
+ .select(Seq.tabulate(numCols)(i => lit("a" *
stringLength).as(s"col$i")): _*)
+ .write.parquet(path)
+ }
+
+ private def runLargeRowBenchmark(rowsNum: Int, numCols: Int, cellSizeMb:
Double): Unit = {
+ withTempPath { path =>
+ val benchmark = new Benchmark(
+ s"#rows: $rowsNum, #cols: $numCols, cell: $cellSizeMb MB", rowsNum,
output = output)
+ writeLargeRow(path.getAbsolutePath, rowsNum, numCols, cellSizeMb)
+ val df = spark.read.parquet(path.getAbsolutePath)
+ df.createOrReplaceTempView("T")
+ benchmark.addCase("built-in UPPER") { _ =>
+ val sqlSelect = df.columns.map(c => s"UPPER($c) as $c").mkString(", ")
+ spark.sql(s"SELECT $sqlSelect FROM T").noop()
+ }
+ benchmark.addCase("udf UPPER") { _ =>
+ val sqlSelect = df.columns.map(c => s"udfUpper($c) as $c").mkString(",
")
+ spark.sql(s"SELECT $sqlSelect FROM T").noop()
+ }
+ benchmark.run()
+ }
+ }
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ runBenchmark("Large Row Benchmark") {
+ val udfUpper = (s: String) => s.toUpperCase()
+ spark.udf.register("udfUpper", udfUpper(_: String): String)
+
+ val benchmarks = Array(
+ Map("rows" -> 100, "cols" -> 10, "cellSizeMb" -> 1.3), // OutOfMemory
@ 100, 10, 1.4
+ Map("rows" -> 1, "cols" -> 1, "cellSizeMb" -> 300.0), // OutOfMemory
@ 1, 1, 400
+ Map("rows" -> 1, "cols" -> 200, "cellSizeMb" -> 1.0) // OutOfMemory @
1, 300, 1
+ )
+
+ benchmarks.foreach { b =>
+ val rows = b("rows").asInstanceOf[Int]
+ val cols = b("cols").asInstanceOf[Int]
+ val cellSizeMb = b("cellSizeMb").asInstanceOf[Double]
+ runLargeRowBenchmark(rows, cols, cellSizeMb)
+ }
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]