This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a8b694f4d3dc [SPARK-50767][SQL] Remove codegen of `from_json`
a8b694f4d3dc is described below
commit a8b694f4d3dcac684290282c40f52f947b50942b
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Feb 18 21:55:23 2025 +0100
[SPARK-50767][SQL] Remove codegen of `from_json`
### What changes were proposed in this pull request?
This reopens https://github.com/apache/spark/pull/49411 to fix the
performance regression in 4.0.
### Why are the changes needed?
It's non-trivial to support CSE for Filter in whole stage codegen. We
should not rush but revert the codegen support in 4.0 so that we have more time
to get it right in 4.1.
Note: 4.0 also adds codegen support for a few other expressions, but
`from_json` is special as it's quite expensive and the performance regression
is very significant with it.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
N/A
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49992 from cloud-fan/json.
Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../sql/catalyst/expressions/jsonExpressions.scala | 22 +++-------------------
.../SubExprEliminationBenchmark-jdk21-results.txt | 20 ++++++++++----------
.../SubExprEliminationBenchmark-results.txt | 20 ++++++++++----------
3 files changed, 23 insertions(+), 39 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 84b8374599d3..195d481a31ee 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext,
CodeGenerator, ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext,
CodeGenerator, CodegenFallback, ExprCode}
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
import org.apache.spark.sql.catalyst.expressions.json.{GetJsonObjectEvaluator,
JsonExpressionUtils, JsonToStructsEvaluator, JsonTupleEvaluator,
SchemaOfJsonEvaluator, StructsToJsonEvaluator}
import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
@@ -261,6 +261,7 @@ case class JsonToStructs(
variantAllowDuplicateKeys: Boolean =
SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS))
extends UnaryExpression
with TimeZoneAwareExpression
+ with CodegenFallback
with ExpectsInputTypes
with QueryErrorsBase {
@@ -308,7 +309,7 @@ case class JsonToStructs(
copy(timeZoneId = Option(timeZoneId))
@transient
- private val nameOfCorruptRecord =
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
+ private lazy val nameOfCorruptRecord =
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
@transient
private lazy val evaluator = new JsonToStructsEvaluator(
@@ -316,23 +317,6 @@ case class JsonToStructs(
override def nullSafeEval(json: Any): Any =
evaluator.evaluate(json.asInstanceOf[UTF8String])
- override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- val refEvaluator = ctx.addReferenceObj("evaluator", evaluator)
- val eval = child.genCode(ctx)
- val resultType = CodeGenerator.boxedType(dataType)
- val resultTerm = ctx.freshName("result")
- ev.copy(code =
- code"""
- |${eval.code}
- |$resultType $resultTerm = ($resultType)
$refEvaluator.evaluate(${eval.value});
- |boolean ${ev.isNull} = $resultTerm == null;
- |${CodeGenerator.javaType(dataType)} ${ev.value} =
${CodeGenerator.defaultValue(dataType)};
- |if (!${ev.isNull}) {
- | ${ev.value} = $resultTerm;
- |}
- |""".stripMargin)
- }
-
override def inputTypes: Seq[AbstractDataType] =
StringTypeWithCollation(supportsTrimCollation = true) :: Nil
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
index be430feb9780..8cf0ba7224ba 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
================================================================================================
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
AMD EPYC 7763 64-Core Processor
from_json as subExpr in Project: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true 6313 6431
120 0.0 63134831.3 1.0X
-subExprElimination false, codegen: false 6093 6348
288 0.0 60930747.6 1.0X
-subExprElimination true, codegen: true 1387 1425
33 0.0 13872525.5 4.6X
-subExprElimination true, codegen: false 1218 1332
99 0.0 12182992.7 5.2X
+subExprElimination false, codegen: true 6700 7047
301 0.0 67001649.1 1.0X
+subExprElimination false, codegen: false 6719 6837
118 0.0 67191470.6 1.0X
+subExprElimination true, codegen: true 1350 1489
122 0.0 13503842.8 5.0X
+subExprElimination true, codegen: false 1366 1444
96 0.0 13658823.9 4.9X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
AMD EPYC 7763 64-Core Processor
from_json as subExpr in Filter: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true 6610 6705
85 0.0 66104698.4 1.0X
-subExprElimination false, codegen: false 6647 6730
76 0.0 66469463.5 1.0X
-subExprElimination true, codegen: true 2077 2126
43 0.0 20769220.1 3.2X
-subExprElimination true, codegen: false 1949 2000
64 0.0 19489004.0 3.4X
+subExprElimination false, codegen: true 7250 7520
384 0.0 72501549.6 1.0X
+subExprElimination false, codegen: false 7255 7366
114 0.0 72554716.3 1.0X
+subExprElimination true, codegen: true 1934 2024
79 0.0 19344228.2 3.7X
+subExprElimination true, codegen: false 1981 2015
29 0.0 19814306.4 3.7X
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
index 12d602bec17c..b689fef720f7 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
================================================================================================
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
AMD EPYC 7763 64-Core Processor
from_json as subExpr in Project: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true 6438 6551
98 0.0 64378783.5 1.0X
-subExprElimination false, codegen: false 6216 6320
175 0.0 62161826.1 1.0X
-subExprElimination true, codegen: true 1480 1518
39 0.0 14799890.8 4.3X
-subExprElimination true, codegen: false 1321 1429
94 0.0 13212919.6 4.9X
+subExprElimination false, codegen: true 6389 6498
173 0.0 63887225.6 1.0X
+subExprElimination false, codegen: false 6235 6292
81 0.0 62351284.2 1.0X
+subExprElimination true, codegen: true 1328 1368
47 0.0 13284825.9 4.8X
+subExprElimination true, codegen: false 1323 1368
73 0.0 13227629.0 4.8X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
AMD EPYC 7763 64-Core Processor
from_json as subExpr in Filter: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true 7107 7310
207 0.0 71066752.8 1.0X
-subExprElimination false, codegen: false 6738 6781
41 0.0 67375897.0 1.1X
-subExprElimination true, codegen: true 2052 2110
51 0.0 20519152.3 3.5X
-subExprElimination true, codegen: false 2053 2079
33 0.0 20526629.8 3.5X
+subExprElimination false, codegen: true 7081 7177
86 0.0 70813603.9 1.0X
+subExprElimination false, codegen: false 6586 6720
139 0.0 65859888.8 1.1X
+subExprElimination true, codegen: true 1729 1827
117 0.0 17291697.7 4.1X
+subExprElimination true, codegen: false 1726 1789
57 0.0 17255779.7 4.1X
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]