(spark) branch master updated: [SPARK-50767][SQL] Remove codegen of `from_json`

maxgekk Tue, 18 Feb 2025 12:56:07 -0800

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new a8b694f4d3dc [SPARK-50767][SQL] Remove codegen of `from_json`
a8b694f4d3dc is described below

commit a8b694f4d3dcac684290282c40f52f947b50942b
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Feb 18 21:55:23 2025 +0100

    [SPARK-50767][SQL] Remove codegen of `from_json`
    
    ### What changes were proposed in this pull request?
    
    This reopens https://github.com/apache/spark/pull/49411 to fix the 
performance regression in 4.0.
    
    ### Why are the changes needed?
    
    It's non-trivial to support CSE for Filter in whole stage codegen. We 
should not rush but revert the codegen support in 4.0 so that we have more time 
to get it right in 4.1.
    
    Note: 4.0 also adds codegen support for a few other expressions, but 
`from_json` is special as it's quite expensive and the performance regression 
is very significant with it.
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    N/A
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #49992 from cloud-fan/json.
    
    Authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../sql/catalyst/expressions/jsonExpressions.scala | 22 +++-------------------
 .../SubExprEliminationBenchmark-jdk21-results.txt  | 20 ++++++++++----------
 .../SubExprEliminationBenchmark-results.txt        | 20 ++++++++++----------
 3 files changed, 23 insertions(+), 39 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 84b8374599d3..195d481a31ee 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
CodeGenerator, ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
CodeGenerator, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
 import org.apache.spark.sql.catalyst.expressions.json.{GetJsonObjectEvaluator, 
JsonExpressionUtils, JsonToStructsEvaluator, JsonTupleEvaluator, 
SchemaOfJsonEvaluator, StructsToJsonEvaluator}
 import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
@@ -261,6 +261,7 @@ case class JsonToStructs(
     variantAllowDuplicateKeys: Boolean = 
SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS))
   extends UnaryExpression
   with TimeZoneAwareExpression
+  with CodegenFallback
   with ExpectsInputTypes
   with QueryErrorsBase {
 
@@ -308,7 +309,7 @@ case class JsonToStructs(
     copy(timeZoneId = Option(timeZoneId))
 
   @transient
-  private val nameOfCorruptRecord = 
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
+  private lazy val nameOfCorruptRecord = 
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
 
   @transient
   private lazy val evaluator = new JsonToStructsEvaluator(
@@ -316,23 +317,6 @@ case class JsonToStructs(
 
   override def nullSafeEval(json: Any): Any = 
evaluator.evaluate(json.asInstanceOf[UTF8String])
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val refEvaluator = ctx.addReferenceObj("evaluator", evaluator)
-    val eval = child.genCode(ctx)
-    val resultType = CodeGenerator.boxedType(dataType)
-    val resultTerm = ctx.freshName("result")
-    ev.copy(code =
-      code"""
-         |${eval.code}
-         |$resultType $resultTerm = ($resultType) 
$refEvaluator.evaluate(${eval.value});
-         |boolean ${ev.isNull} = $resultTerm == null;
-         |${CodeGenerator.javaType(dataType)} ${ev.value} = 
${CodeGenerator.defaultValue(dataType)};
-         |if (!${ev.isNull}) {
-         |  ${ev.value} = $resultTerm;
-         |}
-         |""".stripMargin)
-  }
-
   override def inputTypes: Seq[AbstractDataType] =
     StringTypeWithCollation(supportsTrimCollation = true) :: Nil
 
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
index be430feb9780..8cf0ba7224ba 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
 
================================================================================================
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true            6313           6431         
120          0.0    63134831.3       1.0X
-subExprElimination false, codegen: false           6093           6348         
288          0.0    60930747.6       1.0X
-subExprElimination true, codegen: true             1387           1425         
 33          0.0    13872525.5       4.6X
-subExprElimination true, codegen: false            1218           1332         
 99          0.0    12182992.7       5.2X
+subExprElimination false, codegen: true            6700           7047         
301          0.0    67001649.1       1.0X
+subExprElimination false, codegen: false           6719           6837         
118          0.0    67191470.6       1.0X
+subExprElimination true, codegen: true             1350           1489         
122          0.0    13503842.8       5.0X
+subExprElimination true, codegen: false            1366           1444         
 96          0.0    13658823.9       4.9X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true            6610           6705         
 85          0.0    66104698.4       1.0X
-subExprElimination false, codegen: false           6647           6730         
 76          0.0    66469463.5       1.0X
-subExprElimination true, codegen: true             2077           2126         
 43          0.0    20769220.1       3.2X
-subExprElimination true, codegen: false            1949           2000         
 64          0.0    19489004.0       3.4X
+subExprElimination false, codegen: true            7250           7520         
384          0.0    72501549.6       1.0X
+subExprElimination false, codegen: false           7255           7366         
114          0.0    72554716.3       1.0X
+subExprElimination true, codegen: true             1934           2024         
 79          0.0    19344228.2       3.7X
+subExprElimination true, codegen: false            1981           2015         
 29          0.0    19814306.4       3.7X
 
 
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt 
b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
index 12d602bec17c..b689fef720f7 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
 
================================================================================================
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true            6438           6551         
 98          0.0    64378783.5       1.0X
-subExprElimination false, codegen: false           6216           6320         
175          0.0    62161826.1       1.0X
-subExprElimination true, codegen: true             1480           1518         
 39          0.0    14799890.8       4.3X
-subExprElimination true, codegen: false            1321           1429         
 94          0.0    13212919.6       4.9X
+subExprElimination false, codegen: true            6389           6498         
173          0.0    63887225.6       1.0X
+subExprElimination false, codegen: false           6235           6292         
 81          0.0    62351284.2       1.0X
+subExprElimination true, codegen: true             1328           1368         
 47          0.0    13284825.9       4.8X
+subExprElimination true, codegen: false            1323           1368         
 73          0.0    13227629.0       4.8X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-subExprElimination false, codegen: true            7107           7310         
207          0.0    71066752.8       1.0X
-subExprElimination false, codegen: false           6738           6781         
 41          0.0    67375897.0       1.1X
-subExprElimination true, codegen: true             2052           2110         
 51          0.0    20519152.3       3.5X
-subExprElimination true, codegen: false            2053           2079         
 33          0.0    20526629.8       3.5X
+subExprElimination false, codegen: true            7081           7177         
 86          0.0    70813603.9       1.0X
+subExprElimination false, codegen: false           6586           6720         
139          0.0    65859888.8       1.1X
+subExprElimination true, codegen: true             1729           1827         
117          0.0    17291697.7       4.1X
+subExprElimination true, codegen: false            1726           1789         
 57          0.0    17255779.7       4.1X
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50767][SQL] Remove codegen of `from_json`

Reply via email to