This is an automated email from the ASF dual-hosted git repository.

LuciferYang pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.x by this push:
     new 0a4db2064a53 [SPARK-57208][SQL] Simplify Ascii codegen by extracting a 
static Java helper
0a4db2064a53 is described below

commit 0a4db2064a537fc3bd636196ac6d75936238759a
Author: YangJie <[email protected]>
AuthorDate: Wed Jun 3 17:04:29 2026 +0800

    [SPARK-57208][SQL] Simplify Ascii codegen by extracting a static Java helper
    
    ### What changes were proposed in this pull request?
    
    `Ascii` implemented the same first-character logic twice — once in 
`nullSafeEval` and once inlined in `doGenCode` (a ~6-line `substring(0, 1)` / 
`numChars() > 0 ? codePointAt(0) : 0` block). This moves that logic into a 
single `ExpressionImplUtils.ascii(UTF8String): int` helper; `nullSafeEval` 
delegates to it and `doGenCode` becomes a one-line `defineCodeGen` call, so 
eval and codegen share one implementation.
    
    This follows the other SPARK-56908 helpers in `ExpressionImplUtils` and 
lands alongside Crc32 (#56222), regexp (#56223), Chr (#56224), Acosh (#56228), 
and Asinh (#56229), which all append to the same file.
    
    ### Why are the changes needed?
    
    It collapses the inlined block into a single `invokestatic` per `ascii` 
call site — fewer constant-pool entries and a smaller generated method, which 
helps with the JVM 64KB method / constant-pool limits, Janino compile time, and 
JIT work — and removes the duplicated logic between eval and codegen. Part of 
SPARK-56908.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing `StringExpressionsSuite` `ascii` tests, plus a new assertion for a 
supplementary-plane code point (`ascii('😀') = 128512`) that pins the 
`codePointAt` (rather than `charAt`) behavior. `checkEvaluation` runs both the 
interpreted and codegen paths.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Claude Opus 4.8)
    
    Closes #56267 from LuciferYang/ascii-codegen-helper.
    
    Authored-by: YangJie <[email protected]>
    Signed-off-by: yangjie01 <[email protected]>
    (cherry picked from commit 1fe8493f999e8fe33a7e89a12d077ace6de79cd7)
    Signed-off-by: yangjie01 <[email protected]>
---
 .../catalyst/expressions/ExpressionImplUtils.java  | 15 +++++++++++++++
 .../catalyst/expressions/stringExpressions.scala   | 22 +++-------------------
 .../expressions/StringExpressionsSuite.scala       |  5 +++++
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index fa1741cb08f7..6db52b7af46f 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -388,4 +388,19 @@ public class ExpressionImplUtils {
     checksum.update(bytes, 0, bytes.length);
     return checksum.getValue();
   }
+
+  /**
+   * Returns the numeric value of the first character of the input string, or 
0 if it is empty.
+   * Shared by the Ascii expression's eval and codegen paths so the generated 
Java is a single
+   * call rather than an inline substring/if-else block.
+   */
+  public static int ascii(UTF8String str) {
+    // only pick the first character to reduce the `toString` cost
+    UTF8String firstCharStr = str.substring(0, 1);
+    if (firstCharStr.numChars() > 0) {
+      return firstCharStr.toString().codePointAt(0);
+    } else {
+      return 0;
+    }
+  }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 5c6e457421bf..bbc9341d8e6d 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2774,27 +2774,11 @@ case class Ascii(child: Expression)
   override def inputTypes: Seq[AbstractDataType] =
     Seq(StringTypeWithCollation(supportsTrimCollation = true))
 
-  protected override def nullSafeEval(string: Any): Any = {
-    // only pick the first character to reduce the `toString` cost
-    val firstCharStr = string.asInstanceOf[UTF8String].substring(0, 1)
-    if (firstCharStr.numChars > 0) {
-      firstCharStr.toString.codePointAt(0)
-    } else {
-      0
-    }
-  }
+  protected override def nullSafeEval(string: Any): Any =
+    ExpressionImplUtils.ascii(string.asInstanceOf[UTF8String])
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    nullSafeCodeGen(ctx, ev, (child) => {
-      val firstCharStr = ctx.freshName("firstCharStr")
-      s"""
-        UTF8String $firstCharStr = $child.substring(0, 1);
-        if ($firstCharStr.numChars() > 0) {
-          ${ev.value} = $firstCharStr.toString().codePointAt(0);
-        } else {
-          ${ev.value} = 0;
-        }
-       """})
+    defineCodeGen(ctx, ev, c => 
s"${classOf[ExpressionImplUtils].getName}.ascii($c)")
   }
 
   override protected def withNewChildInternal(newChild: Expression): Ascii = 
copy(child = newChild)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index bca4984cfac9..aac4fafb7802 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -378,6 +378,11 @@ class StringExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     val a = $"a".string.at(0)
     checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef"))
     checkEvaluation(Ascii(a), 97, create_row("abdef"))
+    // U+1F600 is a supplementary-plane code point; ascii must return the full 
code point
+    // (128512 via codePointAt), not the leading UTF-16 surrogate (55357 via 
charAt).
+    // scalastyle:off
+    checkEvaluation(Ascii(Literal("😀")), 128512, create_row("😀"))
+    // scalastyle:on
     checkEvaluation(Ascii(a), 0, create_row(""))
     checkEvaluation(Ascii(a), null, create_row(null))
     checkEvaluation(Ascii(Literal.create(null, StringType)), null, 
create_row("abdef"))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to