This is an automated email from the ASF dual-hosted git repository.
LuciferYang pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new 0a4db2064a53 [SPARK-57208][SQL] Simplify Ascii codegen by extracting a
static Java helper
0a4db2064a53 is described below
commit 0a4db2064a537fc3bd636196ac6d75936238759a
Author: YangJie <[email protected]>
AuthorDate: Wed Jun 3 17:04:29 2026 +0800
[SPARK-57208][SQL] Simplify Ascii codegen by extracting a static Java helper
### What changes were proposed in this pull request?
`Ascii` implemented the same first-character logic twice — once in
`nullSafeEval` and once inlined in `doGenCode` (a ~6-line `substring(0, 1)` /
`numChars() > 0 ? codePointAt(0) : 0` block). This moves that logic into a
single `ExpressionImplUtils.ascii(UTF8String): int` helper; `nullSafeEval`
delegates to it and `doGenCode` becomes a one-line `defineCodeGen` call, so
eval and codegen share one implementation.
This follows the other SPARK-56908 helpers in `ExpressionImplUtils` and
lands alongside Crc32 (#56222), regexp (#56223), Chr (#56224), Acosh (#56228),
and Asinh (#56229), which all append to the same file.
### Why are the changes needed?
It collapses the inlined block into a single `invokestatic` per `ascii`
call site — fewer constant-pool entries and a smaller generated method, which
helps with the JVM 64KB method / constant-pool limits, Janino compile time, and
JIT work — and removes the duplicated logic between eval and codegen. Part of
SPARK-56908.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing `StringExpressionsSuite` `ascii` tests, plus a new assertion for a
supplementary-plane code point (`ascii('😀') = 128512`) that pins the
`codePointAt` (rather than `charAt`) behavior. `checkEvaluation` runs both the
interpreted and codegen paths.
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Claude Opus 4.8)
Closes #56267 from LuciferYang/ascii-codegen-helper.
Authored-by: YangJie <[email protected]>
Signed-off-by: yangjie01 <[email protected]>
(cherry picked from commit 1fe8493f999e8fe33a7e89a12d077ace6de79cd7)
Signed-off-by: yangjie01 <[email protected]>
---
.../catalyst/expressions/ExpressionImplUtils.java | 15 +++++++++++++++
.../catalyst/expressions/stringExpressions.scala | 22 +++-------------------
.../expressions/StringExpressionsSuite.scala | 5 +++++
3 files changed, 23 insertions(+), 19 deletions(-)
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index fa1741cb08f7..6db52b7af46f 100644
---
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -388,4 +388,19 @@ public class ExpressionImplUtils {
checksum.update(bytes, 0, bytes.length);
return checksum.getValue();
}
+
+ /**
+ * Returns the numeric value of the first character of the input string, or
0 if it is empty.
+ * Shared by the Ascii expression's eval and codegen paths so the generated
Java is a single
+ * call rather than an inline substring/if-else block.
+ */
+ public static int ascii(UTF8String str) {
+ // only pick the first character to reduce the `toString` cost
+ UTF8String firstCharStr = str.substring(0, 1);
+ if (firstCharStr.numChars() > 0) {
+ return firstCharStr.toString().codePointAt(0);
+ } else {
+ return 0;
+ }
+ }
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 5c6e457421bf..bbc9341d8e6d 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2774,27 +2774,11 @@ case class Ascii(child: Expression)
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeWithCollation(supportsTrimCollation = true))
- protected override def nullSafeEval(string: Any): Any = {
- // only pick the first character to reduce the `toString` cost
- val firstCharStr = string.asInstanceOf[UTF8String].substring(0, 1)
- if (firstCharStr.numChars > 0) {
- firstCharStr.toString.codePointAt(0)
- } else {
- 0
- }
- }
+ protected override def nullSafeEval(string: Any): Any =
+ ExpressionImplUtils.ascii(string.asInstanceOf[UTF8String])
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- nullSafeCodeGen(ctx, ev, (child) => {
- val firstCharStr = ctx.freshName("firstCharStr")
- s"""
- UTF8String $firstCharStr = $child.substring(0, 1);
- if ($firstCharStr.numChars() > 0) {
- ${ev.value} = $firstCharStr.toString().codePointAt(0);
- } else {
- ${ev.value} = 0;
- }
- """})
+ defineCodeGen(ctx, ev, c =>
s"${classOf[ExpressionImplUtils].getName}.ascii($c)")
}
override protected def withNewChildInternal(newChild: Expression): Ascii =
copy(child = newChild)
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index bca4984cfac9..aac4fafb7802 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -378,6 +378,11 @@ class StringExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
val a = $"a".string.at(0)
checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef"))
checkEvaluation(Ascii(a), 97, create_row("abdef"))
+ // U+1F600 is a supplementary-plane code point; ascii must return the full
code point
+ // (128512 via codePointAt), not the leading UTF-16 surrogate (55357 via
charAt).
+ // scalastyle:off
+ checkEvaluation(Ascii(Literal("😀")), 128512, create_row("😀"))
+ // scalastyle:on
checkEvaluation(Ascii(a), 0, create_row(""))
checkEvaluation(Ascii(a), null, create_row(null))
checkEvaluation(Ascii(Literal.create(null, StringType)), null,
create_row("abdef"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]