This is an automated email from the ASF dual-hosted git repository.
gengliangwang pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new 85639de1a3e7 [SPARK-57173][SQL] Simplify regexp pattern-compile
codegen by extracting a static Java helper
85639de1a3e7 is described below
commit 85639de1a3e7a2af2f9f13082d10452b8c7c57e9
Author: Gengliang Wang <[email protected]>
AuthorDate: Tue Jun 2 20:52:07 2026 -0700
[SPARK-57173][SQL] Simplify regexp pattern-compile codegen by extracting a
static Java helper
### What changes were proposed in this pull request?
Add `ExpressionImplUtils.compileRegexPattern(String regex, int flags,
String funcName)`, which wraps `Pattern.compile` and maps a
`PatternSyntaxException` to the user-facing INVALID_PARAMETER_VALUE.PATTERN
error. Route both the shared codegen (`RegExpUtils.initLastMatcherCode`, used
by the whole regexp expression family -- `RLike`, `RegExpReplace`,
`RegExpExtract`, `RegExpExtractAll`, `RegExpInStr`, etc.) and the eval helper
(`RegExpUtils.getPatternAndLastRegex`) through it.
`initLastMatcherCode` previously emitted a 5-line inline `try {
Pattern.compile(...) } catch (PatternSyntaxException)` block; it now emits a
single helper call. The per-stage mutable-state caching (`lastRegex` /
`pattern`) is preserved in the generated code.
### Why are the changes needed?
Part of SPARK-56908 (umbrella). This block is emitted by every regexp
expression in every stage that uses one; collapsing it to a single call shrinks
the generated Java across the whole family, helping with the JVM 64KB method /
constant-pool limits, Janino compile time, and JIT work.
### Does this PR introduce _any_ user-facing change?
No. The compiled behavior is identical; only the emitted Java source text
changes.
### How was this patch tested?
```
build/sbt "catalyst/testOnly *RegexpExpressionsSuite"
```
21/21 pass (exercised both with and without whole-stage codegen).
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Opus 4.8)
Closes #56223 from gengliangwang/spark-regexp-compile-codegen.
Authored-by: Gengliang Wang <[email protected]>
Signed-off-by: Gengliang Wang <[email protected]>
(cherry picked from commit dffbe1adb2aecfee868fefde1b3f4c673ecf5a32)
Signed-off-by: Gengliang Wang <[email protected]>
---
.../catalyst/expressions/ExpressionImplUtils.java | 17 +++++++++++++++++
.../sql/catalyst/expressions/regexpExpressions.scala | 20 +++++++-------------
2 files changed, 24 insertions(+), 13 deletions(-)
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index 7bad7c430b86..1053650a3709 100644
---
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -25,6 +25,8 @@ import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
import java.util.zip.CRC32;
import javax.crypto.Cipher;
import javax.crypto.spec.GCMParameterSpec;
@@ -344,6 +346,21 @@ public class ExpressionImplUtils {
return UTF8String.fromString(qtChar + sp + qtChar);
}
+ /**
+ * Compiles {@code regex} with the given {@code flags} for the regexp
expression
+ * family, translating a {@link PatternSyntaxException} into the user-facing
+ * INVALID_PARAMETER_VALUE.PATTERN error. Shared by the regexp eval and
codegen
+ * paths so the generated Java is a single call instead of an inline
try/catch
+ * around {@code Pattern.compile}.
+ */
+ public static Pattern compileRegexPattern(String regex, int flags, String
funcName) {
+ try {
+ return Pattern.compile(regex, flags);
+ } catch (PatternSyntaxException e) {
+ throw QueryExecutionErrors.invalidPatternError(funcName, e.getPattern(),
e);
+ }
+ }
+
/**
* Computes the CRC32 checksum of {@code bytes} for the {@code crc32}
expression.
* Shared by the eval and codegen paths so the per-stage generated Java is a
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5ad360a54e8d..c2c01d2c7815 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -1253,17 +1253,15 @@ object RegExpUtils {
val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
val termPattern = ctx.addMutableState(classNamePattern, "pattern")
val collationRegexFlags =
CollationSupport.collationAwareRegexFlags(collationId)
+ val utils = classOf[ExpressionImplUtils].getName
s"""
|if (!$regexp.equals($termLastRegex)) {
| // regex value changed
- | try {
- | UTF8String r = $regexp.clone();
- | $termPattern = $classNamePattern.compile(r.toString(),
$collationRegexFlags);
- | $termLastRegex = r;
- | } catch (java.util.regex.PatternSyntaxException e) {
- | throw QueryExecutionErrors.invalidPatternError("$prettyName",
e.getPattern(), e);
- | }
+ | UTF8String r = $regexp.clone();
+ | $termPattern =
+ | $utils.compileRegexPattern(r.toString(), $collationRegexFlags,
"$prettyName");
+ | $termLastRegex = r;
|}
|java.util.regex.Matcher $matcher =
$termPattern.matcher($subject.toString());
|""".stripMargin
@@ -1272,12 +1270,8 @@ object RegExpUtils {
def getPatternAndLastRegex(p: Any, prettyName: String, collationId: Int):
(Pattern, UTF8String) =
{
val r = p.asInstanceOf[UTF8String].clone()
- val pattern = try {
- Pattern.compile(r.toString,
CollationSupport.collationAwareRegexFlags(collationId))
- } catch {
- case e: PatternSyntaxException =>
- throw QueryExecutionErrors.invalidPatternError(prettyName,
e.getPattern, e)
- }
+ val pattern = ExpressionImplUtils.compileRegexPattern(
+ r.toString, CollationSupport.collationAwareRegexFlags(collationId),
prettyName)
(pattern, r)
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]