This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 1893f85501b3 [SPARK-50875][SQL] Add RTRIM collations to TVF
1893f85501b3 is described below
commit 1893f85501b3b8753f366c34328339eeb2aee726
Author: Stevo Mitric <[email protected]>
AuthorDate: Tue Jan 21 12:04:39 2025 +0800
[SPARK-50875][SQL] Add RTRIM collations to TVF
### What changes were proposed in this pull request?
Added all RTRIM collations to TVF when using `Collations` generator.
### Why are the changes needed?
Since RTRIM collation is enabled by default, we need to add it to the list.
### Does this PR introduce _any_ user-facing change?
Yes, using the `collations` function will include `rtrim` collations.
### How was this patch tested?
Modified TVF collation tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #49554 from stevomitric/stevomitric/add-rtrim-to-list.
Authored-by: Stevo Mitric <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit 5534b91dee6ba54ffcd53b5ff324c83f0f9db7e5)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/CollationFactory.java | 42 +++++++++++----
python/pyspark/sql/tvf.py | 8 +--
.../org/apache/spark/sql/CollationSuite.scala | 61 ++++++++++++++++------
3 files changed, 83 insertions(+), 28 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 9aad7d390977..4bcd75a73105 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -440,6 +440,15 @@ public final class CollationFactory {
}
}
+ protected String getPadding() {
+ if (spaceTrimming == SpaceTrimming.RTRIM) {
+ return PAD_ATTRIBUTE_RTRIM;
+ } else {
+ assert spaceTrimming == SpaceTrimming.NONE;
+ return PAD_ATTRIBUTE_EMPTY;
+ }
+ }
+
protected abstract Collation buildCollation();
protected abstract CollationMeta buildCollationMeta();
@@ -460,6 +469,8 @@ public final class CollationFactory {
}
return collationSpecUTF8;
}
+
+ protected SpaceTrimming spaceTrimming;
}
private static class CollationSpecUTF8 extends CollationSpec {
@@ -492,7 +503,6 @@ public final class CollationFactory {
new CollationSpecUTF8(CaseSensitivity.LCASE,
SpaceTrimming.NONE).buildCollation();
private final CaseSensitivity caseSensitivity;
- private final SpaceTrimming spaceTrimming;
private final int collationId;
private CollationSpecUTF8(
@@ -638,7 +648,7 @@ public final class CollationFactory {
/* language = */ null,
/* country = */ null,
/* icuVersion = */ null,
- COLLATION_PAD_ATTRIBUTE,
+ getPadding(),
/* accentSensitivity = */ true,
/* caseSensitivity = */ true,
spaceTrimming.toString());
@@ -650,7 +660,7 @@ public final class CollationFactory {
/* language = */ null,
/* country = */ null,
/* icuVersion = */ null,
- COLLATION_PAD_ATTRIBUTE,
+ getPadding(),
/* accentSensitivity = */ true,
/* caseSensitivity = */ false,
spaceTrimming.toString());
@@ -689,7 +699,18 @@ public final class CollationFactory {
CollationNames.UTF8_LCASE,
CollationSpecICU.ICU_VERSION
);
- return Arrays.asList(UTF8_BINARY_COLLATION_IDENT,
UTF8_LCASE_COLLATION_IDENT);
+ CollationIdentifier UTF8_BINARY_RTRIM_COLLATION_IDENT = new
CollationIdentifier(
+ PROVIDER_SPARK,
+ CollationNames.UTF8_BINARY + "_RTRIM",
+ CollationSpecICU.ICU_VERSION
+ );
+ CollationIdentifier UTF8_LCASE_RTRIM_COLLATION_IDENT = new
CollationIdentifier(
+ PROVIDER_SPARK,
+ CollationNames.UTF8_LCASE + "_RTRIM",
+ CollationSpecICU.ICU_VERSION
+ );
+ return Arrays.asList(UTF8_BINARY_COLLATION_IDENT,
UTF8_LCASE_COLLATION_IDENT,
+ UTF8_BINARY_RTRIM_COLLATION_IDENT, UTF8_LCASE_RTRIM_COLLATION_IDENT);
}
static CollationMeta loadCollationMeta(CollationIdentifier
collationIdentifier) {
@@ -831,7 +852,6 @@ public final class CollationFactory {
private final CaseSensitivity caseSensitivity;
private final AccentSensitivity accentSensitivity;
- private final SpaceTrimming spaceTrimming;
private final String locale;
private final int collationId;
@@ -1032,7 +1052,7 @@ public final class CollationFactory {
language.isEmpty() ? null : language,
country.isEmpty() ? null : country,
VersionInfo.ICU_VERSION.toString(),
- COLLATION_PAD_ATTRIBUTE,
+ getPadding(),
accentSensitivity == AccentSensitivity.AS,
caseSensitivity == CaseSensitivity.CS,
spaceTrimming.toString());
@@ -1068,10 +1088,13 @@ public final class CollationFactory {
private static List<String> allCollationNames() {
List<String> collationNames = new ArrayList<>();
List<String> caseAccentSpecifiers = Arrays.asList("", "_AI", "_CI",
"_CI_AI");
+ List<String> trimmingSpecifiers = Arrays.asList("", "_RTRIM");
for (String locale : ICULocaleToId.keySet()) {
for (String caseAccent : caseAccentSpecifiers) {
- String collationName = locale + caseAccent;
- collationNames.add(collationName);
+ for (String trimming : trimmingSpecifiers) {
+ String collationName = locale + caseAccent + trimming;
+ collationNames.add(collationName);
+ }
}
}
return collationNames.stream().sorted().toList();
@@ -1161,7 +1184,8 @@ public final class CollationFactory {
public static final String PROVIDER_ICU = "icu";
public static final String PROVIDER_NULL = "null";
public static final List<String> SUPPORTED_PROVIDERS =
List.of(PROVIDER_SPARK, PROVIDER_ICU);
- public static final String COLLATION_PAD_ATTRIBUTE = "NO_PAD";
+ public static final String PAD_ATTRIBUTE_EMPTY = "NO_PAD";
+ public static final String PAD_ATTRIBUTE_RTRIM = "RTRIM";
public static final int UTF8_BINARY_COLLATION_ID =
Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION_ID;
diff --git a/python/pyspark/sql/tvf.py b/python/pyspark/sql/tvf.py
index 1d0febf9ba3a..b34877b03311 100644
--- a/python/pyspark/sql/tvf.py
+++ b/python/pyspark/sql/tvf.py
@@ -530,11 +530,11 @@ class TableValuedFunction:
Examples
--------
>>> spark.tvf.collations().show()
- +-------+-------+-------------+...
- |CATALOG| SCHEMA| NAME|...
- +-------+-------+-------------+...
+ +-------+-------+-------------------+...
+ |CATALOG| SCHEMA| NAME|...
+ +-------+-------+-------------------+...
...
- +-------+-------+-------------+...
+ +-------+-------+-------------------+...
"""
return self._fn("collations")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
index 89c205d4f13f..a075198fb149 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
@@ -1974,48 +1974,66 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
checkAnswer(df,
Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
+ Row("SYSTEM", "BUILTIN", "UTF8_BINARY_RTRIM", null, null,
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "RTRIM", null),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
+ Row("SYSTEM", "BUILTIN", "UTF8_LCASE_RTRIM", null, null,
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "RTRIM", null),
Row("SYSTEM", "BUILTIN", "UNICODE", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", null, null,
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "UNICODE_AI_RTRIM", null, null,
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", null, null,
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
- Row("SYSTEM", "BUILTIN", "af", "Afrikaans", null,
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
- Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", null,
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
- Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", null,
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
- Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", null,
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
+ Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI_RTRIM", null, null,
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE
'%UTF8_BINARY%'"),
- Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null))
+ Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
+ Row("SYSTEM", "BUILTIN", "UTF8_BINARY_RTRIM", null, null,
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "RTRIM", null)))
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE
'%zh_Hant_HKG%'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR
China",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR
China",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI_RTRIM", "Chinese", "Hong Kong
SAR China",
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI", "Chinese", "Hong Kong SAR
China",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong
SAR China",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI_RTRIM", "Chinese", "Hong
Kong SAR China",
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_RTRIM", "Chinese", "Hong Kong
SAR China",
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_RTRIM", "Chinese", "Hong Kong
SAR China",
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE COUNTRY = 'Singapore'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hans_SGP", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI", "Chinese", "Singapore",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI_RTRIM", "Chinese",
"Singapore",
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI", "Chinese", "Singapore",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI_RTRIM", "Chinese",
"Singapore",
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_RTRIM", "Chinese",
"Singapore",
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_RTRIM", "Chinese", "Singapore",
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE LANGUAGE = 'English' " +
"and COUNTRY = 'United States'"),
@@ -2023,20 +2041,33 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_AI", "English", "United States",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "en_USA_AI_RTRIM", "English", "United States",
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_CI", "English", "United States",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_CI_AI", "English", "United States",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
+ Row("SYSTEM", "BUILTIN", "en_USA_CI_AI_RTRIM", "English", "United
States",
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "en_USA_CI_RTRIM", "English", "United States",
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "RTRIM", icvVersion),
+ Row("SYSTEM", "BUILTIN", "en_USA_RTRIM", "English", "United States",
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "RTRIM", icvVersion)))
checkAnswer(sql("SELECT NAME, LANGUAGE, ACCENT_SENSITIVITY,
CASE_SENSITIVITY " +
"FROM collations() WHERE COUNTRY = 'United States'"),
Seq(Row("en_USA", "English", "ACCENT_SENSITIVE", "CASE_SENSITIVE"),
Row("en_USA_AI", "English", "ACCENT_INSENSITIVE", "CASE_SENSITIVE"),
+ Row("en_USA_AI_RTRIM", "English", "ACCENT_INSENSITIVE",
"CASE_SENSITIVE"),
Row("en_USA_CI", "English", "ACCENT_SENSITIVE", "CASE_INSENSITIVE"),
- Row("en_USA_CI_AI", "English", "ACCENT_INSENSITIVE",
"CASE_INSENSITIVE")))
+ Row("en_USA_CI_AI", "English", "ACCENT_INSENSITIVE",
"CASE_INSENSITIVE"),
+ Row("en_USA_CI_AI_RTRIM", "English", "ACCENT_INSENSITIVE",
"CASE_INSENSITIVE"),
+ Row("en_USA_CI_RTRIM", "English", "ACCENT_SENSITIVE",
"CASE_INSENSITIVE"),
+ Row("en_USA_RTRIM", "English", "ACCENT_SENSITIVE", "CASE_SENSITIVE")))
checkAnswer(sql("SELECT NAME FROM collations() WHERE ICU_VERSION is null"),
- Seq(Row("UTF8_BINARY"), Row("UTF8_LCASE")))
+ Seq(Row("UTF8_BINARY"), Row("UTF8_LCASE"), Row("UTF8_BINARY_RTRIM"),
+ Row("UTF8_LCASE_RTRIM")))
}
test("fully qualified name") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]