This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new de00ac8a05ae [SPARK-47765][SQL] Add SET COLLATION to parser rules
de00ac8a05ae is described below
commit de00ac8a05aedb3a150c8c10f76d1fe5496b1df3
Author: Mihailo Milosevic <[email protected]>
AuthorDate: Fri Apr 12 22:25:06 2024 +0800
[SPARK-47765][SQL] Add SET COLLATION to parser rules
### What changes were proposed in this pull request?
Addition of a new statement SET COLLATION collationName.
### Why are the changes needed?
Requested by srielau in order to follow other principles for session level
defaults (e.g. SET TIME ZONE).
### Does this PR introduce _any_ user-facing change?
Users now can use SET COLLATION statement to change session level default
collation.
### How was this patch tested?
Test added to `CollationSuite`.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45946 from mihailom-db/SPARK-47765.
Authored-by: Mihailo Milosevic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/CollationFactory.java | 17 +++++++++++++++++
.../src/main/resources/error/error-classes.json | 5 +++++
.../apache/spark/internal/config/ConfigBuilder.scala | 4 ++--
...rror-conditions-invalid-conf-value-error-class.md | 4 ++++
docs/sql-ref-ansi-compliance.md | 1 +
.../apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 | 1 +
.../spark/sql/catalyst/parser/SqlBaseParser.g4 | 2 ++
.../org/apache/spark/sql/internal/SQLConf.scala | 8 +++++++-
.../resources/ansi-sql-2016-reserved-keywords.txt | 1 +
.../apache/spark/sql/execution/SparkSqlParser.scala | 12 ++++++++++++
.../sql-tests/results/ansi/keywords.sql.out | 2 ++
.../resources/sql-tests/results/keywords.sql.out | 1 +
.../org/apache/spark/sql/internal/SQLConfSuite.scala | 20 +++++++++++++++++++-
.../ThriftServerWithSparkContextSuite.scala | 2 +-
14 files changed, 75 insertions(+), 5 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index ff7bc450f851..9786c559da44 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -202,6 +202,23 @@ public final class CollationFactory {
return new StringSearch(pattern, target, (RuleBasedCollator) collator);
}
+ /**
+ * Returns if the given collationName is valid one.
+ */
+ public static boolean isValidCollation(String collationName) {
+ return collationNameToIdMap.containsKey(collationName.toUpperCase());
+ }
+
+ /**
+ * Returns closest valid name to collationName
+ */
+ public static String getClosestCollation(String collationName) {
+ Collation suggestion = Collections.min(List.of(collationTable),
Comparator.comparingInt(
+ c -> UTF8String.fromString(c.collationName).levenshteinDistance(
+ UTF8String.fromString(collationName.toUpperCase()))));
+ return suggestion.collationName;
+ }
+
/**
* Returns a collation-unaware StringSearch object for the given pattern and
target strings.
* While this object does not respect collation, it can be used to find
occurrences of the pattern
diff --git a/common/utils/src/main/resources/error/error-classes.json
b/common/utils/src/main/resources/error/error-classes.json
index 7b13fa4278e4..2a00edb9a4df 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -1881,6 +1881,11 @@
"The value '<confValue>' in the config \"<confName>\" is invalid."
],
"subClass" : {
+ "DEFAULT_COLLATION" : {
+ "message" : [
+ "Cannot resolve the given default collation. Did you mean
'<proposal>'?"
+ ]
+ },
"TIME_ZONE" : {
"message" : [
"Cannot resolve the given timezone."
diff --git
a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
index 303d856ca2c5..1f19e9444d38 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
@@ -117,12 +117,12 @@ private[spark] class TypedConfigBuilder[T](
def checkValue(
validator: T => Boolean,
errorClass: String,
- parameters: Map[String, String]): TypedConfigBuilder[T] = {
+ parameters: T => Map[String, String]): TypedConfigBuilder[T] = {
transform { v =>
if (!validator(v)) {
throw new SparkIllegalArgumentException(
errorClass = "INVALID_CONF_VALUE." + errorClass,
- messageParameters = parameters ++ Map(
+ messageParameters = parameters(v) ++ Map(
"confValue" -> v.toString,
"confName" -> parent.key))
}
diff --git a/docs/sql-error-conditions-invalid-conf-value-error-class.md
b/docs/sql-error-conditions-invalid-conf-value-error-class.md
index ae0975e16116..ac430956340f 100644
--- a/docs/sql-error-conditions-invalid-conf-value-error-class.md
+++ b/docs/sql-error-conditions-invalid-conf-value-error-class.md
@@ -30,6 +30,10 @@ The value '`<confValue>`' in the config "`<confName>`" is
invalid.
This error class has the following derived error classes:
+## DEFAULT_COLLATION
+
+Cannot resolve the given default collation. Did you mean '`<proposal>`'?
+
## TIME_ZONE
Cannot resolve the given timezone.
diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
index 9b933ec1f65c..bf1819b9767b 100644
--- a/docs/sql-ref-ansi-compliance.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -439,6 +439,7 @@ Below is a list of all the keywords in Spark SQL.
|CLUSTERED|non-reserved|non-reserved|non-reserved|
|CODEGEN|non-reserved|non-reserved|non-reserved|
|COLLATE|reserved|non-reserved|reserved|
+|COLLATION|reserved|non-reserved|reserved|
|COLLECTION|non-reserved|non-reserved|non-reserved|
|COLUMN|reserved|non-reserved|reserved|
|COLUMNS|non-reserved|non-reserved|non-reserved|
diff --git
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
index f5565f0a63fb..e2b178d34b56 100644
---
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
+++
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
@@ -129,6 +129,7 @@ CLUSTER: 'CLUSTER';
CLUSTERED: 'CLUSTERED';
CODEGEN: 'CODEGEN';
COLLATE: 'COLLATE';
+COLLATION: 'COLLATION';
COLLECTION: 'COLLECTION';
COLUMN: 'COLUMN';
COLUMNS: 'COLUMNS';
diff --git
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 6e79d4af2f5e..3d008516589b 100644
---
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -210,6 +210,7 @@ statement
| (MSCK)? REPAIR TABLE identifierReference
(option=(ADD|DROP|SYNC) PARTITIONS)?
#repairTable
| op=(ADD | LIST) identifier .*?
#manageResource
+ | SET COLLATION collationName=identifier
#setCollation
| SET ROLE .*?
#failNativeCommand
| SET TIME ZONE interval
#setTimeZone
| SET TIME ZONE timezone
#setTimeZone
@@ -1662,6 +1663,7 @@ nonReserved
| CLUSTERED
| CODEGEN
| COLLATE
+ | COLLATION
| COLLECTION
| COLUMN
| COLUMNS
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 55d8b61f8b94..c8a5d997da7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -772,6 +772,12 @@ object SQLConf {
" produced by a builtin function such as to_char or CAST")
.version("4.0.0")
.stringConf
+ .checkValue(CollationFactory.isValidCollation,
+ "DEFAULT_COLLATION",
+ name =>
+ Map(
+ "proposal" -> CollationFactory.getClosestCollation(name)
+ ))
.createWithDefault("UTF8_BINARY")
val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
@@ -2804,7 +2810,7 @@ object SQLConf {
"short names are not recommended to use because they can be ambiguous.")
.version("2.2.0")
.stringConf
- .checkValue(isValidTimezone, errorClass = "TIME_ZONE", parameters =
Map.empty)
+ .checkValue(isValidTimezone, errorClass = "TIME_ZONE", parameters = tz =>
Map.empty)
.createWithDefaultFunction(() => TimeZone.getDefault.getID)
val WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD =
diff --git
a/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
index 47a3f02ac165..46da60b7897b 100644
--- a/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
+++ b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
@@ -47,6 +47,7 @@ CLOB
CLOSE
COALESCE
COLLATE
+COLLATION
COLLECT
COLUMN
COMMIT
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 28bcc33b1cdc..8192be269993 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -142,6 +142,18 @@ class SparkSqlAstBuilder extends AstBuilder {
ResetCommand(Some(ctx.configKey().getText))
}
+ /**
+ * Create a [[SetCommand]] logical plan to set [[SQLConf.DEFAULT_COLLATION]]
+ * Example SQL :
+ * {{{
+ * SET COLLATION UNICODE;
+ * }}}
+ */
+ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan =
withOrigin(ctx) {
+ val key = SQLConf.DEFAULT_COLLATION.key
+ SetCommand(Some(key ->
Some(ctx.identifier.getText.toUpperCase(Locale.ROOT))))
+ }
+
/**
* Create a [[SetCommand]] logical plan to set
[[SQLConf.SESSION_LOCAL_TIMEZONE]]
* Example SQL :
diff --git
a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out
b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out
index c0b3a1e8cc55..8b4acd12911b 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out
@@ -44,6 +44,7 @@ CLUSTER false
CLUSTERED false
CODEGEN false
COLLATE true
+COLLATION true
COLLECTION false
COLUMN true
COLUMNS false
@@ -356,6 +357,7 @@ CASE
CAST
CHECK
COLLATE
+COLLATION
COLUMN
CONSTRAINT
CREATE
diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out
b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out
index 70df01e786ce..884f17c23eb0 100644
--- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out
@@ -44,6 +44,7 @@ CLUSTER false
CLUSTERED false
CODEGEN false
COLLATE false
+COLLATION false
COLLECTION false
COLUMN false
COLUMNS false
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 03f6b9719b9c..18a06e83c076 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -17,7 +17,7 @@
package org.apache.spark.sql.internal
-import java.util.TimeZone
+import java.util.{Locale, TimeZone}
import org.apache.hadoop.fs.Path
import org.apache.logging.log4j.Level
@@ -505,6 +505,24 @@ class SQLConfSuite extends QueryTest with
SharedSparkSession {
|""".stripMargin)
}
+ test("SPARK-47765: set collation") {
+ Seq("UNICODE", "UNICODE_CI", "utf8_binary_lcase", "utf8_binary").foreach {
collation =>
+ sql(s"set collation $collation")
+ assert(spark.conf.get(SQLConf.DEFAULT_COLLATION) ===
collation.toUpperCase(Locale.ROOT))
+ }
+
+ checkError(
+ exception = intercept[SparkIllegalArgumentException] {
+ sql(s"SET COLLATION unicode_c").collect()
+ },
+ errorClass = "INVALID_CONF_VALUE.DEFAULT_COLLATION",
+ parameters = Map(
+ "confValue" -> "UNICODE_C",
+ "confName" -> "spark.sql.session.collation.default",
+ "proposal" -> "UNICODE_CI"
+ ))
+ }
+
test("SPARK-43028: config not found error") {
checkError(
exception =
intercept[SparkNoSuchElementException](spark.conf.get("some.conf")),
diff --git
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala
index 26cf62d2323c..51123b17eeec 100644
---
a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala
+++
b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala
@@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends
SharedThriftServer {
val sessionHandle = client.openSession(user, "")
val infoValue = client.getInfo(sessionHandle,
GetInfoType.CLI_ODBC_KEYWORDS)
// scalastyle:off line.size.limit
- assert(infoValue.getStringValue ==
"ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DA
[...]
+ assert(infoValue.getStringValue ==
"ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_US
[...]
// scalastyle:on line.size.limit
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]