This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new d5da49d56d7d Revert [SPARK-50230][SQL] Added logic to support reading 
unknown collation name as utf8_binary
d5da49d56d7d is described below

commit d5da49d56d7dec5f8a96c5252384d865f7efd4d9
Author: Vladan Vasić <[email protected]>
AuthorDate: Fri Nov 22 12:22:33 2024 +0100

    Revert [SPARK-50230][SQL] Added logic to support reading unknown collation 
name as utf8_binary
    
    ### What changes were proposed in this pull request?
    
    I propose reverting changes for  new `SQLConf` entry which enables spark to 
read an invalid collation name as `UTF8_BINARY`.
    
    ### Why are the changes needed?
    
    Since the original changes may bring unwanted data corruption when a user 
writes in a table that has unknown collation and modifies its properties, the 
original PR must be reverted.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Not applicable.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #48876 from 
vladanvasi-db/vladanvasi-db/allow-reading-unknown-collations-as-utf8-binary-revert.
    
    Authored-by: Vladan Vasić <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../org/apache/spark/sql/internal/SqlApiConf.scala |   3 -
 .../spark/sql/internal/SqlApiConfHelper.scala      |   2 -
 .../org/apache/spark/sql/types/DataType.scala      |  26 +---
 .../org/apache/spark/sql/internal/SQLConf.scala    |  11 --
 .../org/apache/spark/sql/types/DataTypeSuite.scala | 162 +--------------------
 5 files changed, 5 insertions(+), 199 deletions(-)

diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala
index 773494f41865..d5668cc72175 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala
@@ -47,7 +47,6 @@ private[sql] trait SqlApiConf {
   def stackTracesInDataFrameContext: Int
   def dataFrameQueryContextEnabled: Boolean
   def legacyAllowUntypedScalaUDFs: Boolean
-  def allowReadingUnknownCollations: Boolean
 }
 
 private[sql] object SqlApiConf {
@@ -60,7 +59,6 @@ private[sql] object SqlApiConf {
     SqlApiConfHelper.LOCAL_RELATION_CACHE_THRESHOLD_KEY
   }
   val DEFAULT_COLLATION: String = SqlApiConfHelper.DEFAULT_COLLATION
-  val ALLOW_READING_UNKNOWN_COLLATIONS: String = 
SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS
 
   def get: SqlApiConf = SqlApiConfHelper.getConfGetter.get()()
 
@@ -89,5 +87,4 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf {
   override def stackTracesInDataFrameContext: Int = 1
   override def dataFrameQueryContextEnabled: Boolean = true
   override def legacyAllowUntypedScalaUDFs: Boolean = false
-  override def allowReadingUnknownCollations: Boolean = false
 }
diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala
index c8d6f395d450..13ef13e5894e 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala
@@ -33,8 +33,6 @@ private[sql] object SqlApiConfHelper {
   val SESSION_LOCAL_TIMEZONE_KEY: String = "spark.sql.session.timeZone"
   val LOCAL_RELATION_CACHE_THRESHOLD_KEY: String = 
"spark.sql.session.localRelationCacheThreshold"
   val DEFAULT_COLLATION: String = "spark.sql.session.collation.default"
-  val ALLOW_READING_UNKNOWN_COLLATIONS: String =
-    "spark.sql.collation.allowReadingUnknownCollations"
 
   val confGetter: AtomicReference[() => SqlApiConf] = {
     new AtomicReference[() => SqlApiConf](() => DefaultSqlApiConf)
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 4cf7d8efb96a..036de22b4189 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -27,7 +27,7 @@ import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.{SparkException, SparkIllegalArgumentException, 
SparkThrowable}
+import org.apache.spark.{SparkIllegalArgumentException, SparkThrowable}
 import org.apache.spark.annotation.Stable
 import org.apache.spark.sql.catalyst.analysis.SqlApiAnalysis
 import org.apache.spark.sql.catalyst.parser.DataTypeParser
@@ -340,17 +340,8 @@ object DataType {
         fields.collect { case (fieldPath, JString(collation)) =>
           collation.split("\\.", 2) match {
             case Array(provider: String, collationName: String) =>
-              try {
-                CollationFactory.assertValidProvider(provider)
-                fieldPath -> collationName
-              } catch {
-                case e: SparkException
-                    if e.getCondition == "COLLATION_INVALID_PROVIDER" &&
-                      SqlApiConf.get.allowReadingUnknownCollations =>
-                  // If the collation provider is unknown and the config for 
reading such
-                  // collations is enabled, return the UTF8_BINARY collation.
-                  fieldPath -> "UTF8_BINARY"
-              }
+              CollationFactory.assertValidProvider(provider)
+              fieldPath -> collationName
           }
         }.toMap
 
@@ -359,16 +350,7 @@ object DataType {
   }
 
   private def stringTypeWithCollation(collationName: String): StringType = {
-    try {
-      StringType(CollationFactory.collationNameToId(collationName))
-    } catch {
-      case e: SparkException
-          if e.getCondition == "COLLATION_INVALID_NAME" &&
-            SqlApiConf.get.allowReadingUnknownCollations =>
-        // If the collation name is unknown and the config for reading such 
collations is enabled,
-        // return the UTF8_BINARY collation.
-        StringType(CollationFactory.UTF8_BINARY_COLLATION_ID)
-    }
+    StringType(CollationFactory.collationNameToId(collationName))
   }
 
   protected[types] def buildFormattedString(
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 123759c6c8b8..ba0a37541e49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -778,15 +778,6 @@ object SQLConf {
       .booleanConf
       .createWithDefault(Utils.isTesting)
 
-  val ALLOW_READING_UNKNOWN_COLLATIONS =
-    buildConf(SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS)
-      .internal()
-      .doc("Enables spark to read unknown collation name as UTF8_BINARY. If 
the config is " +
-        "not enabled, when spark encounters an unknown collation name, it will 
throw an error.")
-      .version("4.0.0")
-      .booleanConf
-      .createWithDefault(false)
-
   val DEFAULT_COLLATION =
     buildConf(SqlApiConfHelper.DEFAULT_COLLATION)
       .doc("Sets default collation to use for string literals, parameter 
markers or the string" +
@@ -5582,8 +5573,6 @@ class SQLConf extends Serializable with Logging with 
SqlApiConf {
     }
   }
 
-  override def allowReadingUnknownCollations: Boolean = 
getConf(ALLOW_READING_UNKNOWN_COLLATIONS)
-
   def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED)
 
   def adaptiveExecutionLogLevel: String = getConf(ADAPTIVE_EXECUTION_LOG_LEVEL)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index d5fc4d87bb6a..f6d8f2a66e20 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -23,13 +23,11 @@ import org.json4s.jackson.JsonMethods
 import org.apache.spark.{SparkException, SparkFunSuite, 
SparkIllegalArgumentException}
 import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, 
caseSensitiveResolution}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.catalyst.util.{CollationFactory, StringConcat}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, 
yearMonthIntervalTypes}
 
-class DataTypeSuite extends SparkFunSuite with SQLHelper {
+class DataTypeSuite extends SparkFunSuite {
 
   private val UNICODE_COLLATION_ID = 
CollationFactory.collationNameToId("UNICODE")
 
@@ -878,90 +876,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper {
       }
   }
 
-  test("string field with invalid collation name") {
-    val collationProviders = Seq("spark", "icu")
-    collationProviders.foreach { provider =>
-      val json =
-        s"""
-           |{
-           |  "type": "struct",
-           |  "fields": [
-           |    {
-           |      "name": "c1",
-           |      "type": "string",
-           |      "nullable": false,
-           |      "metadata": {
-           |        "${DataType.COLLATIONS_METADATA_KEY}": {
-           |          "c1": "$provider.INVALID"
-           |        }
-           |      }
-           |    }
-           |  ]
-           |}
-           |""".stripMargin
-
-      // Check that the exception will be thrown in case of invalid collation 
name and
-      // UNKNOWN_COLLATION_NAME config not enabled.
-      checkError(
-        exception = intercept[SparkException] {
-          DataType.fromJson(json)
-        },
-        condition = "COLLATION_INVALID_NAME",
-        parameters = Map(
-          "proposals" -> "id",
-          "collationName" -> "INVALID"))
-
-      // Check that the exception will not be thrown in case of invalid 
collation name and
-      // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be 
returned.
-      withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
-        val dataType = DataType.fromJson(json)
-        assert(dataType === StructType(
-          StructField("c1", 
StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil))
-      }
-    }
-  }
-
-  test("string field with invalid collation provider") {
-    val json =
-      s"""
-         |{
-         |  "type": "struct",
-         |  "fields": [
-         |    {
-         |      "name": "c1",
-         |      "type": "string",
-         |      "nullable": false,
-         |      "metadata": {
-         |        "${DataType.COLLATIONS_METADATA_KEY}": {
-         |          "c1": "INVALID.INVALID"
-         |        }
-         |      }
-         |    }
-         |  ]
-         |}
-         |""".stripMargin
-
-
-    // Check that the exception will be thrown in case of invalid collation 
name and
-    // UNKNOWN_COLLATION_NAME config not enabled.
-    checkError(
-      exception = intercept[SparkException] {
-        DataType.fromJson(json)
-      },
-      condition = "COLLATION_INVALID_PROVIDER",
-      parameters = Map(
-        "supportedProviders" -> "spark, icu",
-        "provider" -> "INVALID"))
-
-    // Check that the exception will not be thrown in case of invalid 
collation name and
-    // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be 
returned.
-    withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
-      val dataType = DataType.fromJson(json)
-      assert(dataType === StructType(
-        StructField("c1", 
StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil))
-    }
-  }
-
   test("non string field has collation metadata") {
     val json =
       s"""
@@ -1109,42 +1023,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper 
{
     assert(parsedWithCollations === ArrayType(StringType(unicodeCollationId)))
   }
 
-  test("parse array type with invalid collation metadata") {
-    val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID
-    val arrayJson =
-      s"""
-         |{
-         |  "type": "array",
-         |  "elementType": "string",
-         |  "containsNull": true
-         |}
-         |""".stripMargin
-
-    val collationsMap = Map("element" -> "INVALID")
-
-    // Parse without collations map
-    assert(DataType.parseDataType(JsonMethods.parse(arrayJson)) === 
ArrayType(StringType))
-
-    // Check that the exception will be thrown in case of invalid collation 
name and
-    // UNKNOWN_COLLATION_NAME config not enabled.
-    checkError(
-      exception = intercept[SparkException] {
-        DataType.parseDataType(JsonMethods.parse(arrayJson), collationsMap = 
collationsMap)
-      },
-      condition = "COLLATION_INVALID_NAME",
-      parameters = Map(
-        "proposals" -> "id",
-        "collationName" -> "INVALID"))
-
-    // Check that the exception will not be thrown in case of invalid 
collation name and
-    // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be 
returned.
-    withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
-      val dataType = DataType.parseDataType(
-        JsonMethods.parse(arrayJson), collationsMap = collationsMap)
-      assert(dataType === ArrayType(StringType(utf8BinaryCollationId)))
-    }
-  }
-
   test("parse map type with collation metadata") {
     val unicodeCollationId = CollationFactory.collationNameToId("UNICODE")
     val mapJson =
@@ -1168,44 +1046,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper 
{
       MapType(StringType(unicodeCollationId), StringType(unicodeCollationId)))
   }
 
-  test("parse map type with invalid collation metadata") {
-    val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID
-    val mapJson =
-      s"""
-         |{
-         |  "type": "map",
-         |  "keyType": "string",
-         |  "valueType": "string",
-         |  "valueContainsNull": true
-         |}
-         |""".stripMargin
-
-    val collationsMap = Map("key" -> "INVALID", "value" -> "INVALID")
-
-    // Parse without collations map
-    assert(DataType.parseDataType(JsonMethods.parse(mapJson)) === 
MapType(StringType, StringType))
-
-    // Check that the exception will be thrown in case of invalid collation 
name and
-    // UNKNOWN_COLLATION_NAME config not enabled.
-    checkError(
-      exception = intercept[SparkException] {
-        DataType.parseDataType(JsonMethods.parse(mapJson), collationsMap = 
collationsMap)
-      },
-      condition = "COLLATION_INVALID_NAME",
-      parameters = Map(
-        "proposals" -> "id",
-        "collationName" -> "INVALID"))
-
-    // Check that the exception will not be thrown in case of invalid 
collation name and
-    // UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be 
returned.
-    withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
-      val dataType = DataType.parseDataType(
-        JsonMethods.parse(mapJson), collationsMap = collationsMap)
-      assert(dataType === MapType(
-        StringType(utf8BinaryCollationId), StringType(utf8BinaryCollationId)))
-    }
-  }
-
   test("SPARK-48680: Add CharType and VarcharType to DataTypes JAVA API") {
     assert(DataTypes.createCharType(1) === CharType(1))
     assert(DataTypes.createVarcharType(100) === VarcharType(100))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to