This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 3a989dbe8d93 [SPARK-51962][SQL][TESTS] Improve ORDER BY testing coverage for non-orderable sort expressions 3a989dbe8d93 is described below commit 3a989dbe8d93be3b7a86e498fd85a90afd02cf31 Author: mihailoale-db <mihailo.alek...@databricks.com> AuthorDate: Wed Apr 30 22:03:22 2025 +0800 [SPARK-51962][SQL][TESTS] Improve ORDER BY testing coverage for non-orderable sort expressions ### What changes were proposed in this pull request? Spark lacks tests for non-orderable sort expressions. In this PR I propose that we fix that and also refactor the code so we can support it in single-pass analyzer as well. ### Why are the changes needed? To improve testing coverage ease single-pass development process. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #50768 from mihailoale-db/refactororderablethrow. Authored-by: mihailoale-db <mihailo.alek...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../sql/catalyst/analysis/CheckAnalysis.scala | 14 +- .../apache/spark/sql/catalyst/util/TypeUtils.scala | 12 +- .../sql-tests/analyzer-results/order-by.sql.out | 176 +++++++++++++++++++ .../test/resources/sql-tests/inputs/order-by.sql | 10 ++ .../resources/sql-tests/results/order-by.sql.out | 192 +++++++++++++++++++++ 5 files changed, 391 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 8a17849f9396..c60e99654075 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -645,24 +645,14 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString case Sort(orders, _, _, _) => orders.foreach { order => - if (!RowOrdering.isOrderable(order.dataType)) { - order.failAnalysis( - errorClass = "EXPRESSION_TYPE_IS_NOT_ORDERABLE", - messageParameters = Map("exprType" -> toSQLType(order.dataType))) - } + TypeUtils.tryThrowNotOrderableExpression(order) } case Window(_, partitionSpec, _, _, _) => // Both `partitionSpec` and `orderSpec` must be orderable. We only need an extra check // for `partitionSpec` here because `orderSpec` has the type check itself. partitionSpec.foreach { p => - if (!RowOrdering.isOrderable(p.dataType)) { - p.failAnalysis( - errorClass = "EXPRESSION_TYPE_IS_NOT_ORDERABLE", - messageParameters = Map( - "expr" -> toSQLExpr(p), - "exprType" -> toSQLType(p.dataType))) - } + TypeUtils.tryThrowNotOrderableExpression(p) } case GlobalLimit(limitExpr, _) => checkLimitLikeClause("limit", limitExpr) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index a0d578c66e73..9f89f068b756 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.util -import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} +import org.apache.spark.sql.catalyst.analysis.{AnalysisErrorAt, TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions.{Expression, RowOrdering} import org.apache.spark.sql.catalyst.types.{PhysicalDataType, PhysicalNumericType} @@ -43,6 +43,16 @@ object TypeUtils extends QueryErrorsBase { } } + def tryThrowNotOrderableExpression(expression: Expression): Unit = { + if (!RowOrdering.isOrderable(expression.dataType)) { + expression.failAnalysis( + errorClass = "EXPRESSION_TYPE_IS_NOT_ORDERABLE", + messageParameters = + Map("expr" -> toSQLExpr(expression), "exprType" -> toSQLType(expression.dataType)) + ) + } + } + def checkForSameTypeInputExpr(types: Seq[DataType], caller: String): TypeCheckResult = { if (TypeCoercion.haveSameType(types)) { TypeCheckResult.TypeCheckSuccess diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by.sql.out index 9f0afdfd8859..903ffdfa62dc 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by.sql.out @@ -272,6 +272,182 @@ Project [count(DISTINCT col1)#xL] +- OneRowRelation +-- !query +SELECT 'a' ORDER BY CAST('a' AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(a AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 21, + "stopIndex" : 40, + "fragment" : "CAST('a' AS VARIANT)" + } ] +} + + +-- !query +SELECT a, b FROM testData ORDER BY CAST(a + b AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST((a + b) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 36, + "stopIndex" : 57, + "fragment" : "CAST(a + b AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(CAST(a AS VARIANT) AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(CAST(a AS VARIANT) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 67, + "fragment" : "CAST(CAST(a AS VARIANT) AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData GROUP BY a ORDER BY CAST(CAST(a AS STRING) AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(CAST(a AS STRING) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 44, + "stopIndex" : 77, + "fragment" : "CAST(CAST(a AS STRING) AS VARIANT)" + } ] +} + + +-- !query +SELECT * FROM testData ORDER BY CAST(null AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(NULL AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 53, + "fragment" : "CAST(null AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY ARRAY(CAST(true AS VARIANT)) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"ARRAY<VARIANT>\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"array(CAST(true AS VARIANT)) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 60, + "fragment" : "ARRAY(CAST(true AS VARIANT))" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(ARRAY('a') AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(array(a) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 59, + "fragment" : "CAST(ARRAY('a') AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(ARRAY(CAST(true AS VARIANT)) AS VARIANT) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(array(CAST(true AS VARIANT)) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 77, + "fragment" : "CAST(ARRAY(CAST(true AS VARIANT)) AS VARIANT)" + } ] +} + + -- !query DROP VIEW IF EXISTS testData -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/order-by.sql b/sql/core/src/test/resources/sql-tests/inputs/order-by.sql index c5ce610e4c88..04d3fda84c7e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/order-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/order-by.sql @@ -63,6 +63,16 @@ ORDER BY col1 ; +-- Order by non-orderable expressions should fail +SELECT 'a' ORDER BY CAST('a' AS VARIANT); +SELECT a, b FROM testData ORDER BY CAST(a + b AS VARIANT); +SELECT a FROM testData ORDER BY CAST(CAST(a AS VARIANT) AS VARIANT); +SELECT a FROM testData GROUP BY a ORDER BY CAST(CAST(a AS STRING) AS VARIANT); +SELECT * FROM testData ORDER BY CAST(null AS VARIANT); +SELECT a FROM testData ORDER BY ARRAY(CAST(true AS VARIANT)); +SELECT a FROM testData ORDER BY CAST(ARRAY('a') AS VARIANT); +SELECT a FROM testData ORDER BY CAST(ARRAY(CAST(true AS VARIANT)) AS VARIANT); + -- Clean up DROP VIEW IF EXISTS testData; DROP VIEW IF EXISTS v1; diff --git a/sql/core/src/test/resources/sql-tests/results/order-by.sql.out b/sql/core/src/test/resources/sql-tests/results/order-by.sql.out index 2b9862964a64..26bb94c98fe2 100644 --- a/sql/core/src/test/resources/sql-tests/results/order-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/order-by.sql.out @@ -252,6 +252,198 @@ struct<count(DISTINCT col1):bigint> 1 +-- !query +SELECT 'a' ORDER BY CAST('a' AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(a AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 21, + "stopIndex" : 40, + "fragment" : "CAST('a' AS VARIANT)" + } ] +} + + +-- !query +SELECT a, b FROM testData ORDER BY CAST(a + b AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST((a + b) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 36, + "stopIndex" : 57, + "fragment" : "CAST(a + b AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(CAST(a AS VARIANT) AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(CAST(a AS VARIANT) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 67, + "fragment" : "CAST(CAST(a AS VARIANT) AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData GROUP BY a ORDER BY CAST(CAST(a AS STRING) AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(CAST(a AS STRING) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 44, + "stopIndex" : 77, + "fragment" : "CAST(CAST(a AS STRING) AS VARIANT)" + } ] +} + + +-- !query +SELECT * FROM testData ORDER BY CAST(null AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(NULL AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 53, + "fragment" : "CAST(null AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY ARRAY(CAST(true AS VARIANT)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"ARRAY<VARIANT>\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"array(CAST(true AS VARIANT)) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 60, + "fragment" : "ARRAY(CAST(true AS VARIANT))" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(ARRAY('a') AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(array(a) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 59, + "fragment" : "CAST(ARRAY('a') AS VARIANT)" + } ] +} + + +-- !query +SELECT a FROM testData ORDER BY CAST(ARRAY(CAST(true AS VARIANT)) AS VARIANT) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "dataType" : "\"VARIANT\"", + "functionName" : "`sortorder`", + "sqlExpr" : "\"CAST(array(CAST(true AS VARIANT)) AS VARIANT) ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 33, + "stopIndex" : 77, + "fragment" : "CAST(ARRAY(CAST(true AS VARIANT)) AS VARIANT)" + } ] +} + + -- !query DROP VIEW IF EXISTS testData -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org