This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1043d53b62bd [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` deterministic 1043d53b62bd is described below commit 1043d53b62bd5d152aa4d685fa2c6880ce015b77 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Fri Sep 5 16:40:51 2025 +0800 [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` deterministic ### What changes were proposed in this pull request? Make `test_unpivot` deterministic ### Why are the changes needed? it occasionally fails with: ``` ====================================================================== FAIL [0.000s]: test_unpivot (pyspark.sql.tests.connect.test_parity_stat.DataFrameStatParityTests.test_unpivot) (ids=[], desc='with no identifier') ---------------------------------------------------------------------- Traceback (most recent call last): File "/home/jenkins/python/pyspark/sql/tests/test_stat.py", line 435, in test_unpivot self.assertEqual( AssertionError: Lists differ: [Row([26 chars]var='int', val=20.0), Row(var='int', val=30.0)[80 chars]3.0)] != [Row([26 chars]var='double', val=1.0), Row(var='int', val=20.[80 chars]3.0)] First differing element 1: Row(var='int', val=20.0) Row(var='double', val=1.0) [Row(var='int', val=10.0), + Row(var='double', val=1.0), Row(var='int', val=20.0), + Row(var='double', val=2.0), Row(var='int', val=30.0), - Row(var='double', val=1.0), - Row(var='double', val=2.0), Row(var='double', val=3.0)] ====================================================================== FAIL [0.000s]: test_unpivot (pyspark.sql.tests.connect.test_parity_stat.DataFrameStatParityTests.test_unpivot) (ids=(), desc='with no identifier') ---------------------------------------------------------------------- Traceback (most recent call last): File "/home/jenkins/python/pyspark/sql/tests/test_stat.py", line 435, in test_unpivot self.assertEqual( AssertionError: Lists differ: [Row([26 chars]var='int', val=20.0), Row(var='int', val=30.0)[80 chars]3.0)] != [Row([26 chars]var='double', val=1.0), Row(var='int', val=20.[80 chars]3.0)] First differing element 1: Row(var='int', val=20.0) Row(var='double', val=1.0) [Row(var='int', val=10.0), + Row(var='double', val=1.0), Row(var='int', val=20.0), + Row(var='double', val=2.0), Row(var='int', val=30.0), - Row(var='double', val=1.0), - Row(var='double', val=2.0), Row(var='double', val=3.0)] ``` ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? no Closes #52245 from zhengruifeng/test_unpivot_fix. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/tests/test_stat.py | 44 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/python/pyspark/sql/tests/test_stat.py b/python/pyspark/sql/tests/test_stat.py index fe1746dbd894..437cb04020a5 100644 --- a/python/pyspark/sql/tests/test_stat.py +++ b/python/pyspark/sql/tests/test_stat.py @@ -17,7 +17,7 @@ import unittest -from pyspark.sql import Row +from pyspark.sql import Row, functions as sf from pyspark.sql.types import ( StringType, IntegerType, @@ -430,24 +430,26 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with no identifier"): for id in [[], ()]: with self.subTest(ids=id): - actual = df.unpivot(id, ["int", "double"], "var", "val") + actual = df.unpivot(id, ["int", "double"], "var", "val").sort("var", "val") self.assertEqual(actual.schema.simpleString(), "struct<var:string,val:double>") self.assertEqual( actual.collect(), [ - Row(var="int", val=10.0), Row(var="double", val=1.0), - Row(var="int", val=20.0), Row(var="double", val=2.0), - Row(var="int", val=30.0), Row(var="double", val=3.0), + Row(var="int", val=10.0), + Row(var="int", val=20.0), + Row(var="int", val=30.0), ], ) with self.subTest(desc="with single identifier column"): for id in ["id", ["id"], ("id",)]: with self.subTest(ids=id): - actual = df.unpivot(id, ["int", "double"], "var", "val") + actual = df.unpivot(id, ["int", "double"], "var", "val").sort( + "id", sf.desc("val") + ) self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,var:string,val:double>", @@ -467,7 +469,9 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with multiple identifier columns"): for ids in [["id", "double"], ("id", "double")]: with self.subTest(ids=ids): - actual = df.unpivot(ids, ["int", "double"], "var", "val") + actual = df.unpivot(ids, ["int", "double"], "var", "val").sort( + "id", sf.desc("val") + ) self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,double:double,var:string,val:double>", @@ -486,20 +490,22 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with no identifier columns but none value columns"): # select only columns that have common data type (double) - actual = df.select("id", "int", "double").unpivot([], None, "var", "val") + actual = ( + df.select("id", "int", "double").unpivot([], None, "var", "val").sort("var", "val") + ) self.assertEqual(actual.schema.simpleString(), "struct<var:string,val:double>") self.assertEqual( actual.collect(), [ - Row(var="id", val=1.0), - Row(var="int", val=10.0), Row(var="double", val=1.0), - Row(var="id", val=2.0), - Row(var="int", val=20.0), Row(var="double", val=2.0), + Row(var="double", val=3.0), + Row(var="id", val=1.0), + Row(var="id", val=2.0), Row(var="id", val=3.0), + Row(var="int", val=10.0), + Row(var="int", val=20.0), Row(var="int", val=30.0), - Row(var="double", val=3.0), ], ) @@ -507,7 +513,11 @@ class DataFrameStatTestsMixin: for ids in ["id", ["id"], ("id",)]: with self.subTest(ids=ids): # select only columns that have common data type (double) - actual = df.select("id", "int", "double").unpivot(ids, None, "var", "val") + actual = ( + df.select("id", "int", "double") + .unpivot(ids, None, "var", "val") + .sort("id", sf.desc("val")) + ) self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,var:string,val:double>" ) @@ -526,7 +536,7 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with multiple identifier columns but none given value columns"): for ids in [["id", "str"], ("id", "str")]: with self.subTest(ids=ids): - actual = df.unpivot(ids, None, "var", "val") + actual = df.unpivot(ids, None, "var", "val").sort("id", sf.desc("val")) self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,str:string,var:string,val:double>", @@ -546,7 +556,7 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with single value column"): for values in ["int", ["int"], ("int",)]: with self.subTest(values=values): - actual = df.unpivot("id", values, "var", "val") + actual = df.unpivot("id", values, "var", "val").sort("id") self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,var:string,val:bigint>" ) @@ -562,7 +572,7 @@ class DataFrameStatTestsMixin: with self.subTest(desc="with multiple value columns"): for values in [["int", "double"], ("int", "double")]: with self.subTest(values=values): - actual = df.unpivot("id", values, "var", "val") + actual = df.unpivot("id", values, "var", "val").sort("id", sf.desc("val")) self.assertEqual( actual.schema.simpleString(), "struct<id:bigint,var:string,val:double>" ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org