(spark) branch master updated: [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` deterministic

ruifengz Sun, 07 Sep 2025 07:01:44 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 1043d53b62bd [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` 
deterministic
1043d53b62bd is described below

commit 1043d53b62bd5d152aa4d685fa2c6880ce015b77
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Fri Sep 5 16:40:51 2025 +0800

    [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` deterministic
    
    ### What changes were proposed in this pull request?
    Make `test_unpivot` deterministic
    
    ### Why are the changes needed?
    it occasionally fails with:
    ```
    ======================================================================
    FAIL [0.000s]: test_unpivot 
(pyspark.sql.tests.connect.test_parity_stat.DataFrameStatParityTests.test_unpivot)
 (ids=[], desc='with no identifier')
    ----------------------------------------------------------------------
    Traceback (most recent call last):
      File "/home/jenkins/python/pyspark/sql/tests/test_stat.py", line 435, in 
test_unpivot
        self.assertEqual(
    AssertionError: Lists differ: [Row([26 chars]var='int', val=20.0), 
Row(var='int', val=30.0)[80 chars]3.0)] != [Row([26 chars]var='double', 
val=1.0), Row(var='int', val=20.[80 chars]3.0)]
    First differing element 1:
    Row(var='int', val=20.0)
    Row(var='double', val=1.0)
      [Row(var='int', val=10.0),
    +  Row(var='double', val=1.0),
       Row(var='int', val=20.0),
    +  Row(var='double', val=2.0),
       Row(var='int', val=30.0),
    -  Row(var='double', val=1.0),
    -  Row(var='double', val=2.0),
       Row(var='double', val=3.0)]
    ======================================================================
    FAIL [0.000s]: test_unpivot 
(pyspark.sql.tests.connect.test_parity_stat.DataFrameStatParityTests.test_unpivot)
 (ids=(), desc='with no identifier')
    ----------------------------------------------------------------------
    Traceback (most recent call last):
      File "/home/jenkins/python/pyspark/sql/tests/test_stat.py", line 435, in 
test_unpivot
        self.assertEqual(
    AssertionError: Lists differ: [Row([26 chars]var='int', val=20.0), 
Row(var='int', val=30.0)[80 chars]3.0)] != [Row([26 chars]var='double', 
val=1.0), Row(var='int', val=20.[80 chars]3.0)]
    First differing element 1:
    Row(var='int', val=20.0)
    Row(var='double', val=1.0)
      [Row(var='int', val=10.0),
    +  Row(var='double', val=1.0),
       Row(var='int', val=20.0),
    +  Row(var='double', val=2.0),
       Row(var='int', val=30.0),
    -  Row(var='double', val=1.0),
    -  Row(var='double', val=2.0),
       Row(var='double', val=3.0)]
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    no, test-only
    
    ### How was this patch tested?
    CI
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #52245 from zhengruifeng/test_unpivot_fix.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 python/pyspark/sql/tests/test_stat.py | 44 +++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/sql/tests/test_stat.py 
b/python/pyspark/sql/tests/test_stat.py
index fe1746dbd894..437cb04020a5 100644
--- a/python/pyspark/sql/tests/test_stat.py
+++ b/python/pyspark/sql/tests/test_stat.py
@@ -17,7 +17,7 @@
 
 import unittest
 
-from pyspark.sql import Row
+from pyspark.sql import Row, functions as sf
 from pyspark.sql.types import (
     StringType,
     IntegerType,
@@ -430,24 +430,26 @@ class DataFrameStatTestsMixin:
         with self.subTest(desc="with no identifier"):
             for id in [[], ()]:
                 with self.subTest(ids=id):
-                    actual = df.unpivot(id, ["int", "double"], "var", "val")
+                    actual = df.unpivot(id, ["int", "double"], "var", 
"val").sort("var", "val")
                     self.assertEqual(actual.schema.simpleString(), 
"struct<var:string,val:double>")
                     self.assertEqual(
                         actual.collect(),
                         [
-                            Row(var="int", val=10.0),
                             Row(var="double", val=1.0),
-                            Row(var="int", val=20.0),
                             Row(var="double", val=2.0),
-                            Row(var="int", val=30.0),
                             Row(var="double", val=3.0),
+                            Row(var="int", val=10.0),
+                            Row(var="int", val=20.0),
+                            Row(var="int", val=30.0),
                         ],
                     )
 
         with self.subTest(desc="with single identifier column"):
             for id in ["id", ["id"], ("id",)]:
                 with self.subTest(ids=id):
-                    actual = df.unpivot(id, ["int", "double"], "var", "val")
+                    actual = df.unpivot(id, ["int", "double"], "var", 
"val").sort(
+                        "id", sf.desc("val")
+                    )
                     self.assertEqual(
                         actual.schema.simpleString(),
                         "struct<id:bigint,var:string,val:double>",
@@ -467,7 +469,9 @@ class DataFrameStatTestsMixin:
         with self.subTest(desc="with multiple identifier columns"):
             for ids in [["id", "double"], ("id", "double")]:
                 with self.subTest(ids=ids):
-                    actual = df.unpivot(ids, ["int", "double"], "var", "val")
+                    actual = df.unpivot(ids, ["int", "double"], "var", 
"val").sort(
+                        "id", sf.desc("val")
+                    )
                     self.assertEqual(
                         actual.schema.simpleString(),
                         
"struct<id:bigint,double:double,var:string,val:double>",
@@ -486,20 +490,22 @@ class DataFrameStatTestsMixin:
 
         with self.subTest(desc="with no identifier columns but none value 
columns"):
             # select only columns that have common data type (double)
-            actual = df.select("id", "int", "double").unpivot([], None, "var", 
"val")
+            actual = (
+                df.select("id", "int", "double").unpivot([], None, "var", 
"val").sort("var", "val")
+            )
             self.assertEqual(actual.schema.simpleString(), 
"struct<var:string,val:double>")
             self.assertEqual(
                 actual.collect(),
                 [
-                    Row(var="id", val=1.0),
-                    Row(var="int", val=10.0),
                     Row(var="double", val=1.0),
-                    Row(var="id", val=2.0),
-                    Row(var="int", val=20.0),
                     Row(var="double", val=2.0),
+                    Row(var="double", val=3.0),
+                    Row(var="id", val=1.0),
+                    Row(var="id", val=2.0),
                     Row(var="id", val=3.0),
+                    Row(var="int", val=10.0),
+                    Row(var="int", val=20.0),
                     Row(var="int", val=30.0),
-                    Row(var="double", val=3.0),
                 ],
             )
 
@@ -507,7 +513,11 @@ class DataFrameStatTestsMixin:
             for ids in ["id", ["id"], ("id",)]:
                 with self.subTest(ids=ids):
                     # select only columns that have common data type (double)
-                    actual = df.select("id", "int", "double").unpivot(ids, 
None, "var", "val")
+                    actual = (
+                        df.select("id", "int", "double")
+                        .unpivot(ids, None, "var", "val")
+                        .sort("id", sf.desc("val"))
+                    )
                     self.assertEqual(
                         actual.schema.simpleString(), 
"struct<id:bigint,var:string,val:double>"
                     )
@@ -526,7 +536,7 @@ class DataFrameStatTestsMixin:
         with self.subTest(desc="with multiple identifier columns but none 
given value columns"):
             for ids in [["id", "str"], ("id", "str")]:
                 with self.subTest(ids=ids):
-                    actual = df.unpivot(ids, None, "var", "val")
+                    actual = df.unpivot(ids, None, "var", "val").sort("id", 
sf.desc("val"))
                     self.assertEqual(
                         actual.schema.simpleString(),
                         "struct<id:bigint,str:string,var:string,val:double>",
@@ -546,7 +556,7 @@ class DataFrameStatTestsMixin:
         with self.subTest(desc="with single value column"):
             for values in ["int", ["int"], ("int",)]:
                 with self.subTest(values=values):
-                    actual = df.unpivot("id", values, "var", "val")
+                    actual = df.unpivot("id", values, "var", "val").sort("id")
                     self.assertEqual(
                         actual.schema.simpleString(), 
"struct<id:bigint,var:string,val:bigint>"
                     )
@@ -562,7 +572,7 @@ class DataFrameStatTestsMixin:
         with self.subTest(desc="with multiple value columns"):
             for values in [["int", "double"], ("int", "double")]:
                 with self.subTest(values=values):
-                    actual = df.unpivot("id", values, "var", "val")
+                    actual = df.unpivot("id", values, "var", "val").sort("id", 
sf.desc("val"))
                     self.assertEqual(
                         actual.schema.simpleString(), 
"struct<id:bigint,var:string,val:double>"
                     )


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-53500][PYTHON][TESTS] Make `test_unpivot` deterministic

Reply via email to