This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 1a9051ad5b06 [SPARK-53691][PS][INFRA][TESTS][4.0] Further reorganize
tests for Pandas API
1a9051ad5b06 is described below
commit 1a9051ad5b06b60ea5e069a33fe49394580442c8
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Oct 7 10:52:09 2025 +0800
[SPARK-53691][PS][INFRA][TESTS][4.0] Further reorganize tests for Pandas API
cherry-pick https://github.com/apache/spark/pull/52433 to branch-4.0
Closes #52525 from zhengruifeng/futher_combin_pandas_on_connect_40.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.github/workflows/build_and_test.yml | 14 +-
.github/workflows/build_python_connect.yml | 2 +-
.github/workflows/python_macos_test.yml | 8 +-
dev/sparktestsupport/modules.py | 322 +++++++++++++----------------
dev/sparktestsupport/utils.py | 20 +-
5 files changed, 162 insertions(+), 204 deletions(-)
diff --git a/.github/workflows/build_and_test.yml
b/.github/workflows/build_and_test.yml
index d20ad58105d5..a562bc2e55da 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -512,13 +512,9 @@ jobs:
- >-
pyspark-pandas-slow
- >-
- pyspark-pandas-connect-part0
+ pyspark-pandas-connect
- >-
- pyspark-pandas-connect-part1
- - >-
- pyspark-pandas-connect-part2
- - >-
- pyspark-pandas-connect-part3
+ pyspark-pandas-slow-connect
exclude:
# Always run if pyspark == 'true', even infra-image is skip (such as
non-master job)
# In practice, the build will run in individual PR, but not against
the individual commit
@@ -532,10 +528,8 @@ jobs:
# in Apache Spark repository.
- modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas' }}
- modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-slow' }}
- - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-connect-part0' }}
- - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-connect-part1' }}
- - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-connect-part2' }}
- - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-connect-part3' }}
+ - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-connect' }}
+ - modules: ${{
fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' &&
'pyspark-pandas-slow-connect' }}
env:
MODULES_TO_TEST: ${{ matrix.modules }}
HADOOP_PROFILE: ${{ inputs.hadoop }}
diff --git a/.github/workflows/build_python_connect.yml
b/.github/workflows/build_python_connect.yml
index 67428d6af0de..8bf8c56c2531 100644
--- a/.github/workflows/build_python_connect.yml
+++ b/.github/workflows/build_python_connect.yml
@@ -96,7 +96,7 @@ jobs:
# Several tests related to catalog requires to run them
sequencially, e.g., writing a table in a listener.
./python/run-tests --parallelism=1 --python-executables=python3
--modules pyspark-connect,pyspark-ml-connect
# None of tests are dependent on each other in Pandas API on Spark
so run them in parallel
- ./python/run-tests --parallelism=1 --python-executables=python3
--modules
pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+ ./python/run-tests --parallelism=1 --python-executables=python3
--modules pyspark-pandas-connect,pyspark-pandas-slow-connect
# Stop Spark Connect server.
./sbin/stop-connect-server.sh
diff --git a/.github/workflows/python_macos_test.yml
b/.github/workflows/python_macos_test.yml
index 2cffb68419e8..49b953122386 100644
--- a/.github/workflows/python_macos_test.yml
+++ b/.github/workflows/python_macos_test.yml
@@ -70,13 +70,9 @@ jobs:
- >-
pyspark-pandas-slow
- >-
- pyspark-pandas-connect-part0
+ pyspark-pandas-connect
- >-
- pyspark-pandas-connect-part1
- - >-
- pyspark-pandas-connect-part2
- - >-
- pyspark-pandas-connect-part3
+ pyspark-pandas-slow-connect
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: python${{inputs.python}}
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 54988ac5b788..dc4e95a14b69 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -811,6 +811,21 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.frame.test_time_series",
"pyspark.pandas.tests.frame.test_truncate",
"pyspark.pandas.tests.series.test_interpolate",
+ "pyspark.pandas.tests.series.test_datetime",
+ "pyspark.pandas.tests.series.test_string_ops_adv",
+ "pyspark.pandas.tests.series.test_string_ops_basic",
+ "pyspark.pandas.tests.series.test_all_any",
+ "pyspark.pandas.tests.series.test_arg_ops",
+ "pyspark.pandas.tests.series.test_as_of",
+ "pyspark.pandas.tests.series.test_as_type",
+ "pyspark.pandas.tests.series.test_compute",
+ "pyspark.pandas.tests.series.test_conversion",
+ "pyspark.pandas.tests.series.test_cumulative",
+ "pyspark.pandas.tests.series.test_index",
+ "pyspark.pandas.tests.series.test_missing_data",
+ "pyspark.pandas.tests.series.test_series",
+ "pyspark.pandas.tests.series.test_sort",
+ "pyspark.pandas.tests.series.test_stat",
"pyspark.pandas.tests.resample.test_on",
"pyspark.pandas.tests.resample.test_error",
"pyspark.pandas.tests.resample.test_frame",
@@ -839,21 +854,6 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.window.test_groupby_rolling",
"pyspark.pandas.tests.window.test_groupby_rolling_adv",
"pyspark.pandas.tests.window.test_groupby_rolling_count",
- "pyspark.pandas.tests.series.test_datetime",
- "pyspark.pandas.tests.series.test_string_ops_adv",
- "pyspark.pandas.tests.series.test_string_ops_basic",
- "pyspark.pandas.tests.series.test_all_any",
- "pyspark.pandas.tests.series.test_arg_ops",
- "pyspark.pandas.tests.series.test_as_of",
- "pyspark.pandas.tests.series.test_as_type",
- "pyspark.pandas.tests.series.test_compute",
- "pyspark.pandas.tests.series.test_conversion",
- "pyspark.pandas.tests.series.test_cumulative",
- "pyspark.pandas.tests.series.test_index",
- "pyspark.pandas.tests.series.test_missing_data",
- "pyspark.pandas.tests.series.test_series",
- "pyspark.pandas.tests.series.test_sort",
- "pyspark.pandas.tests.series.test_stat",
"pyspark.pandas.tests.io.test_io",
"pyspark.pandas.tests.io.test_csv",
"pyspark.pandas.tests.io.test_feather",
@@ -1141,8 +1141,8 @@ pyspark_ml_connect = Module(
)
-pyspark_pandas_connect_part0 = Module(
- name="pyspark-pandas-connect-part0",
+pyspark_pandas_connect = Module(
+ name="pyspark-pandas-connect",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
@@ -1166,6 +1166,25 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.test_parity_sql",
"pyspark.pandas.tests.connect.test_parity_typedef",
"pyspark.pandas.tests.connect.test_parity_utils",
+ "pyspark.pandas.tests.connect.computation.test_parity_any_all",
+ "pyspark.pandas.tests.connect.computation.test_parity_apply_func",
+ "pyspark.pandas.tests.connect.computation.test_parity_binary_ops",
+ "pyspark.pandas.tests.connect.computation.test_parity_combine",
+ "pyspark.pandas.tests.connect.computation.test_parity_compute",
+ "pyspark.pandas.tests.connect.computation.test_parity_cov",
+ "pyspark.pandas.tests.connect.computation.test_parity_corr",
+ "pyspark.pandas.tests.connect.computation.test_parity_corrwith",
+ "pyspark.pandas.tests.connect.computation.test_parity_cumulative",
+ "pyspark.pandas.tests.connect.computation.test_parity_describe",
+ "pyspark.pandas.tests.connect.computation.test_parity_eval",
+ "pyspark.pandas.tests.connect.computation.test_parity_melt",
+ "pyspark.pandas.tests.connect.computation.test_parity_missing_data",
+ "pyspark.pandas.tests.connect.computation.test_parity_pivot",
+ "pyspark.pandas.tests.connect.computation.test_parity_pivot_table",
+ "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv",
+
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx",
+
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv",
+ "pyspark.pandas.tests.connect.computation.test_parity_stats",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
@@ -1180,12 +1199,98 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.data_type_ops.test_parity_string_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_udt_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_timedelta_ops",
+
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
+ "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod",
+ "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div",
+ "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot_matplotlib",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot_plotly",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot_matplotlib",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly",
+ "pyspark.pandas.tests.connect.frame.test_parity_attrs",
+ "pyspark.pandas.tests.connect.frame.test_parity_axis",
+ "pyspark.pandas.tests.connect.frame.test_parity_constructor",
+ "pyspark.pandas.tests.connect.frame.test_parity_conversion",
+ "pyspark.pandas.tests.connect.frame.test_parity_reindexing",
+ "pyspark.pandas.tests.connect.frame.test_parity_reshaping",
+ "pyspark.pandas.tests.connect.frame.test_parity_spark",
+ "pyspark.pandas.tests.connect.frame.test_parity_take",
+ "pyspark.pandas.tests.connect.frame.test_parity_take_adv",
+ "pyspark.pandas.tests.connect.frame.test_parity_time_series",
+ "pyspark.pandas.tests.connect.frame.test_parity_truncate",
+ "pyspark.pandas.tests.connect.frame.test_parity_interpolate",
+ "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error",
+ "pyspark.pandas.tests.connect.series.test_parity_datetime",
+ "pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
+ "pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
+ "pyspark.pandas.tests.connect.series.test_parity_all_any",
+ "pyspark.pandas.tests.connect.series.test_parity_arg_ops",
+ "pyspark.pandas.tests.connect.series.test_parity_as_of",
+ "pyspark.pandas.tests.connect.series.test_parity_as_type",
+ "pyspark.pandas.tests.connect.series.test_parity_compute",
+ "pyspark.pandas.tests.connect.series.test_parity_conversion",
+ "pyspark.pandas.tests.connect.series.test_parity_cumulative",
+ "pyspark.pandas.tests.connect.series.test_parity_index",
+ "pyspark.pandas.tests.connect.series.test_parity_missing_data",
+ "pyspark.pandas.tests.connect.series.test_parity_series",
+ "pyspark.pandas.tests.connect.series.test_parity_sort",
+ "pyspark.pandas.tests.connect.series.test_parity_stat",
+ "pyspark.pandas.tests.connect.series.test_parity_interpolate",
+ "pyspark.pandas.tests.connect.resample.test_parity_frame",
+ "pyspark.pandas.tests.connect.resample.test_parity_series",
+ "pyspark.pandas.tests.connect.resample.test_parity_error",
+ "pyspark.pandas.tests.connect.resample.test_parity_missing",
+ "pyspark.pandas.tests.connect.resample.test_parity_on",
+ "pyspark.pandas.tests.connect.resample.test_parity_timezone",
+ "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies",
+ "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs",
+
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex",
+ "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object",
+ "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix",
+ "pyspark.pandas.tests.connect.reshape.test_parity_merge_asof",
+ "pyspark.pandas.tests.connect.window.test_parity_ewm_error",
+ "pyspark.pandas.tests.connect.window.test_parity_ewm_mean",
+ "pyspark.pandas.tests.connect.window.test_parity_groupby_ewm_mean",
+ "pyspark.pandas.tests.connect.window.test_parity_missing",
+ "pyspark.pandas.tests.connect.window.test_parity_rolling",
+ "pyspark.pandas.tests.connect.window.test_parity_rolling_adv",
+ "pyspark.pandas.tests.connect.window.test_parity_rolling_count",
+ "pyspark.pandas.tests.connect.window.test_parity_rolling_error",
+ "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling",
+ "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_adv",
+
"pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_count",
+ "pyspark.pandas.tests.connect.window.test_parity_expanding",
+ "pyspark.pandas.tests.connect.window.test_parity_expanding_adv",
+ "pyspark.pandas.tests.connect.window.test_parity_expanding_error",
+ "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
+
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
+ "pyspark.pandas.tests.connect.io.test_parity_io",
+ "pyspark.pandas.tests.connect.io.test_parity_csv",
+ "pyspark.pandas.tests.connect.io.test_parity_feather",
+ "pyspark.pandas.tests.connect.io.test_parity_stata",
+ "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
+ "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
+ "pyspark.pandas.tests.connect.io.test_parity_series_conversion",
+ # fallback
+ "pyspark.pandas.tests.connect.frame.test_parity_asfreq",
+ "pyspark.pandas.tests.connect.frame.test_parity_asof",
+ ],
+ excluded_python_implementations=[
+ "PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
+ # they aren't available there
+ ],
+)
+
+pyspark_pandas_slow_connect = Module(
+ name="pyspark-pandas-slow-connect",
+ dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
+ source_file_regexes=[
+ "python/pyspark/pandas",
+ ],
+ python_test_goals=[
+ # pandas-on-Spark unittests
"pyspark.pandas.tests.connect.indexes.test_parity_default",
"pyspark.pandas.tests.connect.indexes.test_parity_category",
"pyspark.pandas.tests.connect.indexes.test_parity_timedelta",
@@ -1222,50 +1327,21 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.indexes.test_parity_datetime",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_at",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_between",
- "pyspark.pandas.tests.connect.computation.test_parity_any_all",
- "pyspark.pandas.tests.connect.computation.test_parity_apply_func",
- "pyspark.pandas.tests.connect.computation.test_parity_binary_ops",
- "pyspark.pandas.tests.connect.computation.test_parity_combine",
- "pyspark.pandas.tests.connect.computation.test_parity_compute",
- "pyspark.pandas.tests.connect.computation.test_parity_cov",
- "pyspark.pandas.tests.connect.computation.test_parity_corr",
- "pyspark.pandas.tests.connect.computation.test_parity_corrwith",
- "pyspark.pandas.tests.connect.computation.test_parity_cumulative",
- "pyspark.pandas.tests.connect.computation.test_parity_describe",
- "pyspark.pandas.tests.connect.computation.test_parity_eval",
- "pyspark.pandas.tests.connect.computation.test_parity_melt",
- "pyspark.pandas.tests.connect.computation.test_parity_missing_data",
+ "pyspark.pandas.tests.connect.indexes.test_parity_append",
+ "pyspark.pandas.tests.connect.indexes.test_parity_intersection",
+ "pyspark.pandas.tests.connect.indexes.test_parity_monotonic",
+ "pyspark.pandas.tests.connect.indexes.test_parity_union",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
+ "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round",
"pyspark.pandas.tests.connect.groupby.test_parity_stat",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_adv",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_func",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_prod",
- ],
- excluded_python_implementations=[
- "PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
- # they aren't available there
- ],
-)
-
-pyspark_pandas_connect_part1 = Module(
- name="pyspark-pandas-connect-part1",
- dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
- source_file_regexes=[
- "python/pyspark/pandas",
- ],
- python_test_goals=[
- # pandas-on-Spark unittests
- "pyspark.pandas.tests.connect.frame.test_parity_attrs",
- "pyspark.pandas.tests.connect.frame.test_parity_axis",
- "pyspark.pandas.tests.connect.frame.test_parity_constructor",
- "pyspark.pandas.tests.connect.frame.test_parity_conversion",
- "pyspark.pandas.tests.connect.frame.test_parity_reindexing",
- "pyspark.pandas.tests.connect.frame.test_parity_reshaping",
- "pyspark.pandas.tests.connect.frame.test_parity_spark",
- "pyspark.pandas.tests.connect.frame.test_parity_take",
- "pyspark.pandas.tests.connect.frame.test_parity_take_adv",
- "pyspark.pandas.tests.connect.frame.test_parity_time_series",
- "pyspark.pandas.tests.connect.frame.test_parity_truncate",
"pyspark.pandas.tests.connect.groupby.test_parity_aggregate",
"pyspark.pandas.tests.connect.groupby.test_parity_apply_func",
"pyspark.pandas.tests.connect.groupby.test_parity_corr",
@@ -1279,93 +1355,17 @@ pyspark_pandas_connect_part1 = Module(
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var",
- "pyspark.pandas.tests.connect.series.test_parity_datetime",
- "pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
- "pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
- "pyspark.pandas.tests.connect.series.test_parity_all_any",
- "pyspark.pandas.tests.connect.series.test_parity_arg_ops",
- "pyspark.pandas.tests.connect.series.test_parity_as_of",
- "pyspark.pandas.tests.connect.series.test_parity_as_type",
- "pyspark.pandas.tests.connect.series.test_parity_compute",
- "pyspark.pandas.tests.connect.series.test_parity_conversion",
- "pyspark.pandas.tests.connect.series.test_parity_cumulative",
- "pyspark.pandas.tests.connect.series.test_parity_index",
- "pyspark.pandas.tests.connect.series.test_parity_missing_data",
- "pyspark.pandas.tests.connect.series.test_parity_series",
- "pyspark.pandas.tests.connect.series.test_parity_sort",
- "pyspark.pandas.tests.connect.series.test_parity_stat",
- "pyspark.pandas.tests.connect.series.test_parity_interpolate",
-
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
- "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod",
- "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div",
- "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow",
- "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies",
- "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs",
-
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex",
- "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object",
- "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix",
- "pyspark.pandas.tests.connect.reshape.test_parity_merge_asof",
- "pyspark.pandas.tests.connect.indexes.test_parity_append",
- "pyspark.pandas.tests.connect.indexes.test_parity_intersection",
- "pyspark.pandas.tests.connect.indexes.test_parity_monotonic",
- "pyspark.pandas.tests.connect.indexes.test_parity_union",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
- "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round",
-
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift",
-
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform",
- # fallback
- "pyspark.pandas.tests.connect.frame.test_parity_asfreq",
- "pyspark.pandas.tests.connect.frame.test_parity_asof",
- ],
- excluded_python_implementations=[
- "PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
- # they aren't available there
- ],
-)
-
-
-pyspark_pandas_connect_part2 = Module(
- name="pyspark-pandas-connect-part2",
- dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
- source_file_regexes=[
- "python/pyspark/pandas",
- ],
- python_test_goals=[
- # pandas-on-Spark unittests
- "pyspark.pandas.tests.connect.computation.test_parity_pivot",
- "pyspark.pandas.tests.connect.computation.test_parity_pivot_table",
- "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv",
-
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx",
-
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv",
- "pyspark.pandas.tests.connect.computation.test_parity_stats",
- "pyspark.pandas.tests.connect.frame.test_parity_interpolate",
- "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error",
- "pyspark.pandas.tests.connect.resample.test_parity_frame",
- "pyspark.pandas.tests.connect.resample.test_parity_series",
- "pyspark.pandas.tests.connect.resample.test_parity_error",
- "pyspark.pandas.tests.connect.resample.test_parity_missing",
- "pyspark.pandas.tests.connect.resample.test_parity_on",
- "pyspark.pandas.tests.connect.resample.test_parity_timezone",
- "pyspark.pandas.tests.connect.window.test_parity_ewm_error",
- "pyspark.pandas.tests.connect.window.test_parity_ewm_mean",
- "pyspark.pandas.tests.connect.window.test_parity_groupby_ewm_mean",
- "pyspark.pandas.tests.connect.window.test_parity_missing",
- "pyspark.pandas.tests.connect.window.test_parity_rolling",
- "pyspark.pandas.tests.connect.window.test_parity_rolling_adv",
- "pyspark.pandas.tests.connect.window.test_parity_rolling_count",
- "pyspark.pandas.tests.connect.window.test_parity_rolling_error",
- "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling",
- "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_adv",
-
"pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_count",
- "pyspark.pandas.tests.connect.window.test_parity_expanding",
- "pyspark.pandas.tests.connect.window.test_parity_expanding_adv",
- "pyspark.pandas.tests.connect.window.test_parity_expanding_error",
- "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
-
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
+ "pyspark.pandas.tests.connect.groupby.test_parity_index",
+ "pyspark.pandas.tests.connect.groupby.test_parity_describe",
+ "pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
+ "pyspark.pandas.tests.connect.groupby.test_parity_groupby",
+ "pyspark.pandas.tests.connect.groupby.test_parity_grouping",
+ "pyspark.pandas.tests.connect.groupby.test_parity_missing",
+ "pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
+ "pyspark.pandas.tests.connect.groupby.test_parity_raises",
+ "pyspark.pandas.tests.connect.groupby.test_parity_rank",
+ "pyspark.pandas.tests.connect.groupby.test_parity_size",
+ "pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count",
@@ -1380,40 +1380,6 @@ pyspark_pandas_connect_part2 = Module(
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_series",
- "pyspark.pandas.tests.connect.groupby.test_parity_index",
- "pyspark.pandas.tests.connect.groupby.test_parity_describe",
- "pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
- "pyspark.pandas.tests.connect.groupby.test_parity_groupby",
- "pyspark.pandas.tests.connect.groupby.test_parity_grouping",
- "pyspark.pandas.tests.connect.groupby.test_parity_missing",
- "pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
- "pyspark.pandas.tests.connect.groupby.test_parity_raises",
- "pyspark.pandas.tests.connect.groupby.test_parity_rank",
- "pyspark.pandas.tests.connect.groupby.test_parity_size",
- "pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
- ],
- excluded_python_implementations=[
- "PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
- # they aren't available there
- ],
-)
-
-
-pyspark_pandas_connect_part3 = Module(
- name="pyspark-pandas-connect-part3",
- dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
- source_file_regexes=[
- "python/pyspark/pandas",
- ],
- python_test_goals=[
- # pandas-on-Spark unittests
- "pyspark.pandas.tests.connect.io.test_parity_io",
- "pyspark.pandas.tests.connect.io.test_parity_csv",
- "pyspark.pandas.tests.connect.io.test_parity_feather",
- "pyspark.pandas.tests.connect.io.test_parity_stata",
- "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
- "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
- "pyspark.pandas.tests.connect.io.test_parity_series_conversion",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float",
@@ -1440,6 +1406,8 @@ pyspark_pandas_connect_part3 = Module(
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_adv",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_count",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
index 8215628c1942..0dcd72ca7e47 100755
--- a/dev/sparktestsupport/utils.py
+++ b/dev/sparktestsupport/utils.py
@@ -110,27 +110,27 @@ def determine_modules_to_test(changed_modules,
deduplicated=True):
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive',
'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml',
'pyspark-ml-connect', 'pyspark-mllib',
- 'pyspark-pandas', 'pyspark-pandas-connect-part0',
'pyspark-pandas-connect-part1',
- 'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3',
'pyspark-pandas-slow',
- 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
'sql-kafka-0-10']
+ 'pyspark-pandas', 'pyspark-pandas-connect', 'pyspark-pandas-slow',
+ 'pyspark-pandas-slow-connect', 'pyspark-sql', 'pyspark-testing', 'repl',
'sparkr', 'sql',
+ 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive',
'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml',
'pyspark-ml-connect', 'pyspark-mllib',
- 'pyspark-pandas', 'pyspark-pandas-connect-part0',
'pyspark-pandas-connect-part1',
- 'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3',
'pyspark-pandas-slow',
- 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
'sql-kafka-0-10']
+ 'pyspark-pandas', 'pyspark-pandas-connect', 'pyspark-pandas-slow',
+ 'pyspark-pandas-slow-connect', 'pyspark-sql', 'pyspark-testing', 'repl',
'sparkr', 'sql',
+ 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests',
'examples', 'graphx',
'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf',
'pyspark-connect',
'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
'pyspark-pandas',
- 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
'pyspark-pandas-connect-part2',
- 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
'pyspark-resource', 'pyspark-sql',
- 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql',
'sql-kafka-0-10',
- 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
+ 'pyspark-pandas-connect', 'pyspark-pandas-slow',
'pyspark-pandas-slow-connect',
+ 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming',
'pyspark-testing', 'repl',
+ 'root', 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming',
'streaming-kafka-0-10',
+ 'streaming-kinesis-asl']
"""
modules_to_test = set()
for module in changed_modules:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]