This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b4c84c917aee [SPARK-54544][PYTHON] Enable flake8 F811 check
b4c84c917aee is described below
commit b4c84c917aee673d26162a92bd95afc743ea7684
Author: Tian Gao <[email protected]>
AuthorDate: Tue Dec 2 08:53:25 2025 +0900
[SPARK-54544][PYTHON] Enable flake8 F811 check
### What changes were proposed in this pull request?
Enabled flake8 [F811](https://www.flake8rules.com/rules/F811.html) check on
our repo and fixed reported issues.
### Why are the changes needed?
I know upgrading lint system is a pain, but we should not just put it aside
forever. Our pinned `flake8` version is not even usable on Python3.12+.
During this "lint fix", I actually discovered a few real bugs - most of
them are silently disabled unittests because there is a test method that has
the same name (probably due to copy/paste). I think this result supported the
idea that we should take lint more seriously.
About `functions.log`, we got it wrong. It's not because `overload` does
not work properly - it's because we have two `log` function in that gigantic
file. The former one is
[dead](https://app.codecov.io/gh/apache/spark/blob/master/python%2Fpyspark%2Fsql%2Ffunctions%2Fbuiltin.py#L3111).
I just removed that one.
Again, I really think we should upgrade our lint system. I'm trying to do
it slowly - piece by piece, so that people's daily workflow is not impacted too
much.
I hope we can eventually move to a place where all linters are updated and
people can be more confident about their changes.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
`flake8` test on major directories. CI should give more a comprehensive
result.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #53253 from gaogaotiantian/flake8-f811.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/tox.ini | 3 --
python/pyspark/logger/tests/test_logger.py | 1 -
python/pyspark/ml/util.py | 1 -
python/pyspark/mllib/classification.py | 1 -
python/pyspark/mllib/feature.py | 1 -
python/pyspark/mllib/regression.py | 2 +-
python/pyspark/mllib/tree.py | 2 +-
python/pyspark/mllib/util.py | 3 +-
.../pandas/tests/diff_frames_ops/test_corrwith.py | 1 -
.../pandas/tests/diff_frames_ops/test_dot_frame.py | 1 -
.../pyspark/pandas/tests/indexes/test_category.py | 1 -
.../pyspark/pandas/tests/indexes/test_timedelta.py | 1 -
.../pandas/tests/series/test_string_ops_adv.py | 1 -
python/pyspark/pandas/tests/test_namespace.py | 1 -
python/pyspark/pandas/tests/test_numpy_compat.py | 1 -
python/pyspark/pandas/tests/test_utils.py | 2 +-
.../pipelines/tests/test_block_connect_access.py | 1 -
python/pyspark/sql/connect/dataframe.py | 1 -
python/pyspark/sql/connect/group.py | 1 -
python/pyspark/sql/connect/udf.py | 1 -
python/pyspark/sql/functions/builtin.py | 52 +---------------------
python/pyspark/sql/tests/arrow/test_arrow.py | 6 ---
python/pyspark/sql/tests/arrow/test_arrow_udf.py | 14 +++---
.../sql/tests/arrow/test_arrow_udf_scalar.py | 2 +-
.../connect/streaming/test_parity_listener.py | 1 -
.../connect/test_parity_dataframe_query_context.py | 1 -
.../sql/tests/connect/test_parity_geographytype.py | 1 -
.../sql/tests/connect/test_parity_geometrytype.py | 1 -
.../sql/tests/connect/test_parity_observation.py | 1 -
.../sql/tests/connect/test_parity_readwriter.py | 1 -
.../pyspark/sql/tests/connect/test_parity_udf.py | 4 --
.../test_transform_with_state_state_variable.py | 1 -
python/pyspark/sql/tests/pandas/test_pandas_udf.py | 16 +++----
.../sql/tests/pandas/test_pandas_udf_scalar.py | 1 -
python/pyspark/sql/tests/test_python_datasource.py | 9 ++--
python/pyspark/sql/tests/test_udtf.py | 1 -
python/pyspark/sql/tests/test_utils.py | 49 +-------------------
.../tests/udf_type_tests/test_udf_return_types.py | 1 -
python/pyspark/worker.py | 2 -
39 files changed, 26 insertions(+), 165 deletions(-)
diff --git a/dev/tox.ini b/dev/tox.ini
index 8a8e03ec9be3..721266ec2bfa 100644
--- a/dev/tox.ini
+++ b/dev/tox.ini
@@ -18,9 +18,6 @@
ignore =
E203, # Skip as black formatter adds a whitespace around ':'.
E402, # Module top level import is disabled for optional import check, etc.
- # 1. Type hints with def are treated as redefinition (e.g., functions.log).
- # 2. Some are used for testing.
- F811,
# There are too many instances to fix. Ignored for now.
W503,
W504,
diff --git a/python/pyspark/logger/tests/test_logger.py
b/python/pyspark/logger/tests/test_logger.py
index 96d707eb34a2..69b28d2308a7 100644
--- a/python/pyspark/logger/tests/test_logger.py
+++ b/python/pyspark/logger/tests/test_logger.py
@@ -215,7 +215,6 @@ class LoggerTests(LoggerTestsMixin, ReusedSQLTestCase):
if __name__ == "__main__":
- import unittest
from pyspark.logger.tests.test_logger import * # noqa: F401
try:
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 9072e88ca29f..06759bc25269 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -50,7 +50,6 @@ if TYPE_CHECKING:
from py4j.java_gateway import JavaGateway, JavaObject
from pyspark.ml._typing import PipelineStage
from pyspark.ml.base import Params
- from pyspark.ml.wrapper import JavaWrapper
from pyspark.core.context import SparkContext
from pyspark.sql import DataFrame
from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
diff --git a/python/pyspark/mllib/classification.py
b/python/pyspark/mllib/classification.py
index bf8fd04dc283..84b59eecbe9d 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -34,7 +34,6 @@ from pyspark.mllib.regression import (
)
from pyspark.mllib.util import Saveable, Loader, inherit_doc
from pyspark.mllib.linalg import Vector
-from pyspark.mllib.regression import LabeledPoint
if TYPE_CHECKING:
from pyspark.mllib._typing import VectorLike
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 915a55595cb5..0d988dcb065a 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -36,7 +36,6 @@ from py4j.java_collections import JavaMap
if TYPE_CHECKING:
from pyspark.mllib._typing import VectorLike
- from py4j.java_collections import JavaMap
__all__ = [
"Normalizer",
diff --git a/python/pyspark/mllib/regression.py
b/python/pyspark/mllib/regression.py
index 87f05bc0979b..b4bd41492706 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -32,7 +32,7 @@ from typing import (
import numpy as np
-from pyspark import RDD, since
+from pyspark import since
from pyspark.streaming.dstream import DStream
from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
from pyspark.mllib.linalg import _convert_to_vector
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index b24bced3ced6..9e945d8a1b94 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -18,7 +18,7 @@
import sys
import random
-from pyspark import RDD, since
+from pyspark import since
from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 5572d9ca8555..caa2c9338a95 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -20,7 +20,7 @@ from functools import reduce
import numpy as np
-from pyspark import SparkContext, since
+from pyspark import since
from pyspark.mllib.common import callMLlibFunc, inherit_doc
from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
from pyspark.sql import DataFrame
@@ -28,7 +28,6 @@ from typing import Generic, Iterable, List, Optional, Tuple,
Type, TypeVar, cast
from pyspark.core.context import SparkContext
from pyspark.mllib.linalg import Vector
from pyspark.core.rdd import RDD
-from pyspark.sql.dataframe import DataFrame
T = TypeVar("T")
L = TypeVar("L", bound="Loader")
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
index 1bd274b45a74..5421de9f3371 100644
--- a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
@@ -124,7 +124,6 @@ class DiffFramesCorrWithTests(
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.diff_frames_ops.test_corrwith import * # noqa:
F401
try:
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py
index 7a94e1858f09..14292b6c3280 100644
--- a/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_dot_frame.py
@@ -91,7 +91,6 @@ class DiffFramesDotFrameTests(DiffFramesDotFrameMixin,
PandasOnSparkTestCase, SQ
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.diff_frames_ops.test_dot_frame import * # noqa:
F401
try:
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py
b/python/pyspark/pandas/tests/indexes/test_category.py
index acd80378333e..7d44bd426da7 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -402,7 +402,6 @@ class CategoricalIndexTests(CategoricalIndexTestsMixin,
PandasOnSparkTestCase, T
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.indexes.test_category import * # noqa: F401
try:
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py
b/python/pyspark/pandas/tests/indexes/test_timedelta.py
index 400bb0e2a365..037d784840ae 100644
--- a/python/pyspark/pandas/tests/indexes/test_timedelta.py
+++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -115,7 +115,6 @@ class TimedeltaIndexTests(
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.indexes.test_timedelta import * # noqa: F401
try:
diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index b0e4c69a35ea..f03167d2ad72 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -226,7 +226,6 @@ class SeriesStringOpsAdvTests(
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.series.test_string_ops_adv import * # noqa: F401
try:
diff --git a/python/pyspark/pandas/tests/test_namespace.py
b/python/pyspark/pandas/tests/test_namespace.py
index 141c6873d7f5..145ac2dcb9f3 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -683,7 +683,6 @@ class NamespaceTests(NamespaceTestsMixin,
PandasOnSparkTestCase, SQLTestUtils):
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.test_namespace import * # noqa: F401
try:
diff --git a/python/pyspark/pandas/tests/test_numpy_compat.py
b/python/pyspark/pandas/tests/test_numpy_compat.py
index f754ee08a783..d961a433e181 100644
--- a/python/pyspark/pandas/tests/test_numpy_compat.py
+++ b/python/pyspark/pandas/tests/test_numpy_compat.py
@@ -198,7 +198,6 @@ class NumPyCompatTests(
if __name__ == "__main__":
- import unittest
from pyspark.pandas.tests.test_numpy_compat import * # noqa: F401
try:
diff --git a/python/pyspark/pandas/tests/test_utils.py
b/python/pyspark/pandas/tests/test_utils.py
index 6286df8e5469..89efb85dd61a 100644
--- a/python/pyspark/pandas/tests/test_utils.py
+++ b/python/pyspark/pandas/tests/test_utils.py
@@ -181,7 +181,7 @@ class UtilsTestsMixin:
},
)
- def test_series_error_assert_pandas_equal(self):
+ def test_series_error_assert_pandas_almost_equal_2(self):
series1 = pd.Series([1, 2, 3])
series2 = pd.Series([4, 5, 6])
diff --git a/python/pyspark/pipelines/tests/test_block_connect_access.py
b/python/pyspark/pipelines/tests/test_block_connect_access.py
index 1ad1881b7127..60688f30bfb9 100644
--- a/python/pyspark/pipelines/tests/test_block_connect_access.py
+++ b/python/pyspark/pipelines/tests/test_block_connect_access.py
@@ -17,7 +17,6 @@
import unittest
from pyspark.errors import PySparkException
-from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.connectutils import (
ReusedConnectTestCase,
should_test_connect,
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 65dea5cf1a57..8de8663693f7 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -19,7 +19,6 @@
from pyspark.errors.exceptions.base import (
SessionNotSameException,
PySparkIndexError,
- PySparkAttributeError,
)
from pyspark.resource import ResourceProfile
from pyspark.sql.connect.logging import logger
diff --git a/python/pyspark/sql/connect/group.py
b/python/pyspark/sql/connect/group.py
index d540e721f149..afaf286a7e9c 100644
--- a/python/pyspark/sql/connect/group.py
+++ b/python/pyspark/sql/connect/group.py
@@ -55,7 +55,6 @@ if TYPE_CHECKING:
PandasGroupedMapFunctionWithState,
)
from pyspark.sql.connect.dataframe import DataFrame
- from pyspark.sql.types import StructType
class GroupedData:
diff --git a/python/pyspark/sql/connect/udf.py
b/python/pyspark/sql/connect/udf.py
index 3bc2f87e4bd3..64506731badb 100644
--- a/python/pyspark/sql/connect/udf.py
+++ b/python/pyspark/sql/connect/udf.py
@@ -50,7 +50,6 @@ if TYPE_CHECKING:
UserDefinedFunctionLike,
)
from pyspark.sql.connect.session import SparkSession
- from pyspark.sql.types import StringType
def _create_py_udf(
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 17eda26f76e1..04b800be2372 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -3061,56 +3061,6 @@ def floor(col: "ColumnOrName", scale:
Optional[Union[Column, int]] = None) -> Co
return _invoke_function_over_columns("floor", col, scale) # type:
ignore[arg-type]
-@_try_remote_functions
-def log(col: "ColumnOrName") -> Column:
- """
- Computes the natural logarithm of the given value.
-
- .. versionadded:: 1.4.0
-
- .. versionchanged:: 3.4.0
- Supports Spark Connect.
-
- Parameters
- ----------
- col : :class:`~pyspark.sql.Column` or column name
- column to calculate natural logarithm for.
-
- Returns
- -------
- :class:`~pyspark.sql.Column`
- natural logarithm of the given value.
-
- Examples
- --------
- Example 1: Compute the natural logarithm of E
-
- >>> from pyspark.sql import functions as sf
- >>> spark.range(1).select(sf.log(sf.e())).show()
- +-------+
- |ln(E())|
- +-------+
- | 1.0|
- +-------+
-
- Example 2: Compute the natural logarithm of invalid values
-
- >>> from pyspark.sql import functions as sf
- >>> spark.sql(
- ... "SELECT * FROM VALUES (-1), (0), (FLOAT('NAN')), (NULL) AS
TAB(value)"
- ... ).select("*", sf.log("value")).show()
- +-----+---------+
- |value|ln(value)|
- +-----+---------+
- | -1.0| NULL|
- | 0.0| NULL|
- | NaN| NaN|
- | NULL| NULL|
- +-----+---------+
- """
- return _invoke_function_over_columns("log", col)
-
-
@_try_remote_functions
def log10(col: "ColumnOrName") -> Column:
"""
@@ -8342,7 +8292,7 @@ def when(condition: Column, value: Any) -> Column:
return _invoke_function("when", condition._jc, v)
-@overload # type: ignore[no-redef]
+@overload
def log(arg1: "ColumnOrName") -> Column:
...
diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py
b/python/pyspark/sql/tests/arrow/test_arrow.py
index 79d8bf77d9d5..19e579cb6778 100644
--- a/python/pyspark/sql/tests/arrow/test_arrow.py
+++ b/python/pyspark/sql/tests/arrow/test_arrow.py
@@ -61,7 +61,6 @@ from pyspark.testing.sqlutils import (
from pyspark.errors import ArithmeticException, PySparkTypeError,
UnsupportedOperationException
from pyspark.loose_version import LooseVersion
from pyspark.util import is_remote_only
-from pyspark.loose_version import LooseVersion
if have_pandas:
import pandas as pd
@@ -1009,11 +1008,6 @@ class ArrowTestsMixin:
expected[r][e] == result[r][e], f"{expected[r][e]}
== {result[r][e]}"
)
- def test_createDataFrame_pandas_with_struct_type(self):
- for arrow_enabled in [True, False]:
- with self.subTest(arrow_enabled=arrow_enabled):
-
self.check_createDataFrame_pandas_with_struct_type(arrow_enabled)
-
def test_createDataFrame_arrow_with_struct_type_nulls(self):
t = pa.table(
{
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf.py
b/python/pyspark/sql/tests/arrow/test_arrow_udf.py
index b56ed75acad9..51190f6d2a3d 100644
--- a/python/pyspark/sql/tests/arrow/test_arrow_udf.py
+++ b/python/pyspark/sql/tests/arrow/test_arrow_udf.py
@@ -164,13 +164,13 @@ class ArrowUDFTestsMixin:
with self.assertRaises(ParseException):
@arrow_udf("blah")
- def foo(x):
+ def _(x):
return x
with self.assertRaises(PySparkTypeError) as pe:
@arrow_udf(returnType="double",
functionType=PandasUDFType.SCALAR)
- def foo(df):
+ def _(df):
return df
self.check_error(
@@ -185,7 +185,7 @@ class ArrowUDFTestsMixin:
with self.assertRaises(PySparkTypeError) as pe:
@arrow_udf(functionType=ArrowUDFType.SCALAR)
- def foo(x):
+ def _(x):
return x
self.check_error(
@@ -197,7 +197,7 @@ class ArrowUDFTestsMixin:
with self.assertRaises(PySparkTypeError) as pe:
@arrow_udf("double", 100)
- def foo(x):
+ def _(x):
return x
self.check_error(
@@ -209,7 +209,7 @@ class ArrowUDFTestsMixin:
with self.assertRaises(PySparkTypeError) as pe:
@arrow_udf(returnType=PandasUDFType.GROUPED_MAP)
- def foo(df):
+ def _(df):
return df
self.check_error(
@@ -224,13 +224,13 @@ class ArrowUDFTestsMixin:
with self.assertRaisesRegex(ValueError, "0-arg
arrow_udfs.*not.*supported"):
@arrow_udf(LongType(), ArrowUDFType.SCALAR)
- def zero_with_type():
+ def _():
return 1
with self.assertRaisesRegex(ValueError, "0-arg
arrow_udfs.*not.*supported"):
@arrow_udf(LongType(), ArrowUDFType.SCALAR_ITER)
- def zero_with_type():
+ def _():
yield 1
yield 2
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py
b/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py
index 3bd00d2cc921..a0d805a3ab27 100644
--- a/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py
+++ b/python/pyspark/sql/tests/arrow/test_arrow_udf_scalar.py
@@ -27,7 +27,7 @@ from typing import Iterator, Tuple
from pyspark.util import PythonEvalType
from pyspark.sql.functions import arrow_udf, ArrowUDFType
-from pyspark.sql import Row, functions as F
+from pyspark.sql import functions as F
from pyspark.sql.types import (
IntegerType,
ByteType,
diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
index f05f982d2d14..cc750f2b3439 100644
--- a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
+++ b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
@@ -313,7 +313,6 @@ class
StreamingListenerParityTests(StreamingListenerTestsMixin, ReusedConnectTes
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.streaming.test_parity_listener import * #
noqa: F401
try:
diff --git
a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py
b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py
index 59107363571e..b83a2d5f198b 100644
--- a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py
+++ b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py
@@ -26,7 +26,6 @@ class
DataFrameQueryContextParityTests(DataFrameQueryContextTestsMixin, ReusedCo
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.test_parity_dataframe_query_context import
* # noqa: F401
try:
diff --git a/python/pyspark/sql/tests/connect/test_parity_geographytype.py
b/python/pyspark/sql/tests/connect/test_parity_geographytype.py
index 501bbed20ff1..a85edbc69c2a 100644
--- a/python/pyspark/sql/tests/connect/test_parity_geographytype.py
+++ b/python/pyspark/sql/tests/connect/test_parity_geographytype.py
@@ -26,7 +26,6 @@ class GeographyTypeParityTest(GeographyTypeTestMixin,
ReusedConnectTestCase):
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.test_parity_geographytype import * # noqa:
F401
try:
diff --git a/python/pyspark/sql/tests/connect/test_parity_geometrytype.py
b/python/pyspark/sql/tests/connect/test_parity_geometrytype.py
index b95321b3c61b..640ead4c3244 100644
--- a/python/pyspark/sql/tests/connect/test_parity_geometrytype.py
+++ b/python/pyspark/sql/tests/connect/test_parity_geometrytype.py
@@ -26,7 +26,6 @@ class GeometryTypeParityTest(GeometryTypeTestMixin,
ReusedConnectTestCase):
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.test_parity_geometrytype import * # noqa:
F401
try:
diff --git a/python/pyspark/sql/tests/connect/test_parity_observation.py
b/python/pyspark/sql/tests/connect/test_parity_observation.py
index e16053d5a082..ebe0ed78e5c4 100644
--- a/python/pyspark/sql/tests/connect/test_parity_observation.py
+++ b/python/pyspark/sql/tests/connect/test_parity_observation.py
@@ -29,7 +29,6 @@ class DataFrameObservationParityTests(
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.test_parity_observation import * # noqa:
F401
try:
diff --git a/python/pyspark/sql/tests/connect/test_parity_readwriter.py
b/python/pyspark/sql/tests/connect/test_parity_readwriter.py
index f83f3edbfa78..a95cea3e7345 100644
--- a/python/pyspark/sql/tests/connect/test_parity_readwriter.py
+++ b/python/pyspark/sql/tests/connect/test_parity_readwriter.py
@@ -37,7 +37,6 @@ class ReadwriterV2ParityTests(ReadwriterV2TestsMixin,
ReusedConnectTestCase):
if __name__ == "__main__":
- import unittest
from pyspark.sql.tests.connect.test_parity_readwriter import * # noqa:
F401
try:
diff --git a/python/pyspark/sql/tests/connect/test_parity_udf.py
b/python/pyspark/sql/tests/connect/test_parity_udf.py
index 5507f8e9f289..4b027f6fca37 100644
--- a/python/pyspark/sql/tests/connect/test_parity_udf.py
+++ b/python/pyspark/sql/tests/connect/test_parity_udf.py
@@ -56,10 +56,6 @@ class UDFParityTests(BaseUDFTestsMixin,
ReusedConnectTestCase):
def test_nondeterministic_udf3(self):
super().test_nondeterministic_udf3()
- @unittest.skip("Spark Connect doesn't support RDD but the test depends on
it.")
- def test_worker_original_stdin_closed(self):
- super().test_worker_original_stdin_closed()
-
@unittest.skip("Spark Connect does not support SQLContext but the test
depends on it.")
def test_udf_on_sql_context(self):
super().test_udf_on_sql_context()
diff --git
a/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py
b/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py
index 7f9eb8979c08..c5172ff6af3f 100644
---
a/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py
+++
b/python/pyspark/sql/tests/pandas/streaming/test_transform_with_state_state_variable.py
@@ -26,7 +26,6 @@ from pyspark.testing.sqlutils import (
ReusedSQLTestCase,
)
-from pyspark.testing.sqlutils import ReusedSQLTestCase
from
pyspark.sql.tests.pandas.streaming.test_pandas_transform_with_state_state_variable
import (
TransformWithStateStateVariableTestsMixin,
)
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf.py
b/python/pyspark/sql/tests/pandas/test_pandas_udf.py
index 017698f318d5..6b397bc63626 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_udf.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_udf.py
@@ -170,13 +170,13 @@ class PandasUDFTestsMixin:
with self.assertRaises(ParseException):
@pandas_udf("blah")
- def foo(x):
+ def _(x):
return x
with self.assertRaises(PySparkTypeError) as pe:
@pandas_udf(returnType="double",
functionType=PandasUDFType.GROUPED_MAP)
- def foo(df):
+ def _(df):
return df
self.check_error(
@@ -193,14 +193,14 @@ class PandasUDFTestsMixin:
with self.assertRaisesRegex(ValueError, "Invalid function"):
@pandas_udf(returnType="k int, v double",
functionType=PandasUDFType.GROUPED_MAP)
- def foo(k, v, w):
+ def _(k, v, w):
return k
def check_udf_wrong_arg(self):
with self.assertRaises(PySparkTypeError) as pe:
@pandas_udf(functionType=PandasUDFType.SCALAR)
- def foo(x):
+ def _(x):
return x
self.check_error(
@@ -212,7 +212,7 @@ class PandasUDFTestsMixin:
with self.assertRaises(PySparkTypeError) as pe:
@pandas_udf("double", 100)
- def foo(x):
+ def _(x):
return x
self.check_error(
@@ -227,20 +227,20 @@ class PandasUDFTestsMixin:
with self.assertRaisesRegex(ValueError, "0-arg
pandas_udfs.*not.*supported"):
@pandas_udf(LongType(), PandasUDFType.SCALAR)
- def zero_with_type():
+ def _():
return 1
with self.assertRaisesRegex(ValueError, "0-arg
pandas_udfs.*not.*supported"):
@pandas_udf(LongType(), PandasUDFType.SCALAR_ITER)
- def zero_with_type():
+ def _():
yield 1
yield 2
with self.assertRaises(PySparkTypeError) as pe:
@pandas_udf(returnType=PandasUDFType.GROUPED_MAP)
- def foo(df):
+ def _(df):
return df
self.check_error(
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
index b936a9240e52..88f2168f131b 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
@@ -53,7 +53,6 @@ from pyspark.sql.types import (
StringType,
ArrayType,
StructField,
- Row,
TimestampType,
MapType,
DateType,
diff --git a/python/pyspark/sql/tests/test_python_datasource.py
b/python/pyspark/sql/tests/test_python_datasource.py
index e5171876656c..9a4782e7322f 100644
--- a/python/pyspark/sql/tests/test_python_datasource.py
+++ b/python/pyspark/sql/tests/test_python_datasource.py
@@ -20,7 +20,6 @@ import tempfile
import unittest
import logging
import json
-import os
from dataclasses import dataclass
from datetime import datetime
from decimal import Decimal
@@ -852,9 +851,9 @@ class BasePythonDataSourceTestsMixin:
return "x string"
def reader(self, schema):
- return TestReader()
+ return TestReader2()
- class TestReader(DataSourceReader):
+ class TestReader2(DataSourceReader):
def read(self, partition):
ctypes.string_at(0)
yield "x",
@@ -894,9 +893,9 @@ class BasePythonDataSourceTestsMixin:
return "test"
def writer(self, schema, overwrite):
- return TestWriter()
+ return TestWriter2()
- class TestWriter(DataSourceWriter):
+ class TestWriter2(DataSourceWriter):
def write(self, iterator):
return WriterCommitMessage()
diff --git a/python/pyspark/sql/tests/test_udtf.py
b/python/pyspark/sql/tests/test_udtf.py
index 5ded5aa67b4e..389df5b5a6cf 100644
--- a/python/pyspark/sql/tests/test_udtf.py
+++ b/python/pyspark/sql/tests/test_udtf.py
@@ -40,7 +40,6 @@ from pyspark.sql.functions import (
array,
col,
create_map,
- array,
lit,
named_struct,
udf,
diff --git a/python/pyspark/sql/tests/test_utils.py
b/python/pyspark/sql/tests/test_utils.py
index 08d360a19e81..5f85c1886b4b 100644
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@@ -923,7 +923,7 @@ class UtilsTestsMixin:
assertDataFrameEqual(df1, df2, checkRowOrder=True)
@unittest.skipIf(not have_pandas or not have_pyarrow, "no pandas or
pyarrow dependency")
- def test_assert_equal_exact_pandas_on_spark_df(self):
+ def test_assert_equal_exact_pandas_on_spark_df_no_order(self):
import pyspark.pandas as ps
df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
@@ -1543,53 +1543,6 @@ class UtilsTestsMixin:
messageParameters={"error_msg": error_msg},
)
- def test_list_row_unequal_schema(self):
- df1 = self.spark.createDataFrame(
- data=[
- (1, 3000),
- (2, 1000),
- (3, 10),
- ],
- schema=["id", "amount"],
- )
-
- list_of_rows = [Row(id=1, amount=300), Row(id=2, amount=100),
Row(id=3, amount=10)]
-
- rows_str1 = ""
- rows_str2 = ""
-
- # count different rows
- for r1, r2 in list(zip_longest(df1, list_of_rows)):
- rows_str1 += str(r1) + "\n"
- rows_str2 += str(r2) + "\n"
-
- generated_diff = _context_diff(
- actual=rows_str1.splitlines(), expected=rows_str2.splitlines(), n=3
- )
-
- error_msg = "Results do not match: "
- percent_diff = (2 / 3) * 100
- error_msg += "( %.5f %% )" % percent_diff
- error_msg += "\n" + "\n".join(generated_diff)
-
- with self.assertRaises(PySparkAssertionError) as pe:
- assertDataFrameEqual(df1, list_of_rows)
-
- self.check_error(
- exception=pe.exception,
- errorClass="DIFFERENT_ROWS",
- messageParameters={"error_msg": error_msg},
- )
-
- with self.assertRaises(PySparkAssertionError) as pe:
- assertDataFrameEqual(df1, list_of_rows, checkRowOrder=True)
-
- self.check_error(
- exception=pe.exception,
- errorClass="DIFFERENT_ROWS",
- messageParameters={"error_msg": error_msg},
- )
-
def test_list_row_unequal_schema(self):
from pyspark.sql import Row
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
index 0ad17e919754..5e307110559b 100644
--- a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
+++ b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
@@ -21,7 +21,6 @@ import os
import platform
import unittest
from decimal import Decimal
-import numpy as np
import pandas as pd
from pyspark.sql import Row
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 96aac6083bf2..ee3d71ee5ffb 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -19,7 +19,6 @@
Worker that receives input from Piped RDD.
"""
import datetime
-import itertools
import os
import sys
import dataclasses
@@ -54,7 +53,6 @@ from pyspark.sql.functions import
SkipRestOfInputTableException
from pyspark.sql.pandas.serializers import (
ArrowStreamPandasUDFSerializer,
ArrowStreamPandasUDTFSerializer,
- GroupPandasUDFSerializer,
GroupArrowUDFSerializer,
CogroupArrowUDFSerializer,
CogroupPandasUDFSerializer,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]