This is an automated email from the ASF dual-hosted git repository.
HyukjinKwon pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new cc4eed582a37 [SPARK-56790][PYTHON][TESTS] Remove duplicated test cases
for test_data_source_segfault
cc4eed582a37 is described below
commit cc4eed582a37e69a522f1b31bac549f175b049e8
Author: Tian Gao <[email protected]>
AuthorDate: Tue May 12 07:01:56 2026 +0900
[SPARK-56790][PYTHON][TESTS] Remove duplicated test cases for
test_data_source_segfault
### What changes were proposed in this pull request?
Remove some unnecessary test cases in `test_data_source_segfault`
### Why are the changes needed?
After we re-organize the planner worker code, all planner workers that data
source uses are using the exact same code path for `faulthandler`. The test is
pretty expensive because it needs to spawn a new process (it segfaults so not
reusable). We don't need to test it on different planner workers. I kept one of
the planner worker and the one that segfaults on an actual Python UDF worker.
This should save us about 30s.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #55749 from gaogaotiantian/speed-up-datasource-segfault.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit b71c4aef1bb0accc309f402d005e8bfc94bfb61c)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/tests/test_python_datasource.py | 70 ----------------------
1 file changed, 70 deletions(-)
diff --git a/python/pyspark/sql/tests/test_python_datasource.py
b/python/pyspark/sql/tests/test_python_datasource.py
index 9aa1b643c04a..675c4b19fad9 100644
--- a/python/pyspark/sql/tests/test_python_datasource.py
+++ b/python/pyspark/sql/tests/test_python_datasource.py
@@ -849,32 +849,6 @@ class BasePythonDataSourceTestsMixin:
with self.assertRaisesRegex(Exception, expected):
self.spark.read.format("test").load().show()
- with
self.subTest(worker="pyspark.sql.worker.plan_data_source_read"):
-
- class TestDataSource(DataSource):
- @classmethod
- def name(cls):
- return "test"
-
- def schema(self):
- return "x string"
-
- def reader(self, schema):
- return TestReader()
-
- class TestReader(DataSourceReader):
- def partitions(self):
- ctypes.string_at(0)
- return []
-
- def read(self, partition):
- return []
-
- self.spark.dataSource.register(TestDataSource)
-
- with self.assertRaisesRegex(Exception, expected):
- self.spark.read.format("test").load().show()
-
with self.subTest(worker="pyspark.worker"):
class TestDataSource(DataSource):
@@ -898,50 +872,6 @@ class BasePythonDataSourceTestsMixin:
with self.assertRaisesRegex(Exception, expected):
self.spark.read.format("test").load().show()
- with
self.subTest(worker="pyspark.sql.worker.write_into_data_source"):
-
- class TestDataSource(DataSource):
- @classmethod
- def name(cls):
- return "test"
-
- def writer(self, schema, overwrite):
- return TestWriter()
-
- class TestWriter(DataSourceWriter):
- def write(self, iterator):
- ctypes.string_at(0)
- return WriterCommitMessage()
-
- self.spark.dataSource.register(TestDataSource)
-
- with tempfile.TemporaryDirectory(prefix="test_segfault_")
as d:
- with self.assertRaisesRegex(Exception, expected):
-
self.spark.range(10).write.format("test").mode("append").save(d)
-
- with
self.subTest(worker="pyspark.sql.worker.commit_data_source_write"):
-
- class TestDataSource(DataSource):
- @classmethod
- def name(cls):
- return "test"
-
- def writer(self, schema, overwrite):
- return TestWriter2()
-
- class TestWriter2(DataSourceWriter):
- def write(self, iterator):
- return WriterCommitMessage()
-
- def commit(self, messages):
- ctypes.string_at(0)
-
- self.spark.dataSource.register(TestDataSource)
-
- with tempfile.TemporaryDirectory(prefix="test_segfault_")
as d:
- with self.assertRaisesRegex(Exception, expected):
-
self.spark.range(10).write.format("test").mode("append").save(d)
-
@unittest.skipIf(is_remote_only(), "Requires JVM access")
def test_data_source_reader_with_logging(self):
logger = logging.getLogger("test_data_source_reader")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]