This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 12b823c61c6 [SPARK-42008][CONNECT][TESTS] Reuse
pyspark.sql.tests.test_datasources test cases
12b823c61c6 is described below
commit 12b823c61c6265395372285acb1bc26fa8431e09
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Jan 12 17:36:41 2023 +0900
[SPARK-42008][CONNECT][TESTS] Reuse pyspark.sql.tests.test_datasources test
cases
### What changes were proposed in this pull request?
This PR reuses PySpark `pyspark.sql.tests.test_datasources` tests in Spark
Connect that pass for now.
### Why are the changes needed?
To make sure on the test coverage.
### Does this PR introduce _any_ user-facing change?
No, test-only.
### How was this patch tested?
Manually ran it in my local.
Closes #39526 from HyukjinKwon/SPARK-42008.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 1 +
.../sql/tests/connect/test_parity_datasources.py | 65 ++++++++++++++++++++++
python/pyspark/sql/tests/test_datasources.py | 6 +-
3 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 0bd82c7122d..0ffffe39323 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -518,6 +518,7 @@ pyspark_connect = Module(
"pyspark.sql.tests.connect.test_connect_basic",
"pyspark.sql.tests.connect.test_connect_function",
"pyspark.sql.tests.connect.test_connect_column",
+ "pyspark.sql.tests.connect.test_parity_datasources",
"pyspark.sql.tests.connect.test_parity_catalog",
"pyspark.sql.tests.connect.test_parity_functions",
"pyspark.sql.tests.connect.test_parity_group",
diff --git a/python/pyspark/sql/tests/connect/test_parity_datasources.py
b/python/pyspark/sql/tests/connect/test_parity_datasources.py
new file mode 100644
index 00000000000..83a9c4414e9
--- /dev/null
+++ b/python/pyspark/sql/tests/connect/test_parity_datasources.py
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+from pyspark.sql.tests.test_datasources import DataSourcesTestsMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class DataSourcesParityTests(DataSourcesTestsMixin, ReusedConnectTestCase):
+
+ # TODO(SPARK-42011): Implement DataFrameReader.csv
+ @unittest.skip("Fails in Spark Connect, should enable.")
+ def test_checking_csv_header(self):
+ super().test_checking_csv_header()
+
+ @unittest.skip("Spark Connect does not support RDD but the tests depend on
them.")
+ def test_csv_sampling_ratio(self):
+ super().test_csv_sampling_ratio()
+
+ @unittest.skip("Spark Connect does not support RDD but the tests depend on
them.")
+ def test_json_sampling_ratio(self):
+ super().test_json_sampling_ratio()
+
+ # TODO(SPARK-42011): Implement DataFrameReader.csv
+ @unittest.skip("Fails in Spark Connect, should enable.")
+ def test_multiline_csv(self):
+ super().test_multiline_csv()
+
+ # TODO(SPARK-42012): Implement DataFrameReader.orc
+ @unittest.skip("Fails in Spark Connect, should enable.")
+ def test_read_multiple_orc_file(self):
+ super().test_read_multiple_orc_file()
+
+ # TODO(SPARK-42013): Implement DataFrameReader.text to take multiple paths
+ @unittest.skip("Fails in Spark Connect, should enable.")
+ def test_read_text_file_list(self):
+ super().test_read_text_file_list()
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.sql.tests.connect.test_parity_datasources import * # noqa:
F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/sql/tests/test_datasources.py
b/python/pyspark/sql/tests/test_datasources.py
index 80ab8a33162..52880ae4631 100644
--- a/python/pyspark/sql/tests/test_datasources.py
+++ b/python/pyspark/sql/tests/test_datasources.py
@@ -23,7 +23,7 @@ from pyspark.sql.types import IntegerType, StructField,
StructType, LongType, St
from pyspark.testing.sqlutils import ReusedSQLTestCase
-class DataSourcesTests(ReusedSQLTestCase):
+class DataSourcesTestsMixin:
def test_linesep_text(self):
df = self.spark.read.text("python/test_support/sql/ages_newlines.csv",
lineSep=",")
expected = [
@@ -193,6 +193,10 @@ class DataSourcesTests(ReusedSQLTestCase):
shutil.rmtree(path)
+class DataSourcesTests(DataSourcesTestsMixin, ReusedSQLTestCase):
+ pass
+
+
if __name__ == "__main__":
import unittest
from pyspark.sql.tests.test_datasources import * # noqa: F401
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]