This is an automated email from the ASF dual-hosted git repository.

HyukjinKwon pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.x by this push:
     new 26c7e98a8b7a [SPARK-57192][PYTHON] Fix classic addArtifacts with 
multiple paths
26c7e98a8b7a is described below

commit 26c7e98a8b7a25b5a5d3f5640cf925649c1e4bbe
Author: Bobby Wang <[email protected]>
AuthorDate: Tue Jun 2 07:39:23 2026 +0900

    [SPARK-57192][PYTHON] Fix classic addArtifacts with multiple paths
    
    ### What changes were proposed in this pull request?
    
      This PR fixes classic PySpark `SparkSession.addArtifacts` to handle 
multiple paths in
    one call.
    
      The classic implementation previously forwarded all paths as positional 
arguments to
    `SparkContext.addPyFile`, `SparkContext.addArchive`, or 
`SparkContext.addFile`. Those
    APIs accept one path at a time, so this PR updates classic `addArtifacts` 
to call the
    underlying SparkContext API once per path.
    
      This PR also adds regression coverage for adding multiple Python files in 
a single
    `addArtifacts(..., pyfile=True)` call.
    
      ### Why are the changes needed?
    
      Classic PySpark currently fails for useful multi-path calls such as:
    
    ```python
    spark.addArtifacts("a.py", "b.py", "c.py", pyfile=True)
    ```
      with:
    
    ```
    TypeError: SparkContext.addPyFile() takes 2 positional arguments but 4 were 
given
    ```
      The public API accepts *path, so classic Spark should support multiple 
artifacts
      consistently.
    
      ### Does this PR introduce any user-facing change?
    
      Yes. Classic PySpark users can now add multiple artifacts in one
      SparkSession.addArtifacts call when using `pyfile=True, archive=True, or 
file=True`.
    
      ### How was this patch tested?
    
      Added a regression test for adding multiple Python files in one call.
    
      Also manually ran:
    
    ```
    env -u SPARK_CONF_DIR PYTHONPATH="python:python/lib/py4j-0.10.9.9-src.zip"
    SPARK_HOME="$PWD" \
        python3 -m unittest 
pyspark.sql.tests.test_artifact.ArtifactTests.test_add_multiple_pyfiles
    ```
    
      and a standalone local Spark repro covering pyfile=True, file=True, and 
archive=True.
    
      ### Was this patch authored or co-authored using generative AI tooling?
    
      Generated-by: OpenAI Codex (GPT-5.5)
    
    Closes #56245 from wbo4958/fix-addartifacts.
    
    Authored-by: Bobby Wang <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
    (cherry picked from commit 224231815105f6c0f333e866ef50c48810720f37)
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/sql/session.py             |  9 ++++++---
 python/pyspark/sql/tests/test_artifact.py | 32 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 3878af7bd5de..a1fd6ba72bef 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -2288,11 +2288,14 @@ class SparkSession(SparkConversionMixin):
                         messageParameters={"normalized_path": normalized_path},
                     )
         if archive:
-            self._sc.addArchive(*path)
+            for p in path:
+                self._sc.addArchive(p)
         elif pyfile:
-            self._sc.addPyFile(*path)
+            for p in path:
+                self._sc.addPyFile(p)
         elif file:
-            self._sc.addFile(*path)  # type: ignore[arg-type]
+            for p in path:
+                self._sc.addFile(p)
 
     addArtifact = addArtifacts
 
diff --git a/python/pyspark/sql/tests/test_artifact.py 
b/python/pyspark/sql/tests/test_artifact.py
index 777b7f35f536..7351762dadca 100644
--- a/python/pyspark/sql/tests/test_artifact.py
+++ b/python/pyspark/sql/tests/test_artifact.py
@@ -20,6 +20,7 @@ import tempfile
 from pyspark.sql.tests.connect.client.test_artifact import ArtifactTestsMixin
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 from pyspark.errors import PySparkRuntimeError
+from pyspark.sql.functions import assert_true, lit, udf
 
 
 class ArtifactTests(ArtifactTestsMixin, ReusedSQLTestCase):
@@ -36,6 +37,37 @@ class ArtifactTests(ArtifactTestsMixin, ReusedSQLTestCase):
         # file from different session.
         self.check_add_pyfile(self.spark.newSession())
 
+    def test_add_multiple_pyfiles(self):
+        def check_add_multiple_pyfiles(spark_session):
+            with 
tempfile.TemporaryDirectory(prefix="check_add_multiple_pyfiles") as d:
+                pyfile_paths = []
+                for name, value in [
+                    ("my_pyfile_a.py", 1),
+                    ("my_pyfile_b.py", 2),
+                    ("my_pyfile_c.py", 3),
+                ]:
+                    pyfile_path = os.path.join(d, name)
+                    with open(pyfile_path, "w") as f:
+                        f.write(f"my_func = lambda: {value}")
+                    pyfile_paths.append(pyfile_path)
+
+                @udf("int")
+                def func(x):
+                    import my_pyfile_a
+                    import my_pyfile_b
+                    import my_pyfile_c
+
+                    return my_pyfile_a.my_func() + my_pyfile_b.my_func() + 
my_pyfile_c.my_func()
+
+                spark_session.addArtifacts(*pyfile_paths, pyfile=True)
+                spark_session.range(1).select(assert_true(func("id") == 
lit(6))).show()
+
+        check_add_multiple_pyfiles(self.spark)
+
+        # Test multi sessions. Should be able to add the same
+        # files from different session.
+        check_add_multiple_pyfiles(self.spark.newSession())
+
     def test_add_file(self):
         self.check_add_file(self.spark)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to