This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 46b033f0bd GH-47728: [Python] Check the source argument in 
parquet.read_table (#48008)
46b033f0bd is described below

commit 46b033f0bd18ce5c0fcc69cd6523cdbb7acd48cd
Author: Bogdan Romenskii <[email protected]>
AuthorDate: Fri Nov 14 09:52:04 2025 +0100

    GH-47728: [Python] Check the source argument in parquet.read_table (#48008)
    
    ### Rationale for this change
    See #47728. Check `source` argument in `pyarrow.parquet.read_table` if 
`pyarrow.dataset` is not available.
    
    ### What changes are included in this PR?
    Check the `source` argument, raise `ValueError` if the `source` argument is 
either a list of `.parquet` files or a directory.
    
    ### Are these changes tested?
    Yes
    
    ### Are there any user-facing changes?
    No
    
    In case if the `source` argument is a directory, I decided not to check it 
directly, but to catch the exceptions coming from the `fs.open_input_file`, 
since it already checks for it, and add extra exception on top of the stack 
that explains the actual reason.
    * GitHub Issue: #47728
    
    Authored-by: Bogdan Romenskii <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/parquet/core.py             | 15 ++++++++++++++-
 python/pyarrow/tests/parquet/test_basic.py | 14 ++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 24cb586c82..5f62a3fc4f 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1887,10 +1887,23 @@ def read_table(source, *, columns=None, 
use_threads=True,
                 "the 'schema' argument is not supported when the "
                 "pyarrow.dataset module is not available"
             )
+        if isinstance(source, list):
+            raise ValueError(
+                "the 'source' argument cannot be a list of files "
+                "when the pyarrow.dataset module is not available"
+            )
+
         filesystem, path = _resolve_filesystem_and_path(source, filesystem)
         if filesystem is not None:
+            if not filesystem.get_file_info(path).is_file:
+                raise ValueError(
+                    "the 'source' argument should be "
+                    "an existing parquet file and not a directory "
+                    "when the pyarrow.dataset module is not available"
+                )
+
             source = filesystem.open_input_file(path)
-        # TODO test that source is not a directory or a list
+
         dataset = ParquetFile(
             source, read_dictionary=read_dictionary,
             binary_type=binary_type,
diff --git a/python/pyarrow/tests/parquet/test_basic.py 
b/python/pyarrow/tests/parquet/test_basic.py
index 591bcffc1a..3b991fdd57 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import sys
 from collections import OrderedDict
 import io
 import warnings
@@ -185,8 +186,7 @@ def test_read_table_without_dataset(tempdir):
             pq.read_table(path, partitioning=['week', 'color'])
         with pytest.raises(ValueError, match="the 'schema' argument"):
             pq.read_table(path, schema=table.schema)
-        # Error message varies depending on OS
-        with pytest.raises(OSError):
+        with pytest.raises(ValueError, match="the 'source' argument"):
             pq.read_table(tempdir)
         result = pq.read_table(path)
         assert result == table
@@ -993,3 +993,13 @@ def test_checksum_write_to_dataset(tempdir):
     # checksum verification enabled raises an exception
     with pytest.raises(OSError, match="CRC checksum verification"):
         _ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
+
+
[email protected](
+    "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]])
+def test_read_table_raises_value_error_when_ds_is_unavailable(monkeypatch, 
source):
+    # GH-47728
+    monkeypatch.setitem(sys.modules, "pyarrow.dataset", None)
+
+    with pytest.raises(ValueError, match="the 'source' argument"):
+        pq.read_table(source=source)

Reply via email to