(arrow) branch main updated: GH-47172: [Python] Add a utility function to create Arrow table instead of pandas df (#47199)

rok Mon, 04 Aug 2025 00:46:35 -0700

This is an automated email from the ASF dual-hosted git repository.

rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 2cbf122793 GH-47172: [Python] Add a utility function to create Arrow 
table instead of pandas df (#47199)
2cbf122793 is described below

commit 2cbf1227939d42cb08998eb8fa666120c44ff254
Author: egolearner <[email protected]>
AuthorDate: Mon Aug 4 15:46:15 2025 +0800

    GH-47172: [Python] Add a utility function to create Arrow table instead of 
pandas df (#47199)
    
    ### Rationale for this change
    resolve #47172
    
    ### What changes are included in this PR?
    Add a utility function to create Arrow table instead of pandas df
    
    ### Are these changes tested?
    yes
    
    ### Are there any user-facing changes?
    no
    
    * GitHub Issue: #47172
    
    Authored-by: egolearner <[email protected]>
    Signed-off-by: Rok Mihevc <[email protected]>
---
 python/pyarrow/tests/parquet/common.py             | 18 ++++++++++----
 python/pyarrow/tests/parquet/test_basic.py         | 12 ++++-----
 python/pyarrow/tests/parquet/test_dataset.py       | 29 ++++++++++------------
 .../pyarrow/tests/parquet/test_parquet_writer.py   |  7 +++---
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/python/pyarrow/tests/parquet/common.py 
b/python/pyarrow/tests/parquet/common.py
index 4f5946649b..5390a24b90 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -95,11 +95,9 @@ def _range_integers(size, dtype):
     return pa.array(np.arange(size, dtype=dtype))
 
 
-def _test_dataframe(size=10000, seed=0):
-    import pandas as pd
-
+def _test_dict(size=10000, seed=0):
     np.random.seed(seed)
-    df = pd.DataFrame({
+    return {
         'uint8': _random_integers(size, np.uint8),
         'uint16': _random_integers(size, np.uint16),
         'uint32': _random_integers(size, np.uint32),
@@ -114,13 +112,23 @@ def _test_dataframe(size=10000, seed=0):
         'strings': [util.rands(10) for i in range(size)],
         'all_none': [None] * size,
         'all_none_category': [None] * size
-    })
+    }
+
+
+def _test_dataframe(size=10000, seed=0):
+    import pandas as pd
+
+    df = pd.DataFrame(_test_dict(size, seed))
 
     # TODO(PARQUET-1015)
     # df['all_none_category'] = df['all_none_category'].astype('category')
     return df
 
 
+def _test_table(size=10000, seed=0):
+    return pa.Table.from_pydict(_test_dict(size, seed))
+
+
 def make_sample_file(table_or_df):
     import pyarrow.parquet as pq
 
diff --git a/python/pyarrow/tests/parquet/test_basic.py 
b/python/pyarrow/tests/parquet/test_basic.py
index 67515c5e24..591bcffc1a 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -28,7 +28,7 @@ import pyarrow as pa
 from pyarrow import fs
 from pyarrow.tests import util
 from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
-                                          _test_dataframe)
+                                          _test_table)
 
 try:
     import pyarrow.parquet as pq
@@ -76,20 +76,18 @@ def test_set_data_page_size():
         _check_roundtrip(t, data_page_size=target_page_size)
 
 
[email protected]
[email protected]
 def test_set_write_batch_size():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
 
     _check_roundtrip(
         table, data_page_size=10, write_batch_size=1, version='2.4'
     )
 
 
[email protected]
[email protected]
 def test_set_dictionary_pagesize_limit():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
 
     _check_roundtrip(table, dictionary_pagesize_limit=1,
                      data_page_size=10, version='2.4')
diff --git a/python/pyarrow/tests/parquet/test_dataset.py 
b/python/pyarrow/tests/parquet/test_dataset.py
index b8939443c1..d3e9cda730 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -38,7 +38,7 @@ from pyarrow.util import guid
 try:
     import pyarrow.parquet as pq
     from pyarrow.tests.parquet.common import (
-        _read_table, _test_dataframe, _write_table)
+        _read_table, _test_dataframe, _test_table, _write_table)
 except ImportError:
     pq = None
 
@@ -742,15 +742,14 @@ def test_dataset_read_pandas(tempdir):
     tm.assert_frame_equal(result.reindex(columns=expected.columns), expected)
 
 
[email protected]
[email protected]
 def test_dataset_memory_map(tempdir):
     # ARROW-2627: Check that we can use ParquetDataset with memory-mapping
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     dataset = pq.ParquetDataset(
@@ -758,14 +757,13 @@ def test_dataset_memory_map(tempdir):
     assert dataset.read().equals(table)
 
 
[email protected]
[email protected]
 def test_dataset_enable_buffered_stream(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     with pytest.raises(ValueError):
@@ -778,14 +776,13 @@ def test_dataset_enable_buffered_stream(tempdir):
         assert dataset.read().equals(table)
 
 
[email protected]
[email protected]
 def test_dataset_enable_pre_buffer(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
-    df = _test_dataframe(10, seed=0)
+    table = _test_table(10, seed=0)
     path = dirpath / '0.parquet'
-    table = pa.Table.from_pandas(df)
     _write_table(table, path, version='2.6')
 
     for pre_buffer in (True, False):
@@ -800,10 +797,10 @@ def _make_example_multifile_dataset(base_path, nfiles=10, 
file_nrows=5):
     test_data = []
     paths = []
     for i in range(nfiles):
-        df = _test_dataframe(file_nrows, seed=i)
+        table = _test_table(file_nrows, seed=i)
         path = base_path / f'{i}.parquet'
 
-        test_data.append(_write_table(df, path))
+        test_data.append(_write_table(table, path))
         paths.append(path)
     return paths
 
@@ -813,7 +810,7 @@ def _assert_dataset_paths(dataset, paths):
     assert set(paths) == set(dataset.files)
 
 
[email protected]
[email protected]
 @pytest.mark.parametrize('dir_prefix', ['_', '.'])
 def test_ignore_private_directories(tempdir, dir_prefix):
     dirpath = tempdir / guid()
@@ -830,7 +827,7 @@ def test_ignore_private_directories(tempdir, dir_prefix):
     _assert_dataset_paths(dataset, paths)
 
 
[email protected]
[email protected]
 def test_ignore_hidden_files_dot(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
@@ -849,7 +846,7 @@ def test_ignore_hidden_files_dot(tempdir):
     _assert_dataset_paths(dataset, paths)
 
 
[email protected]
[email protected]
 def test_ignore_hidden_files_underscore(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
@@ -868,7 +865,7 @@ def test_ignore_hidden_files_underscore(tempdir):
     _assert_dataset_paths(dataset, paths)
 
 
[email protected]
[email protected]
 @pytest.mark.parametrize('dir_prefix', ['_', '.'])
 def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix):
     # ARROW-8427 - don't ignore explicitly listed files if parent directory
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py 
b/python/pyarrow/tests/parquet/test_parquet_writer.py
index d1e9e874ba..f9df90b135 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -23,7 +23,7 @@ from pyarrow import fs
 try:
     import pyarrow.parquet as pq
     from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
-                                              _range_integers)
+                                              _test_table, _range_integers)
 except ImportError:
     pq = None
 
@@ -314,10 +314,9 @@ def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
     tm.assert_frame_equal(result, df)
 
 
[email protected]
[email protected]
 def test_parquet_writer_filesystem_buffer_raises():
-    df = _test_dataframe(100)
-    table = pa.Table.from_pandas(df, preserve_index=False)
+    table = _test_table(100)
     filesystem = fs.LocalFileSystem()
 
     # Should raise ValueError when filesystem is passed with file-like object

(arrow) branch main updated: GH-47172: [Python] Add a utility function to create Arrow table instead of pandas df (#47199)

Reply via email to