This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2cbf122793 GH-47172: [Python] Add a utility function to create Arrow
table instead of pandas df (#47199)
2cbf122793 is described below
commit 2cbf1227939d42cb08998eb8fa666120c44ff254
Author: egolearner <[email protected]>
AuthorDate: Mon Aug 4 15:46:15 2025 +0800
GH-47172: [Python] Add a utility function to create Arrow table instead of
pandas df (#47199)
### Rationale for this change
resolve #47172
### What changes are included in this PR?
Add a utility function to create Arrow table instead of pandas df
### Are these changes tested?
yes
### Are there any user-facing changes?
no
* GitHub Issue: #47172
Authored-by: egolearner <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
python/pyarrow/tests/parquet/common.py | 18 ++++++++++----
python/pyarrow/tests/parquet/test_basic.py | 12 ++++-----
python/pyarrow/tests/parquet/test_dataset.py | 29 ++++++++++------------
.../pyarrow/tests/parquet/test_parquet_writer.py | 7 +++---
4 files changed, 34 insertions(+), 32 deletions(-)
diff --git a/python/pyarrow/tests/parquet/common.py
b/python/pyarrow/tests/parquet/common.py
index 4f5946649b..5390a24b90 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -95,11 +95,9 @@ def _range_integers(size, dtype):
return pa.array(np.arange(size, dtype=dtype))
-def _test_dataframe(size=10000, seed=0):
- import pandas as pd
-
+def _test_dict(size=10000, seed=0):
np.random.seed(seed)
- df = pd.DataFrame({
+ return {
'uint8': _random_integers(size, np.uint8),
'uint16': _random_integers(size, np.uint16),
'uint32': _random_integers(size, np.uint32),
@@ -114,13 +112,23 @@ def _test_dataframe(size=10000, seed=0):
'strings': [util.rands(10) for i in range(size)],
'all_none': [None] * size,
'all_none_category': [None] * size
- })
+ }
+
+
+def _test_dataframe(size=10000, seed=0):
+ import pandas as pd
+
+ df = pd.DataFrame(_test_dict(size, seed))
# TODO(PARQUET-1015)
# df['all_none_category'] = df['all_none_category'].astype('category')
return df
+def _test_table(size=10000, seed=0):
+ return pa.Table.from_pydict(_test_dict(size, seed))
+
+
def make_sample_file(table_or_df):
import pyarrow.parquet as pq
diff --git a/python/pyarrow/tests/parquet/test_basic.py
b/python/pyarrow/tests/parquet/test_basic.py
index 67515c5e24..591bcffc1a 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -28,7 +28,7 @@ import pyarrow as pa
from pyarrow import fs
from pyarrow.tests import util
from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
- _test_dataframe)
+ _test_table)
try:
import pyarrow.parquet as pq
@@ -76,20 +76,18 @@ def test_set_data_page_size():
_check_roundtrip(t, data_page_size=target_page_size)
[email protected]
[email protected]
def test_set_write_batch_size():
- df = _test_dataframe(100)
- table = pa.Table.from_pandas(df, preserve_index=False)
+ table = _test_table(100)
_check_roundtrip(
table, data_page_size=10, write_batch_size=1, version='2.4'
)
[email protected]
[email protected]
def test_set_dictionary_pagesize_limit():
- df = _test_dataframe(100)
- table = pa.Table.from_pandas(df, preserve_index=False)
+ table = _test_table(100)
_check_roundtrip(table, dictionary_pagesize_limit=1,
data_page_size=10, version='2.4')
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index b8939443c1..d3e9cda730 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -38,7 +38,7 @@ from pyarrow.util import guid
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (
- _read_table, _test_dataframe, _write_table)
+ _read_table, _test_dataframe, _test_table, _write_table)
except ImportError:
pq = None
@@ -742,15 +742,14 @@ def test_dataset_read_pandas(tempdir):
tm.assert_frame_equal(result.reindex(columns=expected.columns), expected)
[email protected]
[email protected]
def test_dataset_memory_map(tempdir):
# ARROW-2627: Check that we can use ParquetDataset with memory-mapping
dirpath = tempdir / guid()
dirpath.mkdir()
- df = _test_dataframe(10, seed=0)
+ table = _test_table(10, seed=0)
path = dirpath / '0.parquet'
- table = pa.Table.from_pandas(df)
_write_table(table, path, version='2.6')
dataset = pq.ParquetDataset(
@@ -758,14 +757,13 @@ def test_dataset_memory_map(tempdir):
assert dataset.read().equals(table)
[email protected]
[email protected]
def test_dataset_enable_buffered_stream(tempdir):
dirpath = tempdir / guid()
dirpath.mkdir()
- df = _test_dataframe(10, seed=0)
+ table = _test_table(10, seed=0)
path = dirpath / '0.parquet'
- table = pa.Table.from_pandas(df)
_write_table(table, path, version='2.6')
with pytest.raises(ValueError):
@@ -778,14 +776,13 @@ def test_dataset_enable_buffered_stream(tempdir):
assert dataset.read().equals(table)
[email protected]
[email protected]
def test_dataset_enable_pre_buffer(tempdir):
dirpath = tempdir / guid()
dirpath.mkdir()
- df = _test_dataframe(10, seed=0)
+ table = _test_table(10, seed=0)
path = dirpath / '0.parquet'
- table = pa.Table.from_pandas(df)
_write_table(table, path, version='2.6')
for pre_buffer in (True, False):
@@ -800,10 +797,10 @@ def _make_example_multifile_dataset(base_path, nfiles=10,
file_nrows=5):
test_data = []
paths = []
for i in range(nfiles):
- df = _test_dataframe(file_nrows, seed=i)
+ table = _test_table(file_nrows, seed=i)
path = base_path / f'{i}.parquet'
- test_data.append(_write_table(df, path))
+ test_data.append(_write_table(table, path))
paths.append(path)
return paths
@@ -813,7 +810,7 @@ def _assert_dataset_paths(dataset, paths):
assert set(paths) == set(dataset.files)
[email protected]
[email protected]
@pytest.mark.parametrize('dir_prefix', ['_', '.'])
def test_ignore_private_directories(tempdir, dir_prefix):
dirpath = tempdir / guid()
@@ -830,7 +827,7 @@ def test_ignore_private_directories(tempdir, dir_prefix):
_assert_dataset_paths(dataset, paths)
[email protected]
[email protected]
def test_ignore_hidden_files_dot(tempdir):
dirpath = tempdir / guid()
dirpath.mkdir()
@@ -849,7 +846,7 @@ def test_ignore_hidden_files_dot(tempdir):
_assert_dataset_paths(dataset, paths)
[email protected]
[email protected]
def test_ignore_hidden_files_underscore(tempdir):
dirpath = tempdir / guid()
dirpath.mkdir()
@@ -868,7 +865,7 @@ def test_ignore_hidden_files_underscore(tempdir):
_assert_dataset_paths(dataset, paths)
[email protected]
[email protected]
@pytest.mark.parametrize('dir_prefix', ['_', '.'])
def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix):
# ARROW-8427 - don't ignore explicitly listed files if parent directory
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py
b/python/pyarrow/tests/parquet/test_parquet_writer.py
index d1e9e874ba..f9df90b135 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -23,7 +23,7 @@ from pyarrow import fs
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
- _range_integers)
+ _test_table, _range_integers)
except ImportError:
pq = None
@@ -314,10 +314,9 @@ def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
tm.assert_frame_equal(result, df)
[email protected]
[email protected]
def test_parquet_writer_filesystem_buffer_raises():
- df = _test_dataframe(100)
- table = pa.Table.from_pandas(df, preserve_index=False)
+ table = _test_table(100)
filesystem = fs.LocalFileSystem()
# Should raise ValueError when filesystem is passed with file-like object