This is an automated email from the ASF dual-hosted git repository.
alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 811a273b9d GH-48961: [Docs][Python] Doctest fails on pandas 3.0
(#48969)
811a273b9d is described below
commit 811a273b9d6c1a6cea179637f05feca05c100ae8
Author: tadeja <[email protected]>
AuthorDate: Wed Jan 28 15:22:05 2026 +0100
GH-48961: [Docs][Python] Doctest fails on pandas 3.0 (#48969)
### Rationale for this change
See issue #48961
Pandas 3.0.0 string storage type changes
https://github.com/pandas-dev/pandas/pull/62118/changes
and
https://pandas.pydata.org/docs/whatsnew/v3.0.0.html#dedicated-string-data-type-by-default
### What changes are included in this PR?
Updating several doctest examples from `string` to `large_string`.
### Are these changes tested?
Yes, locally.
### Are there any user-facing changes?
No.
Closes #48961
* GitHub Issue: #48961
Authored-by: Tadeja Kadunc <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
python/pyarrow/table.pxi | 218 ++++++++++++++++++++---------------------------
python/pyarrow/types.pxi | 6 +-
2 files changed, 97 insertions(+), 127 deletions(-)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 8e258e38af..de839a9a50 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1877,10 +1877,12 @@ cdef class _Tabular(_PandasConvertible):
>>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021],
... 'n_legs': [2, 4, 5, 100],
... 'animals': ["Flamingo", "Horse", None,
"Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[None, 2022, 2019, 2021], [2, 4, 5, 100], ["Flamingo",
"Horse", None, "Centipede"]],
+ ... names=['year', 'n_legs', 'animals'])
>>> table.drop_null()
pyarrow.Table
- year: double
+ year: int64
n_legs: int64
animals: string
----
@@ -1909,10 +1911,9 @@ cdef class _Tabular(_PandasConvertible):
Table (works similarly for RecordBatch)
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.field(0)
pyarrow.Field<n_legs: int64>
>>> table.field(1)
@@ -2064,10 +2065,9 @@ cdef class _Tabular(_PandasConvertible):
Table (works similarly for RecordBatch)
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None],
- ... 'animals': ["Flamingo", "Horse", None,
"Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> for i in table.itercolumns():
... print(i.null_count)
...
@@ -2133,13 +2133,12 @@ cdef class _Tabular(_PandasConvertible):
--------
Table (works similarly for RecordBatch)
- >>> import pandas as pd
>>> import pyarrow as pa
- >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021],
- ... 'n_legs': [2, 2, 4, 4, 5, 100],
- ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
- ... "Brittle stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2020, 2022, 2021, 2022, 2019, 2021],
+ ... [2, 2, 4, 4, 5, 100],
+ ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['year', 'n_legs', 'animal'])
>>> table.sort_by('animal')
pyarrow.Table
year: int64
@@ -2181,11 +2180,10 @@ cdef class _Tabular(_PandasConvertible):
Table (works similarly for RecordBatch)
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
- ... 'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100],
+ ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]],
+ ... names=['year', 'n_legs', 'animals'])
>>> table.take([1,3])
pyarrow.Table
year: int64
@@ -2473,10 +2471,9 @@ cdef class _Tabular(_PandasConvertible):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Append column at the end:
@@ -2545,7 +2542,7 @@ cdef class RecordBatch(_Tabular):
month: int64
day: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [2020,2022,2021,2022]
month: [3,5,7,9]
@@ -2585,7 +2582,7 @@ cdef class RecordBatch(_Tabular):
month: int64
day: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [2020,2022,2021,2022]
month: [3,5,7,9]
@@ -2858,10 +2855,9 @@ cdef class RecordBatch(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> batch = pa.RecordBatch.from_pandas(df)
+ >>> batch = pa.RecordBatch.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Add column:
@@ -2931,10 +2927,9 @@ cdef class RecordBatch(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> batch = pa.RecordBatch.from_pandas(df)
+ >>> batch = pa.RecordBatch.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> batch.remove_column(1)
pyarrow.RecordBatch
n_legs: int64
@@ -2970,10 +2965,9 @@ cdef class RecordBatch(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> batch = pa.RecordBatch.from_pandas(df)
+ >>> batch = pa.RecordBatch.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Replace a column:
@@ -3039,10 +3033,9 @@ cdef class RecordBatch(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> batch = pa.RecordBatch.from_pandas(df)
+ >>> batch = pa.RecordBatch.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> new_names = ["n", "name"]
>>> batch.rename_columns(new_names)
pyarrow.RecordBatch
@@ -3318,15 +3311,12 @@ cdef class RecordBatch(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> batch = pa.RecordBatch.from_pandas(df)
+ >>> batch = pa.RecordBatch.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> batch.schema
n_legs: int64
animals: string
- -- schema metadata --
- pandas: '{"index_columns": [{"kind": "range", "name": null, "start":
0, ...
Define new schema and cast batch values:
@@ -3416,7 +3406,7 @@ cdef class RecordBatch(_Tabular):
month: int64
day: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [2020,2022,2021,2022]
month: [3,5,7,9]
@@ -3579,11 +3569,11 @@ cdef class RecordBatch(_Tabular):
--------
>>> import pyarrow as pa
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
- ... {'year': 2022, 'n_legs': 4}])
+ ... {'year': 2022, 'n_legs': 4, 'animals': 'Goat'}])
>>> pa.RecordBatch.from_struct_array(struct).to_pandas()
n_legs animals year
0 2 Parrot NaN
- 1 4 None 2022.0
+ 1 4 Goat 2022.0
"""
cdef:
shared_ptr[CRecordBatch] c_record_batch
@@ -4156,7 +4146,7 @@ cdef class Table(_Tabular):
pyarrow.Table
year: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [[2020,2022,2019,2021]]
n_legs: [[2,4,5,100]]
@@ -4282,11 +4272,10 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
- ... 'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100],
+ ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]],
+ ... names=['year', 'n_legs', 'animals'])
>>> table.slice(length=3)
pyarrow.Table
year: int64
@@ -4347,11 +4336,10 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
- ... 'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100],
+ ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]],
+ ... names=['year', 'n_legs', 'animals'])
>>> table.select([0,1])
pyarrow.Table
year: int64
@@ -4687,15 +4675,12 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.schema
n_legs: int64
animals: string
- -- schema metadata --
- pandas: '{"index_columns": [{"kind": "range", "name": null, "start":
0, ...
Define new schema and cast table values:
@@ -4787,7 +4772,7 @@ cdef class Table(_Tabular):
>>> pa.Table.from_pandas(df)
pyarrow.Table
n_legs: int64
- animals: string
+ animals: ...string
----
n_legs: [[2,4,5,100]]
animals: [["Flamingo","Horse","Brittle stars","Centipede"]]
@@ -4934,11 +4919,11 @@ cdef class Table(_Tabular):
--------
>>> import pyarrow as pa
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
- ... {'year': 2022, 'n_legs': 4}])
+ ... {'year': 2022, 'n_legs': 4, 'animals': 'Goat'}])
>>> pa.Table.from_struct_array(struct).to_pandas()
n_legs animals year
0 2 Parrot NaN
- 1 4 None 2022.0
+ 1 4 Goat 2022.0
"""
if isinstance(struct_array, Array):
return
Table.from_batches([RecordBatch.from_struct_array(struct_array)])
@@ -5132,10 +5117,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Convert a Table to a RecordBatchReader:
@@ -5146,8 +5130,6 @@ cdef class Table(_Tabular):
>>> reader.schema
n_legs: int64
animals: string
- -- schema metadata --
- pandas: '{"index_columns": [{"kind": "range", "name": null, "start":
0, ...
>>> reader.read_all()
pyarrow.Table
n_legs: int64
@@ -5193,15 +5175,12 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.schema
n_legs: int64
animals: string
- -- schema metadata --
- pandas: '{"index_columns": [{"kind": "range", "name": null, "start":
0, "' ...
"""
return pyarrow_wrap_schema(self.table.schema())
@@ -5288,10 +5267,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None],
- ... 'animals': ["Flamingo", "Horse", None,
"Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.nbytes
72
"""
@@ -5318,10 +5296,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None],
- ... 'animals': ["Flamingo", "Horse", None,
"Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.get_total_buffer_size()
76
"""
@@ -5360,10 +5337,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Add column:
@@ -5426,10 +5402,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> table.remove_column(1)
pyarrow.Table
n_legs: int64
@@ -5465,10 +5440,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
Replace a column:
@@ -5527,10 +5501,9 @@ cdef class Table(_Tabular):
Examples
--------
>>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle
stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['n_legs', 'animals'])
>>> new_names = ["n", "name"]
>>> table.rename_columns(new_names)
pyarrow.Table
@@ -5619,13 +5592,12 @@ cdef class Table(_Tabular):
Examples
--------
- >>> import pandas as pd
>>> import pyarrow as pa
- >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021],
- ... 'n_legs': [2, 2, 4, 4, 5, 100],
- ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
- ... "Brittle stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
+ >>> table = pa.Table.from_arrays(
+ ... [[2020, 2022, 2021, 2022, 2019, 2021],
+ ... [2, 2, 4, 4, 5, 100],
+ ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars",
"Centipede"]],
+ ... names=['year', 'n_legs', 'animal'])
>>> table.group_by('year').aggregate([('n_legs', 'sum')])
pyarrow.Table
year: int64
@@ -5682,16 +5654,14 @@ cdef class Table(_Tabular):
Examples
--------
- >>> import pandas as pd
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
- >>> df1 = pd.DataFrame({'id': [1, 2, 3],
- ... 'year': [2020, 2022, 2019]})
- >>> df2 = pd.DataFrame({'id': [3, 4],
- ... 'n_legs': [5, 100],
- ... 'animal': ["Brittle stars", "Centipede"]})
- >>> t1 = pa.Table.from_pandas(df1)
- >>> t2 = pa.Table.from_pandas(df2)
+ >>> t1 = pa.Table.from_arrays(
+ ... [[1, 2, 3], [2020, 2022, 2019]],
+ ... names=['id', 'year'])
+ >>> t2 = pa.Table.from_arrays(
+ ... [[3, 4], [5, 100], ["Brittle stars", "Centipede"]],
+ ... names=['id', 'n_legs', 'animal'])
Left outer join:
@@ -6003,7 +5973,7 @@ def record_batch(data, names=None, schema=None,
metadata=None):
month: int64
day: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [2020,2022,2021,2022]
month: [3,5,7,9]
@@ -6164,7 +6134,7 @@ def table(data, names=None, schema=None, metadata=None,
nthreads=None):
pyarrow.Table
year: int64
n_legs: int64
- animals: string
+ animals: ...string
----
year: [[2020,2022,2019,2021]]
n_legs: [[2,4,5,100]]
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 792c0840f8..e84f1b073f 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -3111,7 +3111,7 @@ cdef class Schema(_Weakrefable):
@classmethod
def from_pandas(cls, df, preserve_index=None):
"""
- Returns implied schema from dataframe
+ Returns implied schema from DataFrame
Parameters
----------
@@ -3136,11 +3136,11 @@ cdef class Schema(_Weakrefable):
... 'str': ['a', 'b']
... })
- Create an Arrow Schema from the schema of a pandas dataframe:
+ Create an Arrow Schema from the schema of a pandas DataFrame:
>>> pa.Schema.from_pandas(df)
int: int64
- str: string
+ str: ...string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, ...
"""