raulcd opened a new issue, #49839: URL: https://github.com/apache/arrow/issues/49839
### Describe the bug, including details regarding any error messages, version, and platform. The fastparquet project isn't compatible with Pandas 3, see: - https://github.com/dask/fastparquet/pull/973 The `test_fastparquet_cross_compatibility` fails when using pandas 3 and pyarrow. I validated locally: ```python $ python -m pytest pyarrow/tests/parquet/test_basic.py::test_fastparquet_cross_compatibility ================================================================================================ test session starts ================================================================================================= platform linux -- Python 3.13.12, pytest-9.0.0, pluggy-1.6.0 rootdir: /home/raulcd/code/arrow/python configfile: setup.cfg plugins: hypothesis-6.147.0 collected 1 item pyarrow/tests/parquet/test_basic.py F [100%] ====================================================================================================== FAILURES ====================================================================================================== ________________________________________________________________________________________ test_fastparquet_cross_compatibility ________________________________________________________________________________________ data = 0 a 1 b dtype: str se = {'type': 6, 'type_length': None, 'repetition_type': 1, 'name': 'f', 'num_children': None, 'converted_type': 0, 'scale': None, 'precision': None, 'field_id': None, 'logicalType': None} def convert(data, se): """Convert data according to the schema encoding""" dtype = data.dtype type = se.type converted_type = se.converted_type if dtype.name in typemap: if type in revmap: out = data.values.astype(revmap[type], copy=False) elif type == parquet_thrift.Type.BOOLEAN: # TODO: with our own bitpack writer, no need to copy for # the padding padded = np.pad(data.values, (0, 8 - (len(data) % 8)), 'constant', constant_values=(0, 0)) out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel()) elif dtype.name in typemap: out = data.values elif "S" in str(dtype)[:2] or "U" in str(dtype)[:2]: out = data.values elif dtype == "O": # TODO: nullable types try: if converted_type == parquet_thrift.ConvertedType.UTF8: # getattr for new pandas StringArray # TODO: to bytes in one step out = array_encode_utf8(data) elif converted_type == parquet_thrift.ConvertedType.DECIMAL: out = data.values.astype(np.float64, copy=False) elif converted_type is None: if type in revmap: out = data.values.astype(revmap[type], copy=False) elif type == parquet_thrift.Type.BOOLEAN: # TODO: with our own bitpack writer, no need to copy for # the padding padded = np.pad(data.values, (0, 8 - (len(data) % 8)), 'constant', constant_values=(0, 0)) out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel()) else: out = data.values elif converted_type == parquet_thrift.ConvertedType.JSON: encoder = json_encoder() # TODO: avoid list. np.fromiter can be used with numpy >= 1.23.0, # but older versions don't support object arrays. out = np.array([encoder(x) for x in data], dtype="O") elif converted_type == parquet_thrift.ConvertedType.BSON: out = data.map(tobson).values if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY: out = out.astype('S%i' % se.type_length) except Exception as e: ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[ converted_type] if converted_type is not None else None raise ValueError('Error converting column "%s" to bytes using ' 'encoding %s. Original error: ' '%s' % (data.name, ct, e)) elif "str" in str(dtype): try: if converted_type == parquet_thrift.ConvertedType.UTF8: # TODO: into bytes in one step > out = array_encode_utf8(data) ^^^^^^^^^^^^^^^^^^^^^^^ ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:290: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ fastparquet/speedups.pyx:42: in fastparquet.speedups.array_encode_utf8 ??? ../../pyarrow-dev/lib/python3.13/site-packages/pandas/core/series.py:901: in __array__ arr = np.array(values, dtype=dtype, copy=copy) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <ArrowStringArray> ['a', 'b'] Length: 2, dtype: str, dtype = None, copy = False def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" if copy is False: # TODO: By using `zero_copy_only` it may be possible to implement this > raise ValueError( "Unable to avoid copy while creating an array as requested." E ValueError: Unable to avoid copy while creating an array as requested. ../../pyarrow-dev/lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py:857: ValueError During handling of the above exception, another exception occurred: tempdir = PosixPath('/tmp/pytest-of-raulcd/pytest-2/test_fastparquet_cross_compati0') @pytest.mark.pandas @pytest.mark.fastparquet @pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning") @pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet") @pytest.mark.filterwarnings("ignore:unclosed file:ResourceWarning") def test_fastparquet_cross_compatibility(tempdir): fp = pytest.importorskip('fastparquet') df = pd.DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(4.0, 7.0, dtype="float64"), "d": [True, False, True], "e": pd.date_range("20130101", periods=3), "f": pd.Categorical(["a", "b", "a"]), # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip # "g": [[1, 2], None, [1, 2, 3]], } ) table = pa.table(df) # Arrow -> fastparquet file_arrow = str(tempdir / "cross_compat_arrow.parquet") pq.write_table(table, file_arrow, compression=None) fp_file = fp.ParquetFile(file_arrow) df_fp = fp_file.to_pandas() # pandas 3 defaults to StringDtype for strings, fastparquet still returns object # TODO: remove astype casts once fastparquet supports pandas 3 StringDtype tm.assert_frame_equal(df_fp, df.astype({"a": object})) # Fastparquet -> arrow file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet") # fastparquet doesn't support writing pandas 3 StringDtype yet > fp.write(file_fastparquet, df.astype({"a": object})) pyarrow/tests/parquet/test_basic.py:855: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:1340: in write write_simple(filename, data, fmd, ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:1001: in write_simple write_to_file(f) ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:985: in write_to_file rg = make_row_group(f, row_group, fmd.schema, ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:802: in make_row_group chunk = write_column(f, coldata, column, ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:607: in write_column bdata = encode['PLAIN'](pd.Series(data.cat.categories), selement) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:388: in encode_plain out = convert(data, se) ^^^^^^^^^^^^^^^^^ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ data = 0 a 1 b dtype: str se = {'type': 6, 'type_length': None, 'repetition_type': 1, 'name': 'f', 'num_children': None, 'converted_type': 0, 'scale': None, 'precision': None, 'field_id': None, 'logicalType': None} def convert(data, se): """Convert data according to the schema encoding""" dtype = data.dtype type = se.type converted_type = se.converted_type if dtype.name in typemap: if type in revmap: out = data.values.astype(revmap[type], copy=False) elif type == parquet_thrift.Type.BOOLEAN: # TODO: with our own bitpack writer, no need to copy for # the padding padded = np.pad(data.values, (0, 8 - (len(data) % 8)), 'constant', constant_values=(0, 0)) out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel()) elif dtype.name in typemap: out = data.values elif "S" in str(dtype)[:2] or "U" in str(dtype)[:2]: out = data.values elif dtype == "O": # TODO: nullable types try: if converted_type == parquet_thrift.ConvertedType.UTF8: # getattr for new pandas StringArray # TODO: to bytes in one step out = array_encode_utf8(data) elif converted_type == parquet_thrift.ConvertedType.DECIMAL: out = data.values.astype(np.float64, copy=False) elif converted_type is None: if type in revmap: out = data.values.astype(revmap[type], copy=False) elif type == parquet_thrift.Type.BOOLEAN: # TODO: with our own bitpack writer, no need to copy for # the padding padded = np.pad(data.values, (0, 8 - (len(data) % 8)), 'constant', constant_values=(0, 0)) out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel()) else: out = data.values elif converted_type == parquet_thrift.ConvertedType.JSON: encoder = json_encoder() # TODO: avoid list. np.fromiter can be used with numpy >= 1.23.0, # but older versions don't support object arrays. out = np.array([encoder(x) for x in data], dtype="O") elif converted_type == parquet_thrift.ConvertedType.BSON: out = data.map(tobson).values if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY: out = out.astype('S%i' % se.type_length) except Exception as e: ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[ converted_type] if converted_type is not None else None raise ValueError('Error converting column "%s" to bytes using ' 'encoding %s. Original error: ' '%s' % (data.name, ct, e)) elif "str" in str(dtype): try: if converted_type == parquet_thrift.ConvertedType.UTF8: # TODO: into bytes in one step out = array_encode_utf8(data) elif converted_type is None: out = data.values if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY: out = out.astype('S%i' % se.type_length) except Exception as e: # pragma: no cover ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[ converted_type] if converted_type is not None else None > raise ValueError('Error converting column "%s" to bytes using ' 'encoding %s. Original error: ' '%s' % (data.name, ct, e)) E ValueError: Error converting column "None" to bytes using encoding UTF8. Original error: Unable to avoid copy while creating an array as requested. ../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:298: ValueError ============================================================================================== short test summary info =============================================================================================== FAILED pyarrow/tests/parquet/test_basic.py::test_fastparquet_cross_compatibility - ValueError: Error converting column "None" to bytes using encoding UTF8. Original error: Unable to avoid copy while creating an array as requested. = ``` This was originally found when adding the conda feedstocks for PyArrow: - https://github.com/conda-forge/pyarrow-feedstock/pull/177 We haven't noticed because we don't seem to install fastparquet in any of our CI jobs. At this point I am wondering whether we should just remove the test or install the dependency (on pandas 2 jobs). ### Component(s) Python -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
