This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7c18001f0d GH-45848: [C++][Python][R] Remove deprecated PARQUET_2_0
(#45849)
7c18001f0d is described below
commit 7c18001f0d7bd97471237719702c33165858bba7
Author: Alenka Frim <[email protected]>
AuthorDate: Thu Mar 27 12:19:08 2025 +0100
GH-45848: [C++][Python][R] Remove deprecated PARQUET_2_0 (#45849)
### Rationale for this change
`PARQUET_2_0` has been deprecated for a while now and can be removed from
the codebase.
### What changes are included in this PR?
Removal of deprecated enum value `PARQUET_2_0`, and the corresponding
Python and R definitions.
### Are these changes tested?
By existing unit tests, some of which have been updated.
### Are there any user-facing changes?
* `PARQUET_2_0` is removed in Parquet C++
* `"2.0"` is not allowed anymore as Parquet version in the PyArrow and R
Parquet APIs
* GitHub Issue: #45848
Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Bryce Mecum <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 19 ++++++-------------
cpp/src/parquet/metadata.cc | 3 ---
cpp/src/parquet/type_fwd.h | 6 ------
python/pyarrow/_parquet.pxd | 1 -
python/pyarrow/_parquet.pyx | 8 --------
python/pyarrow/tests/parquet/test_data_types.py | 2 +-
python/pyarrow/tests/parquet/test_metadata.py | 2 +-
python/pyarrow/tests/parquet/test_pandas.py | 6 ++----
python/pyarrow/tests/test_pandas.py | 3 +--
r/R/enums.R | 2 +-
r/R/parquet.R | 15 +++------------
r/man/enums.Rd | 2 +-
r/man/write_parquet.Rd | 4 ++--
r/tests/testthat/test-parquet.R | 9 ---------
14 files changed, 18 insertions(+), 64 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f80ab83c86..0cc5f948c7 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1264,7 +1264,7 @@ TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
using TestUInt32ParquetIO = TestParquetIO<::arrow::UInt32Type>;
-TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) {
+TEST_F(TestUInt32ParquetIO, Parquet_2_6_Compatibility) {
// This also tests max_definition_level = 1
std::shared_ptr<Array> values;
@@ -2055,10 +2055,6 @@ TEST(TestArrowReadWrite,
ParquetVersionTimestampDifferences) {
.version(ParquetVersion::PARQUET_1_0)
->build();
ARROW_SUPPRESS_DEPRECATION_WARNING
- auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder()
-
.version(ParquetVersion::PARQUET_2_0)
- ->build();
- ARROW_UNSUPPRESS_DEPRECATION_WARNING
auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_2_4)
->build();
@@ -2066,8 +2062,8 @@ TEST(TestArrowReadWrite,
ParquetVersionTimestampDifferences) {
.version(ParquetVersion::PARQUET_2_6)
->build();
const std::vector<std::shared_ptr<WriterProperties>> all_properties = {
- parquet_version_1_properties, parquet_version_2_0_properties,
- parquet_version_2_4_properties, parquet_version_2_6_properties};
+ parquet_version_1_properties, parquet_version_2_4_properties,
+ parquet_version_2_6_properties};
{
// Using Parquet version 1.0 and 2.4 defaults, seconds should be coerced to
@@ -2081,13 +2077,11 @@ TEST(TestArrowReadWrite,
ParquetVersionTimestampDifferences) {
parquet_version_2_4_properties));
}
{
- // Using Parquet version 2.0 and 2.6 defaults, seconds should be coerced to
+ // Using Parquet version 2.6 defaults, seconds should be coerced to
// milliseconds and nanoseconds should be retained
auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms),
field("ts:us", t_us), field("ts:ns",
t_ns)});
auto expected_table = Table::Make(expected_schema, {a_ms, a_ms, a_us,
a_ns});
- ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table,
expected_table,
-
parquet_version_2_0_properties));
ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table,
expected_table,
parquet_version_2_6_properties));
}
@@ -2133,9 +2127,8 @@ TEST(TestArrowReadWrite,
ParquetVersionTimestampDifferences) {
CreateOutputStream(), input_table->num_rows(),
properties,
arrow_coerce_to_nanos_properties));
}
- // Using Parquet versions "2.0" and 2.6, coercing to (int64) nanoseconds is
allowed
- for (const auto& properties :
- {parquet_version_2_0_properties, parquet_version_2_6_properties}) {
+ // Using Parquet version 2.6, coercing to (int64) nanoseconds is allowed
+ for (const auto& properties : {parquet_version_2_6_properties}) {
ARROW_SCOPED_TRACE("format = ",
ParquetVersionToString(properties->version()));
auto expected_schema = schema({field("ts:s", t_ns), field("ts:ms", t_ns),
field("ts:us", t_ns), field("ts:ns",
t_ns)});
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 9b53da021f..398ff761bd 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -76,9 +76,6 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
case ParquetVersion::PARQUET_1_0:
return "1.0";
ARROW_SUPPRESS_DEPRECATION_WARNING
- case ParquetVersion::PARQUET_2_0:
- return "pseudo-2.0";
- ARROW_UNSUPPRESS_DEPRECATION_WARNING
case ParquetVersion::PARQUET_2_4:
return "2.4";
case ParquetVersion::PARQUET_2_6:
diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h
index cda0dc5a77..c2e902c41f 100644
--- a/cpp/src/parquet/type_fwd.h
+++ b/cpp/src/parquet/type_fwd.h
@@ -38,12 +38,6 @@ struct ParquetVersion {
/// corresponding converted type.
PARQUET_1_0,
- /// DEPRECATED: Enable Parquet format 2.6 features
- ///
- /// This misleadingly named enum value is roughly similar to PARQUET_2_6.
- PARQUET_2_0 ARROW_DEPRECATED_ENUM_VALUE("use PARQUET_2_4 or PARQUET_2_6 "
- "for fine-grained feature
selection"),
-
/// Enable Parquet format 2.4 and earlier features when writing
///
/// This enables UINT32 as well as logical types which don't have
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 1e3c89e4e7..e6de9712f8 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -134,7 +134,6 @@ cdef extern from "parquet/api/schema.h" namespace "parquet"
nogil:
enum ParquetVersion" parquet::ParquetVersion::type":
ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
- ParquetVersion_V2_0" parquet::ParquetVersion::PARQUET_2_0"
ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4"
ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6"
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 6bc77ed795..55c2866243 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -991,8 +991,6 @@ cdef class FileMetaData(_Weakrefable):
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
return '1.0'
- elif version == ParquetVersion_V2_0:
- return 'pseudo-2.0'
elif version == ParquetVersion_V2_4:
return '2.4'
elif version == ParquetVersion_V2_6:
@@ -1888,12 +1886,6 @@ cdef shared_ptr[WriterProperties]
_create_writer_properties(
if version is not None:
if version == "1.0":
props.version(ParquetVersion_V1)
- elif version in ("2.0", "pseudo-2.0"):
- warnings.warn(
- "Parquet format '2.0' pseudo version is deprecated, use "
- "'2.4' or '2.6' for fine-grained feature selection",
- FutureWarning, stacklevel=2)
- props.version(ParquetVersion_V2_0)
elif version == "2.4":
props.version(ParquetVersion_V2_4)
elif version == "2.6":
diff --git a/python/pyarrow/tests/parquet/test_data_types.py
b/python/pyarrow/tests/parquet/test_data_types.py
index 1428f80239..855d5952b9 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -58,7 +58,7 @@ pytestmark = pytest.mark.parquet
@pytest.mark.pandas
@pytest.mark.parametrize('chunk_size', [None, 1000])
-def test_parquet_2_0_roundtrip(tempdir, chunk_size):
+def test_parquet_2_6_roundtrip(tempdir, chunk_size):
df = alltypes_sample(size=10000, categorical=True)
filename = tempdir / 'pandas_roundtrip.parquet'
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index cf17f830f2..b3340d93e4 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -554,7 +554,7 @@ def test_write_metadata(tempdir):
assert b'ARROW:schema' not in schema_as_arrow.metadata
# pass through writer keyword arguments
- for version in ["1.0", "2.0", "2.4", "2.6"]:
+ for version in ["1.0", "2.4", "2.6"]:
pq.write_metadata(schema, path, version=version)
parquet_meta = pq.read_metadata(path)
# The version is stored as a single integer in the Parquet metadata,
diff --git a/python/pyarrow/tests/parquet/test_pandas.py
b/python/pyarrow/tests/parquet/test_pandas.py
index 2ea2f46873..58a39c1967 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -121,7 +121,7 @@ def test_pandas_parquet_column_multiindex(tempdir):
@pytest.mark.pandas
-def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir):
+def test_pandas_parquet_2_roundtrip_read_pandas_no_index_written(tempdir):
df = alltypes_sample(size=10000)
filename = tempdir / 'pandas_roundtrip.parquet'
@@ -270,14 +270,12 @@ def test_pandas_parquet_configuration_options(tempdir):
@pytest.mark.pandas
[email protected]("ignore:Parquet format '2.0':FutureWarning")
def test_spark_flavor_preserves_pandas_metadata():
df = _test_dataframe(size=100)
df.index = np.arange(0, 10 * len(df), 10)
df.index.name = 'foo'
- result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
- 'flavor': 'spark'})
+ result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'})
tm.assert_frame_equal(result, df)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 4ad04c9ad1..39757c1f4c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -4892,14 +4892,13 @@ def make_df_with_timestamps():
@pytest.mark.parquet
[email protected]("ignore:Parquet format '2.0':FutureWarning")
def test_timestamp_as_object_parquet(tempdir):
# Timestamps can be stored as Parquet and reloaded into Pandas with no loss
# of information if the timestamp_as_object option is True.
df = make_df_with_timestamps()
table = pa.Table.from_pandas(df)
filename = tempdir / "timestamps_from_pandas.parquet"
- pq.write_table(table, filename, version="2.0")
+ pq.write_table(table, filename)
result = pq.read_table(filename)
df2 = result.to_pandas(timestamp_as_object=True)
tm.assert_frame_equal(df, df2)
diff --git a/r/R/enums.R b/r/R/enums.R
index 7dd5f2f858..98995b2a2e 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -129,7 +129,7 @@ FileType <- enum("FileType",
#' @export
#' @rdname enums
ParquetVersionType <- enum("ParquetVersionType",
- PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
+ PARQUET_1_0 = 0L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
)
#' @export
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 88ce1c7712..91ddfc63a2 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -98,8 +98,8 @@ read_parquet <- function(file,
#' the number of columns and number of rows), though if the data has fewer
#' than 250 million cells (rows x cols), then the total number of rows is
#' used.
-#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
-#' "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+#' @param version parquet version: "1.0", "2.4" (default), "2.6", or
+#' "latest" (currently equivalent to 2.6). Numeric values are
#' coerced to character.
#' @param compression compression algorithm. Default "snappy". See details.
#' @param compression_level compression level. Meaning depends on compression
@@ -232,7 +232,6 @@ ParquetArrowWriterProperties$create <-
function(use_deprecated_int96_timestamps
valid_parquet_version <- c(
"1.0" = ParquetVersionType$PARQUET_1_0,
- "2.0" = ParquetVersionType$PARQUET_2_0,
"2.4" = ParquetVersionType$PARQUET_2_4,
"2.6" = ParquetVersionType$PARQUET_2_6,
"latest" = ParquetVersionType$PARQUET_2_6
@@ -252,15 +251,7 @@ make_valid_parquet_version <- function(version,
valid_versions = valid_parquet_v
call. = FALSE
)
}
- out <- valid_versions[[arg_match(version, values = names(valid_versions))]]
-
- if (identical(out, ParquetVersionType$PARQUET_2_0)) {
- warning(
- 'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to
select format features.',
- call. = FALSE
- )
- }
- out
+ valid_versions[[arg_match(version, values = names(valid_versions))]]
}
#' @title ParquetWriterProperties class
diff --git a/r/man/enums.Rd b/r/man/enums.Rd
index dd0ca944b8..e2f50dfc86 100644
--- a/r/man/enums.Rd
+++ b/r/man/enums.Rd
@@ -36,7 +36,7 @@ An object of class \code{Compression::type} (inherits from
\code{arrow-enum}) of
An object of class \code{FileType} (inherits from \code{arrow-enum}) of length
4.
-An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum})
of length 4.
+An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum})
of length 3.
An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of
length 5.
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index 954c692dad..859006571a 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -32,8 +32,8 @@ the number of columns and number of rows), though if the data
has fewer
than 250 million cells (rows x cols), then the total number of rows is
used.}
-\item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
-"2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+\item{version}{parquet version: "1.0", "2.4" (default), "2.6", or
+"latest" (currently equivalent to 2.6). Numeric values are
coerced to character.}
\item{compression}{compression algorithm. Default "snappy". See details.}
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index edca48e92d..e769aa9ea0 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -134,12 +134,6 @@ test_that("make_valid_parquet_version()", {
make_valid_parquet_version("1.0"),
ParquetVersionType$PARQUET_1_0
)
- expect_deprecated(
- expect_equal(
- make_valid_parquet_version("2.0"),
- ParquetVersionType$PARQUET_2_0
- )
- )
expect_equal(
make_valid_parquet_version("2.4"),
ParquetVersionType$PARQUET_2_4
@@ -154,9 +148,6 @@ test_that("make_valid_parquet_version()", {
)
expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
- expect_deprecated(
- expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0)
- )
expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4)
})