This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new e06f0eec5e [Minor] Remove private APIs from Parquet metadata benchmark 
(#8478)
e06f0eec5e is described below

commit e06f0eec5e63f6f26ca8d8cda8dc86964b8151d0
Author: Ed Seidl <[email protected]>
AuthorDate: Fri Sep 26 15:30:11 2025 -0700

    [Minor] Remove private APIs from Parquet metadata benchmark (#8478)
    
    # Which issue does this PR close?
    
    # Rationale for this change
    
    While changing the Thrift decoding more fine-grained benchmarking was
    desired. With that work mostly done, the benchmarks should revert to
    only testing public APIs.
    
    # What changes are included in this PR?
    
    Removes benches of internal APIs. Also makes changes to the "wide" test
    to be more representative of the real world. Use of random numbers added
    many negative values which require many more bytes to encode.
    
    # Are these changes tested?
    
    None required
    
    # Are there any user-facing changes?
    
    No, only changes to benchmark output.
---
 parquet/benches/metadata.rs | 93 +++++++--------------------------------------
 parquet/src/thrift.rs       | 14 -------
 2 files changed, 14 insertions(+), 93 deletions(-)

diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs
index 8c886e4d5e..b05d6c33a9 100644
--- a/parquet/benches/metadata.rs
+++ b/parquet/benches/metadata.rs
@@ -86,13 +86,13 @@ fn encoded_meta() -> Vec<u8> {
                         encodings: vec![Encoding::PLAIN, 
Encoding::RLE_DICTIONARY],
                         path_in_schema: vec![],
                         codec: CompressionCodec::UNCOMPRESSED,
-                        num_values: rng.random(),
-                        total_uncompressed_size: rng.random(),
-                        total_compressed_size: rng.random(),
+                        num_values: rng.random_range(1..1000000),
+                        total_uncompressed_size: 
rng.random_range(100000..100000000),
+                        total_compressed_size: 
rng.random_range(50000..5000000),
                         key_value_metadata: None,
-                        data_page_offset: rng.random(),
-                        index_page_offset: Some(rng.random()),
-                        dictionary_page_offset: Some(rng.random()),
+                        data_page_offset: rng.random_range(4..2000000000),
+                        index_page_offset: None,
+                        dictionary_page_offset: 
Some(rng.random_range(4..2000000000)),
                         statistics: Some(stats.clone()),
                         encoding_stats: Some(vec![
                             PageEncodingStats {
@@ -111,10 +111,10 @@ fn encoded_meta() -> Vec<u8> {
                         size_statistics: None,
                         geospatial_statistics: None,
                     }),
-                    offset_index_offset: Some(rng.random()),
-                    offset_index_length: Some(rng.random()),
-                    column_index_offset: Some(rng.random()),
-                    column_index_length: Some(rng.random()),
+                    offset_index_offset: Some(rng.random_range(0..2000000000)),
+                    offset_index_length: Some(rng.random_range(1..100000)),
+                    column_index_offset: Some(rng.random_range(0..2000000000)),
+                    column_index_length: Some(rng.random_range(1..100000)),
                     crypto_metadata: None,
                     encrypted_column_metadata: None,
                 })
@@ -122,11 +122,11 @@ fn encoded_meta() -> Vec<u8> {
 
             RowGroup {
                 columns,
-                total_byte_size: rng.random(),
-                num_rows: rng.random(),
+                total_byte_size: rng.random_range(1..2000000000),
+                num_rows: rng.random_range(1..10000000000),
                 sorting_columns: None,
                 file_offset: None,
-                total_compressed_size: Some(rng.random()),
+                total_compressed_size: Some(rng.random_range(1..1000000000)),
                 ordinal: Some(i as _),
             }
         })
@@ -136,7 +136,7 @@ fn encoded_meta() -> Vec<u8> {
         schema,
         row_groups,
         version: 1,
-        num_rows: rng.random(),
+        num_rows: rng.random_range(1..2000000000),
         key_value_metadata: None,
         created_by: Some("parquet-rs".into()),
         column_orders: None,
@@ -163,36 +163,6 @@ fn get_footer_bytes(data: Bytes) -> Bytes {
     data.slice(meta_start..meta_end)
 }
 
-#[cfg(feature = "arrow")]
-fn rewrite_file(bytes: Bytes) -> (Bytes, FileMetaData) {
-    use arrow::array::RecordBatchReader;
-    use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, 
ArrowWriter};
-    use parquet::file::properties::{EnabledStatistics, WriterProperties};
-
-    let parquet_reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
-        .expect("parquet open")
-        .build()
-        .expect("parquet open");
-    let writer_properties = WriterProperties::builder()
-        .set_statistics_enabled(EnabledStatistics::Page)
-        .set_write_page_header_statistics(true)
-        .build();
-    let mut output = Vec::new();
-    let mut parquet_writer = ArrowWriter::try_new(
-        &mut output,
-        parquet_reader.schema(),
-        Some(writer_properties),
-    )
-    .expect("create arrow writer");
-
-    for maybe_batch in parquet_reader {
-        let batch = maybe_batch.expect("reading batch");
-        parquet_writer.write(&batch).expect("writing data");
-    }
-    let file_meta = parquet_writer.close().expect("finalizing file");
-    (output.into(), file_meta)
-}
-
 fn criterion_benchmark(c: &mut Criterion) {
     // Read file into memory to isolate filesystem performance
     let file = "../parquet-testing/data/alltypes_tiny_pages.parquet";
@@ -217,47 +187,12 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("decode thrift file metadata", |b| {
-        b.iter(|| {
-            parquet::thrift::bench_file_metadata(&meta_data);
-        })
-    });
-
     let buf: Bytes = black_box(encoded_meta()).into();
     c.bench_function("decode parquet metadata (wide)", |b| {
         b.iter(|| {
             ParquetMetaDataReader::decode_metadata(&buf).unwrap();
         })
     });
-
-    c.bench_function("decode thrift file metadata (wide)", |b| {
-        b.iter(|| {
-            parquet::thrift::bench_file_metadata(&buf);
-        })
-    });
-
-    // rewrite file with page statistics. then read page headers.
-    #[cfg(feature = "arrow")]
-    let (file_bytes, metadata) = rewrite_file(data.clone());
-    #[cfg(feature = "arrow")]
-    c.bench_function("page headers", |b| {
-        b.iter(|| {
-            metadata.row_groups.iter().for_each(|rg| {
-                rg.columns.iter().for_each(|col| {
-                    if let Some(col_meta) = &col.meta_data {
-                        if let Some(dict_offset) = 
col_meta.dictionary_page_offset {
-                            parquet::thrift::bench_page_header(
-                                &file_bytes.slice(dict_offset as usize..),
-                            );
-                        }
-                        parquet::thrift::bench_page_header(
-                            &file_bytes.slice(col_meta.data_page_offset as 
usize..),
-                        );
-                    }
-                });
-            });
-        })
-    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs
index e16e394be2..1cbd47a900 100644
--- a/parquet/src/thrift.rs
+++ b/parquet/src/thrift.rs
@@ -33,20 +33,6 @@ pub trait TSerializable: Sized {
     fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()>;
 }
 
-// Public function to aid benchmarking. Reads Parquet `FileMetaData` encoded 
in `bytes`.
-#[doc(hidden)]
-pub fn bench_file_metadata(bytes: &bytes::Bytes) {
-    let mut input = TCompactSliceInputProtocol::new(bytes);
-    crate::format::FileMetaData::read_from_in_protocol(&mut input).unwrap();
-}
-
-// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in 
`bytes`.
-#[doc(hidden)]
-pub fn bench_page_header(bytes: &bytes::Bytes) {
-    let mut prot = TCompactSliceInputProtocol::new(bytes);
-    crate::format::PageHeader::read_from_in_protocol(&mut prot).unwrap();
-}
-
 /// A more performant implementation of [`TCompactInputProtocol`] that reads a 
slice
 ///
 /// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol

Reply via email to