This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new e2b264f1b8 Optimize data page statistics conversion (up to 4x) (#9303)
e2b264f1b8 is described below

commit e2b264f1b8c8593454bdbfeb4845947c97d4c98a
Author: Daniël Heres <[email protected]>
AuthorDate: Sat Jan 31 13:40:33 2026 +0100

    Optimize data page statistics conversion (up to 4x) (#9303)
    
    # Which issue does this PR close?
    
    
    - Closes #9306
    
    # Rationale for this change
    
    Loading statis is notably inefficient. This makes the conversion from
    the structure to arrow arrays a bit faster by avoiding allocations,
    until we get a more efficient encoding directly
    (https://github.com/apache/arrow-rs/issues/9296)
    
    <details>
    
    ```
    Extract data page statistics for Int64/extract_statistics/Int64
                            time:   [5.2223 µs 5.2589 µs 5.3230 µs]
                            change: [−39.253% −38.205% −37.016%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 7 outliers among 100 measurements (7.00%)
      2 (2.00%) low mild
      3 (3.00%) high mild
      2 (2.00%) high severe
    
    Extract data page statistics for UInt64/extract_statistics/UInt64
                            time:   [5.1035 µs 5.2173 µs 5.3576 µs]
                            change: [−32.745% −31.758% −30.535%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 14 outliers among 100 measurements (14.00%)
      8 (8.00%) high mild
      6 (6.00%) high severe
    
    Extract data page statistics for F64/extract_statistics/F64
                            time:   [6.1922 µs 6.2021 µs 6.2130 µs]
                            change: [−30.749% −29.405% −28.469%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 3 outliers among 100 measurements (3.00%)
      1 (1.00%) high mild
      2 (2.00%) high severe
    
    Extract data page statistics for String/extract_statistics/String
                            time:   [10.924 µs 10.965 µs 11.008 µs]
                            change: [−64.471% −64.330% −64.206%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 14 outliers among 100 measurements (14.00%)
      11 (11.00%) high mild
      3 (3.00%) high severe
    
    Extract data page statistics for Dictionary(Int32, 
String)/extract_statistics/Dictionary(Int32, Stri...
                            time:   [10.885 µs 10.905 µs 10.928 µs]
                            change: [−64.444% −64.362% −64.285%] (p = 0.00 < 
0.05)
                            Performance has improved.
    ```
    
    </details>
    
    # What changes are included in this PR?
    
    Converts the inefficient iterator-based code (which doesn't really fit
    the iterator pattern well) to just traverse the values and use the
    builders. (I think it's just converting a bunch of ugly code to another
    bunch of ugly code).
    Additionally tries to preallocate where possible.
    
    # Are these changes tested?
    Existing tests
    
    
    # Are there any user-facing changes?
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/builder/primitive_builder.rs |   20 +
 parquet/src/arrow/arrow_reader/statistics.rs | 1104 +++++++++++++++-----------
 parquet/src/file/page_index/column_index.rs  |   19 +-
 3 files changed, 670 insertions(+), 473 deletions(-)

diff --git a/arrow-array/src/builder/primitive_builder.rs 
b/arrow-array/src/builder/primitive_builder.rs
index 049cef241c..dc96486bca 100644
--- a/arrow-array/src/builder/primitive_builder.rs
+++ b/arrow-array/src/builder/primitive_builder.rs
@@ -260,6 +260,26 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
         self.values_builder.extend_from_slice(values);
     }
 
+    /// Appends values from a iter of type `Option<T>`
+    ///
+    /// # Panics
+    ///
+    /// Panics if `values` and `is_valid` have different lengths
+    #[inline]
+    pub fn extend_from_iter_option<I: IntoIterator<Item = 
Option<T::Native>>>(&mut self, iter: I) {
+        let iter = iter.into_iter();
+        self.values_builder.extend(iter.map(|v| match v {
+            Some(v) => {
+                self.null_buffer_builder.append_non_null();
+                v
+            }
+            None => {
+                self.null_buffer_builder.append_null();
+                T::Native::default()
+            }
+        }));
+    }
+
     /// Appends array values and null to this builder as is
     /// (this means that underlying null values are copied as is).
     ///
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs 
b/parquet/src/arrow/arrow_reader/statistics.rs
index 2f46c96be6..19d3e34f52 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -22,15 +22,19 @@
 use crate::arrow::buffer::bit_util::sign_extend_be;
 use crate::arrow::parquet_column;
 use crate::basic::Type as PhysicalType;
-use crate::data_type::{ByteArray, FixedLenByteArray};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, 
RowGroupMetaData};
-use crate::file::page_index::column_index::{ColumnIndexIterators, 
ColumnIndexMetaData};
+use crate::file::page_index::column_index::ColumnIndexMetaData;
 use crate::file::statistics::Statistics as ParquetStatistics;
 use crate::schema::types::SchemaDescriptor;
 use arrow_array::builder::{
-    BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, 
LargeStringBuilder, StringBuilder,
-    StringViewBuilder,
+    BinaryBuilder, BinaryViewBuilder, BooleanBuilder, Date32Builder, 
Date64Builder,
+    Decimal32Builder, Decimal64Builder, FixedSizeBinaryBuilder, 
Float16Builder, Float32Builder,
+    Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, 
LargeBinaryBuilder,
+    LargeStringBuilder, StringBuilder, StringViewBuilder, 
Time32MillisecondBuilder,
+    Time32SecondBuilder, Time64MicrosecondBuilder, Time64NanosecondBuilder,
+    TimestampMicrosecondBuilder, TimestampMillisecondBuilder, 
TimestampNanosecondBuilder,
+    TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, 
UInt64Builder,
 };
 use arrow_array::{
     ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, 
Decimal32Array, Decimal64Array,
@@ -38,9 +42,9 @@ use arrow_array::{
     Int16Array, Int32Array, Int64Array, LargeBinaryArray, 
Time32MillisecondArray,
     Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, 
TimestampMicrosecondArray,
     TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, 
UInt8Array,
-    UInt16Array, UInt32Array, UInt64Array, new_empty_array, new_null_array,
+    UInt16Array, UInt32Array, UInt64Array, new_null_array,
 };
-use arrow_buffer::i256;
+use arrow_buffer::{NullBufferBuilder, i256};
 use arrow_schema::{DataType, Field, Schema, TimeUnit};
 use half::f16;
 use paste::paste;
@@ -596,473 +600,641 @@ macro_rules! get_statistics {
         }}}
 }
 
-macro_rules! make_data_page_stats_iterator {
-    ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
-        struct $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            iter: I,
-        }
-
-        impl<'a, I> $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            fn new(iter: I) -> Self {
-                Self { iter }
-            }
-        }
-
-        impl<'a, I> Iterator for $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            type Item = Vec<Option<$stat_value_type>>;
-
-            fn next(&mut self) -> Option<Self::Item> {
-                let next = self.iter.next();
-                match next {
-                    Some((len, index)) => match index {
-                        // No matching `Index` found;
-                        // thus no statistics that can be extracted.
-                        // We return vec![None; len] to effectively
-                        // create an arrow null-array with the length
-                        // corresponding to the number of entries in
-                        // `ParquetOffsetIndex` per row group per column.
-                        ColumnIndexMetaData::NONE => Some(vec![None; len]),
-                        _ => 
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
-                    },
-                    _ => None,
-                }
-            }
-
-            fn size_hint(&self) -> (usize, Option<usize>) {
-                self.iter.size_hint()
-            }
-        }
-    };
-}
-
-make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator, 
min_values_iter, bool);
-make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator, 
max_values_iter, bool);
-make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter, 
i32);
-make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter, 
i32);
-make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter, 
i64);
-make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter, 
i64);
-make_data_page_stats_iterator!(
-    MinFloat16DataPageStatsIterator,
-    min_values_iter,
-    FixedLenByteArray
-);
-make_data_page_stats_iterator!(
-    MaxFloat16DataPageStatsIterator,
-    max_values_iter,
-    FixedLenByteArray
-);
-make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator, 
min_values_iter, f32);
-make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator, 
max_values_iter, f32);
-make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator, 
min_values_iter, f64);
-make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator, 
max_values_iter, f64);
-make_data_page_stats_iterator!(
-    MinByteArrayDataPageStatsIterator,
-    min_values_iter,
-    ByteArray
-);
-make_data_page_stats_iterator!(
-    MaxByteArrayDataPageStatsIterator,
-    max_values_iter,
-    ByteArray
-);
-make_data_page_stats_iterator!(
-    MaxFixedLenByteArrayDataPageStatsIterator,
-    max_values_iter,
-    FixedLenByteArray
-);
-
-make_data_page_stats_iterator!(
-    MinFixedLenByteArrayDataPageStatsIterator,
-    min_values_iter,
-    FixedLenByteArray
-);
-
-macro_rules! get_decimal_page_stats_iterator {
-    ($iterator_type: ident, $func: ident, $stat_value_type: ident, 
$convert_func: ident) => {
-        struct $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            iter: I,
-        }
-
-        impl<'a, I> $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            fn new(iter: I) -> Self {
-                Self { iter }
-            }
-        }
-
-        impl<'a, I> Iterator for $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            type Item = Vec<Option<$stat_value_type>>;
-
-            // Some(native_index.$func().map(|v| 
v.map($conv)).collect::<Vec<_>>())
-            fn next(&mut self) -> Option<Self::Item> {
-                let next = self.iter.next();
-                match next {
-                    Some((len, index)) => match index {
-                        ColumnIndexMetaData::INT32(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $stat_value_type::from(*x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        ColumnIndexMetaData::INT64(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| 
$stat_value_type::try_from(*x).unwrap()))
-                                .collect::<Vec<_>>(),
-                        ),
-                        ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $convert_func(x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $convert_func(x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        _ => Some(vec![None; len]),
-                    },
-                    _ => None,
-                }
-            }
-
-            fn size_hint(&self) -> (usize, Option<usize>) {
-                self.iter.size_hint()
-            }
-        }
-    };
-}
-
-get_decimal_page_stats_iterator!(
-    MinDecimal32DataPageStatsIterator,
-    min_values_iter,
-    i32,
-    from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal32DataPageStatsIterator,
-    max_values_iter,
-    i32,
-    from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal64DataPageStatsIterator,
-    min_values_iter,
-    i64,
-    from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal64DataPageStatsIterator,
-    max_values_iter,
-    i64,
-    from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal128DataPageStatsIterator,
-    min_values_iter,
-    i128,
-    from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal128DataPageStatsIterator,
-    max_values_iter,
-    i128,
-    from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal256DataPageStatsIterator,
-    min_values_iter,
-    i256,
-    from_bytes_to_i256
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal256DataPageStatsIterator,
-    max_values_iter,
-    i256,
-    from_bytes_to_i256
-);
-
 macro_rules! get_data_page_statistics {
     ($stat_type_prefix: ident, $data_type: ident, $iterator: ident, 
$physical_type: ident) => {
-        paste! {
-            match $data_type {
+        {
+            let chunks: Vec<(usize, &ColumnIndexMetaData)> = 
$iterator.collect();
+            let capacity: usize = chunks.iter().map(|c| c.0).sum();
+            paste! {
+                match $data_type {
                 DataType::Boolean => {
-                    let iterator = [<$stat_type_prefix 
BooleanDataPageStatsIterator>]::new($iterator);
-                    let mut builder = BooleanBuilder::new();
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-                            builder.append_value(x);
+                    let mut b = BooleanBuilder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BOOLEAN(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    b.append_option(val.copied());
+                                }
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::UInt8 => {
+                    let mut b = UInt8Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|&x| 
u8::try_from(x).ok())),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::UInt16 => {
+                    let mut b = UInt16Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                     index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|&x| 
u16::try_from(x).ok())),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::UInt32 => {
+                    let mut b = UInt32Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|&x| x as u32)),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::UInt64 => {
+                    let mut b = UInt64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|&x| x as u64)),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Int8 => {
+                    let mut b = Int8Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|&x| 
i8::try_from(x).ok())),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Int16 => {
+                    let mut b = Int16Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|&x| 
i16::try_from(x).ok())),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Int32 => {
+                    let mut b = Int32Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Int64 => {
+                    let mut b = Int64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Float16 => {
+                    let mut b = Float16Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|x| 
from_bytes_to_f16(x))),
+                                );
+                            }
+                            _ => b.append_nulls(len),
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Float32 => {
+                    let mut b = Float32Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::FLOAT(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Float64 => {
+                    let mut b = Float64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::DOUBLE(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Binary => {
+                    let mut b = BinaryBuilder::with_capacity(capacity, 
capacity * 10);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    b.append_option(val.map(|x| x.as_ref()));
+                                }
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::LargeBinary => {
+                    let mut b = LargeBinaryBuilder::with_capacity(capacity, 
capacity * 10);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    b.append_option(val.map(|x| x.as_ref()));
+                                }
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
                 },
-                DataType::UInt8 => Ok(Arc::new(
-                    UInt8Array::from_iter(
-                        [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| u8::try_from(x).ok())
-                                })
-                            })
-                            .flatten()
-                    )
-                )),
-                DataType::UInt16 => Ok(Arc::new(
-                    UInt16Array::from_iter(
-                        [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| u16::try_from(x).ok())
-                                })
-                            })
-                            .flatten()
-                    )
-                )),
-                DataType::UInt32 => Ok(Arc::new(
-                    UInt32Array::from_iter(
-                        [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| Some(x as u32))
-                                })
-                            })
-                            .flatten()
-                ))),
-                DataType::UInt64 => Ok(Arc::new(
-                    UInt64Array::from_iter(
-                        [<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| Some(x as u64))
-                                })
-                            })
-                            .flatten()
-                ))),
-                DataType::Int8 => Ok(Arc::new(
-                    Int8Array::from_iter(
-                        [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| i8::try_from(x).ok())
-                                })
-                            })
-                            .flatten()
-                    )
-                )),
-                DataType::Int16 => Ok(Arc::new(
-                    Int16Array::from_iter(
-                        [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| i16::try_from(x).ok())
-                                })
-                            })
-                            .flatten()
-                    )
-                )),
-                DataType::Int32 => 
Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Int64 => 
Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Float16 => Ok(Arc::new(
-                    Float16Array::from_iter(
-                        [<$stat_type_prefix 
Float16DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter().map(|x| {
-                                    x.and_then(|x| from_bytes_to_f16(x.data()))
-                                })
-                            })
-                            .flatten()
-                    )
-                )),
-                DataType::Float32 => 
Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix 
Float32DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Float64 => 
Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix 
Float64DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Binary => 
Ok(Arc::new(BinaryArray::from_iter([<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::LargeBinary => 
Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
                 DataType::Utf8 => {
-                    let mut builder = StringBuilder::new();
-                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-
-                            let Ok(x) = std::str::from_utf8(x.data()) else {
-                                builder.append_null();
-                                continue;
-                            };
-
-                            builder.append_value(x);
+                    let mut b = StringBuilder::with_capacity(capacity, 
capacity * 10);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    match val {
+                                        Some(x) => match 
std::str::from_utf8(x.as_ref()) {
+                                            Ok(s) => b.append_value(s),
+                                            _ => b.append_null(),
+                                        }
+                                        None => b.append_null(),
+                                    }
+                                }
+                            }
+                            _ => b.append_nulls(len),
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
                 },
                 DataType::LargeUtf8 => {
-                    let mut builder = LargeStringBuilder::new();
-                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-
-                            let Ok(x) = std::str::from_utf8(x.data()) else {
-                                builder.append_null();
-                                continue;
-                            };
-
-                            builder.append_value(x);
+                    let mut b = LargeStringBuilder::with_capacity(capacity, 
capacity * 10);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    match val {
+                                        Some(x) => match 
std::str::from_utf8(x.as_ref()) {
+                                            Ok(s) => b.append_value(s),
+                                            _ => b.append_null(),
+                                        }
+                                        None => b.append_null(),
+                                    }
+                                }
+                            }
+                            _ => b.append_nulls(len),
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
                 },
                 DataType::Dictionary(_, value_type) => {
-                    [<$stat_type_prefix:lower _ page_statistics>](value_type, 
$iterator, $physical_type)
+                    [<$stat_type_prefix:lower _ page_statistics>](value_type, 
chunks.into_iter(), $physical_type)
                 },
                 DataType::Timestamp(unit, timezone) => {
-                    let iter = [<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator).flatten();
-                    Ok(match unit {
-                        TimeUnit::Second => 
Arc::new(TimestampSecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
-                        TimeUnit::Millisecond => 
Arc::new(TimestampMillisecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
-                        TimeUnit::Microsecond => 
Arc::new(TimestampMicrosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
-                        TimeUnit::Nanosecond => 
Arc::new(TimestampNanosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
-                    })
+                    match unit {
+                        TimeUnit::Second => {
+                            let mut b = 
TimestampSecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            
Ok(Arc::new(b.finish().with_timezone_opt(timezone.clone())))
+                        }
+                        TimeUnit::Millisecond => {
+                            let mut b = 
TimestampMillisecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            
Ok(Arc::new(b.finish().with_timezone_opt(timezone.clone())))
+                        }
+                        TimeUnit::Microsecond => {
+                            let mut b = 
TimestampMicrosecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            
Ok(Arc::new(b.finish().with_timezone_opt(timezone.clone())))
+                        }
+                        TimeUnit::Nanosecond => {
+                            let mut b = 
TimestampNanosecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            
Ok(Arc::new(b.finish().with_timezone_opt(timezone.clone())))
+                        }
+                    }
+                },
+                DataType::Date32 => {
+                    let mut b = Date32Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Date64 if $physical_type == 
Some(PhysicalType::INT32)=> {
+                    let mut b = Date64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|&x| (x as i64) * 
24 * 60 * 60 * 1000)),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Date64 if $physical_type == 
Some(PhysicalType::INT64) => {
+                    let mut b = Date64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::Decimal32(precision, scale) => {
+                    let mut b = Decimal32Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.and_then(|&x| 
i32::try_from(x).ok())),
+                                );
+                            }
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i32(x.as_ref()))),
+                                );
+                            }
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i32(x.as_ref()))),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.with_precision_and_scale(*precision, 
*scale)?.finish()))
+                },
+                DataType::Decimal64(precision, scale) => {
+                    let mut b = Decimal64Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| *x as i64)),
+                                );
+                            }
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.copied()),
+                                );
+                            }
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i64(x.as_ref()))),
+                                );
+                            }
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i64(x.as_ref()))),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.with_precision_and_scale(*precision, 
*scale)?.finish()))
+                },
+                DataType::Decimal128(precision, scale) => {
+                    let mut b = Decimal128Array::builder(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| *x as i128)),
+                                );
+                            }
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| *x as i128)),
+                                );
+                            }
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i128(x.as_ref()))),
+                                );
+                            }
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i128(x.as_ref()))),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.with_precision_and_scale(*precision, 
*scale)?.finish()))
+                },
+                DataType::Decimal256(precision, scale) => {
+                    let mut b = Decimal256Array::builder(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
i256::from_i128(*x as i128))),
+                                );
+                            }
+                            ColumnIndexMetaData::INT64(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
i256::from_i128(*x as i128))),
+                                );
+                            }
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i256(x.as_ref()))),
+                                );
+                            }
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                b.extend_from_iter_option(
+                                    index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                        .map(|val| val.map(|x| 
from_bytes_to_i256(x.as_ref()))),
+                                );
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.with_precision_and_scale(*precision, 
*scale)?.finish()))
                 },
-                DataType::Date32 => 
Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Date64 if $physical_type == 
Some(PhysicalType::INT32)=> Ok(
-                    Arc::new(
-                        Date64Array::from_iter([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
-                            .map(|x| {
-                                x.into_iter()
-                                .map(|x| {
-                                    x.and_then(|x| i64::try_from(x).ok())
-                                })
-                                .map(|x| x.map(|x| x * 24 * 60 * 60 * 1000))
-                            }).flatten()
-                        )
-                    )
-                ),
-                DataType::Date64 if $physical_type == 
Some(PhysicalType::INT64) => 
Ok(Arc::new(Date64Array::from_iter([<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator).flatten()))),
-                DataType::Decimal32(precision, scale) => Ok(Arc::new(
-                    Decimal32Array::from_iter([<$stat_type_prefix 
Decimal32DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
 *scale)?)),
-                DataType::Decimal64(precision, scale) => Ok(Arc::new(
-                    Decimal64Array::from_iter([<$stat_type_prefix 
Decimal64DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
 *scale)?)),
-                DataType::Decimal128(precision, scale) => Ok(Arc::new(
-                    Decimal128Array::from_iter([<$stat_type_prefix 
Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
 *scale)?)),
-                DataType::Decimal256(precision, scale) => Ok(Arc::new(
-                    Decimal256Array::from_iter([<$stat_type_prefix 
Decimal256DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
 *scale)?)),
                 DataType::Time32(unit) => {
-                    Ok(match unit {
-                        TimeUnit::Second =>  
Arc::new(Time32SecondArray::from_iter(
-                            [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator).flatten(),
-                        )),
-                        TimeUnit::Millisecond => 
Arc::new(Time32MillisecondArray::from_iter(
-                            [<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator).flatten(),
-                        )),
+                    match unit {
+                        TimeUnit::Second => {
+                            let mut b = 
Time32SecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT32(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            Ok(Arc::new(b.finish()))
+                        }
+                        TimeUnit::Millisecond => {
+                            let mut b = 
Time32MillisecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT32(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            Ok(Arc::new(b.finish()))
+                        }
                         _ => {
-                            // don't know how to extract statistics, so return 
an empty array
-                            new_empty_array(&DataType::Time32(unit.clone()))
+                            Ok(new_null_array($data_type, capacity))
                         }
-                    })
+                    }
                 }
                 DataType::Time64(unit) => {
-                    Ok(match unit {
-                        TimeUnit::Microsecond =>  
Arc::new(Time64MicrosecondArray::from_iter(
-                            [<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator).flatten(),
-                        )),
-                        TimeUnit::Nanosecond => 
Arc::new(Time64NanosecondArray::from_iter(
-                            [<$stat_type_prefix 
Int64DataPageStatsIterator>]::new($iterator).flatten(),
-                        )),
+                    match unit {
+                        TimeUnit::Microsecond => {
+                            let mut b = 
Time64MicrosecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            Ok(Arc::new(b.finish()))
+                        }
+                        TimeUnit::Nanosecond => {
+                            let mut b = 
Time64NanosecondBuilder::with_capacity(capacity);
+                            for (len, index) in chunks {
+                                match index {
+                                    ColumnIndexMetaData::INT64(index) => {
+                                        b.extend_from_iter_option(
+                                            index.[<$stat_type_prefix:lower 
_values_iter>]()
+                                                .map(|val| val.copied()),
+                                        );
+                                    }
+                                    _ => b.append_nulls(len),
+                                }
+                            }
+                            Ok(Arc::new(b.finish()))
+                        }
                         _ => {
-                            // don't know how to extract statistics, so return 
an empty array
-                            new_empty_array(&DataType::Time64(unit.clone()))
+                            Ok(new_null_array($data_type, capacity))
                         }
-                    })
+                    }
                 },
                 DataType::FixedSizeBinary(size) => {
-                    let mut builder = FixedSizeBinaryBuilder::new(*size);
-                    let iterator = [<$stat_type_prefix 
FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-
-                            if x.len() == *size as usize {
-                                let _ = builder.append_value(x.data());
-                            } else {
-                                builder.append_null();
+                    let mut b = 
FixedSizeBinaryBuilder::with_capacity(capacity, *size);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) 
=> {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    match val {
+                                        Some(v) => {
+                                           if v.len() == *size as usize {
+                                               let _ = 
b.append_value(v.as_ref())?;
+                                           } else {
+                                               b.append_null();
+                                           }
+                                       }
+                                        None => b.append_null(),
+                                    }
+                                }
                             }
+                            _ => b.append_nulls(len),
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
                 },
                 DataType::Utf8View => {
-                    let mut builder = StringViewBuilder::new();
-                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-
-                            let Ok(x) = std::str::from_utf8(x.data()) else {
-                                builder.append_null();
-                                continue;
-                            };
-
-                            builder.append_value(x);
+                    let mut b = StringViewBuilder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    match val {
+                                        Some(x) => match 
std::str::from_utf8(x.as_ref()) {
+                                            Ok(s) => b.append_value(s),
+                                            _ => b.append_null(),
+                                        }
+                                        None => b.append_null(),
+                                    }
+                                }
+                            }
+                            _ => {
+                                for _ in 0..len { b.append_null(); }
+                            }
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
                 },
                 DataType::BinaryView => {
-                    let mut builder = BinaryViewBuilder::new();
-                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-
-                            builder.append_value(x);
+                    let mut b = BinaryViewBuilder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BYTE_ARRAY(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    match val {
+                                        Some(v) => b.append_value(v.as_ref()),
+                                        None => b.append_null(),
+                                    }
+                                }
+                            }
+                            _ => {
+                                for _ in 0..len { b.append_null(); }
+                            }
                         }
                     }
-                    Ok(Arc::new(builder.finish()))
+                    Ok(Arc::new(b.finish()))
                 },
                 DataType::Date64 |  // required to cover $physical_type match 
guard
                 DataType::Null |
@@ -1077,12 +1249,12 @@ macro_rules! get_data_page_statistics {
                 DataType::Union(_, _) |
                 DataType::Map(_, _) |
                 DataType::RunEndEncoded(_, _) => {
-                    let len = $iterator.count();
                     // don't know how to extract statistics, so return a null 
array
-                    Ok(new_null_array($data_type, len))
+                    Ok(new_null_array($data_type, capacity))
                 },
             }
         }
+        }
     }
 }
 /// Extracts the min statistics from an iterator of [`ParquetStatistics`] to an
@@ -1142,14 +1314,25 @@ pub(crate) fn null_counts_page_statistics<'a, 
I>(iterator: I) -> Result<UInt64Ar
 where
     I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
 {
-    let iter = iterator.flat_map(|(len, index)| match index {
-        ColumnIndexMetaData::NONE => vec![None; len],
-        column_index => column_index.null_counts().map_or(vec![None; len], |v| 
{
-            v.iter().map(|i| Some(*i as u64)).collect::<Vec<_>>()
-        }),
-    });
-
-    Ok(UInt64Array::from_iter(iter))
+    let chunks: Vec<_> = iterator.collect();
+    let total_capacity: usize = chunks.iter().map(|(len, _)| *len).sum();
+    let mut values = Vec::with_capacity(total_capacity);
+    let mut nulls = NullBufferBuilder::new(total_capacity);
+    for (len, index) in chunks {
+        match index.null_counts() {
+            Some(counts) => {
+                values.extend(counts.iter().map(|&x| x as u64));
+                nulls.append_n_non_nulls(len);
+            }
+            None => {
+                values.resize(values.len() + len, 0);
+                nulls.append_n_nulls(len);
+            }
+        }
+    }
+    let null_buffer = nulls.build();
+    let array = UInt64Array::new(values.into(), null_buffer);
+    Ok(array)
 }
 
 /// Extracts Parquet statistics as Arrow arrays
@@ -1588,10 +1771,7 @@ impl<'a> StatisticsConverter<'a> {
     {
         let Some(parquet_index) = self.parquet_column_index else {
             let num_row_groups = row_group_indices.into_iter().count();
-            return Ok(UInt64Array::from_iter(std::iter::repeat_n(
-                None,
-                num_row_groups,
-            )));
+            return Ok(UInt64Array::new_null(num_row_groups));
         };
 
         let iter = row_group_indices.into_iter().map(|rg_index| {
@@ -1639,7 +1819,8 @@ impl<'a> StatisticsConverter<'a> {
             return Ok(None);
         };
 
-        let mut row_count_total = Vec::new();
+        let mut row_counts = Vec::new();
+        let mut nulls = NullBufferBuilder::new(0);
         for rg_idx in row_group_indices {
             let page_locations = 
&column_offset_index[*rg_idx][parquet_index].page_locations();
 
@@ -1649,17 +1830,22 @@ impl<'a> StatisticsConverter<'a> {
 
             // append the last page row count
             let num_rows_in_row_group = 
&row_group_metadatas[*rg_idx].num_rows();
-            let row_count_per_page = row_count_per_page
-                .chain(std::iter::once(Some(
-                    *num_rows_in_row_group as u64
-                        - page_locations.last().unwrap().first_row_index as 
u64,
-                )))
-                .collect::<Vec<_>>();
-
-            row_count_total.extend(row_count_per_page);
+            let row_count_per_page = 
row_count_per_page.chain(std::iter::once(Some(
+                *num_rows_in_row_group as u64
+                    - page_locations.last().unwrap().first_row_index as u64,
+            )));
+
+            row_counts.extend(row_count_per_page.clone().map(|x| 
x.unwrap_or(0)));
+            for val in row_count_per_page {
+                if val.is_some() {
+                    nulls.append_non_null();
+                } else {
+                    nulls.append_null();
+                }
+            }
         }
 
-        Ok(Some(UInt64Array::from_iter(row_count_total)))
+        Ok(Some(UInt64Array::new(row_counts.into(), nulls.build())))
     }
 
     /// Returns a null array of data_type with one element per row group
diff --git a/parquet/src/file/page_index/column_index.rs 
b/parquet/src/file/page_index/column_index.rs
index a41fefef26..d005044332 100644
--- a/parquet/src/file/page_index/column_index.rs
+++ b/parquet/src/file/page_index/column_index.rs
@@ -195,6 +195,7 @@ impl<T> PrimitiveColumnIndex<T> {
     /// Returns the min value for the page indexed by `idx`
     ///
     /// It is `None` when all values are null
+    #[inline]
     pub fn min_value(&self, idx: usize) -> Option<&T> {
         if self.null_pages[idx] {
             None
@@ -206,6 +207,7 @@ impl<T> PrimitiveColumnIndex<T> {
     /// Returns the max value for the page indexed by `idx`
     ///
     /// It is `None` when all values are null
+    #[inline]
     pub fn max_value(&self, idx: usize) -> Option<&T> {
         if self.null_pages[idx] {
             None
@@ -383,26 +385,14 @@ impl ByteArrayColumnIndex {
     ///
     /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
     pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).map(|i| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                self.min_value(i)
-            }
-        })
+        (0..self.num_pages() as usize).map(|i| self.min_value(i))
     }
 
     /// Returns an iterator over the max values.
     ///
     /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
     pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).map(|i| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                self.max_value(i)
-            }
-        })
+        (0..self.num_pages() as usize).map(|i| self.max_value(i))
     }
 }
 
@@ -596,6 +586,7 @@ impl ColumnIndexMetaData {
     }
 
     /// Returns whether the page indexed by `idx` consists of all null values
+    #[inline]
     pub fn is_null_page(&self, idx: usize) -> bool {
         colidx_enum_func!(self, is_null_page, idx)
     }

Reply via email to