This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch gh5854_thrift_remodel
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/gh5854_thrift_remodel by this
push:
new f777584af4 [thrift-remodel] PoC new form for column index (#8191)
f777584af4 is described below
commit f777584af4dab38c00e94ea9a045c3557f8069a4
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Aug 27 12:43:29 2025 -0700
[thrift-remodel] PoC new form for column index (#8191)
# Which issue does this PR close?
**Note: this targets a feature branch, not main**
- Part of #5854.
# Rationale for this change
Parsing the column index is _very_ slow. The largest part of the cost is
taking the thrift structure (which is a struct of arrays) and converting
it to an array of structs. This results in a large number of allocations
when dealing with binary columns.
This is an experiment in creating a new structure to hold the column
index info that is a little friendlier to parse. It may also be easier
to consume on the datafusion side.
# What changes are included in this PR?
A new `ColumnIndexMetaData` enum is added along with a type
parameterized `NativeColumnIndex` struct.
# Are these changes tested?
No, this is an experiment only. If this work can be honed into an
acceptible `Index` replacement, then tests will be added at that time.
# Are there any user-facing changes?
Yes, this would be a radical change to the column indexes in
`ParquetMetaData`.
---
parquet/src/arrow/arrow_reader/statistics.rs | 214 +++-------
parquet/src/arrow/arrow_writer/mod.rs | 13 +-
parquet/src/bin/parquet-index.rs | 89 ++++-
parquet/src/file/metadata/memory.rs | 50 +++
parquet/src/file/metadata/mod.rs | 24 +-
parquet/src/file/metadata/reader.rs | 8 +-
parquet/src/file/metadata/writer.rs | 35 +-
parquet/src/file/page_index/column_index.rs | 569 +++++++++++++++++++++++++++
parquet/src/file/page_index/index.rs | 5 +-
parquet/src/file/page_index/index_reader.rs | 40 +-
parquet/src/file/page_index/mod.rs | 1 +
parquet/src/file/serialized_reader.rs | 95 +++--
parquet/src/file/writer.rs | 28 +-
parquet/tests/arrow_reader/io/mod.rs | 10 +-
parquet/tests/encryption/encryption_util.rs | 10 +-
15 files changed, 908 insertions(+), 283 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs
b/parquet/src/arrow/arrow_reader/statistics.rs
index eba1f56120..1613656ab9 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType;
use crate::data_type::{ByteArray, FixedLenByteArray};
use crate::errors::{ParquetError, Result};
use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex,
RowGroupMetaData};
-use crate::file::page_index::index::{Index, PageIndex};
+use crate::file::page_index::column_index::{ColumnIndexIterators,
ColumnIndexMetaData};
use crate::file::statistics::Statistics as ParquetStatistics;
use crate::schema::types::SchemaDescriptor;
use arrow_array::builder::{
@@ -597,17 +597,17 @@ macro_rules! get_statistics {
}
macro_rules! make_data_page_stats_iterator {
- ($iterator_type: ident, $func: expr, $index_type: path, $stat_value_type:
ty) => {
+ ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
struct $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
iter: I,
}
impl<'a, I> $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
fn new(iter: I) -> Self {
Self { iter }
@@ -616,7 +616,7 @@ macro_rules! make_data_page_stats_iterator {
impl<'a, I> Iterator for $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
type Item = Vec<Option<$stat_value_type>>;
@@ -624,16 +624,14 @@ macro_rules! make_data_page_stats_iterator {
let next = self.iter.next();
match next {
Some((len, index)) => match index {
- $index_type(native_index) => {
-
Some(native_index.indexes.iter().map($func).collect::<Vec<_>>())
- }
// No matching `Index` found;
// thus no statistics that can be extracted.
// We return vec![None; len] to effectively
// create an arrow null-array with the length
// corresponding to the number of entries in
// `ParquetOffsetIndex` per row group per column.
- _ => Some(vec![None; len]),
+ ColumnIndexMetaData::NONE => Some(vec![None; len]),
+ _ =>
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
},
_ => None,
}
@@ -646,101 +644,45 @@ macro_rules! make_data_page_stats_iterator {
};
}
-make_data_page_stats_iterator!(
- MinBooleanDataPageStatsIterator,
- |x: &PageIndex<bool>| { x.min },
- Index::BOOLEAN,
- bool
-);
-make_data_page_stats_iterator!(
- MaxBooleanDataPageStatsIterator,
- |x: &PageIndex<bool>| { x.max },
- Index::BOOLEAN,
- bool
-);
-make_data_page_stats_iterator!(
- MinInt32DataPageStatsIterator,
- |x: &PageIndex<i32>| { x.min },
- Index::INT32,
- i32
-);
-make_data_page_stats_iterator!(
- MaxInt32DataPageStatsIterator,
- |x: &PageIndex<i32>| { x.max },
- Index::INT32,
- i32
-);
-make_data_page_stats_iterator!(
- MinInt64DataPageStatsIterator,
- |x: &PageIndex<i64>| { x.min },
- Index::INT64,
- i64
-);
-make_data_page_stats_iterator!(
- MaxInt64DataPageStatsIterator,
- |x: &PageIndex<i64>| { x.max },
- Index::INT64,
- i64
-);
+make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator,
min_values_iter, bool);
+make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator,
max_values_iter, bool);
+make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter,
i32);
+make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter,
i32);
+make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter,
i64);
+make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter,
i64);
make_data_page_stats_iterator!(
MinFloat16DataPageStatsIterator,
- |x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
- Index::FIXED_LEN_BYTE_ARRAY,
+ min_values_iter,
FixedLenByteArray
);
make_data_page_stats_iterator!(
MaxFloat16DataPageStatsIterator,
- |x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
- Index::FIXED_LEN_BYTE_ARRAY,
+ max_values_iter,
FixedLenByteArray
);
-make_data_page_stats_iterator!(
- MinFloat32DataPageStatsIterator,
- |x: &PageIndex<f32>| { x.min },
- Index::FLOAT,
- f32
-);
-make_data_page_stats_iterator!(
- MaxFloat32DataPageStatsIterator,
- |x: &PageIndex<f32>| { x.max },
- Index::FLOAT,
- f32
-);
-make_data_page_stats_iterator!(
- MinFloat64DataPageStatsIterator,
- |x: &PageIndex<f64>| { x.min },
- Index::DOUBLE,
- f64
-);
-make_data_page_stats_iterator!(
- MaxFloat64DataPageStatsIterator,
- |x: &PageIndex<f64>| { x.max },
- Index::DOUBLE,
- f64
-);
+make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator,
min_values_iter, f32);
+make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator,
max_values_iter, f32);
+make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator,
min_values_iter, f64);
+make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator,
max_values_iter, f64);
make_data_page_stats_iterator!(
MinByteArrayDataPageStatsIterator,
- |x: &PageIndex<ByteArray>| { x.min.clone() },
- Index::BYTE_ARRAY,
+ min_values_iter,
ByteArray
);
make_data_page_stats_iterator!(
MaxByteArrayDataPageStatsIterator,
- |x: &PageIndex<ByteArray>| { x.max.clone() },
- Index::BYTE_ARRAY,
+ max_values_iter,
ByteArray
);
make_data_page_stats_iterator!(
MaxFixedLenByteArrayDataPageStatsIterator,
- |x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
- Index::FIXED_LEN_BYTE_ARRAY,
+ max_values_iter,
FixedLenByteArray
);
make_data_page_stats_iterator!(
MinFixedLenByteArrayDataPageStatsIterator,
- |x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
- Index::FIXED_LEN_BYTE_ARRAY,
+ min_values_iter,
FixedLenByteArray
);
@@ -748,14 +690,14 @@ macro_rules! get_decimal_page_stats_iterator {
($iterator_type: ident, $func: ident, $stat_value_type: ident,
$convert_func: ident) => {
struct $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
iter: I,
}
impl<'a, I> $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
fn new(iter: I) -> Self {
Self { iter }
@@ -764,44 +706,37 @@ macro_rules! get_decimal_page_stats_iterator {
impl<'a, I> Iterator for $iterator_type<'a, I>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
type Item = Vec<Option<$stat_value_type>>;
+ // Some(native_index.$func().map(|v|
v.map($conv)).collect::<Vec<_>>())
fn next(&mut self) -> Option<Self::Item> {
let next = self.iter.next();
match next {
Some((len, index)) => match index {
- Index::INT32(native_index) => Some(
+ ColumnIndexMetaData::INT32(native_index) => Some(
native_index
- .indexes
- .iter()
- .map(|x| x.$func.and_then(|x|
Some($stat_value_type::from(x))))
+ .$func()
+ .map(|x| x.map(|x| $stat_value_type::from(*x)))
.collect::<Vec<_>>(),
),
- Index::INT64(native_index) => Some(
+ ColumnIndexMetaData::INT64(native_index) => Some(
native_index
- .indexes
- .iter()
- .map(|x| x.$func.and_then(|x|
$stat_value_type::try_from(x).ok()))
+ .$func()
+ .map(|x| x.map(|x|
$stat_value_type::try_from(*x).unwrap()))
.collect::<Vec<_>>(),
),
- Index::BYTE_ARRAY(native_index) => Some(
+ ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
native_index
- .indexes
- .iter()
- .map(|x| {
- x.clone().$func.and_then(|x|
Some($convert_func(x.data())))
- })
+ .$func()
+ .map(|x| x.map(|x| $convert_func(x)))
.collect::<Vec<_>>(),
),
- Index::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
native_index
- .indexes
- .iter()
- .map(|x| {
- x.clone().$func.and_then(|x|
Some($convert_func(x.data())))
- })
+ .$func()
+ .map(|x| x.map(|x| $convert_func(x)))
.collect::<Vec<_>>(),
),
_ => Some(vec![None; len]),
@@ -819,56 +754,56 @@ macro_rules! get_decimal_page_stats_iterator {
get_decimal_page_stats_iterator!(
MinDecimal32DataPageStatsIterator,
- min,
+ min_values_iter,
i32,
from_bytes_to_i32
);
get_decimal_page_stats_iterator!(
MaxDecimal32DataPageStatsIterator,
- max,
+ max_values_iter,
i32,
from_bytes_to_i32
);
get_decimal_page_stats_iterator!(
MinDecimal64DataPageStatsIterator,
- min,
+ min_values_iter,
i64,
from_bytes_to_i64
);
get_decimal_page_stats_iterator!(
MaxDecimal64DataPageStatsIterator,
- max,
+ max_values_iter,
i64,
from_bytes_to_i64
);
get_decimal_page_stats_iterator!(
MinDecimal128DataPageStatsIterator,
- min,
+ min_values_iter,
i128,
from_bytes_to_i128
);
get_decimal_page_stats_iterator!(
MaxDecimal128DataPageStatsIterator,
- max,
+ max_values_iter,
i128,
from_bytes_to_i128
);
get_decimal_page_stats_iterator!(
MinDecimal256DataPageStatsIterator,
- min,
+ min_values_iter,
i256,
from_bytes_to_i256
);
get_decimal_page_stats_iterator!(
MaxDecimal256DataPageStatsIterator,
- max,
+ max_values_iter,
i256,
from_bytes_to_i256
);
@@ -1174,77 +1109,44 @@ fn max_statistics<'a, I: Iterator<Item = Option<&'a
ParquetStatistics>>>(
}
/// Extracts the min statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
pub(crate) fn min_page_statistics<'a, I>(
data_type: &DataType,
iterator: I,
physical_type: Option<PhysicalType>,
) -> Result<ArrayRef>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
get_data_page_statistics!(Min, data_type, iterator, physical_type)
}
/// Extracts the max statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
pub(crate) fn max_page_statistics<'a, I>(
data_type: &DataType,
iterator: I,
physical_type: Option<PhysicalType>,
) -> Result<ArrayRef>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
get_data_page_statistics!(Max, data_type, iterator, physical_type)
}
/// Extracts the null count statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
///
/// The returned Array is an [`UInt64Array`]
pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) ->
Result<UInt64Array>
where
- I: Iterator<Item = (usize, &'a Index)>,
+ I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
let iter = iterator.flat_map(|(len, index)| match index {
- Index::NONE => vec![None; len],
- Index::BOOLEAN(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::INT32(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::INT64(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::FLOAT(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::DOUBLE(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- Index::BYTE_ARRAY(native_index) => native_index
- .indexes
- .iter()
- .map(|x| x.null_count.map(|x| x as u64))
- .collect::<Vec<_>>(),
- _ => unimplemented!(),
+ ColumnIndexMetaData::NONE => vec![None; len],
+ column_index => column_index.null_counts().map_or(vec![None; len], |v|
{
+ v.iter().map(|i| Some(*i as u64)).collect::<Vec<_>>()
+ }),
});
Ok(UInt64Array::from_iter(iter))
@@ -1573,7 +1475,7 @@ impl<'a> StatisticsConverter<'a> {
/// page level statistics can prune at a finer granularity.
///
/// However since they are stored in a separate metadata
- /// structure ([`Index`]) there is different code to extract them as
+ /// structure ([`ColumnIndexMetaData`]) there is different code to extract
them as
/// compared to arrow statistics.
///
/// # Parameters:
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index c6b0b426f9..bd9f30c361 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1488,6 +1488,7 @@ mod tests {
use crate::arrow::ARROW_SCHEMA_META_KEY;
use crate::column::page::{Page, PageReader};
use crate::file::page_encoding_stats::PageEncodingStats;
+ use crate::file::page_index::column_index::ColumnIndexMetaData;
use crate::file::reader::SerializedPageReader;
use crate::format::PageHeader;
use crate::schema::types::ColumnPath;
@@ -1507,7 +1508,6 @@ mod tests {
use crate::basic::Encoding;
use crate::data_type::AsBytes;
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData,
ParquetMetaDataReader};
- use crate::file::page_index::index::Index;
use crate::file::properties::{
BloomFilterPosition, EnabledStatistics, ReaderProperties,
WriterVersion,
};
@@ -4002,9 +4002,12 @@ mod tests {
assert_eq!(column_index[0].len(), 2); // 2 columns
let a_idx = &column_index[0][0];
- assert!(matches!(a_idx, Index::BYTE_ARRAY(_)), "{a_idx:?}");
+ assert!(
+ matches!(a_idx, ColumnIndexMetaData::BYTE_ARRAY(_)),
+ "{a_idx:?}"
+ );
let b_idx = &column_index[0][1];
- assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+ assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
}
#[test]
@@ -4070,9 +4073,9 @@ mod tests {
assert_eq!(column_index[0].len(), 2); // 2 columns
let a_idx = &column_index[0][0];
- assert!(matches!(a_idx, Index::NONE), "{a_idx:?}");
+ assert!(matches!(a_idx, ColumnIndexMetaData::NONE), "{a_idx:?}");
let b_idx = &column_index[0][1];
- assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+ assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
}
#[test]
diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs
index e91f5e5a9f..397a75c76a 100644
--- a/parquet/src/bin/parquet-index.rs
+++ b/parquet/src/bin/parquet-index.rs
@@ -35,8 +35,11 @@
//! [page index]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
use clap::Parser;
+use parquet::data_type::ByteArray;
use parquet::errors::{ParquetError, Result};
-use parquet::file::page_index::index::{Index, PageIndex};
+use parquet::file::page_index::column_index::{
+ ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+};
use parquet::file::page_index::offset_index::{OffsetIndexMetaData,
PageLocation};
use parquet::file::reader::{FileReader, SerializedFileReader};
use parquet::file::serialized_reader::ReadOptionsBuilder;
@@ -96,16 +99,20 @@ impl Args {
let row_counts =
compute_row_counts(offset_index.page_locations.as_slice(),
row_group.num_rows());
match &column_indices[column_idx] {
- Index::NONE => println!("NO INDEX"),
- Index::BOOLEAN(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::INT32(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::INT64(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::INT96(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::FLOAT(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::DOUBLE(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index,
&row_counts)?,
- Index::FIXED_LEN_BYTE_ARRAY(v) => {
- print_index(&v.indexes, offset_index, &row_counts)?
+ ColumnIndexMetaData::NONE => println!("NO INDEX"),
+ ColumnIndexMetaData::BOOLEAN(v) => {
+ print_index::<bool>(v, offset_index, &row_counts)?
+ }
+ ColumnIndexMetaData::INT32(v) => print_index(v, offset_index,
&row_counts)?,
+ ColumnIndexMetaData::INT64(v) => print_index(v, offset_index,
&row_counts)?,
+ ColumnIndexMetaData::INT96(v) => print_index(v, offset_index,
&row_counts)?,
+ ColumnIndexMetaData::FLOAT(v) => print_index(v, offset_index,
&row_counts)?,
+ ColumnIndexMetaData::DOUBLE(v) => print_index(v, offset_index,
&row_counts)?,
+ ColumnIndexMetaData::BYTE_ARRAY(v) => {
+ print_bytes_index(v, offset_index, &row_counts)?
+ }
+ ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => {
+ print_bytes_index(v, offset_index, &row_counts)?
}
}
}
@@ -131,20 +138,21 @@ fn compute_row_counts(offset_index: &[PageLocation],
rows: i64) -> Vec<i64> {
/// Prints index information for a single column chunk
fn print_index<T: std::fmt::Display>(
- column_index: &[PageIndex<T>],
+ column_index: &PrimitiveColumnIndex<T>,
offset_index: &OffsetIndexMetaData,
row_counts: &[i64],
) -> Result<()> {
- if column_index.len() != offset_index.page_locations.len() {
+ if column_index.num_pages() as usize != offset_index.page_locations.len() {
return Err(ParquetError::General(format!(
"Index length mismatch, got {} and {}",
- column_index.len(),
+ column_index.num_pages(),
offset_index.page_locations.len()
)));
}
- for (idx, ((c, o), row_count)) in column_index
- .iter()
+ for (idx, (((min, max), o), row_count)) in column_index
+ .min_values_iter()
+ .zip(column_index.max_values_iter())
.zip(offset_index.page_locations())
.zip(row_counts)
.enumerate()
@@ -153,12 +161,12 @@ fn print_index<T: std::fmt::Display>(
"Page {:>5} at offset {:#010x} with length {:>10} and row count
{:>10}",
idx, o.offset, o.compressed_page_size, row_count
);
- match &c.min {
+ match min {
Some(m) => print!(", min {m:>10}"),
None => print!(", min {:>10}", "NONE"),
}
- match &c.max {
+ match max {
Some(m) => print!(", max {m:>10}"),
None => print!(", max {:>10}", "NONE"),
}
@@ -168,6 +176,51 @@ fn print_index<T: std::fmt::Display>(
Ok(())
}
+fn print_bytes_index(
+ column_index: &ByteArrayColumnIndex,
+ offset_index: &OffsetIndexMetaData,
+ row_counts: &[i64],
+) -> Result<()> {
+ if column_index.num_pages() as usize != offset_index.page_locations.len() {
+ return Err(ParquetError::General(format!(
+ "Index length mismatch, got {} and {}",
+ column_index.num_pages(),
+ offset_index.page_locations.len()
+ )));
+ }
+
+ for (idx, (((min, max), o), row_count)) in column_index
+ .min_values_iter()
+ .zip(column_index.max_values_iter())
+ .zip(offset_index.page_locations())
+ .zip(row_counts)
+ .enumerate()
+ {
+ print!(
+ "Page {:>5} at offset {:#010x} with length {:>10} and row count
{:>10}",
+ idx, o.offset, o.compressed_page_size, row_count
+ );
+ match min {
+ Some(m) => match String::from_utf8(m.to_vec()) {
+ Ok(s) => print!(", min {s:>10}"),
+ Err(_) => print!(", min {:>10}", ByteArray::from(m)),
+ },
+ None => print!(", min {:>10}", "NONE"),
+ }
+
+ match max {
+ Some(m) => match String::from_utf8(m.to_vec()) {
+ Ok(s) => print!(", max {s:>10}"),
+ Err(_) => print!(", min {:>10}", ByteArray::from(m)),
+ },
+ None => print!(", max {:>10}", "NONE"),
+ }
+ println!()
+ }
+
+ Ok(())
+}
+
fn main() -> Result<()> {
Args::parse().run()
}
diff --git a/parquet/src/file/metadata/memory.rs
b/parquet/src/file/metadata/memory.rs
index 0b8d3b336f..69eee3c299 100644
--- a/parquet/src/file/metadata/memory.rs
+++ b/parquet/src/file/metadata/memory.rs
@@ -24,6 +24,9 @@ use crate::file::metadata::{
ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData,
SortingColumn,
};
use crate::file::page_encoding_stats::PageEncodingStats;
+use crate::file::page_index::column_index::{
+ ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData,
PrimitiveColumnIndex,
+};
use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
use crate::file::statistics::{Statistics, ValueStatistics};
@@ -154,6 +157,48 @@ impl HeapSize for OffsetIndexMetaData {
}
}
+impl HeapSize for ColumnIndexMetaData {
+ fn heap_size(&self) -> usize {
+ match self {
+ Self::NONE => 0,
+ Self::BOOLEAN(native_index) => native_index.heap_size(),
+ Self::INT32(native_index) => native_index.heap_size(),
+ Self::INT64(native_index) => native_index.heap_size(),
+ Self::INT96(native_index) => native_index.heap_size(),
+ Self::FLOAT(native_index) => native_index.heap_size(),
+ Self::DOUBLE(native_index) => native_index.heap_size(),
+ Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
+ Self::FIXED_LEN_BYTE_ARRAY(native_index) =>
native_index.heap_size(),
+ }
+ }
+}
+
+impl HeapSize for ColumnIndex {
+ fn heap_size(&self) -> usize {
+ self.null_pages.heap_size()
+ + self.boundary_order.heap_size()
+ + self.null_counts.heap_size()
+ + self.definition_level_histograms.heap_size()
+ + self.repetition_level_histograms.heap_size()
+ }
+}
+
+impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
+ fn heap_size(&self) -> usize {
+ self.column_index.heap_size() + self.min_values.heap_size() +
self.max_values.heap_size()
+ }
+}
+
+impl HeapSize for ByteArrayColumnIndex {
+ fn heap_size(&self) -> usize {
+ self.column_index.heap_size()
+ + self.min_bytes.heap_size()
+ + self.min_offsets.heap_size()
+ + self.max_bytes.heap_size()
+ + self.max_offsets.heap_size()
+ }
+}
+
impl HeapSize for Index {
fn heap_size(&self) -> usize {
match self {
@@ -193,6 +238,11 @@ impl HeapSize for bool {
0 // no heap allocations
}
}
+impl HeapSize for u8 {
+ fn heap_size(&self) -> usize {
+ 0 // no heap allocations
+ }
+}
impl HeapSize for i32 {
fn heap_size(&self) -> usize {
0 // no heap allocations
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index f2fe9de77e..69cdf8f107 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -106,7 +106,7 @@ use crate::file::column_crypto_metadata::{self,
ColumnCryptoMetaData};
pub(crate) use crate::file::metadata::memory::HeapSize;
use crate::file::{
page_encoding_stats::{self, PageEncodingStats},
- page_index::offset_index::PageLocation,
+ page_index::{column_index::ColumnIndexMetaData,
offset_index::PageLocation},
};
use crate::file::{
page_index::index::PageIndex,
@@ -156,7 +156,7 @@ pub(crate) use writer::ThriftMetadataWriter;
///
/// [PageIndex documentation]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// [`ColumnIndex`]: crate::format::ColumnIndex
-pub type ParquetColumnIndex = Vec<Vec<Index>>;
+pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
/// [`OffsetIndexMetaData`] for each data page of each row group of each column
///
@@ -1948,7 +1948,7 @@ impl OffsetIndexBuilder {
mod tests {
use super::*;
use crate::basic::{PageType, SortOrder};
- use crate::file::page_index::index::NativeIndex;
+ use crate::file::page_index::column_index::{ColumnIndex,
PrimitiveColumnIndex};
#[test]
fn test_row_group_metadata_thrift_conversion() {
@@ -2223,7 +2223,17 @@ mod tests {
let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
let column_index = column_index.build_to_thrift();
- let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
+ let native_index = PrimitiveColumnIndex::<bool> {
+ column_index: ColumnIndex {
+ null_pages: column_index.null_pages,
+ boundary_order:
column_index.boundary_order.try_into().unwrap(),
+ null_counts: column_index.null_counts,
+ repetition_level_histograms:
column_index.repetition_level_histograms,
+ definition_level_histograms:
column_index.definition_level_histograms,
+ },
+ min_values: vec![],
+ max_values: vec![],
+ };
// Now, add in OffsetIndex
let mut offset_index = OffsetIndexBuilder::new();
@@ -2237,16 +2247,16 @@ mod tests {
let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
.set_row_groups(row_group_meta)
- .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
+
.set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
.set_offset_index(Some(vec![vec![
OffsetIndexMetaData::try_new(offset_index).unwrap()
]]))
.build();
#[cfg(not(feature = "encryption"))]
- let bigger_expected_size = 2784;
+ let bigger_expected_size = 2704;
#[cfg(feature = "encryption")]
- let bigger_expected_size = 3120;
+ let bigger_expected_size = 3040;
// more set fields means more memory usage
assert!(bigger_expected_size > base_expected_size);
diff --git a/parquet/src/file/metadata/reader.rs
b/parquet/src/file/metadata/reader.rs
index a403f4eee8..57cc7c57ac 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -34,7 +34,7 @@ use bytes::Bytes;
use crate::errors::{ParquetError, Result};
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData,
RowGroupMetaData};
-use crate::file::page_index::index::Index;
+use crate::file::page_index::column_index::ColumnIndexMetaData;
use crate::file::page_index::index_reader::{acc_range, decode_column_index,
decode_offset_index};
use crate::file::reader::ChunkReader;
use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
@@ -566,7 +566,7 @@ impl ParquetMetaDataReader {
col_idx,
)
}
- None => Ok(Index::NONE),
+ None => Ok(ColumnIndexMetaData::NONE),
})
.collect::<Result<Vec<_>>>()
})
@@ -584,7 +584,7 @@ impl ParquetMetaDataReader {
column: &ColumnChunkMetaData,
row_group_index: usize,
col_index: usize,
- ) -> Result<Index> {
+ ) -> Result<ColumnIndexMetaData> {
match &column.column_crypto_metadata {
Some(crypto_metadata) => {
let file_decryptor =
metadata.file_decryptor.as_ref().ok_or_else(|| {
@@ -612,7 +612,7 @@ impl ParquetMetaDataReader {
column: &ColumnChunkMetaData,
_row_group_index: usize,
_col_index: usize,
- ) -> Result<Index> {
+ ) -> Result<ColumnIndexMetaData> {
decode_column_index(bytes, column.column_type())
}
diff --git a/parquet/src/file/metadata/writer.rs
b/parquet/src/file/metadata/writer.rs
index acae20ec3c..404bcf5dba 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -24,9 +24,7 @@ use crate::encryption::{
};
#[cfg(feature = "encryption")]
use crate::errors::ParquetError;
-use crate::errors::Result;
use crate::file::metadata::{KeyValue, ParquetMetaData};
-use crate::file::page_index::index::Index;
use crate::file::writer::{get_file_magic, TrackedWrite};
use crate::format::EncryptionAlgorithm;
#[cfg(feature = "encryption")]
@@ -34,6 +32,7 @@ use crate::format::{AesGcmV1, ColumnCryptoMetaData};
use crate::schema::types;
use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr};
use crate::thrift::TSerializable;
+use crate::{errors::Result,
file::page_index::column_index::ColumnIndexMetaData};
use std::io::Write;
use std::sync::Arc;
use thrift::protocol::TCompactOutputProtocol;
@@ -391,17 +390,31 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
column_indexes
.iter()
.map(|column_index| match column_index {
- Index::NONE => None,
- Index::BOOLEAN(column_index) =>
Some(column_index.to_thrift()),
- Index::BYTE_ARRAY(column_index) =>
Some(column_index.to_thrift()),
- Index::DOUBLE(column_index) =>
Some(column_index.to_thrift()),
- Index::FIXED_LEN_BYTE_ARRAY(column_index) => {
+ ColumnIndexMetaData::NONE => None,
+ ColumnIndexMetaData::BOOLEAN(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::BYTE_ARRAY(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::DOUBLE(column_index) => {
+ Some(column_index.to_thrift())
+ }
+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::FLOAT(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::INT32(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::INT64(column_index) => {
+ Some(column_index.to_thrift())
+ }
+ ColumnIndexMetaData::INT96(column_index) => {
Some(column_index.to_thrift())
}
- Index::FLOAT(column_index) =>
Some(column_index.to_thrift()),
- Index::INT32(column_index) =>
Some(column_index.to_thrift()),
- Index::INT64(column_index) =>
Some(column_index.to_thrift()),
- Index::INT96(column_index) =>
Some(column_index.to_thrift()),
})
.collect()
})
diff --git a/parquet/src/file/page_index/column_index.rs
b/parquet/src/file/page_index/column_index.rs
new file mode 100644
index 0000000000..2d43c93b2e
--- /dev/null
+++ b/parquet/src/file/page_index/column_index.rs
@@ -0,0 +1,569 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ColumnIndexMetaData`] structures holding decoded [`ColumnIndex`]
information
+//!
+//! [`ColumnIndex`]: crate::format::ColumnIndex
+//!
+
+use crate::{
+ data_type::{ByteArray, FixedLenByteArray},
+ errors::Result,
+};
+use std::ops::Deref;
+
+use crate::{
+ basic::BoundaryOrder,
+ data_type::{private::ParquetValueType, Int96},
+ file::page_index::index_reader::ThriftColumnIndex,
+};
+
+/// Common bits of the column index
+#[derive(Debug, Clone, PartialEq)]
+pub struct ColumnIndex {
+ pub(crate) null_pages: Vec<bool>,
+ pub(crate) boundary_order: BoundaryOrder,
+ pub(crate) null_counts: Option<Vec<i64>>,
+ pub(crate) repetition_level_histograms: Option<Vec<i64>>,
+ pub(crate) definition_level_histograms: Option<Vec<i64>>,
+}
+
+impl ColumnIndex {
+ /// Returns the number of pages
+ pub fn num_pages(&self) -> u64 {
+ self.null_pages.len() as u64
+ }
+
+ /// Returns the number of null values in the page indexed by `idx`
+ ///
+ /// Returns `None` if no null counts have been set in the index
+ pub fn null_count(&self, idx: usize) -> Option<i64> {
+ self.null_counts.as_ref().map(|nc| nc[idx])
+ }
+
+ /// Returns the repetition level histogram for the page indexed by `idx`
+ pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+ if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
+ let num_lvls = rep_hists.len() / self.num_pages() as usize;
+ let start = num_lvls * idx;
+ Some(&rep_hists[start..start + num_lvls])
+ } else {
+ None
+ }
+ }
+
+ /// Returns the definition level histogram for the page indexed by `idx`
+ pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+ if let Some(def_hists) = self.definition_level_histograms.as_ref() {
+ let num_lvls = def_hists.len() / self.num_pages() as usize;
+ let start = num_lvls * idx;
+ Some(&def_hists[start..start + num_lvls])
+ } else {
+ None
+ }
+ }
+
+ /// Returns whether the page indexed by `idx` consists of all null values
+ pub fn is_null_page(&self, idx: usize) -> bool {
+ self.null_pages[idx]
+ }
+}
+
+/// Column index for primitive types
+#[derive(Debug, Clone, PartialEq)]
+pub struct PrimitiveColumnIndex<T> {
+ pub(crate) column_index: ColumnIndex,
+ pub(crate) min_values: Vec<T>,
+ pub(crate) max_values: Vec<T>,
+}
+
+impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
+ pub(super) fn try_new(index: ThriftColumnIndex) -> Result<Self> {
+ let len = index.null_pages.len();
+
+ let mut min_values = Vec::with_capacity(len);
+ let mut max_values = Vec::with_capacity(len);
+
+ for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+ if !is_null {
+ let min = index.min_values[i];
+ min_values.push(T::try_from_le_slice(min)?);
+
+ let max = index.max_values[i];
+ max_values.push(T::try_from_le_slice(max)?);
+ } else {
+ // need placeholders
+ min_values.push(Default::default());
+ max_values.push(Default::default());
+ }
+ }
+
+ Ok(Self {
+ column_index: ColumnIndex {
+ null_pages: index.null_pages,
+ boundary_order: index.boundary_order,
+ null_counts: index.null_counts,
+ repetition_level_histograms: index.repetition_level_histograms,
+ definition_level_histograms: index.definition_level_histograms,
+ },
+ min_values,
+ max_values,
+ })
+ }
+
+ pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+ let min_values = self
+ .min_values
+ .iter()
+ .map(|x| x.as_bytes().to_vec())
+ .collect::<Vec<_>>();
+
+ let max_values = self
+ .max_values
+ .iter()
+ .map(|x| x.as_bytes().to_vec())
+ .collect::<Vec<_>>();
+
+ let null_counts = self.null_counts.clone();
+ let repetition_level_histograms =
self.repetition_level_histograms.clone();
+ let definition_level_histograms =
self.definition_level_histograms.clone();
+ let null_pages = self.null_pages.clone();
+
+ crate::format::ColumnIndex::new(
+ null_pages,
+ min_values,
+ max_values,
+ self.boundary_order.into(),
+ null_counts,
+ repetition_level_histograms,
+ definition_level_histograms,
+ )
+ }
+}
+
+impl<T> PrimitiveColumnIndex<T> {
+ /// Returns an array containing the min values for each page.
+ ///
+ /// Values in the returned slice are only valid if
[`ColumnIndex::is_null_page()`]
+ /// is `false` for the same index.
+ pub fn min_values(&self) -> &[T] {
+ &self.min_values
+ }
+
+ /// Returns an array containing the max values for each page.
+ ///
+ /// Values in the returned slice are only valid if
[`ColumnIndex::is_null_page()`]
+ /// is `false` for the same index.
+ pub fn max_values(&self) -> &[T] {
+ &self.max_values
+ }
+
+ /// Returns an iterator over the min values.
+ ///
+ /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+ pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+ self.min_values.iter().enumerate().map(|(i, min)| {
+ if self.is_null_page(i) {
+ None
+ } else {
+ Some(min)
+ }
+ })
+ }
+
+ /// Returns an iterator over the max values.
+ ///
+ /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+ pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+ self.max_values.iter().enumerate().map(|(i, min)| {
+ if self.is_null_page(i) {
+ None
+ } else {
+ Some(min)
+ }
+ })
+ }
+
+ /// Returns the min value for the page indexed by `idx`
+ ///
+ /// It is `None` when all values are null
+ pub fn min_value(&self, idx: usize) -> Option<&T> {
+ if self.null_pages[idx] {
+ None
+ } else {
+ Some(&self.min_values[idx])
+ }
+ }
+
+ /// Returns the max value for the page indexed by `idx`
+ ///
+ /// It is `None` when all values are null
+ pub fn max_value(&self, idx: usize) -> Option<&T> {
+ if self.null_pages[idx] {
+ None
+ } else {
+ Some(&self.max_values[idx])
+ }
+ }
+}
+
+impl<T> Deref for PrimitiveColumnIndex<T> {
+ type Target = ColumnIndex;
+
+ fn deref(&self) -> &Self::Target {
+ &self.column_index
+ }
+}
+
+/// Column index for byte arrays (fixed length and variable)
+#[derive(Debug, Clone, PartialEq)]
+pub struct ByteArrayColumnIndex {
+ pub(crate) column_index: ColumnIndex,
+ // raw bytes for min and max values
+ pub(crate) min_bytes: Vec<u8>,
+ pub(crate) min_offsets: Vec<usize>,
+ pub(crate) max_bytes: Vec<u8>,
+ pub(crate) max_offsets: Vec<usize>,
+}
+
+impl ByteArrayColumnIndex {
+ pub(super) fn try_new(index: ThriftColumnIndex) -> Result<Self> {
+ let len = index.null_pages.len();
+
+ let min_len = index.min_values.iter().map(|&v| v.len()).sum();
+ let max_len = index.max_values.iter().map(|&v| v.len()).sum();
+ let mut min_bytes = vec![0u8; min_len];
+ let mut max_bytes = vec![0u8; max_len];
+
+ let mut min_offsets = vec![0usize; len + 1];
+ let mut max_offsets = vec![0usize; len + 1];
+
+ let mut min_pos = 0;
+ let mut max_pos = 0;
+
+ for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+ if !is_null {
+ let min = index.min_values[i];
+ let dst = &mut min_bytes[min_pos..min_pos + min.len()];
+ dst.copy_from_slice(min);
+ min_offsets[i] = min_pos;
+ min_pos += min.len();
+
+ let max = index.max_values[i];
+ let dst = &mut max_bytes[max_pos..max_pos + max.len()];
+ dst.copy_from_slice(max);
+ max_offsets[i] = max_pos;
+ max_pos += max.len();
+ } else {
+ min_offsets[i] = min_pos;
+ max_offsets[i] = max_pos;
+ }
+ }
+
+ min_offsets[len] = min_pos;
+ max_offsets[len] = max_pos;
+
+ Ok(Self {
+ column_index: ColumnIndex {
+ null_pages: index.null_pages,
+ boundary_order: index.boundary_order,
+ null_counts: index.null_counts,
+ repetition_level_histograms: index.repetition_level_histograms,
+ definition_level_histograms: index.definition_level_histograms,
+ },
+
+ min_bytes,
+ min_offsets,
+ max_bytes,
+ max_offsets,
+ })
+ }
+
+ /// Returns the min value for the page indexed by `idx`
+ ///
+ /// It is `None` when all values are null
+ pub fn min_value(&self, idx: usize) -> Option<&[u8]> {
+ if self.null_pages[idx] {
+ None
+ } else {
+ let start = self.min_offsets[idx];
+ let end = self.min_offsets[idx + 1];
+ Some(&self.min_bytes[start..end])
+ }
+ }
+
+ /// Returns the max value for the page indexed by `idx`
+ ///
+ /// It is `None` when all values are null
+ pub fn max_value(&self, idx: usize) -> Option<&[u8]> {
+ if self.null_pages[idx] {
+ None
+ } else {
+ let start = self.max_offsets[idx];
+ let end = self.max_offsets[idx + 1];
+ Some(&self.max_bytes[start..end])
+ }
+ }
+
+ /// Returns an iterator over the min values.
+ ///
+ /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+ pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+ (0..self.num_pages() as usize).map(|i| {
+ if self.is_null_page(i) {
+ None
+ } else {
+ self.min_value(i)
+ }
+ })
+ }
+
+ /// Returns an iterator over the max values.
+ ///
+ /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+ pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+ (0..self.num_pages() as usize).map(|i| {
+ if self.is_null_page(i) {
+ None
+ } else {
+ self.max_value(i)
+ }
+ })
+ }
+
+ pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+ let mut min_values = Vec::with_capacity(self.num_pages() as usize);
+ for i in 0..self.num_pages() as usize {
+ min_values.push(self.min_value(i).unwrap_or(&[]).to_owned());
+ }
+
+ let mut max_values = Vec::with_capacity(self.num_pages() as usize);
+ for i in 0..self.num_pages() as usize {
+ max_values.push(self.max_value(i).unwrap_or(&[]).to_owned());
+ }
+
+ let null_counts = self.null_counts.clone();
+ let repetition_level_histograms =
self.repetition_level_histograms.clone();
+ let definition_level_histograms =
self.definition_level_histograms.clone();
+ let null_pages = self.null_pages.clone();
+
+ crate::format::ColumnIndex::new(
+ null_pages,
+ min_values,
+ max_values,
+ self.boundary_order.into(),
+ null_counts,
+ repetition_level_histograms,
+ definition_level_histograms,
+ )
+ }
+}
+
+impl Deref for ByteArrayColumnIndex {
+ type Target = ColumnIndex;
+
+ fn deref(&self) -> &Self::Target {
+ &self.column_index
+ }
+}
+
+// Macro to generate getter functions for ColumnIndexMetaData.
+macro_rules! colidx_enum_func {
+ ($self:ident, $func:ident, $arg:ident) => {{
+ match *$self {
+ Self::BOOLEAN(ref typed) => typed.$func($arg),
+ Self::INT32(ref typed) => typed.$func($arg),
+ Self::INT64(ref typed) => typed.$func($arg),
+ Self::INT96(ref typed) => typed.$func($arg),
+ Self::FLOAT(ref typed) => typed.$func($arg),
+ Self::DOUBLE(ref typed) => typed.$func($arg),
+ Self::BYTE_ARRAY(ref typed) => typed.$func($arg),
+ Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg),
+ _ => panic!(concat!(
+ "Cannot call ",
+ stringify!($func),
+ " on ColumnIndexMetaData::NONE"
+ )),
+ }
+ }};
+ ($self:ident, $func:ident) => {{
+ match *$self {
+ Self::BOOLEAN(ref typed) => typed.$func(),
+ Self::INT32(ref typed) => typed.$func(),
+ Self::INT64(ref typed) => typed.$func(),
+ Self::INT96(ref typed) => typed.$func(),
+ Self::FLOAT(ref typed) => typed.$func(),
+ Self::DOUBLE(ref typed) => typed.$func(),
+ Self::BYTE_ARRAY(ref typed) => typed.$func(),
+ Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(),
+ _ => panic!(concat!(
+ "Cannot call ",
+ stringify!($func),
+ " on ColumnIndexMetaData::NONE"
+ )),
+ }
+ }};
+}
+
+/// index
+#[derive(Debug, Clone, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum ColumnIndexMetaData {
+ /// Sometimes reading page index from parquet file
+ /// will only return pageLocations without min_max index,
+ /// `NONE` represents this lack of index information
+ NONE,
+ /// Boolean type index
+ BOOLEAN(PrimitiveColumnIndex<bool>),
+ /// 32-bit integer type index
+ INT32(PrimitiveColumnIndex<i32>),
+ /// 64-bit integer type index
+ INT64(PrimitiveColumnIndex<i64>),
+ /// 96-bit integer type (timestamp) index
+ INT96(PrimitiveColumnIndex<Int96>),
+ /// 32-bit floating point type index
+ FLOAT(PrimitiveColumnIndex<f32>),
+ /// 64-bit floating point type index
+ DOUBLE(PrimitiveColumnIndex<f64>),
+ /// Byte array type index
+ BYTE_ARRAY(ByteArrayColumnIndex),
+ /// Fixed length byte array type index
+ FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex),
+}
+
+impl ColumnIndexMetaData {
+ /// Return min/max elements inside ColumnIndex are ordered or not.
+ pub fn is_sorted(&self) -> bool {
+ // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
+ if let Some(order) = self.get_boundary_order() {
+ order != BoundaryOrder::UNORDERED
+ } else {
+ false
+ }
+ }
+
+ /// Get boundary_order of this page index.
+ pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
+ match self {
+ Self::NONE => None,
+ Self::BOOLEAN(index) => Some(index.boundary_order),
+ Self::INT32(index) => Some(index.boundary_order),
+ Self::INT64(index) => Some(index.boundary_order),
+ Self::INT96(index) => Some(index.boundary_order),
+ Self::FLOAT(index) => Some(index.boundary_order),
+ Self::DOUBLE(index) => Some(index.boundary_order),
+ Self::BYTE_ARRAY(index) => Some(index.boundary_order),
+ Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
+ }
+ }
+
+ /// Returns array of null counts, one per page.
+ ///
+ /// Returns `None` if now null counts have been set in the index
+ pub fn null_counts(&self) -> Option<&Vec<i64>> {
+ match self {
+ Self::NONE => None,
+ Self::BOOLEAN(index) => index.null_counts.as_ref(),
+ Self::INT32(index) => index.null_counts.as_ref(),
+ Self::INT64(index) => index.null_counts.as_ref(),
+ Self::INT96(index) => index.null_counts.as_ref(),
+ Self::FLOAT(index) => index.null_counts.as_ref(),
+ Self::DOUBLE(index) => index.null_counts.as_ref(),
+ Self::BYTE_ARRAY(index) => index.null_counts.as_ref(),
+ Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(),
+ }
+ }
+
+ /// Returns the number of pages
+ pub fn num_pages(&self) -> u64 {
+ colidx_enum_func!(self, num_pages)
+ }
+
+ /// Returns the number of null values in the page indexed by `idx`
+ ///
+ /// Returns `None` if no null counts have been set in the index
+ pub fn null_count(&self, idx: usize) -> Option<i64> {
+ colidx_enum_func!(self, null_count, idx)
+ }
+
+ /// Returns the repetition level histogram for the page indexed by `idx`
+ pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+ colidx_enum_func!(self, repetition_level_histogram, idx)
+ }
+
+ /// Returns the definition level histogram for the page indexed by `idx`
+ pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+ colidx_enum_func!(self, definition_level_histogram, idx)
+ }
+
+ /// Returns whether the page indexed by `idx` consists of all null values
+ pub fn is_null_page(&self, idx: usize) -> bool {
+ colidx_enum_func!(self, is_null_page, idx)
+ }
+}
+
+/// Provides iterators over min and max values of a [`ColumnIndexMetaData`]
+pub trait ColumnIndexIterators {
+ /// Can be one of `bool`, `i32`, `i64`, `Int96`, `f32`, `f64`,
[`ByteArray`],
+ /// or [`FixedLenByteArray`]
+ type Item;
+
+ /// Return iterator over the min values for the index
+ fn min_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item =
Option<Self::Item>>;
+
+ /// Return iterator over the max values for the index
+ fn max_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item =
Option<Self::Item>>;
+}
+
+macro_rules! column_index_iters {
+ ($item: ident, $variant: ident, $conv:expr) => {
+ impl ColumnIndexIterators for $item {
+ type Item = $item;
+
+ fn min_values_iter(
+ colidx: &ColumnIndexMetaData,
+ ) -> impl Iterator<Item = Option<Self::Item>> {
+ if let ColumnIndexMetaData::$variant(index) = colidx {
+ index.min_values_iter().map($conv)
+ } else {
+ panic!(concat!("Wrong type for ", stringify!($item), "
iterator"))
+ }
+ }
+
+ fn max_values_iter(
+ colidx: &ColumnIndexMetaData,
+ ) -> impl Iterator<Item = Option<Self::Item>> {
+ if let ColumnIndexMetaData::$variant(index) = colidx {
+ index.max_values_iter().map($conv)
+ } else {
+ panic!(concat!("Wrong type for ", stringify!($item), "
iterator"))
+ }
+ }
+ }
+ };
+}
+
+column_index_iters!(bool, BOOLEAN, |v| v.copied());
+column_index_iters!(i32, INT32, |v| v.copied());
+column_index_iters!(i64, INT64, |v| v.copied());
+column_index_iters!(Int96, INT96, |v| v.copied());
+column_index_iters!(f32, FLOAT, |v| v.copied());
+column_index_iters!(f64, DOUBLE, |v| v.copied());
+column_index_iters!(ByteArray, BYTE_ARRAY, |v| v
+ .map(|v| ByteArray::from(v.to_owned())));
+column_index_iters!(FixedLenByteArray, FIXED_LEN_BYTE_ARRAY, |v| v
+ .map(|v| FixedLenByteArray::from(v.to_owned())));
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index ed586bcd33..861dc0c3b0 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -24,7 +24,7 @@ use crate::data_type::private::ParquetValueType;
use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
use crate::errors::ParquetError;
use crate::file::metadata::LevelHistogram;
-use crate::file::page_index::index_reader::ColumnIndex;
+use crate::file::page_index::index_reader::ThriftColumnIndex;
use std::fmt::Debug;
/// Typed statistics for one data page
@@ -310,7 +310,8 @@ impl<T: ParquetValueType> NativeIndex<T> {
}
/// Creates a new [`NativeIndex`]
- pub(crate) fn try_new_local(index: ColumnIndex) -> Result<Self,
ParquetError> {
+ #[allow(dead_code)]
+ pub(super) fn try_new_local(index: ThriftColumnIndex) -> Result<Self,
ParquetError> {
let len = index.min_values.len();
// turn Option<Vec<i64>> into Vec<Option<i64>>
diff --git a/parquet/src/file/page_index/index_reader.rs
b/parquet/src/file/page_index/index_reader.rs
index fbe6d39845..f35241689e 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -15,13 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-//! Support for reading [`Index`] and [`OffsetIndexMetaData`] from parquet
metadata.
+//! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`]
from parquet metadata.
use crate::basic::{BoundaryOrder, Type};
use crate::data_type::Int96;
use crate::errors::{ParquetError, Result};
use crate::file::metadata::ColumnChunkMetaData;
-use crate::file::page_index::index::{Index, NativeIndex};
+use crate::file::page_index::column_index::{
+ ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+};
use crate::file::page_index::offset_index::OffsetIndexMetaData;
use crate::file::reader::ChunkReader;
use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
@@ -38,7 +40,7 @@ pub(crate) fn acc_range(a: Option<Range<u64>>, b:
Option<Range<u64>>) -> Option<
}
}
-/// Reads per-column [`Index`] for all columns of a row group by
+/// Reads per-column [`ColumnIndexMetaData`] for all columns of a row group by
/// decoding [`ColumnIndex`] .
///
/// Returns a vector of `index[column_number]`.
@@ -56,7 +58,7 @@ pub(crate) fn acc_range(a: Option<Range<u64>>, b:
Option<Range<u64>>) -> Option<
pub fn read_columns_indexes<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
-) -> Result<Option<Vec<Index>>, ParquetError> {
+) -> Result<Option<Vec<ColumnIndexMetaData>>, ParquetError> {
let fetch = chunks
.iter()
.fold(None, |range, c| acc_range(range, c.column_index_range()));
@@ -77,7 +79,7 @@ pub fn read_columns_indexes<R: ChunkReader>(
..usize::try_from(r.end - fetch.start)?],
c.column_type(),
),
- None => Ok(Index::NONE),
+ None => Ok(ColumnIndexMetaData::NONE),
})
.collect(),
)
@@ -134,8 +136,9 @@ pub(crate) fn decode_offset_index(data: &[u8]) ->
Result<OffsetIndexMetaData, Pa
OffsetIndexMetaData::try_from(&mut prot)
}
+// private struct only used for decoding then discarded
thrift_struct!(
-pub(crate) struct ColumnIndex<'a> {
+pub(super) struct ThriftColumnIndex<'a> {
1: required list<bool> null_pages
2: required list<'a><binary> min_values
3: required list<'a><binary> max_values
@@ -146,20 +149,25 @@ pub(crate) struct ColumnIndex<'a> {
}
);
-pub(crate) fn decode_column_index(data: &[u8], column_type: Type) ->
Result<Index, ParquetError> {
+pub(crate) fn decode_column_index(
+ data: &[u8],
+ column_type: Type,
+) -> Result<ColumnIndexMetaData, ParquetError> {
let mut prot = ThriftCompactInputProtocol::new(data);
- let index = ColumnIndex::try_from(&mut prot)?;
+ let index = ThriftColumnIndex::try_from(&mut prot)?;
let index = match column_type {
- Type::BOOLEAN =>
Index::BOOLEAN(NativeIndex::<bool>::try_new_local(index)?),
- Type::INT32 => Index::INT32(NativeIndex::<i32>::try_new_local(index)?),
- Type::INT64 => Index::INT64(NativeIndex::<i64>::try_new_local(index)?),
- Type::INT96 =>
Index::INT96(NativeIndex::<Int96>::try_new_local(index)?),
- Type::FLOAT => Index::FLOAT(NativeIndex::<f32>::try_new_local(index)?),
- Type::DOUBLE =>
Index::DOUBLE(NativeIndex::<f64>::try_new_local(index)?),
- Type::BYTE_ARRAY =>
Index::BYTE_ARRAY(NativeIndex::try_new_local(index)?),
+ Type::BOOLEAN => {
+
ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::<bool>::try_new(index)?)
+ }
+ Type::INT32 =>
ColumnIndexMetaData::INT32(PrimitiveColumnIndex::<i32>::try_new(index)?),
+ Type::INT64 =>
ColumnIndexMetaData::INT64(PrimitiveColumnIndex::<i64>::try_new(index)?),
+ Type::INT96 =>
ColumnIndexMetaData::INT96(PrimitiveColumnIndex::<Int96>::try_new(index)?),
+ Type::FLOAT =>
ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::<f32>::try_new(index)?),
+ Type::DOUBLE =>
ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::<f64>::try_new(index)?),
+ Type::BYTE_ARRAY =>
ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?),
Type::FIXED_LEN_BYTE_ARRAY => {
- Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new_local(index)?)
+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?)
}
};
diff --git a/parquet/src/file/page_index/mod.rs
b/parquet/src/file/page_index/mod.rs
index a8077896db..ff70e2eca5 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -19,6 +19,7 @@
//!
//! [Column Index]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+pub mod column_index;
pub mod index;
pub mod index_reader;
pub mod offset_index;
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index bead048ee2..5308825b09 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1102,13 +1102,15 @@ mod tests {
use bytes::Buf;
+ use crate::file::page_index::column_index::{
+ ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+ };
use crate::file::properties::{EnabledStatistics, WriterProperties};
use crate::basic::{self, BoundaryOrder, ColumnOrder, SortOrder};
use crate::column::reader::ColumnReader;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{AsBytes, FixedLenByteArrayType, Int32Type};
- use crate::file::page_index::index::{Index, NativeIndex};
#[allow(deprecated)]
use crate::file::page_index::index_reader::{read_columns_indexes,
read_offset_indexes};
use crate::file::writer::SerializedFileWriter;
@@ -1912,21 +1914,19 @@ mod tests {
// only one row group
assert_eq!(column_index.len(), 1);
- let index = if let Index::BYTE_ARRAY(index) = &column_index[0][0] {
+ let index = if let ColumnIndexMetaData::BYTE_ARRAY(index) =
&column_index[0][0] {
index
} else {
unreachable!()
};
assert_eq!(index.boundary_order, BoundaryOrder::ASCENDING);
- let index_in_pages = &index.indexes;
//only one page group
- assert_eq!(index_in_pages.len(), 1);
+ assert_eq!(index.num_pages(), 1);
- let page0 = &index_in_pages[0];
- let min = page0.min.as_ref().unwrap();
- let max = page0.max.as_ref().unwrap();
+ let min = index.min_value(0).unwrap();
+ let max = index.max_value(0).unwrap();
assert_eq!(b"Hello", min.as_bytes());
assert_eq!(b"today", max.as_bytes());
@@ -1991,7 +1991,7 @@ mod tests {
let boundary_order = &column_index[0][0].get_boundary_order();
assert!(boundary_order.is_some());
matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED);
- if let Index::INT32(index) = &column_index[0][0] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][0] {
check_native_page_index(
index,
325,
@@ -2004,15 +2004,15 @@ mod tests {
};
//col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0]
assert!(&column_index[0][1].is_sorted());
- if let Index::BOOLEAN(index) = &column_index[0][1] {
- assert_eq!(index.indexes.len(), 82);
+ if let ColumnIndexMetaData::BOOLEAN(index) = &column_index[0][1] {
+ assert_eq!(index.num_pages(), 82);
assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82);
} else {
unreachable!()
};
//col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
assert!(&column_index[0][2].is_sorted());
- if let Index::INT32(index) = &column_index[0][2] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][2] {
check_native_page_index(
index,
325,
@@ -2025,7 +2025,7 @@ mod tests {
};
//col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
assert!(&column_index[0][3].is_sorted());
- if let Index::INT32(index) = &column_index[0][3] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][3] {
check_native_page_index(
index,
325,
@@ -2038,7 +2038,7 @@ mod tests {
};
//col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
assert!(&column_index[0][4].is_sorted());
- if let Index::INT32(index) = &column_index[0][4] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][4] {
check_native_page_index(
index,
325,
@@ -2051,7 +2051,7 @@ mod tests {
};
//col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90,
num_nulls: 0]
assert!(!&column_index[0][5].is_sorted());
- if let Index::INT64(index) = &column_index[0][5] {
+ if let ColumnIndexMetaData::INT64(index) = &column_index[0][5] {
check_native_page_index(
index,
528,
@@ -2064,7 +2064,7 @@ mod tests {
};
//col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9,
num_nulls: 0]
assert!(&column_index[0][6].is_sorted());
- if let Index::FLOAT(index) = &column_index[0][6] {
+ if let ColumnIndexMetaData::FLOAT(index) = &column_index[0][6] {
check_native_page_index(
index,
325,
@@ -2077,7 +2077,7 @@ mod tests {
};
//col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max:
90.89999999999999, num_nulls: 0]
assert!(!&column_index[0][7].is_sorted());
- if let Index::DOUBLE(index) = &column_index[0][7] {
+ if let ColumnIndexMetaData::DOUBLE(index) = &column_index[0][7] {
check_native_page_index(
index,
528,
@@ -2090,8 +2090,8 @@ mod tests {
};
//col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max:
12/31/10, num_nulls: 0]
assert!(!&column_index[0][8].is_sorted());
- if let Index::BYTE_ARRAY(index) = &column_index[0][8] {
- check_native_page_index(
+ if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][8] {
+ check_byte_array_page_index(
index,
974,
get_row_group_min_max_bytes(row_group_metadata, 8),
@@ -2103,8 +2103,8 @@ mod tests {
};
//col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795
SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
assert!(&column_index[0][9].is_sorted());
- if let Index::BYTE_ARRAY(index) = &column_index[0][9] {
- check_native_page_index(
+ if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][9] {
+ check_byte_array_page_index(
index,
352,
get_row_group_min_max_bytes(row_group_metadata, 9),
@@ -2117,14 +2117,14 @@ mod tests {
//col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0,
min/max not defined]
//Notice: min_max values for each page for this col not exits.
assert!(!&column_index[0][10].is_sorted());
- if let Index::NONE = &column_index[0][10] {
+ if let ColumnIndexMetaData::NONE = &column_index[0][10] {
assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974);
} else {
unreachable!()
};
//col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0]
assert!(&column_index[0][11].is_sorted());
- if let Index::INT32(index) = &column_index[0][11] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][11] {
check_native_page_index(
index,
325,
@@ -2137,7 +2137,7 @@ mod tests {
};
//col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0]
assert!(!&column_index[0][12].is_sorted());
- if let Index::INT32(index) = &column_index[0][12] {
+ if let ColumnIndexMetaData::INT32(index) = &column_index[0][12] {
check_native_page_index(
index,
325,
@@ -2151,17 +2151,31 @@ mod tests {
}
fn check_native_page_index<T: ParquetValueType>(
- row_group_index: &NativeIndex<T>,
+ row_group_index: &PrimitiveColumnIndex<T>,
page_size: usize,
min_max: (&[u8], &[u8]),
boundary_order: BoundaryOrder,
) {
- assert_eq!(row_group_index.indexes.len(), page_size);
+ assert_eq!(row_group_index.num_pages() as usize, page_size);
assert_eq!(row_group_index.boundary_order, boundary_order);
- row_group_index.indexes.iter().all(|x| {
- x.min.as_ref().unwrap() >=
&T::try_from_le_slice(min_max.0).unwrap()
- && x.max.as_ref().unwrap() <=
&T::try_from_le_slice(min_max.1).unwrap()
- });
+ assert!(row_group_index.min_values().iter().all(|x| {
+ x >= &T::try_from_le_slice(min_max.0).unwrap()
+ && x <= &T::try_from_le_slice(min_max.1).unwrap()
+ }));
+ }
+
+ fn check_byte_array_page_index(
+ row_group_index: &ByteArrayColumnIndex,
+ page_size: usize,
+ min_max: (&[u8], &[u8]),
+ boundary_order: BoundaryOrder,
+ ) {
+ assert_eq!(row_group_index.num_pages() as usize, page_size);
+ assert_eq!(row_group_index.boundary_order, boundary_order);
+ for i in 0..row_group_index.num_pages() as usize {
+ let x = row_group_index.min_value(i).unwrap();
+ assert!(x >= min_max.0 && x <= min_max.1);
+ }
}
fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) ->
(&[u8], &[u8]) {
@@ -2402,12 +2416,11 @@ mod tests {
assert_eq!(c.len(), 1);
match &c[0] {
- Index::FIXED_LEN_BYTE_ARRAY(v) => {
- assert_eq!(v.indexes.len(), 1);
- let page_idx = &v.indexes[0];
- assert_eq!(page_idx.null_count.unwrap(), 1);
- assert_eq!(page_idx.min.as_ref().unwrap().as_ref(), &[0; 11]);
- assert_eq!(page_idx.max.as_ref().unwrap().as_ref(), &[5; 11]);
+ ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => {
+ assert_eq!(v.num_pages(), 1);
+ assert_eq!(v.null_count(0).unwrap(), 1);
+ assert_eq!(v.min_value(0).unwrap(), &[0; 11]);
+ assert_eq!(v.max_value(0).unwrap(), &[5; 11]);
}
_ => unreachable!(),
}
@@ -2538,11 +2551,11 @@ mod tests {
// test that we got the index matching the row group
match pg_idx {
- Index::INT32(int_idx) => {
+ ColumnIndexMetaData::INT32(int_idx) => {
let min = col_stats.min_bytes_opt().unwrap().get_i32_le();
let max = col_stats.max_bytes_opt().unwrap().get_i32_le();
- assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref());
- assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref());
+ assert_eq!(int_idx.min_value(0), Some(min).as_ref());
+ assert_eq!(int_idx.max_value(0), Some(max).as_ref());
}
_ => panic!("wrong stats type"),
}
@@ -2583,11 +2596,11 @@ mod tests {
// test that we got the index matching the row group
match pg_idx {
- Index::INT32(int_idx) => {
+ ColumnIndexMetaData::INT32(int_idx) => {
let min = col_stats.min_bytes_opt().unwrap().get_i32_le();
let max = col_stats.max_bytes_opt().unwrap().get_i32_le();
- assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref());
- assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref());
+ assert_eq!(int_idx.min_value(0), Some(min).as_ref());
+ assert_eq!(int_idx.max_value(0), Some(max).as_ref());
}
_ => panic!("wrong stats type"),
}
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 7db517ced5..65b96246ea 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1062,7 +1062,7 @@ mod tests {
use crate::column::reader::get_typed_column_reader;
use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
use crate::data_type::{BoolType, ByteArrayType, Int32Type};
- use crate::file::page_index::index::Index;
+ use crate::file::page_index::column_index::ColumnIndexMetaData;
use crate::file::properties::EnabledStatistics;
use crate::file::serialized_reader::ReadOptionsBuilder;
use crate::file::{
@@ -2083,9 +2083,9 @@ mod tests {
assert_eq!(column_index[0].len(), 2); // 2 column
let a_idx = &column_index[0][0];
- assert!(matches!(a_idx, Index::INT32(_)), "{a_idx:?}");
+ assert!(matches!(a_idx, ColumnIndexMetaData::INT32(_)), "{a_idx:?}");
let b_idx = &column_index[0][1];
- assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+ assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
}
#[test]
@@ -2169,16 +2169,16 @@ mod tests {
let column_index = reader.metadata().column_index().unwrap();
assert_eq!(column_index.len(), 1);
assert_eq!(column_index[0].len(), 1);
- let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] {
- assert_eq!(index.indexes.len(), 1);
- &index.indexes[0]
+ let col_idx = if let ColumnIndexMetaData::BYTE_ARRAY(index) =
&column_index[0][0] {
+ assert_eq!(index.num_pages(), 1);
+ index
} else {
unreachable!()
};
- assert!(col_idx.repetition_level_histogram().is_none());
- assert!(col_idx.definition_level_histogram().is_some());
- check_def_hist(col_idx.definition_level_histogram().unwrap().values());
+ assert!(col_idx.repetition_level_histogram(0).is_none());
+ assert!(col_idx.definition_level_histogram(0).is_some());
+ check_def_hist(col_idx.definition_level_histogram(0).unwrap());
assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();
@@ -2324,15 +2324,15 @@ mod tests {
let column_index = reader.metadata().column_index().unwrap();
assert_eq!(column_index.len(), 1);
assert_eq!(column_index[0].len(), 1);
- let col_idx = if let Index::INT32(index) = &column_index[0][0] {
- assert_eq!(index.indexes.len(), 1);
- &index.indexes[0]
+ let col_idx = if let ColumnIndexMetaData::INT32(index) =
&column_index[0][0] {
+ assert_eq!(index.num_pages(), 1);
+ index
} else {
unreachable!()
};
- check_def_hist(col_idx.definition_level_histogram().unwrap().values());
- check_rep_hist(col_idx.repetition_level_histogram().unwrap().values());
+ check_def_hist(col_idx.definition_level_histogram(0).unwrap());
+ check_rep_hist(col_idx.repetition_level_histogram(0).unwrap());
assert!(reader.metadata().offset_index().is_some());
let offset_index = reader.metadata().offset_index().unwrap();
diff --git a/parquet/tests/arrow_reader/io/mod.rs
b/parquet/tests/arrow_reader/io/mod.rs
index b31f295755..9cafcd714e 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -49,7 +49,6 @@ use parquet::data_type::AsBytes;
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader,
ParquetOffsetIndex};
use parquet::file::properties::WriterProperties;
use parquet::file::FOOTER_SIZE;
-use parquet::format::PageLocation;
use parquet::schema::types::SchemaDescriptor;
use std::collections::BTreeMap;
use std::fmt::Display;
@@ -257,7 +256,7 @@ struct TestColumnChunk {
dictionary_page_location: Option<i64>,
/// The location of the data pages in the file
- page_locations: Vec<PageLocation>,
+ page_locations: Vec<parquet::format::PageLocation>,
}
/// Information about the pages in a single row group
@@ -287,8 +286,11 @@ impl TestRowGroups {
.enumerate()
.map(|(col_idx, col_meta)| {
let column_name =
col_meta.column_descr().name().to_string();
- let page_locations =
-
offset_index[rg_index][col_idx].page_locations().to_vec();
+ let page_locations = offset_index[rg_index][col_idx]
+ .page_locations()
+ .iter()
+ .map(parquet::format::PageLocation::from)
+ .collect();
let dictionary_page_location =
col_meta.dictionary_page_offset();
// We can find the byte range of the entire column
chunk
diff --git a/parquet/tests/encryption/encryption_util.rs
b/parquet/tests/encryption/encryption_util.rs
index bf7fd08109..6817491b30 100644
--- a/parquet/tests/encryption/encryption_util.rs
+++ b/parquet/tests/encryption/encryption_util.rs
@@ -191,11 +191,11 @@ pub(crate) fn verify_column_indexes(metadata:
&ParquetMetaData) {
let column_index = &column_index[0][float_col_idx];
match column_index {
- parquet::file::page_index::index::Index::FLOAT(float_index) => {
- assert_eq!(float_index.indexes.len(), 1);
- assert_eq!(float_index.indexes[0].min, Some(0.0f32));
- assert!(float_index.indexes[0]
- .max
+
parquet::file::page_index::column_index::ColumnIndexMetaData::FLOAT(float_index)
=> {
+ assert_eq!(float_index.num_pages(), 1);
+ assert_eq!(float_index.min_value(0), Some(&0.0f32));
+ assert!(float_index
+ .max_value(0)
.is_some_and(|max| (max - 53.9).abs() < 1e-6));
}
_ => {