This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7189472881 GH-46905: [C++][Parquet] Expose
Statistics.is_{min/max}_value_exact and default set to true if min/max are set
(#46992)
7189472881 is described below
commit 71894728810fa2c191a95315574fa5ea4546566f
Author: Raúl Cumplido <[email protected]>
AuthorDate: Wed Aug 27 16:14:31 2025 +0200
GH-46905: [C++][Parquet] Expose Statistics.is_{min/max}_value_exact and
default set to true if min/max are set (#46992)
### Rationale for this change
The `is_{min/max}_value_exact` fields exist on the thrift definition and
some implementations are already using them and truncating min and max values.
This PR aims to expose those values and to default to true when writing files
on C++ as no truncation is happening at the moment. If min/max statistics are
generated we can set `is_{min/max}_value_exact` to true.
Truncation for string and binary min/max is out of scope for this PR, we
can do this on a following one.
### What changes are included in this PR?
- The fields have been added to EncodedStatistics and Statistics along with
the Thrift integration.
- Tests and validation with new parquet-testing file generated where there
fields are present (https://github.com/apache/parquet-testing/pull/88)
- Tests with existing files without the fields.
- Update existing tests to validate the new fields.
- Add new fields to `ParquetFilePrinter`
### Are these changes tested?
Yes on CI.
### Are there any user-facing changes?
Yes, the new fields will be available for the users on the API when reading
Parquet files.
* GitHub Issue: #46905
Authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/metadata.cc | 13 +++-
cpp/src/parquet/printer.cc | 28 ++++++-
cpp/src/parquet/reader_test.cc | 33 +++++++-
cpp/src/parquet/statistics.cc | 48 +++++++++++-
cpp/src/parquet/statistics.h | 53 ++++++++++++-
cpp/src/parquet/statistics_test.cc | 153 ++++++++++++++++++++++++++++++++++---
cpp/src/parquet/thrift_internal.h | 12 +++
cpp/submodules/parquet-testing | 2 +-
8 files changed, 322 insertions(+), 20 deletions(-)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 30d69f4db5..c606561ca9 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -92,6 +92,14 @@ std::string ParquetVersionToString(ParquetVersion::type ver)
{
template <typename DType>
static std::shared_ptr<Statistics> MakeTypedColumnStats(
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
+ std::optional<bool> min_exact =
+ metadata.statistics.__isset.is_min_value_exact
+ ? std::optional<bool>(metadata.statistics.is_min_value_exact)
+ : std::nullopt;
+ std::optional<bool> max_exact =
+ metadata.statistics.__isset.is_max_value_exact
+ ? std::optional<bool>(metadata.statistics.is_max_value_exact)
+ : std::nullopt;
// If ColumnOrder is defined, return max_value and min_value
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
return MakeStatistics<DType>(
@@ -100,7 +108,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max_value &&
metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
- metadata.statistics.__isset.distinct_count);
+ metadata.statistics.__isset.distinct_count, min_exact, max_exact);
}
// Default behavior
return MakeStatistics<DType>(
@@ -108,7 +116,8 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
- metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count);
+ metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count,
+ min_exact, max_exact);
}
namespace {
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index dfd1d85809..dfce57a00f 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -166,11 +166,19 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream,
std::list<int> selecte
stream << " Values: " << column_chunk->num_values();
if (column_chunk->is_stats_set()) {
std::string min = stats->min(), max = stats->max();
+ std::string max_exact =
+ stats->is_max_value_exact.has_value()
+ ? (stats->is_max_value_exact.value() ? "true" : "false")
+ : "unknown";
+ std::string min_exact =
+ stats->is_min_value_exact.has_value()
+ ? (stats->is_min_value_exact.value() ? "true" : "false")
+ : "unknown";
stream << ", Null Values: " << stats->null_count
<< ", Distinct Values: " << stats->distinct_count << std::endl
- << " Max: "
+ << " Max (exact: " << max_exact << "): "
<< FormatStatValue(descr->physical_type(), max,
descr->logical_type())
- << ", Min: "
+ << ", Min (exact: " << min_exact << "): "
<< FormatStatValue(descr->physical_type(), min,
descr->logical_type());
} else {
stream << " Statistics Not Set";
@@ -342,6 +350,22 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream,
std::list<int> selected
<< R"("Min": ")"
<< FormatStatValue(descr->physical_type(), min,
descr->logical_type())
<< "\"";
+ if (stats->is_max_value_exact().has_value()) {
+ stream << ", "
+ << R"("IsMaxValueExact": ")"
+ << (stats->is_max_value_exact().value() ? "True" : "False")
<< "\"";
+ } else {
+ stream << ", "
+ << R"("IsMaxValueExact": "unknown")";
+ }
+ if (stats->is_min_value_exact().has_value()) {
+ stream << ", "
+ << R"("IsMinValueExact": ")"
+ << (stats->is_min_value_exact().value() ? "True" : "False")
<< "\"";
+ } else {
+ stream << ", "
+ << R"("IsMinValueExact": "unknown")";
+ }
}
stream << " },";
} else {
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 7093a5c116..7ae9021e35 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1023,7 +1023,7 @@ Column 0
Uncompressed Size: 103, Compressed Size: 104
Column 1
Values: 3, Null Values: 0, Distinct Values: 0
- Max: 1, Min: 1
+ Max (exact: unknown): 1, Min (exact: unknown): 1
Compression: SNAPPY, Encodings: PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY
Uncompressed Size: 52, Compressed Size: 56
)###";
@@ -1108,6 +1108,37 @@ class TestJSONWithLocalFile : public ::testing::Test {
}
};
+TEST_F(TestJSONWithLocalFile, JSONOutputWithStatistics) {
+ std::string json_output = R"###({
+ "FileName": "nested_lists.snappy.parquet",
+ "Version": "1.0",
+ "CreatedBy": "parquet-mr version 1.8.2 (build
c6522788629e590a53eb79874b95f6c3ff11f16c)",
+ "TotalRows": "3",
+ "NumberOfRowGroups": "1",
+ "NumberOfRealColumns": "2",
+ "NumberOfColumns": "2",
+ "Columns": [
+ { "Id": "0", "Name": "a.list.element.list.element.list.element",
"PhysicalType": "BYTE_ARRAY", "ConvertedType": "UTF8", "LogicalType": {"Type":
"String"} },
+ { "Id": "1", "Name": "b", "PhysicalType": "INT32", "ConvertedType":
"NONE", "LogicalType": {"Type": "None"} }
+ ],
+ "RowGroups": [
+ {
+ "Id": "0", "TotalBytes": "155", "TotalCompressedBytes": "0", "Rows":
"3",
+ "ColumnChunks": [
+ {"Id": "0", "Values": "18", "StatsSet": "False",
+ "Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE)
PLAIN_DICTIONARY", "UncompressedSize": "103", "CompressedSize": "104" },
+ {"Id": "1", "Values": "3", "StatsSet": "True", "Stats": {"NumNulls":
"0", "Max": "1", "Min": "1", "IsMaxValueExact": "unknown", "IsMinValueExact":
"unknown" },
+ "Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE)
PLAIN_DICTIONARY", "UncompressedSize": "52", "CompressedSize": "56" }
+ ]
+ }
+ ]
+}
+)###";
+
+ std::string json_content = ReadFromLocalFile("nested_lists.snappy.parquet");
+ ASSERT_EQ(json_output, json_content);
+}
+
TEST_F(TestJSONWithLocalFile, JSONOutput) {
std::string json_output = R"###({
"FileName": "alltypes_plain.parquet",
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index c373d75cec..932fc66ef1 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -590,6 +590,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
Copy(min, &min_, min_buffer_.get());
Copy(max, &max_, max_buffer_.get());
has_min_max_ = true;
+ statistics_.is_min_value_exact = true;
+ statistics_.is_max_value_exact = true;
}
// Create stats from a thrift Statistics object.
@@ -597,6 +599,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
const std::string& encoded_max, int64_t num_values,
int64_t null_count, int64_t distinct_count, bool
has_min_max,
bool has_null_count, bool has_distinct_count,
MemoryPool* pool)
+ : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values,
null_count,
+ distinct_count, has_min_max, has_null_count,
+ has_distinct_count,
+ /*is_min_value_exact=*/std::nullopt,
+ /*is_max_value_exact=*/std::nullopt, pool) {}
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string&
encoded_min,
+ const std::string& encoded_max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count, bool
has_min_max,
+ bool has_null_count, bool has_distinct_count,
+ std::optional<bool> is_min_value_exact,
+ std::optional<bool> is_max_value_exact, MemoryPool* pool)
: TypedStatisticsImpl(descr, pool) {
TypedStatisticsImpl::IncrementNumValues(num_values);
if (has_null_count) {
@@ -613,6 +627,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
if (has_min_max) {
PlainDecode(encoded_min, &min_);
PlainDecode(encoded_max, &max_);
+ statistics_.is_min_value_exact = is_min_value_exact;
+ statistics_.is_max_value_exact = is_max_value_exact;
}
has_min_max_ = has_min_max;
@@ -659,7 +675,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
return null_count() == other.null_count() &&
distinct_count() == other.distinct_count() &&
- num_values() == other.num_values();
+ num_values() == other.num_values() &&
+ is_min_value_exact() == other.is_min_value_exact() &&
+ is_max_value_exact() == other.is_max_value_exact();
}
bool MinMaxEqual(const TypedStatisticsImpl& other) const;
@@ -742,6 +760,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
if (HasMinMax()) {
s.set_min(this->EncodeMin());
s.set_max(this->EncodeMax());
+ s.is_min_value_exact = this->is_min_value_exact();
+ s.is_max_value_exact = this->is_max_value_exact();
}
if (HasNullCount()) {
s.set_null_count(this->null_count());
@@ -757,6 +777,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
int64_t null_count() const override { return statistics_.null_count; }
int64_t distinct_count() const override { return statistics_.distinct_count;
}
int64_t num_values() const override { return num_values_; }
+ std::optional<bool> is_min_value_exact() const override {
+ return statistics_.is_min_value_exact;
+ }
+ std::optional<bool> is_max_value_exact() const override {
+ return statistics_.is_max_value_exact;
+ }
private:
const ColumnDescriptor* descr_;
@@ -821,6 +847,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
Copy(comparator_->Compare(min_, min) ? min_ : min, &min_,
min_buffer_.get());
Copy(comparator_->Compare(max_, max) ? max : max_, &max_,
max_buffer_.get());
}
+ statistics_.is_min_value_exact = true;
+ statistics_.is_max_value_exact = true;
}
};
@@ -1042,7 +1070,8 @@ std::shared_ptr<Statistics> Statistics::Make(const
ColumnDescriptor* descr,
return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values,
encoded_stats->null_count, encoded_stats->distinct_count,
encoded_stats->has_min && encoded_stats->has_max,
- encoded_stats->has_null_count,
encoded_stats->has_distinct_count, pool);
+ encoded_stats->has_null_count, encoded_stats->has_distinct_count,
+ encoded_stats->is_min_value_exact,
encoded_stats->is_max_value_exact, pool);
}
std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
@@ -1052,11 +1081,24 @@ std::shared_ptr<Statistics> Statistics::Make(const
ColumnDescriptor* descr,
int64_t distinct_count, bool
has_min_max,
bool has_null_count, bool
has_distinct_count,
::arrow::MemoryPool* pool) {
+ return Statistics::Make(descr, encoded_min, encoded_max, num_values,
null_count,
+ distinct_count, has_min_max, has_null_count,
has_distinct_count,
+ /*is_min_value_exact=*/std::nullopt,
+ /*is_max_value_exact=*/std::nullopt, pool);
+}
+
+std::shared_ptr<Statistics> Statistics::Make(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, std::optional<bool> is_min_value_exact,
+ std::optional<bool> is_max_value_exact, ::arrow::MemoryPool* pool) {
#define MAKE_STATS(CAP_TYPE, KLASS)
\
case Type::CAP_TYPE:
\
return std::make_shared<TypedStatisticsImpl<KLASS>>(
\
descr, encoded_min, encoded_max, num_values, null_count,
distinct_count, \
- has_min_max, has_null_count, has_distinct_count, pool)
+ has_min_max, has_null_count, has_distinct_count, is_min_value_exact,
\
+ is_max_value_exact, pool)
switch (descr->physical_type()) {
MAKE_STATS(BOOLEAN, BooleanType);
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index c5da44a7b6..9b0dd8fab8 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -128,6 +128,9 @@ class PARQUET_EXPORT EncodedStatistics {
const std::string& max() const { return max_; }
const std::string& min() const { return min_; }
+ std::optional<bool> is_max_value_exact;
+ std::optional<bool> is_min_value_exact;
+
int64_t null_count = 0;
int64_t distinct_count = 0;
@@ -151,10 +154,12 @@ class PARQUET_EXPORT EncodedStatistics {
if (max_.length() > length) {
has_max = false;
max_.clear();
+ is_max_value_exact = std::nullopt;
}
if (min_.length() > length) {
has_min = false;
min_.clear();
+ is_min_value_exact = std::nullopt;
}
}
@@ -223,6 +228,28 @@ class PARQUET_EXPORT Statistics {
bool has_distinct_count,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+ /// \brief Create a new statistics instance given a column schema
+ /// definition and preexisting state
+ /// \param[in] descr the column schema
+ /// \param[in] encoded_min the encoded minimum value
+ /// \param[in] encoded_max the encoded maximum value
+ /// \param[in] num_values total number of values
+ /// \param[in] null_count number of null values
+ /// \param[in] distinct_count number of distinct values
+ /// \param[in] has_min_max whether the min/max statistics are set
+ /// \param[in] has_null_count whether the null_count statistics are set
+ /// \param[in] has_distinct_count whether the distinct_count statistics are
set
+ /// \param[in] is_min_value_exact whether the min value is exact
+ /// \param[in] is_max_value_exact whether the max value is exact
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, std::optional<bool> is_min_value_exact,
+ std::optional<bool> is_max_value_exact,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
// Helper function to convert EncodedStatistics to Statistics.
// EncodedStatistics does not contain number of non-null values, and it can
be
// passed using the num_values parameter.
@@ -259,6 +286,14 @@ class PARQUET_EXPORT Statistics {
/// \brief Plain-encoded maximum value
virtual std::string EncodeMax() const = 0;
+ /// \brief Return the minimum value exact flag if set.
+ /// It will be true if there was no truncation.
+ virtual std::optional<bool> is_min_value_exact() const = 0;
+
+ /// \brief Return the maximum value exact flag if set.
+ /// It will be true if there was no truncation.
+ virtual std::optional<bool> is_max_value_exact() const = 0;
+
/// \brief The finalized encoded form of the statistics for transport
virtual EncodedStatistics Encode() = 0;
@@ -376,7 +411,23 @@ std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
bool has_distinct_count, ::arrow::MemoryPool* pool =
::arrow::default_memory_pool()) {
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
- has_min_max, has_null_count, has_distinct_count, pool));
+ has_min_max, has_null_count, has_distinct_count,
+ /*is_min_value_exact=*/std::nullopt,
/*is_max_value_exact=*/std::nullopt, pool));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, std::optional<bool> is_min_value_exact,
+ std::optional<bool> is_max_value_exact,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(
+ Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
+ distinct_count, has_min_max, has_null_count,
has_distinct_count,
+ is_min_value_exact, is_max_value_exact, pool));
}
} // namespace parquet
diff --git a/cpp/src/parquet/statistics_test.cc
b/cpp/src/parquet/statistics_test.cc
index 360c69c5b8..cfb6f49281 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -320,9 +320,11 @@ class TestStatistics : public PrimitiveTypedTest<TestType>
{
std::string encoded_min = statistics1->EncodeMin();
std::string encoded_max = statistics1->EncodeMax();
- auto statistics2 =
- MakeStatistics<TestType>(this->schema_.Column(0), encoded_min,
encoded_max,
- this->values_.size(), 0, 0, true, true, true);
+ auto statistics2 = MakeStatistics<TestType>(
+ this->schema_.Column(0), encoded_min, encoded_max,
this->values_.size(),
+ /*null_count=*/0, /*distinct_count=*/0,
+ /*has_min_max=*/true, /*has_null_count=*/true,
/*has_distinct_count=*/true,
+ /*is_min_value_exact=*/true, /*is_max_value_exact=*/true);
auto statistics3 = MakeStatistics<TestType>(this->schema_.Column(0));
std::vector<uint8_t> valid_bits(
@@ -332,14 +334,29 @@ class TestStatistics : public
PrimitiveTypedTest<TestType> {
std::string encoded_min_spaced = statistics3->EncodeMin();
std::string encoded_max_spaced = statistics3->EncodeMax();
+ // Use old API without is_{min/max}_value_exact
+ auto statistics4 = MakeStatistics<TestType>(
+ this->schema_.Column(0), encoded_min, encoded_max,
this->values_.size(),
+ /*null_count=*/0, /*distinct_count=*/0,
+ /*has_min_max=*/true, /*has_null_count=*/true,
/*has_distinct_count=*/true);
ASSERT_EQ(encoded_min, statistics2->EncodeMin());
ASSERT_EQ(encoded_max, statistics2->EncodeMax());
ASSERT_EQ(statistics1->min(), statistics2->min());
ASSERT_EQ(statistics1->max(), statistics2->max());
+ ASSERT_EQ(statistics1->is_min_value_exact(), std::make_optional(true));
+ ASSERT_EQ(statistics1->is_max_value_exact(), std::make_optional(true));
+ ASSERT_EQ(statistics2->is_min_value_exact(), std::make_optional(true));
+ ASSERT_EQ(statistics2->is_max_value_exact(), std::make_optional(true));
ASSERT_EQ(encoded_min_spaced, statistics2->EncodeMin());
ASSERT_EQ(encoded_max_spaced, statistics2->EncodeMax());
ASSERT_EQ(statistics3->min(), statistics2->min());
ASSERT_EQ(statistics3->max(), statistics2->max());
+ ASSERT_EQ(statistics3->is_min_value_exact(), std::make_optional(true));
+ ASSERT_EQ(statistics3->is_max_value_exact(), std::make_optional(true));
+ ASSERT_EQ(statistics4->min(), statistics2->min());
+ ASSERT_EQ(statistics4->max(), statistics2->max());
+ ASSERT_EQ(statistics4->is_min_value_exact(), std::nullopt);
+ ASSERT_EQ(statistics4->is_max_value_exact(), std::nullopt);
}
void TestReset() {
@@ -459,6 +476,8 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
EXPECT_TRUE(enc_stats->has_max);
EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min());
EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max());
+ EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true));
+ EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true));
}
};
@@ -550,9 +569,12 @@ void TestStatistics<ByteArrayType>::TestMinMaxEncode() {
std::string(reinterpret_cast<const char*>(statistics1->max().ptr),
statistics1->max().len));
- auto statistics2 =
- MakeStatistics<ByteArrayType>(this->schema_.Column(0), encoded_min,
encoded_max,
- this->values_.size(), 0, 0, true, true,
true);
+ auto statistics2 = MakeStatistics<ByteArrayType>(
+ this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(),
+ /*null_count=*/0,
+ /*distinct_count=*/0, /*has_min_max=*/true, /*has_null_count=*/true,
+ /*has_distinct_count=*/true, /*is_min_value_exact=*/true,
+ /*is_max_value_exact=*/true);
ASSERT_EQ(encoded_min, statistics2->EncodeMin());
ASSERT_EQ(encoded_max, statistics2->EncodeMax());
@@ -693,6 +715,8 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
EXPECT_FALSE(statistics1->HasMinMax());
EXPECT_FALSE(encoded_stats1.has_min);
EXPECT_FALSE(encoded_stats1.has_max);
+ EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt);
+ EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt);
}
// Create a statistics object with min-max.
std::shared_ptr<TypedStatistics<TestType>> statistics2;
@@ -703,12 +727,18 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
EXPECT_TRUE(statistics2->HasMinMax());
EXPECT_TRUE(encoded_stats2.has_min);
EXPECT_TRUE(encoded_stats2.has_max);
+ EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true));
+ EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true));
}
VerifyMergedStatistics(*statistics1, *statistics2,
[](TypedStatistics<TestType>* merged_statistics) {
EXPECT_TRUE(merged_statistics->HasMinMax());
EXPECT_TRUE(merged_statistics->Encode().has_min);
EXPECT_TRUE(merged_statistics->Encode().has_max);
+
EXPECT_EQ(merged_statistics->Encode().is_min_value_exact,
+ std::make_optional(true));
+
EXPECT_EQ(merged_statistics->Encode().is_max_value_exact,
+ std::make_optional(true));
});
}
@@ -775,6 +805,8 @@ class TestStatisticsHasFlag : public
TestStatistics<TestType> {
EXPECT_FALSE(encoded.has_distinct_count);
EXPECT_FALSE(encoded.has_min);
EXPECT_FALSE(encoded.has_max);
+ EXPECT_FALSE(encoded.is_min_value_exact.has_value());
+ EXPECT_FALSE(encoded.is_max_value_exact.has_value());
}
};
@@ -971,6 +1003,8 @@ class TestStatisticsSortOrder : public ::testing::Test {
rg_metadata->ColumnChunk(i);
EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin());
EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax());
+ EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true));
+ EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true));
}
}
@@ -1007,11 +1041,15 @@ void TestStatisticsSortOrder<Int32Type>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]),
sizeof(c_type)));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
// Write INT32 min/max values
stats_[1]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]),
sizeof(c_type)));
+ stats_[1].is_max_value_exact = true;
+ stats_[1].is_min_value_exact = true;
}
// TYPE::INT64
@@ -1035,11 +1073,15 @@ void TestStatisticsSortOrder<Int64Type>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]),
sizeof(c_type)));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
// Write INT64 min/max values
stats_[1]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]),
sizeof(c_type)));
+ stats_[1].is_max_value_exact = true;
+ stats_[1].is_min_value_exact = true;
}
// TYPE::FLOAT
@@ -1054,6 +1096,8 @@ void TestStatisticsSortOrder<FloatType>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]),
sizeof(c_type)));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
}
// TYPE::DOUBLE
@@ -1068,6 +1112,8 @@ void TestStatisticsSortOrder<DoubleType>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]),
sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]),
sizeof(c_type)));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
}
// TYPE::ByteArray
@@ -1100,6 +1146,8 @@ void TestStatisticsSortOrder<ByteArrayType>::SetValues() {
std::string(reinterpret_cast<const char*>(vals[2].c_str()),
vals[2].length()))
.set_max(
std::string(reinterpret_cast<const char*>(vals[9].c_str()),
vals[9].length()));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
}
// TYPE::FLBAArray
@@ -1129,6 +1177,8 @@ void TestStatisticsSortOrder<FLBAType>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&vals[1][0]),
FLBA_LENGTH))
.set_max(std::string(reinterpret_cast<const char*>(&vals[8][0]),
FLBA_LENGTH));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
}
template <>
@@ -1162,6 +1212,8 @@ void
TestStatisticsSortOrder<Float16LogicalType>::SetValues() {
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(values_[7].ptr),
kValueLen))
.set_max(std::string(reinterpret_cast<const char*>(values_[2].ptr),
kValueLen));
+ stats_[0].is_max_value_exact = true;
+ stats_[0].is_min_value_exact = true;
}
TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes);
@@ -1243,6 +1295,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, T
expected_min, T expecte
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
+ EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
+ EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
}
template <typename Stats, typename Array, typename T = typename Stats::T>
@@ -1256,12 +1310,16 @@ void AssertMinMaxAre(Stats stats, const Array& values,
const uint8_t* valid_bitm
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
+ EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
+ EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
}
template <typename Stats, typename Array>
void AssertUnsetMinMax(Stats stats, const Array& values) {
stats->Update(values.data(), values.size(), 0);
ASSERT_FALSE(stats->HasMinMax());
+ ASSERT_FALSE(stats->is_min_value_exact().has_value());
+ ASSERT_FALSE(stats->is_max_value_exact().has_value());
}
template <typename Stats, typename Array>
@@ -1272,6 +1330,8 @@ void AssertUnsetMinMax(Stats stats, const Array& values,
const uint8_t* valid_bi
stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count +
null_count,
non_null_count, null_count);
ASSERT_FALSE(stats->HasMinMax());
+ ASSERT_FALSE(stats->is_min_value_exact().has_value());
+ ASSERT_FALSE(stats->is_max_value_exact().has_value());
}
template <typename ParquetType, typename T = typename ParquetType::c_type>
@@ -1598,31 +1658,104 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) {
ASSERT_EQ(12, stats->num_values());
ASSERT_EQ(0x00, stats->EncodeMin()[0]);
ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
+ std::shared_ptr<EncodedStatistics> enc_stats =
column_chunk->encoded_statistics();
+ ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
+ ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());
+}
+
+// Test statistics for binary column with truncated max and min values
+TEST(TestEncodedStatistics, TruncatedMinMax) {
+ std::string dir_string(test::get_data_dir());
+ std::stringstream ss;
+ ss << dir_string << "/binary_truncated_min_max.parquet";
+ auto path = ss.str();
+
+ // The file is generated by parquet-rs 55.1.0. It
+ // contains six columns of utf-8 and binary type. statistics_truncate_length
+ // is set to 2. Columns 0 and 1 will have truncation of min and max value,
+ // columns 2 and 3 will have truncation of min value only.
+ // Columns 4 and 5 will have no truncation where is_min_value_exact and
+ // is_max_value_exact are set to true.
+ // More file details in:
+ //
https://github.com/apache/parquet-testing/tree/master/data#binary-truncated-min-and-max-statistics
+ auto file_reader = ParquetFileReader::OpenFile(path);
+ auto rg_reader = file_reader->RowGroup(0);
+ auto metadata = rg_reader->metadata();
+ auto column_schema = metadata->schema()->Column(0);
+ ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
+ ASSERT_EQ(6, metadata->num_columns());
+
+ for (int num_column = 0; num_column < metadata->num_columns(); ++num_column)
{
+ auto column_chunk = metadata->ColumnChunk(num_column);
+ ASSERT_TRUE(column_chunk->is_stats_set());
+
+ std::shared_ptr<EncodedStatistics> encoded_statistics =
+ column_chunk->encoded_statistics();
+ ASSERT_TRUE(encoded_statistics != NULL);
+ ASSERT_EQ(0, encoded_statistics->null_count);
+ EXPECT_EQ("Al", encoded_statistics->min());
+ ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value());
+ ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value());
+ switch (num_column) {
+ case 2:
+ // Max couldn't truncate the utf-8 string longer than 2 bytes
+ EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max());
+ ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+ ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+ break;
+ case 3:
+ // Max couldn't truncate 0xFFFF binary string
+ EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max());
+ ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+ ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+ break;
+ case 4:
+ case 5:
+ // Min and Max are not truncated, fit on 2 bytes
+ EXPECT_EQ("Ke", encoded_statistics->max());
+ ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+ ASSERT_TRUE(encoded_statistics->is_min_value_exact.value());
+ break;
+ default:
+ // Max truncated to 2 bytes on columns 0 and 1
+ EXPECT_EQ("Kf", encoded_statistics->max());
+ ASSERT_FALSE(encoded_statistics->is_max_value_exact.value());
+ ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+ }
+ }
}
TEST(TestEncodedStatistics, CopySafe) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_max("abc");
- encoded_statistics.has_max = true;
+ ASSERT_TRUE(encoded_statistics.has_max);
+ encoded_statistics.is_max_value_exact = true;
+ ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value());
encoded_statistics.set_min("abc");
- encoded_statistics.has_min = true;
+ ASSERT_TRUE(encoded_statistics.has_min);
+ encoded_statistics.is_min_value_exact = true;
+ ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value());
EncodedStatistics copy_statistics = encoded_statistics;
copy_statistics.set_max("abcd");
copy_statistics.set_min("a");
+ copy_statistics.is_max_value_exact = false;
+ copy_statistics.is_min_value_exact = false;
EXPECT_EQ("abc", encoded_statistics.min());
EXPECT_EQ("abc", encoded_statistics.max());
+ EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true));
+ EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true));
}
TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_min("a");
- encoded_statistics.has_min = true;
+ ASSERT_TRUE(encoded_statistics.has_min);
encoded_statistics.set_max("abc");
- encoded_statistics.has_max = true;
+ ASSERT_TRUE(encoded_statistics.has_max);
encoded_statistics.ApplyStatSizeLimits(2);
diff --git a/cpp/src/parquet/thrift_internal.h
b/cpp/src/parquet/thrift_internal.h
index 3055ef3f23..8f82adae92 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -270,9 +270,15 @@ static inline EncodedStatistics FromThrift(const
format::Statistics& stats) {
// TODO: check if the column_order is TYPE_DEFINED_ORDER.
if (stats.__isset.max_value) {
out.set_max(stats.max_value);
+ if (stats.__isset.is_max_value_exact) {
+ out.is_max_value_exact = stats.is_max_value_exact;
+ }
}
if (stats.__isset.min_value) {
out.set_min(stats.min_value);
+ if (stats.__isset.is_min_value_exact) {
+ out.is_min_value_exact = stats.is_min_value_exact;
+ }
}
} else if (stats.__isset.max || stats.__isset.min) {
// TODO: check created_by to see if it is corrupted for some types.
@@ -475,6 +481,9 @@ static inline format::Statistics ToThrift(const
EncodedStatistics& stats) {
format::Statistics statistics;
if (stats.has_min) {
statistics.__set_min_value(stats.min());
+ if (stats.is_min_value_exact.has_value()) {
+ statistics.__set_is_min_value_exact(stats.is_min_value_exact.value());
+ }
// If the order is SIGNED, then the old min value must be set too.
// This for backward compatibility
if (stats.is_signed()) {
@@ -483,6 +492,9 @@ static inline format::Statistics ToThrift(const
EncodedStatistics& stats) {
}
if (stats.has_max) {
statistics.__set_max_value(stats.max());
+ if (stats.is_max_value_exact.has_value()) {
+ statistics.__set_is_max_value_exact(stats.is_max_value_exact.value());
+ }
// If the order is SIGNED, then the old max value must be set too.
// This for backward compatibility
if (stats.is_signed()) {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 18d1754009..4cf674fc58 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 18d17540097fca7c40be3d42c167e6bfad90763c
+Subproject commit 4cf674fc589309ba8651ad676f3fb557582d9cab