This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7189472881 GH-46905: [C++][Parquet] Expose 
Statistics.is_{min/max}_value_exact and default set to true if min/max are set 
(#46992)
7189472881 is described below

commit 71894728810fa2c191a95315574fa5ea4546566f
Author: Raúl Cumplido <[email protected]>
AuthorDate: Wed Aug 27 16:14:31 2025 +0200

    GH-46905: [C++][Parquet] Expose Statistics.is_{min/max}_value_exact and 
default set to true if min/max are set (#46992)
    
    ### Rationale for this change
    
    The `is_{min/max}_value_exact` fields exist on the thrift definition and 
some implementations are already using them and truncating min and max values. 
This PR aims to expose those values and to default to true when writing files 
on C++ as no truncation is happening at the moment. If min/max statistics are 
generated we can set `is_{min/max}_value_exact` to true.
    
    Truncation for string and binary min/max is out of scope for this PR, we 
can do this on a following one.
    
    ### What changes are included in this PR?
    
    - The fields have been added to EncodedStatistics and Statistics along with 
the Thrift integration.
    - Tests and validation with new parquet-testing file generated where there 
fields are present (https://github.com/apache/parquet-testing/pull/88)
    - Tests with existing files without the fields.
    - Update existing tests to validate the new fields.
    - Add new fields to `ParquetFilePrinter`
    
    ### Are these changes tested?
    
    Yes on CI.
    
    ### Are there any user-facing changes?
    
    Yes, the new fields will be available for the users on the API when reading 
Parquet files.
    
    * GitHub Issue: #46905
    
    Authored-by: Raúl Cumplido <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/metadata.cc        |  13 +++-
 cpp/src/parquet/printer.cc         |  28 ++++++-
 cpp/src/parquet/reader_test.cc     |  33 +++++++-
 cpp/src/parquet/statistics.cc      |  48 +++++++++++-
 cpp/src/parquet/statistics.h       |  53 ++++++++++++-
 cpp/src/parquet/statistics_test.cc | 153 ++++++++++++++++++++++++++++++++++---
 cpp/src/parquet/thrift_internal.h  |  12 +++
 cpp/submodules/parquet-testing     |   2 +-
 8 files changed, 322 insertions(+), 20 deletions(-)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 30d69f4db5..c606561ca9 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -92,6 +92,14 @@ std::string ParquetVersionToString(ParquetVersion::type ver) 
{
 template <typename DType>
 static std::shared_ptr<Statistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
+  std::optional<bool> min_exact =
+      metadata.statistics.__isset.is_min_value_exact
+          ? std::optional<bool>(metadata.statistics.is_min_value_exact)
+          : std::nullopt;
+  std::optional<bool> max_exact =
+      metadata.statistics.__isset.is_max_value_exact
+          ? std::optional<bool>(metadata.statistics.is_max_value_exact)
+          : std::nullopt;
   // If ColumnOrder is defined, return max_value and min_value
   if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
     return MakeStatistics<DType>(
@@ -100,7 +108,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
         metadata.statistics.null_count, metadata.statistics.distinct_count,
         metadata.statistics.__isset.max_value && 
metadata.statistics.__isset.min_value,
         metadata.statistics.__isset.null_count,
-        metadata.statistics.__isset.distinct_count);
+        metadata.statistics.__isset.distinct_count, min_exact, max_exact);
   }
   // Default behavior
   return MakeStatistics<DType>(
@@ -108,7 +116,8 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
       metadata.num_values - metadata.statistics.null_count,
       metadata.statistics.null_count, metadata.statistics.distinct_count,
       metadata.statistics.__isset.max && metadata.statistics.__isset.min,
-      metadata.statistics.__isset.null_count, 
metadata.statistics.__isset.distinct_count);
+      metadata.statistics.__isset.null_count, 
metadata.statistics.__isset.distinct_count,
+      min_exact, max_exact);
 }
 
 namespace {
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index dfd1d85809..dfce57a00f 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -166,11 +166,19 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, 
std::list<int> selecte
       stream << "  Values: " << column_chunk->num_values();
       if (column_chunk->is_stats_set()) {
         std::string min = stats->min(), max = stats->max();
+        std::string max_exact =
+            stats->is_max_value_exact.has_value()
+                ? (stats->is_max_value_exact.value() ? "true" : "false")
+                : "unknown";
+        std::string min_exact =
+            stats->is_min_value_exact.has_value()
+                ? (stats->is_min_value_exact.value() ? "true" : "false")
+                : "unknown";
         stream << ", Null Values: " << stats->null_count
                << ", Distinct Values: " << stats->distinct_count << std::endl
-               << "  Max: "
+               << "  Max (exact: " << max_exact << "): "
                << FormatStatValue(descr->physical_type(), max, 
descr->logical_type())
-               << ", Min: "
+               << ", Min (exact: " << min_exact << "): "
                << FormatStatValue(descr->physical_type(), min, 
descr->logical_type());
       } else {
         stream << "  Statistics Not Set";
@@ -342,6 +350,22 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
                  << R"("Min": ")"
                  << FormatStatValue(descr->physical_type(), min, 
descr->logical_type())
                  << "\"";
+          if (stats->is_max_value_exact().has_value()) {
+            stream << ", "
+                   << R"("IsMaxValueExact": ")"
+                   << (stats->is_max_value_exact().value() ? "True" : "False") 
<< "\"";
+          } else {
+            stream << ", "
+                   << R"("IsMaxValueExact": "unknown")";
+          }
+          if (stats->is_min_value_exact().has_value()) {
+            stream << ", "
+                   << R"("IsMinValueExact": ")"
+                   << (stats->is_min_value_exact().value() ? "True" : "False") 
<< "\"";
+          } else {
+            stream << ", "
+                   << R"("IsMinValueExact": "unknown")";
+          }
         }
         stream << " },";
       } else {
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 7093a5c116..7ae9021e35 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1023,7 +1023,7 @@ Column 0
   Uncompressed Size: 103, Compressed Size: 104
 Column 1
   Values: 3, Null Values: 0, Distinct Values: 0
-  Max: 1, Min: 1
+  Max (exact: unknown): 1, Min (exact: unknown): 1
   Compression: SNAPPY, Encodings: PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY
   Uncompressed Size: 52, Compressed Size: 56
 )###";
@@ -1108,6 +1108,37 @@ class TestJSONWithLocalFile : public ::testing::Test {
   }
 };
 
+TEST_F(TestJSONWithLocalFile, JSONOutputWithStatistics) {
+  std::string json_output = R"###({
+  "FileName": "nested_lists.snappy.parquet",
+  "Version": "1.0",
+  "CreatedBy": "parquet-mr version 1.8.2 (build 
c6522788629e590a53eb79874b95f6c3ff11f16c)",
+  "TotalRows": "3",
+  "NumberOfRowGroups": "1",
+  "NumberOfRealColumns": "2",
+  "NumberOfColumns": "2",
+  "Columns": [
+     { "Id": "0", "Name": "a.list.element.list.element.list.element", 
"PhysicalType": "BYTE_ARRAY", "ConvertedType": "UTF8", "LogicalType": {"Type": 
"String"} },
+     { "Id": "1", "Name": "b", "PhysicalType": "INT32", "ConvertedType": 
"NONE", "LogicalType": {"Type": "None"} }
+  ],
+  "RowGroups": [
+     {
+       "Id": "0",  "TotalBytes": "155",  "TotalCompressedBytes": "0",  "Rows": 
"3",
+       "ColumnChunks": [
+          {"Id": "0", "Values": "18", "StatsSet": "False",
+           "Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) 
PLAIN_DICTIONARY", "UncompressedSize": "103", "CompressedSize": "104" },
+          {"Id": "1", "Values": "3", "StatsSet": "True", "Stats": {"NumNulls": 
"0", "Max": "1", "Min": "1", "IsMaxValueExact": "unknown", "IsMinValueExact": 
"unknown" },
+           "Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) 
PLAIN_DICTIONARY", "UncompressedSize": "52", "CompressedSize": "56" }
+        ]
+     }
+  ]
+}
+)###";
+
+  std::string json_content = ReadFromLocalFile("nested_lists.snappy.parquet");
+  ASSERT_EQ(json_output, json_content);
+}
+
 TEST_F(TestJSONWithLocalFile, JSONOutput) {
   std::string json_output = R"###({
   "FileName": "alltypes_plain.parquet",
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index c373d75cec..932fc66ef1 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -590,6 +590,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     Copy(min, &min_, min_buffer_.get());
     Copy(max, &max_, max_buffer_.get());
     has_min_max_ = true;
+    statistics_.is_min_value_exact = true;
+    statistics_.is_max_value_exact = true;
   }
 
   // Create stats from a thrift Statistics object.
@@ -597,6 +599,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
                       const std::string& encoded_max, int64_t num_values,
                       int64_t null_count, int64_t distinct_count, bool 
has_min_max,
                       bool has_null_count, bool has_distinct_count, 
MemoryPool* pool)
+      : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, 
null_count,
+                            distinct_count, has_min_max, has_null_count,
+                            has_distinct_count,
+                            /*is_min_value_exact=*/std::nullopt,
+                            /*is_max_value_exact=*/std::nullopt, pool) {}
+
+  TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& 
encoded_min,
+                      const std::string& encoded_max, int64_t num_values,
+                      int64_t null_count, int64_t distinct_count, bool 
has_min_max,
+                      bool has_null_count, bool has_distinct_count,
+                      std::optional<bool> is_min_value_exact,
+                      std::optional<bool> is_max_value_exact, MemoryPool* pool)
       : TypedStatisticsImpl(descr, pool) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
     if (has_null_count) {
@@ -613,6 +627,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     if (has_min_max) {
       PlainDecode(encoded_min, &min_);
       PlainDecode(encoded_max, &max_);
+      statistics_.is_min_value_exact = is_min_value_exact;
+      statistics_.is_max_value_exact = is_max_value_exact;
     }
 
     has_min_max_ = has_min_max;
@@ -659,7 +675,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
     return null_count() == other.null_count() &&
            distinct_count() == other.distinct_count() &&
-           num_values() == other.num_values();
+           num_values() == other.num_values() &&
+           is_min_value_exact() == other.is_min_value_exact() &&
+           is_max_value_exact() == other.is_max_value_exact();
   }
 
   bool MinMaxEqual(const TypedStatisticsImpl& other) const;
@@ -742,6 +760,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     if (HasMinMax()) {
       s.set_min(this->EncodeMin());
       s.set_max(this->EncodeMax());
+      s.is_min_value_exact = this->is_min_value_exact();
+      s.is_max_value_exact = this->is_max_value_exact();
     }
     if (HasNullCount()) {
       s.set_null_count(this->null_count());
@@ -757,6 +777,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   int64_t null_count() const override { return statistics_.null_count; }
   int64_t distinct_count() const override { return statistics_.distinct_count; 
}
   int64_t num_values() const override { return num_values_; }
+  std::optional<bool> is_min_value_exact() const override {
+    return statistics_.is_min_value_exact;
+  }
+  std::optional<bool> is_max_value_exact() const override {
+    return statistics_.is_max_value_exact;
+  }
 
  private:
   const ColumnDescriptor* descr_;
@@ -821,6 +847,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
       Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, 
min_buffer_.get());
       Copy(comparator_->Compare(max_, max) ? max : max_, &max_, 
max_buffer_.get());
     }
+    statistics_.is_min_value_exact = true;
+    statistics_.is_max_value_exact = true;
   }
 };
 
@@ -1042,7 +1070,8 @@ std::shared_ptr<Statistics> Statistics::Make(const 
ColumnDescriptor* descr,
   return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values,
               encoded_stats->null_count, encoded_stats->distinct_count,
               encoded_stats->has_min && encoded_stats->has_max,
-              encoded_stats->has_null_count, 
encoded_stats->has_distinct_count, pool);
+              encoded_stats->has_null_count, encoded_stats->has_distinct_count,
+              encoded_stats->is_min_value_exact, 
encoded_stats->is_max_value_exact, pool);
 }
 
 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
@@ -1052,11 +1081,24 @@ std::shared_ptr<Statistics> Statistics::Make(const 
ColumnDescriptor* descr,
                                              int64_t distinct_count, bool 
has_min_max,
                                              bool has_null_count, bool 
has_distinct_count,
                                              ::arrow::MemoryPool* pool) {
+  return Statistics::Make(descr, encoded_min, encoded_max, num_values, 
null_count,
+                          distinct_count, has_min_max, has_null_count, 
has_distinct_count,
+                          /*is_min_value_exact=*/std::nullopt,
+                          /*is_max_value_exact=*/std::nullopt, pool);
+}
+
+std::shared_ptr<Statistics> Statistics::Make(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max, bool has_null_count,
+    bool has_distinct_count, std::optional<bool> is_min_value_exact,
+    std::optional<bool> is_max_value_exact, ::arrow::MemoryPool* pool) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                            
  \
   case Type::CAP_TYPE:                                                         
  \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                       
  \
         descr, encoded_min, encoded_max, num_values, null_count, 
distinct_count, \
-        has_min_max, has_null_count, has_distinct_count, pool)
+        has_min_max, has_null_count, has_distinct_count, is_min_value_exact,   
  \
+        is_max_value_exact, pool)
 
   switch (descr->physical_type()) {
     MAKE_STATS(BOOLEAN, BooleanType);
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index c5da44a7b6..9b0dd8fab8 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -128,6 +128,9 @@ class PARQUET_EXPORT EncodedStatistics {
   const std::string& max() const { return max_; }
   const std::string& min() const { return min_; }
 
+  std::optional<bool> is_max_value_exact;
+  std::optional<bool> is_min_value_exact;
+
   int64_t null_count = 0;
   int64_t distinct_count = 0;
 
@@ -151,10 +154,12 @@ class PARQUET_EXPORT EncodedStatistics {
     if (max_.length() > length) {
       has_max = false;
       max_.clear();
+      is_max_value_exact = std::nullopt;
     }
     if (min_.length() > length) {
       has_min = false;
       min_.clear();
+      is_min_value_exact = std::nullopt;
     }
   }
 
@@ -223,6 +228,28 @@ class PARQUET_EXPORT Statistics {
       bool has_distinct_count,
       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
 
+  /// \brief Create a new statistics instance given a column schema
+  /// definition and preexisting state
+  /// \param[in] descr the column schema
+  /// \param[in] encoded_min the encoded minimum value
+  /// \param[in] encoded_max the encoded maximum value
+  /// \param[in] num_values total number of values
+  /// \param[in] null_count number of null values
+  /// \param[in] distinct_count number of distinct values
+  /// \param[in] has_min_max whether the min/max statistics are set
+  /// \param[in] has_null_count whether the null_count statistics are set
+  /// \param[in] has_distinct_count whether the distinct_count statistics are 
set
+  /// \param[in] is_min_value_exact whether the min value is exact
+  /// \param[in] is_max_value_exact whether the max value is exact
+  /// \param[in] pool a memory pool to use for any memory allocations, optional
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr, const std::string& encoded_min,
+      const std::string& encoded_max, int64_t num_values, int64_t null_count,
+      int64_t distinct_count, bool has_min_max, bool has_null_count,
+      bool has_distinct_count, std::optional<bool> is_min_value_exact,
+      std::optional<bool> is_max_value_exact,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
   // Helper function to convert EncodedStatistics to Statistics.
   // EncodedStatistics does not contain number of non-null values, and it can 
be
   // passed using the num_values parameter.
@@ -259,6 +286,14 @@ class PARQUET_EXPORT Statistics {
   /// \brief Plain-encoded maximum value
   virtual std::string EncodeMax() const = 0;
 
+  /// \brief Return the minimum value exact flag if set.
+  /// It will be true if there was no truncation.
+  virtual std::optional<bool> is_min_value_exact() const = 0;
+
+  /// \brief Return the maximum value exact flag if set.
+  /// It will be true if there was no truncation.
+  virtual std::optional<bool> is_max_value_exact() const = 0;
+
   /// \brief The finalized encoded form of the statistics for transport
   virtual EncodedStatistics Encode() = 0;
 
@@ -376,7 +411,23 @@ std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
     bool has_distinct_count, ::arrow::MemoryPool* pool = 
::arrow::default_memory_pool()) {
   return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
       descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
-      has_min_max, has_null_count, has_distinct_count, pool));
+      has_min_max, has_null_count, has_distinct_count,
+      /*is_min_value_exact=*/std::nullopt, 
/*is_max_value_exact=*/std::nullopt, pool));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max, bool has_null_count,
+    bool has_distinct_count, std::optional<bool> is_min_value_exact,
+    std::optional<bool> is_max_value_exact,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(
+      Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
+                       distinct_count, has_min_max, has_null_count, 
has_distinct_count,
+                       is_min_value_exact, is_max_value_exact, pool));
 }
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/statistics_test.cc 
b/cpp/src/parquet/statistics_test.cc
index 360c69c5b8..cfb6f49281 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -320,9 +320,11 @@ class TestStatistics : public PrimitiveTypedTest<TestType> 
{
     std::string encoded_min = statistics1->EncodeMin();
     std::string encoded_max = statistics1->EncodeMax();
 
-    auto statistics2 =
-        MakeStatistics<TestType>(this->schema_.Column(0), encoded_min, 
encoded_max,
-                                 this->values_.size(), 0, 0, true, true, true);
+    auto statistics2 = MakeStatistics<TestType>(
+        this->schema_.Column(0), encoded_min, encoded_max, 
this->values_.size(),
+        /*null_count=*/0, /*distinct_count=*/0,
+        /*has_min_max=*/true, /*has_null_count=*/true, 
/*has_distinct_count=*/true,
+        /*is_min_value_exact=*/true, /*is_max_value_exact=*/true);
 
     auto statistics3 = MakeStatistics<TestType>(this->schema_.Column(0));
     std::vector<uint8_t> valid_bits(
@@ -332,14 +334,29 @@ class TestStatistics : public 
PrimitiveTypedTest<TestType> {
     std::string encoded_min_spaced = statistics3->EncodeMin();
     std::string encoded_max_spaced = statistics3->EncodeMax();
 
+    // Use old API without is_{min/max}_value_exact
+    auto statistics4 = MakeStatistics<TestType>(
+        this->schema_.Column(0), encoded_min, encoded_max, 
this->values_.size(),
+        /*null_count=*/0, /*distinct_count=*/0,
+        /*has_min_max=*/true, /*has_null_count=*/true, 
/*has_distinct_count=*/true);
     ASSERT_EQ(encoded_min, statistics2->EncodeMin());
     ASSERT_EQ(encoded_max, statistics2->EncodeMax());
     ASSERT_EQ(statistics1->min(), statistics2->min());
     ASSERT_EQ(statistics1->max(), statistics2->max());
+    ASSERT_EQ(statistics1->is_min_value_exact(), std::make_optional(true));
+    ASSERT_EQ(statistics1->is_max_value_exact(), std::make_optional(true));
+    ASSERT_EQ(statistics2->is_min_value_exact(), std::make_optional(true));
+    ASSERT_EQ(statistics2->is_max_value_exact(), std::make_optional(true));
     ASSERT_EQ(encoded_min_spaced, statistics2->EncodeMin());
     ASSERT_EQ(encoded_max_spaced, statistics2->EncodeMax());
     ASSERT_EQ(statistics3->min(), statistics2->min());
     ASSERT_EQ(statistics3->max(), statistics2->max());
+    ASSERT_EQ(statistics3->is_min_value_exact(), std::make_optional(true));
+    ASSERT_EQ(statistics3->is_max_value_exact(), std::make_optional(true));
+    ASSERT_EQ(statistics4->min(), statistics2->min());
+    ASSERT_EQ(statistics4->max(), statistics2->max());
+    ASSERT_EQ(statistics4->is_min_value_exact(), std::nullopt);
+    ASSERT_EQ(statistics4->is_max_value_exact(), std::nullopt);
   }
 
   void TestReset() {
@@ -459,6 +476,8 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     EXPECT_TRUE(enc_stats->has_max);
     EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min());
     EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max());
+    EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true));
+    EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true));
   }
 };
 
@@ -550,9 +569,12 @@ void TestStatistics<ByteArrayType>::TestMinMaxEncode() {
             std::string(reinterpret_cast<const char*>(statistics1->max().ptr),
                         statistics1->max().len));
 
-  auto statistics2 =
-      MakeStatistics<ByteArrayType>(this->schema_.Column(0), encoded_min, 
encoded_max,
-                                    this->values_.size(), 0, 0, true, true, 
true);
+  auto statistics2 = MakeStatistics<ByteArrayType>(
+      this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(),
+      /*null_count=*/0,
+      /*distinct_count=*/0, /*has_min_max=*/true, /*has_null_count=*/true,
+      /*has_distinct_count=*/true, /*is_min_value_exact=*/true,
+      /*is_max_value_exact=*/true);
 
   ASSERT_EQ(encoded_min, statistics2->EncodeMin());
   ASSERT_EQ(encoded_max, statistics2->EncodeMax());
@@ -693,6 +715,8 @@ class TestStatisticsHasFlag : public 
TestStatistics<TestType> {
       EXPECT_FALSE(statistics1->HasMinMax());
       EXPECT_FALSE(encoded_stats1.has_min);
       EXPECT_FALSE(encoded_stats1.has_max);
+      EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt);
+      EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt);
     }
     // Create a statistics object with min-max.
     std::shared_ptr<TypedStatistics<TestType>> statistics2;
@@ -703,12 +727,18 @@ class TestStatisticsHasFlag : public 
TestStatistics<TestType> {
       EXPECT_TRUE(statistics2->HasMinMax());
       EXPECT_TRUE(encoded_stats2.has_min);
       EXPECT_TRUE(encoded_stats2.has_max);
+      EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true));
+      EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true));
     }
     VerifyMergedStatistics(*statistics1, *statistics2,
                            [](TypedStatistics<TestType>* merged_statistics) {
                              EXPECT_TRUE(merged_statistics->HasMinMax());
                              EXPECT_TRUE(merged_statistics->Encode().has_min);
                              EXPECT_TRUE(merged_statistics->Encode().has_max);
+                             
EXPECT_EQ(merged_statistics->Encode().is_min_value_exact,
+                                       std::make_optional(true));
+                             
EXPECT_EQ(merged_statistics->Encode().is_max_value_exact,
+                                       std::make_optional(true));
                            });
   }
 
@@ -775,6 +805,8 @@ class TestStatisticsHasFlag : public 
TestStatistics<TestType> {
     EXPECT_FALSE(encoded.has_distinct_count);
     EXPECT_FALSE(encoded.has_min);
     EXPECT_FALSE(encoded.has_max);
+    EXPECT_FALSE(encoded.is_min_value_exact.has_value());
+    EXPECT_FALSE(encoded.is_max_value_exact.has_value());
   }
 };
 
@@ -971,6 +1003,8 @@ class TestStatisticsSortOrder : public ::testing::Test {
           rg_metadata->ColumnChunk(i);
       EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin());
       EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax());
+      EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true));
+      EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true));
     }
   }
 
@@ -1007,11 +1041,15 @@ void TestStatisticsSortOrder<Int32Type>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[5]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[4]), 
sizeof(c_type)));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 
   // Write INT32 min/max values
   stats_[1]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), 
sizeof(c_type)));
+  stats_[1].is_max_value_exact = true;
+  stats_[1].is_min_value_exact = true;
 }
 
 // TYPE::INT64
@@ -1035,11 +1073,15 @@ void TestStatisticsSortOrder<Int64Type>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[5]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[4]), 
sizeof(c_type)));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 
   // Write INT64 min/max values
   stats_[1]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), 
sizeof(c_type)));
+  stats_[1].is_max_value_exact = true;
+  stats_[1].is_min_value_exact = true;
 }
 
 // TYPE::FLOAT
@@ -1054,6 +1096,8 @@ void TestStatisticsSortOrder<FloatType>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), 
sizeof(c_type)));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 }
 
 // TYPE::DOUBLE
@@ -1068,6 +1112,8 @@ void TestStatisticsSortOrder<DoubleType>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(&values_[0]), 
sizeof(c_type)))
       .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), 
sizeof(c_type)));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 }
 
 // TYPE::ByteArray
@@ -1100,6 +1146,8 @@ void TestStatisticsSortOrder<ByteArrayType>::SetValues() {
           std::string(reinterpret_cast<const char*>(vals[2].c_str()), 
vals[2].length()))
       .set_max(
           std::string(reinterpret_cast<const char*>(vals[9].c_str()), 
vals[9].length()));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 }
 
 // TYPE::FLBAArray
@@ -1129,6 +1177,8 @@ void TestStatisticsSortOrder<FLBAType>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(&vals[1][0]), 
FLBA_LENGTH))
       .set_max(std::string(reinterpret_cast<const char*>(&vals[8][0]), 
FLBA_LENGTH));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 }
 
 template <>
@@ -1162,6 +1212,8 @@ void 
TestStatisticsSortOrder<Float16LogicalType>::SetValues() {
   stats_[0]
       .set_min(std::string(reinterpret_cast<const char*>(values_[7].ptr), 
kValueLen))
       .set_max(std::string(reinterpret_cast<const char*>(values_[2].ptr), 
kValueLen));
+  stats_[0].is_max_value_exact = true;
+  stats_[0].is_min_value_exact = true;
 }
 
 TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes);
@@ -1243,6 +1295,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, T 
expected_min, T expecte
   ASSERT_TRUE(stats->HasMinMax());
   EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
   EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
+  EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
+  EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
 }
 
 template <typename Stats, typename Array, typename T = typename Stats::T>
@@ -1256,12 +1310,16 @@ void AssertMinMaxAre(Stats stats, const Array& values, 
const uint8_t* valid_bitm
   ASSERT_TRUE(stats->HasMinMax());
   EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
   EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
+  EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
+  EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
 }
 
 template <typename Stats, typename Array>
 void AssertUnsetMinMax(Stats stats, const Array& values) {
   stats->Update(values.data(), values.size(), 0);
   ASSERT_FALSE(stats->HasMinMax());
+  ASSERT_FALSE(stats->is_min_value_exact().has_value());
+  ASSERT_FALSE(stats->is_max_value_exact().has_value());
 }
 
 template <typename Stats, typename Array>
@@ -1272,6 +1330,8 @@ void AssertUnsetMinMax(Stats stats, const Array& values, 
const uint8_t* valid_bi
   stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + 
null_count,
                       non_null_count, null_count);
   ASSERT_FALSE(stats->HasMinMax());
+  ASSERT_FALSE(stats->is_min_value_exact().has_value());
+  ASSERT_FALSE(stats->is_max_value_exact().has_value());
 }
 
 template <typename ParquetType, typename T = typename ParquetType::c_type>
@@ -1598,31 +1658,104 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) {
   ASSERT_EQ(12, stats->num_values());
   ASSERT_EQ(0x00, stats->EncodeMin()[0]);
   ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
+  std::shared_ptr<EncodedStatistics> enc_stats = 
column_chunk->encoded_statistics();
+  ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
+  ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());
+}
+
+// Test statistics for binary column with truncated max and min values
+TEST(TestEncodedStatistics, TruncatedMinMax) {
+  std::string dir_string(test::get_data_dir());
+  std::stringstream ss;
+  ss << dir_string << "/binary_truncated_min_max.parquet";
+  auto path = ss.str();
+
+  // The file is generated by parquet-rs 55.1.0. It
+  // contains six columns of utf-8 and binary type. statistics_truncate_length
+  // is set to 2. Columns 0 and 1 will have truncation of min and max value,
+  // columns 2 and 3 will have truncation of min value only.
+  // Columns 4 and 5 will have no truncation where is_min_value_exact and
+  // is_max_value_exact are set to true.
+  // More file details in:
+  // 
https://github.com/apache/parquet-testing/tree/master/data#binary-truncated-min-and-max-statistics
+  auto file_reader = ParquetFileReader::OpenFile(path);
+  auto rg_reader = file_reader->RowGroup(0);
+  auto metadata = rg_reader->metadata();
+  auto column_schema = metadata->schema()->Column(0);
+  ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
+  ASSERT_EQ(6, metadata->num_columns());
+
+  for (int num_column = 0; num_column < metadata->num_columns(); ++num_column) 
{
+    auto column_chunk = metadata->ColumnChunk(num_column);
+    ASSERT_TRUE(column_chunk->is_stats_set());
+
+    std::shared_ptr<EncodedStatistics> encoded_statistics =
+        column_chunk->encoded_statistics();
+    ASSERT_TRUE(encoded_statistics != NULL);
+    ASSERT_EQ(0, encoded_statistics->null_count);
+    EXPECT_EQ("Al", encoded_statistics->min());
+    ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value());
+    ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value());
+    switch (num_column) {
+      case 2:
+        // Max couldn't truncate the utf-8 string longer than 2 bytes
+        EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max());
+        ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+        ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+        break;
+      case 3:
+        // Max couldn't truncate 0xFFFF binary string
+        EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max());
+        ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+        ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+        break;
+      case 4:
+      case 5:
+        // Min and Max are not truncated, fit on 2 bytes
+        EXPECT_EQ("Ke", encoded_statistics->max());
+        ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
+        ASSERT_TRUE(encoded_statistics->is_min_value_exact.value());
+        break;
+      default:
+        // Max truncated to 2 bytes on columns 0 and 1
+        EXPECT_EQ("Kf", encoded_statistics->max());
+        ASSERT_FALSE(encoded_statistics->is_max_value_exact.value());
+        ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
+    }
+  }
 }
 
 TEST(TestEncodedStatistics, CopySafe) {
   EncodedStatistics encoded_statistics;
   encoded_statistics.set_max("abc");
-  encoded_statistics.has_max = true;
+  ASSERT_TRUE(encoded_statistics.has_max);
+  encoded_statistics.is_max_value_exact = true;
+  ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value());
 
   encoded_statistics.set_min("abc");
-  encoded_statistics.has_min = true;
+  ASSERT_TRUE(encoded_statistics.has_min);
+  encoded_statistics.is_min_value_exact = true;
+  ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value());
 
   EncodedStatistics copy_statistics = encoded_statistics;
   copy_statistics.set_max("abcd");
   copy_statistics.set_min("a");
+  copy_statistics.is_max_value_exact = false;
+  copy_statistics.is_min_value_exact = false;
 
   EXPECT_EQ("abc", encoded_statistics.min());
   EXPECT_EQ("abc", encoded_statistics.max());
+  EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true));
+  EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true));
 }
 
 TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
   EncodedStatistics encoded_statistics;
   encoded_statistics.set_min("a");
-  encoded_statistics.has_min = true;
+  ASSERT_TRUE(encoded_statistics.has_min);
 
   encoded_statistics.set_max("abc");
-  encoded_statistics.has_max = true;
+  ASSERT_TRUE(encoded_statistics.has_max);
 
   encoded_statistics.ApplyStatSizeLimits(2);
 
diff --git a/cpp/src/parquet/thrift_internal.h 
b/cpp/src/parquet/thrift_internal.h
index 3055ef3f23..8f82adae92 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -270,9 +270,15 @@ static inline EncodedStatistics FromThrift(const 
format::Statistics& stats) {
     // TODO: check if the column_order is TYPE_DEFINED_ORDER.
     if (stats.__isset.max_value) {
       out.set_max(stats.max_value);
+      if (stats.__isset.is_max_value_exact) {
+        out.is_max_value_exact = stats.is_max_value_exact;
+      }
     }
     if (stats.__isset.min_value) {
       out.set_min(stats.min_value);
+      if (stats.__isset.is_min_value_exact) {
+        out.is_min_value_exact = stats.is_min_value_exact;
+      }
     }
   } else if (stats.__isset.max || stats.__isset.min) {
     // TODO: check created_by to see if it is corrupted for some types.
@@ -475,6 +481,9 @@ static inline format::Statistics ToThrift(const 
EncodedStatistics& stats) {
   format::Statistics statistics;
   if (stats.has_min) {
     statistics.__set_min_value(stats.min());
+    if (stats.is_min_value_exact.has_value()) {
+      statistics.__set_is_min_value_exact(stats.is_min_value_exact.value());
+    }
     // If the order is SIGNED, then the old min value must be set too.
     // This for backward compatibility
     if (stats.is_signed()) {
@@ -483,6 +492,9 @@ static inline format::Statistics ToThrift(const 
EncodedStatistics& stats) {
   }
   if (stats.has_max) {
     statistics.__set_max_value(stats.max());
+    if (stats.is_max_value_exact.has_value()) {
+      statistics.__set_is_max_value_exact(stats.is_max_value_exact.value());
+    }
     // If the order is SIGNED, then the old max value must be set too.
     // This for backward compatibility
     if (stats.is_signed()) {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 18d1754009..4cf674fc58 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 18d17540097fca7c40be3d42c167e6bfad90763c
+Subproject commit 4cf674fc589309ba8651ad676f3fb557582d9cab


Reply via email to