This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 673823b5a4 GH-45639: [C++][Statistics] Add support for 
ARROW:average_byte_width:{exac,approximate} (#46385)
673823b5a4 is described below

commit 673823b5a4775a8eb0e3a67e4b3e549febf4b44a
Author: Arash Andishgar <[email protected]>
AuthorDate: Thu Jul 10 03:39:08 2025 +0330

    GH-45639: [C++][Statistics] Add support for 
ARROW:average_byte_width:{exac,approximate} (#46385)
    
    ### Rationale for this change
    
    `ARROW:average_byte_width:exact` and `ARROW:average_byte_width:approximate` 
statistics attributes are missing in `arrow::ArrayStatistics`.
    
    ### What changes are included in this PR?
    
    Add `average_byte_width` and `is_average_byte_width_exact`  member 
variables to `arrow::ArrayStatistics`.
    
    ### Are these changes tested?
    Yes, I run the relevant unit tests
    ### Are there any user-facing changes?
    Yes
    * GitHub Issue: #45639
    
    Lead-authored-by: Arash Andishgar <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/array/array_test.cc      | 24 +++++++++++++
 cpp/src/arrow/array/statistics.h       |  6 ++++
 cpp/src/arrow/array/statistics_test.cc | 23 ++++++++++++-
 cpp/src/arrow/compare.cc               |  3 ++
 cpp/src/arrow/record_batch.cc          | 15 ++++++++
 cpp/src/arrow/record_batch_test.cc     | 62 ++++++++++++++++++++++++++++++++++
 6 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/array/array_test.cc 
b/cpp/src/arrow/array/array_test.cc
index e5a27d18d0..0dd75b01f6 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3897,6 +3897,7 @@ class TestArrayDataStatistics : public ::testing::Test {
   void SetUp() {
     valids_ = {1, 0, 1, 1};
     null_count_ = std::count(valids_.begin(), valids_.end(), 0);
+    average_byte_width_ = 4.0;
     null_buffer_ = *internal::BytesToBits(valids_);
     values_ = {1, 0, 3, -4};
     min_ = *std::min_element(values_.begin(), values_.end());
@@ -3906,6 +3907,8 @@ class TestArrayDataStatistics : public ::testing::Test {
                             null_count_);
     data_->statistics = std::make_shared<ArrayStatistics>();
     data_->statistics->null_count = null_count_;
+    data_->statistics->average_byte_width = average_byte_width_;
+    data_->statistics->is_average_byte_width_exact = true;
     data_->statistics->min = min_;
     data_->statistics->is_min_exact = true;
     data_->statistics->max = max_;
@@ -3915,6 +3918,7 @@ class TestArrayDataStatistics : public ::testing::Test {
  protected:
   std::vector<uint8_t> valids_;
   size_t null_count_;
+  double average_byte_width_;
   std::shared_ptr<Buffer> null_buffer_;
   std::vector<int32_t> values_;
   int64_t min_;
@@ -3930,6 +3934,11 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
   ASSERT_TRUE(moved_data.statistics->null_count.has_value());
   ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
 
+  ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(average_byte_width_,
+                   moved_data.statistics->average_byte_width.value());
+  ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
+
   ASSERT_TRUE(moved_data.statistics->min.has_value());
   
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
   ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
@@ -3947,6 +3956,11 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
   ASSERT_TRUE(copied_data.statistics->null_count.has_value());
   ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
 
+  ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(average_byte_width_,
+                   copied_data.statistics->average_byte_width.value());
+  ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
+
   ASSERT_TRUE(copied_data.statistics->min.has_value());
   
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
   ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
@@ -3966,6 +3980,11 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
   ASSERT_TRUE(moved_data.statistics->null_count.has_value());
   ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
 
+  ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(average_byte_width_,
+                   moved_data.statistics->average_byte_width.value());
+  ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
+
   ASSERT_TRUE(moved_data.statistics->min.has_value());
   
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
   ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
@@ -3984,6 +4003,11 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
   ASSERT_TRUE(copied_data.statistics->null_count.has_value());
   ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
 
+  ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(average_byte_width_,
+                   copied_data.statistics->average_byte_width.value());
+  ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
+
   ASSERT_TRUE(copied_data.statistics->min.has_value());
   
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
   ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 6accd48af7..435c38e861 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -78,6 +78,12 @@ struct ARROW_EXPORT ArrayStatistics {
   /// \brief The number of distinct values, may not be set
   std::optional<int64_t> distinct_count = std::nullopt;
 
+  /// \brief The average size in bytes of a row in an array, may not be set.
+  std::optional<double> average_byte_width = std::nullopt;
+
+  /// \brief Whether the average size in bytes is exact or not.
+  bool is_average_byte_width_exact = false;
+
   /// \brief The minimum value, may not be set
   std::optional<ValueType> min = std::nullopt;
 
diff --git a/cpp/src/arrow/array/statistics_test.cc 
b/cpp/src/arrow/array/statistics_test.cc
index 250c4bb437..d7dbea7c0f 100644
--- a/cpp/src/arrow/array/statistics_test.cc
+++ b/cpp/src/arrow/array/statistics_test.cc
@@ -41,6 +41,17 @@ TEST(TestArrayStatistics, DistinctCount) {
   ASSERT_EQ(29, statistics.distinct_count.value());
 }
 
+TEST(TestArrayStatistics, AverageByteWidth) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.average_byte_width.has_value());
+  ASSERT_FALSE(statistics.is_average_byte_width_exact);
+  statistics.average_byte_width = 4.2;
+  ASSERT_TRUE(statistics.average_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(4.2, statistics.average_byte_width.value());
+  statistics.is_average_byte_width_exact = true;
+  ASSERT_TRUE(statistics.is_average_byte_width_exact);
+}
+
 TEST(TestArrayStatistics, Min) {
   ArrayStatistics statistics;
   ASSERT_FALSE(statistics.min.has_value());
@@ -65,7 +76,7 @@ TEST(TestArrayStatistics, Max) {
   ASSERT_FALSE(statistics.is_max_exact);
 }
 
-TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
+TEST(TestArrayStatistics, Equals) {
   ArrayStatistics statistics1;
   ArrayStatistics statistics2;
 
@@ -81,6 +92,16 @@ TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
   statistics2.distinct_count = 2929;
   ASSERT_EQ(statistics1, statistics2);
 
+  statistics1.average_byte_width = 2.9;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.average_byte_width = 2.9;
+  ASSERT_EQ(statistics1, statistics2);
+
+  statistics1.is_average_byte_width_exact = true;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.is_average_byte_width_exact = true;
+  ASSERT_EQ(statistics1, statistics2);
+
   statistics1.min = std::string("world");
   ASSERT_NE(statistics1, statistics2);
   statistics2.min = std::string("world");
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 6ece1cb444..aa041a5bd5 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1561,8 +1561,11 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& 
left, const ArrayStatistic
                                const EqualOptions& equal_options) {
   return left.null_count == right.null_count &&
          left.distinct_count == right.distinct_count &&
+         left.is_average_byte_width_exact == right.is_average_byte_width_exact 
&&
          left.is_min_exact == right.is_min_exact &&
          left.is_max_exact == right.is_max_exact &&
+         ArrayStatisticsValueTypeEquals(left.average_byte_width, 
right.average_byte_width,
+                                        equal_options) &&
          ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) &&
          ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options);
 }
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 700e1bb2c9..04d6890d39 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -530,6 +530,19 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
       statistics.start_new_column = false;
     }
 
+    if (column_statistics->average_byte_width.has_value()) {
+      statistics.nth_statistics++;
+      if (column_statistics->is_average_byte_width_exact) {
+        statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT;
+      } else {
+        statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE;
+      }
+      statistics.type = float64();
+      statistics.value = column_statistics->average_byte_width.value();
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+
     if (column_statistics->min.has_value()) {
       statistics.nth_statistics++;
       if (column_statistics->is_min_exact) {
@@ -671,8 +684,10 @@ Result<std::shared_ptr<Array>> 
RecordBatch::MakeStatisticsArray(
     if (statistics.start_new_column) {
       RETURN_NOT_OK(builder.Append());
       if (statistics.nth_column.has_value()) {
+        // Add Columns
         RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value()));
       } else {
+        // Add RecordBatch
         RETURN_NOT_OK(columns_builder->AppendNull());
       }
       RETURN_NOT_OK(values_builder->Append());
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index 0572883441..fab8137171 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -1345,6 +1345,68 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayDistinctCount) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
+TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("utf8", 
utf8())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])");
+  string_array->data()->statistics = std::make_shared<ArrayStatistics>();
+  string_array->data()->statistics->average_byte_width = 2.3;
+  auto batch = RecordBatch::Make(schema, string_array->length(),
+                                 {no_statistics_array, string_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto expected_statistics_array,
+      MakeStatisticsArray("[null, 1]",
+                          {{
+                               ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                           },
+                           {
+                               
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE,
+                           }},
+                          {{
+                               ArrayStatistics::ValueType{int64_t{3}},
+                           },
+                           {
+                               ArrayStatistics::ValueType{2.3},
+                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("float64", 
float64())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])");
+  float_array->data()->statistics = std::make_shared<ArrayStatistics>();
+  float_array->data()->statistics->average_byte_width = 8.0;
+  float_array->data()->statistics->is_average_byte_width_exact = true;
+
+  auto batch = RecordBatch::Make(schema, float_array->length(),
+                                 {no_statistics_array, float_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto expected_statistics_array,
+      MakeStatisticsArray("[null, 1]",
+                          {{
+                               ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                           },
+                           {
+                               ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT,
+                           }},
+                          {{
+                               ArrayStatistics::ValueType{int64_t{3}},
+                           },
+                           {
+                               ArrayStatistics::ValueType{8.0},
+                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
 TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) {
   auto schema =
       ::arrow::schema({field("no-statistics", boolean()), field("uint32", 
uint32())});

Reply via email to