This is an automated email from the ASF dual-hosted git repository.

zanmato pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new abbcd53007 GH-46063: [C++][Compute] Fix the issue that MinMax kernel 
emits -inf/inf for all-NaN input (#48459)
abbcd53007 is described below

commit abbcd53007e613d8ece3e4abcdbae9ba2f4c618f
Author: Rossi Sun <[email protected]>
AuthorDate: Wed Jan 7 10:26:45 2026 +0800

    GH-46063: [C++][Compute] Fix the issue that MinMax kernel emits -inf/inf 
for all-NaN input (#48459)
    
    ### Rationale for this change
    
    Our MinMax kernels emit -inf/inf for all-NaN input array, which doesn't 
make sense.
    
    ### What changes are included in this PR?
    
    Initialize the running min/max value from -inf/inf to NaN, so we can 
leverage the nice property that:
    `std::fmin/fmax(all-NaN) = NaN`
    `std::fmin/fmax(NaN, non-NaN) = non-NaN`
    
    ### Are these changes tested?
    
    Test included.
    
    ### Are there any user-facing changes?
    
    None.
    * GitHub Issue: #46063
    
    Authored-by: Rossi Sun <[email protected]>
    Signed-off-by: Rossi Sun <[email protected]>
---
 cpp/src/arrow/acero/hash_aggregate_test.cc         | 68 ++++++++++++++++++
 .../arrow/compute/kernels/aggregate_basic.inc.cc   |  4 +-
 cpp/src/arrow/compute/kernels/aggregate_test.cc    | 22 ++++++
 cpp/src/arrow/compute/kernels/hash_aggregate.cc    | 82 +++++++++++-----------
 4 files changed, 134 insertions(+), 42 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc 
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index de414219eb..12d24429cb 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -2000,6 +2000,74 @@ TEST_P(GroupBy, MinMaxScalar) {
   }
 }
 
+TEST_P(GroupBy, MinMaxWithNaN) {
+  auto in_schema = schema({
+      field("argument1", float32()),
+      field("argument2", float64()),
+      field("key", int64()),
+  });
+  for (bool use_threads : {true, false}) {
+    SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+    auto table = TableFromJSON(in_schema, {R"([
+    [NaN,  NaN,  1],
+    [NaN,  NaN,  2],
+    [NaN,  NaN,  3]
+])",
+                                           R"([
+    [NaN,  NaN,  1],
+    [-Inf, -Inf, 2],
+    [Inf,  Inf,  3]
+])"});
+
+    ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+                         GroupByTest(
+                             {
+                                 table->GetColumnByName("argument1"),
+                                 table->GetColumnByName("argument1"),
+                                 table->GetColumnByName("argument1"),
+                                 table->GetColumnByName("argument2"),
+                                 table->GetColumnByName("argument2"),
+                                 table->GetColumnByName("argument2"),
+                             },
+                             {table->GetColumnByName("key")},
+                             {
+                                 {"hash_min", nullptr},
+                                 {"hash_max", nullptr},
+                                 {"hash_min_max", nullptr},
+                                 {"hash_min", nullptr},
+                                 {"hash_max", nullptr},
+                                 {"hash_min_max", nullptr},
+                             },
+                             use_threads));
+    ValidateOutput(aggregated_and_grouped);
+    SortBy({"key_0"}, &aggregated_and_grouped);
+
+    AssertDatumsEqual(ArrayFromJSON(struct_({
+                                        field("key_0", int64()),
+                                        field("hash_min", float32()),
+                                        field("hash_max", float32()),
+                                        field("hash_min_max", struct_({
+                                                                  field("min", 
float32()),
+                                                                  field("max", 
float32()),
+                                                              })),
+                                        field("hash_min", float64()),
+                                        field("hash_max", float64()),
+                                        field("hash_min_max", struct_({
+                                                                  field("min", 
float64()),
+                                                                  field("max", 
float64()),
+                                                              })),
+                                    }),
+                                    R"([
+    [1, NaN,  NaN,  {"min": NaN,  "max": NaN},  NaN,  NaN,  {"min": NaN,  
"max": NaN}],
+    [2, -Inf, -Inf, {"min": -Inf, "max": -Inf}, -Inf, -Inf, {"min": -Inf, 
"max": -Inf}],
+    [3, Inf,  Inf,  {"min": Inf,  "max": Inf},  Inf,  Inf,  {"min": Inf,  
"max": Inf}]
+  ])"),
+                      aggregated_and_grouped,
+                      /*verbose=*/true);
+  }
+}
+
 TEST_P(GroupBy, AnyAndAll) {
   for (bool use_threads : {true, false}) {
     SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc 
b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
index 7f2bce4063..3733f415a0 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
@@ -694,8 +694,8 @@ struct MinMaxState<ArrowType, SimdLevel, 
enable_if_floating_point<ArrowType>> {
     this->max = std::fmax(this->max, value);
   }
 
-  T min = std::numeric_limits<T>::infinity();
-  T max = -std::numeric_limits<T>::infinity();
+  T min = std::numeric_limits<T>::quiet_NaN();
+  T max = std::numeric_limits<T>::quiet_NaN();
   bool has_nulls = false;
 };
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc 
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index a2bf0b97fd..cdc62f946a 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1841,6 +1841,24 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test 
{
     AssertMinMaxIsNull(array, options);
   }
 
+  void AssertMinMaxIsNaN(const Datum& array, const ScalarAggregateOptions& 
options) {
+    ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
+    for (const auto& val : out.scalar_as<StructScalar>().value) {
+      ASSERT_TRUE(std::isnan(checked_cast<const ScalarType&>(*val).value));
+    }
+  }
+
+  void AssertMinMaxIsNaN(const std::string& json, const 
ScalarAggregateOptions& options) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertMinMaxIsNaN(array, options);
+  }
+
+  void AssertMinMaxIsNaN(const std::vector<std::string>& json,
+                         const ScalarAggregateOptions& options) {
+    auto array = ChunkedArrayFromJSON(type_singleton(), json);
+    AssertMinMaxIsNaN(array, options);
+  }
+
   std::shared_ptr<DataType> type_singleton() {
     return default_type_instance<ArrowType>();
   }
@@ -1963,6 +1981,9 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
   this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
   this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
   this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+  this->AssertMinMaxIs("[NaN, null, 42]", 42, 42, options);
+  this->AssertMinMaxIsNaN("[NaN, NaN]", options);
+  this->AssertMinMaxIsNaN("[NaN, null]", options);
   this->AssertMinMaxIs(chunked_input1, 1, 9, options);
   this->AssertMinMaxIs(chunked_input2, 1, 9, options);
   this->AssertMinMaxIs(chunked_input3, 1, 9, options);
@@ -1980,6 +2001,7 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
   this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
   this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
   this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
+  this->AssertMinMaxIsNull("[NaN, null]", options);
   this->AssertMinMaxIsNull(chunked_input1, options);
   this->AssertMinMaxIsNull(chunked_input2, options);
   this->AssertMinMaxIsNull(chunked_input3, options);
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc 
b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 19f7fc2e5b..2ab5e574e2 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <cmath>
+#include <concepts>
 #include <functional>
 #include <memory>
 #include <string>
@@ -270,52 +271,53 @@ struct GroupedCountImpl : public GroupedAggregator {
 // ----------------------------------------------------------------------
 // MinMax implementation
 
+// XXX: Consider making these concepts complete and moving to public header.
+
+template <typename T>
+concept CBooleanConcept = std::same_as<T, bool>;
+
+// XXX: Ideally we want to have std::floating_point<Float16> = true.
+template <typename T>
+concept CFloatingPointConcept = std::floating_point<T> || std::same_as<T, 
util::Float16>;
+
+template <typename T>
+concept CDecimalConcept = std::same_as<T, Decimal32> || std::same_as<T, 
Decimal64> ||
+    std::same_as<T, Decimal128> || std::same_as<T, Decimal256>;
+
 template <typename CType>
 struct AntiExtrema {
   static constexpr CType anti_min() { return 
std::numeric_limits<CType>::max(); }
   static constexpr CType anti_max() { return 
std::numeric_limits<CType>::min(); }
 };
 
-template <>
-struct AntiExtrema<bool> {
-  static constexpr bool anti_min() { return true; }
-  static constexpr bool anti_max() { return false; }
-};
-
-template <>
-struct AntiExtrema<float> {
-  static constexpr float anti_min() { return 
std::numeric_limits<float>::infinity(); }
-  static constexpr float anti_max() { return 
-std::numeric_limits<float>::infinity(); }
+template <CBooleanConcept CType>
+struct AntiExtrema<CType> {
+  static constexpr CType anti_min() { return true; }
+  static constexpr CType anti_max() { return false; }
 };
 
-template <>
-struct AntiExtrema<double> {
-  static constexpr double anti_min() { return 
std::numeric_limits<double>::infinity(); }
-  static constexpr double anti_max() { return 
-std::numeric_limits<double>::infinity(); }
+template <CFloatingPointConcept CType>
+struct AntiExtrema<CType> {
+  static constexpr CType anti_min() { return 
std::numeric_limits<CType>::quiet_NaN(); }
+  static constexpr CType anti_max() { return 
std::numeric_limits<CType>::quiet_NaN(); }
 };
 
-template <>
-struct AntiExtrema<Decimal32> {
-  static constexpr Decimal32 anti_min() { return 
BasicDecimal32::GetMaxSentinel(); }
-  static constexpr Decimal32 anti_max() { return 
BasicDecimal32::GetMinSentinel(); }
+template <CDecimalConcept CType>
+struct AntiExtrema<CType> {
+  static constexpr CType anti_min() { return CType::GetMaxSentinel(); }
+  static constexpr CType anti_max() { return CType::GetMinSentinel(); }
 };
 
-template <>
-struct AntiExtrema<Decimal64> {
-  static constexpr Decimal64 anti_min() { return 
BasicDecimal64::GetMaxSentinel(); }
-  static constexpr Decimal64 anti_max() { return 
BasicDecimal64::GetMinSentinel(); }
-};
-
-template <>
-struct AntiExtrema<Decimal128> {
-  static constexpr Decimal128 anti_min() { return 
BasicDecimal128::GetMaxSentinel(); }
-  static constexpr Decimal128 anti_max() { return 
BasicDecimal128::GetMinSentinel(); }
+template <typename CType>
+struct MinMaxOp {
+  static constexpr CType min(CType a, CType b) { return std::min(a, b); }
+  static constexpr CType max(CType a, CType b) { return std::max(a, b); }
 };
 
-template <>
-struct AntiExtrema<Decimal256> {
-  static constexpr Decimal256 anti_min() { return 
BasicDecimal256::GetMaxSentinel(); }
-  static constexpr Decimal256 anti_max() { return 
BasicDecimal256::GetMinSentinel(); }
+template <CFloatingPointConcept CType>
+struct MinMaxOp<CType> {
+  static constexpr CType min(CType a, CType b) { return std::fmin(a, b); }
+  static constexpr CType max(CType a, CType b) { return std::fmax(a, b); }
 };
 
 template <typename Type, typename Enable = void>
@@ -352,8 +354,8 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
     VisitGroupedValues<Type>(
         batch,
         [&](uint32_t g, CType val) {
-          GetSet::Set(raw_mins, g, std::min(GetSet::Get(raw_mins, g), val));
-          GetSet::Set(raw_maxes, g, std::max(GetSet::Get(raw_maxes, g), val));
+          GetSet::Set(raw_mins, g, MinMaxOp<CType>::min(GetSet::Get(raw_mins, 
g), val));
+          GetSet::Set(raw_maxes, g, 
MinMaxOp<CType>::max(GetSet::Get(raw_maxes, g), val));
           bit_util::SetBit(has_values_.mutable_data(), g);
         },
         [&](uint32_t g) { bit_util::SetBit(has_nulls_.mutable_data(), g); });
@@ -373,12 +375,12 @@ struct GroupedMinMaxImpl final : public GroupedAggregator 
{
     auto g = group_id_mapping.GetValues<uint32_t>(1);
     for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < 
group_id_mapping.length;
          ++other_g, ++g) {
-      GetSet::Set(
-          raw_mins, *g,
-          std::min(GetSet::Get(raw_mins, *g), GetSet::Get(other_raw_mins, 
other_g)));
-      GetSet::Set(
-          raw_maxes, *g,
-          std::max(GetSet::Get(raw_maxes, *g), GetSet::Get(other_raw_maxes, 
other_g)));
+      GetSet::Set(raw_mins, *g,
+                  MinMaxOp<CType>::min(GetSet::Get(raw_mins, *g),
+                                       GetSet::Get(other_raw_mins, other_g)));
+      GetSet::Set(raw_maxes, *g,
+                  MinMaxOp<CType>::max(GetSet::Get(raw_maxes, *g),
+                                       GetSet::Get(other_raw_maxes, other_g)));
 
       if (bit_util::GetBit(other->has_values_.data(), other_g)) {
         bit_util::SetBit(has_values_.mutable_data(), *g);

Reply via email to