This is an automated email from the ASF dual-hosted git repository.
zanmato pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new abbcd53007 GH-46063: [C++][Compute] Fix the issue that MinMax kernel
emits -inf/inf for all-NaN input (#48459)
abbcd53007 is described below
commit abbcd53007e613d8ece3e4abcdbae9ba2f4c618f
Author: Rossi Sun <[email protected]>
AuthorDate: Wed Jan 7 10:26:45 2026 +0800
GH-46063: [C++][Compute] Fix the issue that MinMax kernel emits -inf/inf
for all-NaN input (#48459)
### Rationale for this change
Our MinMax kernels emit -inf/inf for all-NaN input array, which doesn't
make sense.
### What changes are included in this PR?
Initialize the running min/max value from -inf/inf to NaN, so we can
leverage the nice property that:
`std::fmin/fmax(all-NaN) = NaN`
`std::fmin/fmax(NaN, non-NaN) = non-NaN`
### Are these changes tested?
Test included.
### Are there any user-facing changes?
None.
* GitHub Issue: #46063
Authored-by: Rossi Sun <[email protected]>
Signed-off-by: Rossi Sun <[email protected]>
---
cpp/src/arrow/acero/hash_aggregate_test.cc | 68 ++++++++++++++++++
.../arrow/compute/kernels/aggregate_basic.inc.cc | 4 +-
cpp/src/arrow/compute/kernels/aggregate_test.cc | 22 ++++++
cpp/src/arrow/compute/kernels/hash_aggregate.cc | 82 +++++++++++-----------
4 files changed, 134 insertions(+), 42 deletions(-)
diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index de414219eb..12d24429cb 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -2000,6 +2000,74 @@ TEST_P(GroupBy, MinMaxScalar) {
}
}
+TEST_P(GroupBy, MinMaxWithNaN) {
+ auto in_schema = schema({
+ field("argument1", float32()),
+ field("argument2", float64()),
+ field("key", int64()),
+ });
+ for (bool use_threads : {true, false}) {
+ SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+
+ auto table = TableFromJSON(in_schema, {R"([
+ [NaN, NaN, 1],
+ [NaN, NaN, 2],
+ [NaN, NaN, 3]
+])",
+ R"([
+ [NaN, NaN, 1],
+ [-Inf, -Inf, 2],
+ [Inf, Inf, 3]
+])"});
+
+ ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
+ GroupByTest(
+ {
+ table->GetColumnByName("argument1"),
+ table->GetColumnByName("argument1"),
+ table->GetColumnByName("argument1"),
+ table->GetColumnByName("argument2"),
+ table->GetColumnByName("argument2"),
+ table->GetColumnByName("argument2"),
+ },
+ {table->GetColumnByName("key")},
+ {
+ {"hash_min", nullptr},
+ {"hash_max", nullptr},
+ {"hash_min_max", nullptr},
+ {"hash_min", nullptr},
+ {"hash_max", nullptr},
+ {"hash_min_max", nullptr},
+ },
+ use_threads));
+ ValidateOutput(aggregated_and_grouped);
+ SortBy({"key_0"}, &aggregated_and_grouped);
+
+ AssertDatumsEqual(ArrayFromJSON(struct_({
+ field("key_0", int64()),
+ field("hash_min", float32()),
+ field("hash_max", float32()),
+ field("hash_min_max", struct_({
+ field("min",
float32()),
+ field("max",
float32()),
+ })),
+ field("hash_min", float64()),
+ field("hash_max", float64()),
+ field("hash_min_max", struct_({
+ field("min",
float64()),
+ field("max",
float64()),
+ })),
+ }),
+ R"([
+ [1, NaN, NaN, {"min": NaN, "max": NaN}, NaN, NaN, {"min": NaN,
"max": NaN}],
+ [2, -Inf, -Inf, {"min": -Inf, "max": -Inf}, -Inf, -Inf, {"min": -Inf,
"max": -Inf}],
+ [3, Inf, Inf, {"min": Inf, "max": Inf}, Inf, Inf, {"min": Inf,
"max": Inf}]
+ ])"),
+ aggregated_and_grouped,
+ /*verbose=*/true);
+ }
+}
+
TEST_P(GroupBy, AnyAndAll) {
for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
index 7f2bce4063..3733f415a0 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
@@ -694,8 +694,8 @@ struct MinMaxState<ArrowType, SimdLevel,
enable_if_floating_point<ArrowType>> {
this->max = std::fmax(this->max, value);
}
- T min = std::numeric_limits<T>::infinity();
- T max = -std::numeric_limits<T>::infinity();
+ T min = std::numeric_limits<T>::quiet_NaN();
+ T max = std::numeric_limits<T>::quiet_NaN();
bool has_nulls = false;
};
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index a2bf0b97fd..cdc62f946a 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1841,6 +1841,24 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test
{
AssertMinMaxIsNull(array, options);
}
+ void AssertMinMaxIsNaN(const Datum& array, const ScalarAggregateOptions&
options) {
+ ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
+ for (const auto& val : out.scalar_as<StructScalar>().value) {
+ ASSERT_TRUE(std::isnan(checked_cast<const ScalarType&>(*val).value));
+ }
+ }
+
+ void AssertMinMaxIsNaN(const std::string& json, const
ScalarAggregateOptions& options) {
+ auto array = ArrayFromJSON(type_singleton(), json);
+ AssertMinMaxIsNaN(array, options);
+ }
+
+ void AssertMinMaxIsNaN(const std::vector<std::string>& json,
+ const ScalarAggregateOptions& options) {
+ auto array = ChunkedArrayFromJSON(type_singleton(), json);
+ AssertMinMaxIsNaN(array, options);
+ }
+
std::shared_ptr<DataType> type_singleton() {
return default_type_instance<ArrowType>();
}
@@ -1963,6 +1981,9 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+ this->AssertMinMaxIs("[NaN, null, 42]", 42, 42, options);
+ this->AssertMinMaxIsNaN("[NaN, NaN]", options);
+ this->AssertMinMaxIsNaN("[NaN, null]", options);
this->AssertMinMaxIs(chunked_input1, 1, 9, options);
this->AssertMinMaxIs(chunked_input2, 1, 9, options);
this->AssertMinMaxIs(chunked_input3, 1, 9, options);
@@ -1980,6 +2001,7 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
+ this->AssertMinMaxIsNull("[NaN, null]", options);
this->AssertMinMaxIsNull(chunked_input1, options);
this->AssertMinMaxIsNull(chunked_input2, options);
this->AssertMinMaxIsNull(chunked_input3, options);
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 19f7fc2e5b..2ab5e574e2 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -16,6 +16,7 @@
// under the License.
#include <cmath>
+#include <concepts>
#include <functional>
#include <memory>
#include <string>
@@ -270,52 +271,53 @@ struct GroupedCountImpl : public GroupedAggregator {
// ----------------------------------------------------------------------
// MinMax implementation
+// XXX: Consider making these concepts complete and moving to public header.
+
+template <typename T>
+concept CBooleanConcept = std::same_as<T, bool>;
+
+// XXX: Ideally we want to have std::floating_point<Float16> = true.
+template <typename T>
+concept CFloatingPointConcept = std::floating_point<T> || std::same_as<T,
util::Float16>;
+
+template <typename T>
+concept CDecimalConcept = std::same_as<T, Decimal32> || std::same_as<T,
Decimal64> ||
+ std::same_as<T, Decimal128> || std::same_as<T, Decimal256>;
+
template <typename CType>
struct AntiExtrema {
static constexpr CType anti_min() { return
std::numeric_limits<CType>::max(); }
static constexpr CType anti_max() { return
std::numeric_limits<CType>::min(); }
};
-template <>
-struct AntiExtrema<bool> {
- static constexpr bool anti_min() { return true; }
- static constexpr bool anti_max() { return false; }
-};
-
-template <>
-struct AntiExtrema<float> {
- static constexpr float anti_min() { return
std::numeric_limits<float>::infinity(); }
- static constexpr float anti_max() { return
-std::numeric_limits<float>::infinity(); }
+template <CBooleanConcept CType>
+struct AntiExtrema<CType> {
+ static constexpr CType anti_min() { return true; }
+ static constexpr CType anti_max() { return false; }
};
-template <>
-struct AntiExtrema<double> {
- static constexpr double anti_min() { return
std::numeric_limits<double>::infinity(); }
- static constexpr double anti_max() { return
-std::numeric_limits<double>::infinity(); }
+template <CFloatingPointConcept CType>
+struct AntiExtrema<CType> {
+ static constexpr CType anti_min() { return
std::numeric_limits<CType>::quiet_NaN(); }
+ static constexpr CType anti_max() { return
std::numeric_limits<CType>::quiet_NaN(); }
};
-template <>
-struct AntiExtrema<Decimal32> {
- static constexpr Decimal32 anti_min() { return
BasicDecimal32::GetMaxSentinel(); }
- static constexpr Decimal32 anti_max() { return
BasicDecimal32::GetMinSentinel(); }
+template <CDecimalConcept CType>
+struct AntiExtrema<CType> {
+ static constexpr CType anti_min() { return CType::GetMaxSentinel(); }
+ static constexpr CType anti_max() { return CType::GetMinSentinel(); }
};
-template <>
-struct AntiExtrema<Decimal64> {
- static constexpr Decimal64 anti_min() { return
BasicDecimal64::GetMaxSentinel(); }
- static constexpr Decimal64 anti_max() { return
BasicDecimal64::GetMinSentinel(); }
-};
-
-template <>
-struct AntiExtrema<Decimal128> {
- static constexpr Decimal128 anti_min() { return
BasicDecimal128::GetMaxSentinel(); }
- static constexpr Decimal128 anti_max() { return
BasicDecimal128::GetMinSentinel(); }
+template <typename CType>
+struct MinMaxOp {
+ static constexpr CType min(CType a, CType b) { return std::min(a, b); }
+ static constexpr CType max(CType a, CType b) { return std::max(a, b); }
};
-template <>
-struct AntiExtrema<Decimal256> {
- static constexpr Decimal256 anti_min() { return
BasicDecimal256::GetMaxSentinel(); }
- static constexpr Decimal256 anti_max() { return
BasicDecimal256::GetMinSentinel(); }
+template <CFloatingPointConcept CType>
+struct MinMaxOp<CType> {
+ static constexpr CType min(CType a, CType b) { return std::fmin(a, b); }
+ static constexpr CType max(CType a, CType b) { return std::fmax(a, b); }
};
template <typename Type, typename Enable = void>
@@ -352,8 +354,8 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
VisitGroupedValues<Type>(
batch,
[&](uint32_t g, CType val) {
- GetSet::Set(raw_mins, g, std::min(GetSet::Get(raw_mins, g), val));
- GetSet::Set(raw_maxes, g, std::max(GetSet::Get(raw_maxes, g), val));
+ GetSet::Set(raw_mins, g, MinMaxOp<CType>::min(GetSet::Get(raw_mins,
g), val));
+ GetSet::Set(raw_maxes, g,
MinMaxOp<CType>::max(GetSet::Get(raw_maxes, g), val));
bit_util::SetBit(has_values_.mutable_data(), g);
},
[&](uint32_t g) { bit_util::SetBit(has_nulls_.mutable_data(), g); });
@@ -373,12 +375,12 @@ struct GroupedMinMaxImpl final : public GroupedAggregator
{
auto g = group_id_mapping.GetValues<uint32_t>(1);
for (uint32_t other_g = 0; static_cast<int64_t>(other_g) <
group_id_mapping.length;
++other_g, ++g) {
- GetSet::Set(
- raw_mins, *g,
- std::min(GetSet::Get(raw_mins, *g), GetSet::Get(other_raw_mins,
other_g)));
- GetSet::Set(
- raw_maxes, *g,
- std::max(GetSet::Get(raw_maxes, *g), GetSet::Get(other_raw_maxes,
other_g)));
+ GetSet::Set(raw_mins, *g,
+ MinMaxOp<CType>::min(GetSet::Get(raw_mins, *g),
+ GetSet::Get(other_raw_mins, other_g)));
+ GetSet::Set(raw_maxes, *g,
+ MinMaxOp<CType>::max(GetSet::Get(raw_maxes, *g),
+ GetSet::Get(other_raw_maxes, other_g)));
if (bit_util::GetBit(other->has_values_.data(), other_g)) {
bit_util::SetBit(has_values_.mutable_data(), *g);