This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new bb3ce18080d [Bug](agg) fix wrong is_trivial on min max (#61641)
bb3ce18080d is described below

commit bb3ce18080d5677fbdce0bca890b84822b708c88
Author: Pxl <[email protected]>
AuthorDate: Wed Mar 25 11:55:07 2026 +0800

    [Bug](agg) fix wrong is_trivial on min max (#61641)
    
    related with #60228
    
    This pull request addresses a critical bug in the aggregate functions
    for `min` and `max` when used with nullable columns in streaming
    aggregation mode. The main issue was that the aggregation state for
    `min`/`max` was incorrectly treated as "trivial" (safe to
    zero-initialize), leading to incorrect results for certain types (e.g.,
    numeric and datetime types) when using the streaming aggregation path.
    The changes clarify the contract for state initialization in
    `is_trivial()`, update its implementation, and add comprehensive
    regression tests to prevent similar issues in the future.
    
    **Aggregate function state initialization and correctness:**
    
    * Updated the documentation for `IAggregateFunction::is_trivial()` to
    clarify that only aggregate functions whose state can be safely
    zero-initialized (such as sum/count/avg) should return `true`. Functions
    like `min`/`max` that require sentinel values must not be treated as
    trivial.
    * Fixed the implementation of
    `AggregateFunctionsSingleValue::is_trivial()` so that only the
    `any_value` function with fixed-length types returns `true`. `min` and
    `max` now correctly return `false`, ensuring their state is always
    properly initialized with sentinel values.
    
    **Testing and regression coverage:**
    
    * Added extensive regression tests in `agg_min_max_test.cpp` to verify
    correct behavior of nullable `min`/`max` in streaming aggregation mode
    for various types, including `Int32`, `Int64`, and `DateTimeV2`. These
    tests check both non-null and null scenarios and confirm that results
    are as expected.
    * Included a test to confirm that `any_value` still works correctly with
    the trivial path, ensuring that the fix does not break this special
    case.
    * Added necessary test includes for nullable and datetime types to
    support the new tests.
---
 be/src/exprs/aggregate/aggregate_function.h        |   6 +-
 .../exprs/aggregate/aggregate_function_min_max.h   |   5 +-
 be/test/exprs/aggregate/agg_min_max_test.cpp       | 198 +++++++++++++++++++++
 3 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/be/src/exprs/aggregate/aggregate_function.h 
b/be/src/exprs/aggregate/aggregate_function.h
index d7c97a3f944..f4ccccb52fd 100644
--- a/be/src/exprs/aggregate/aggregate_function.h
+++ b/be/src/exprs/aggregate/aggregate_function.h
@@ -108,7 +108,11 @@ public:
     /// Reset aggregation state
     virtual void reset(AggregateDataPtr place) const = 0;
 
-    /// It is not necessary to delete data.
+    /// Indicates that the aggregate state can be safely zero-initialized 
(memset to 0) instead of
+    /// calling create(). When true, callers may skip create() and destroy() 
for the state.
+    /// This is only valid when zero-initialized memory is a correct initial 
state for the function.
+    /// For example, sum/count/avg can use zero-init (identity element is 0), 
but min/max cannot
+    /// because they require sentinel values (MAX_VALUE for min, MIN_VALUE for 
max) set by create().
     virtual bool is_trivial() const = 0;
 
     /// Get `sizeof` of structure with data.
diff --git a/be/src/exprs/aggregate/aggregate_function_min_max.h 
b/be/src/exprs/aggregate/aggregate_function_min_max.h
index 6cfaa0b7d77..cd48b685362 100644
--- a/be/src/exprs/aggregate/aggregate_function_min_max.h
+++ b/be/src/exprs/aggregate/aggregate_function_min_max.h
@@ -768,7 +768,10 @@ public:
 
     DataTypePtr get_return_type() const override { return type; }
 
-    bool is_trivial() const override { return Data::IsFixedLength; }
+    // min/max require sentinel-initialized state (MAX_VALUE for min, 
MIN_VALUE for max) via
+    // create(), so they cannot use zero-init and must return false. any_value 
is safe with
+    // zero-init because it checks has_value before comparing 
(change_first_time).
+    bool is_trivial() const override { return Data::IsFixedLength && 
Data::IS_ANY; }
 
     void add(AggregateDataPtr __restrict place, const IColumn** columns, 
ssize_t row_num,
              Arena& arena) const override {
diff --git a/be/test/exprs/aggregate/agg_min_max_test.cpp 
b/be/test/exprs/aggregate/agg_min_max_test.cpp
index 501b16e8edc..3f2080fc830 100644
--- a/be/test/exprs/aggregate/agg_min_max_test.cpp
+++ b/be/test/exprs/aggregate/agg_min_max_test.cpp
@@ -25,10 +25,13 @@
 #include <vector>
 
 #include "core/column/column.h"
+#include "core/column/column_nullable.h"
 #include "core/column/column_string.h"
 #include "core/column/column_vector.h"
+#include "core/data_type/data_type_date_or_datetime_v2.h"
 #include "core/data_type/data_type_decimal.h"
 #include "core/data_type/data_type_jsonb.h"
+#include "core/data_type/data_type_nullable.h"
 #include "core/data_type/data_type_number.h"
 #include "core/data_type/data_type_string.h"
 #include "core/field.h"
@@ -196,4 +199,199 @@ TEST_P(AggMinMaxTest, any_json_test) {
 INSTANTIATE_TEST_SUITE_P(Params, AggMinMaxTest,
                          ::testing::ValuesIn(std::vector<std::string> {"min", 
"max"}));
 
+// Test that nullable min/max streaming_agg_serialize_to_column produces 
correct results.
+// This is a regression test for a bug where min/max with is_trivial()=true 
caused the
+// streaming aggregation path to skip create(), leaving states 
zero-initialized instead of
+// sentinel-initialized (MAX_VALUE for min, MIN_VALUE for max). This led to 
incorrect results
+// (e.g., 0 for numeric types, empty strings for datetime types).
+TEST_P(AggMinMaxTest, nullable_streaming_agg_int32_test) {
+    Arena arena;
+    std::string min_max_type = GetParam();
+
+    // Create nullable Int32 column with values [100, 200, 300, 400]
+    auto nested_col = ColumnInt32::create();
+    auto null_map = ColumnUInt8::create();
+    std::vector<int32_t> values = {100, 200, 300, 400};
+    for (auto v : values) {
+        nested_col->insert_value(v);
+        null_map->insert_value(0); // not null
+    }
+    auto nullable_col = ColumnNullable::create(std::move(nested_col), 
std::move(null_map));
+
+    // Create nullable aggregate function
+    AggregateFunctionSimpleFactory factory;
+    register_aggregate_function_minmax(factory);
+    DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt32>())};
+    auto agg_function = factory.get(min_max_type, data_types, nullptr, true, 
-1);
+    ASSERT_NE(agg_function, nullptr);
+
+    // Call streaming_agg_serialize_to_column — this is the bug path where the 
V2 nullable
+    // wrapper allocates per-row states and may skip create() if is_trivial() 
returns true.
+    auto dst = agg_function->create_serialize_column();
+    const IColumn* columns[1] = {nullable_col.get()};
+    agg_function->streaming_agg_serialize_to_column(columns, dst, 
values.size(), arena);
+
+    // Deserialize each row and verify the value matches the original.
+    // In streaming mode, each row is independently aggregated (single value 
per state),
+    // so the result should be the original value itself.
+    for (size_t i = 0; i < values.size(); ++i) {
+        std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+        AggregateDataPtr place = memory.get();
+        agg_function->create(place);
+
+        agg_function->deserialize_and_merge_from_column_range(place, *dst, i, 
i, arena);
+
+        auto result_col = ColumnNullable::create(ColumnInt32::create(), 
ColumnUInt8::create());
+        agg_function->insert_result_into(place, *result_col);
+
+        ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not 
be null";
+        const auto& result_nested =
+                assert_cast<const 
ColumnInt32&>(result_col->get_nested_column());
+        EXPECT_EQ(values[i], result_nested.get_element(0))
+                << "Row " << i << " mismatch for " << min_max_type;
+        agg_function->destroy(place);
+    }
+}
+
+// Test nullable min/max streaming with some null values
+TEST_P(AggMinMaxTest, nullable_streaming_agg_with_nulls_test) {
+    Arena arena;
+    std::string min_max_type = GetParam();
+
+    // Create nullable Int64 column: [10, NULL, 30, NULL, 50]
+    auto nested_col = ColumnInt64::create();
+    auto null_map = ColumnUInt8::create();
+    std::vector<int64_t> values = {10, 0, 30, 0, 50};
+    std::vector<uint8_t> nulls = {0, 1, 0, 1, 0};
+    for (size_t i = 0; i < values.size(); i++) {
+        nested_col->insert_value(values[i]);
+        null_map->insert_value(nulls[i]);
+    }
+    auto nullable_col = ColumnNullable::create(std::move(nested_col), 
std::move(null_map));
+
+    AggregateFunctionSimpleFactory factory;
+    register_aggregate_function_minmax(factory);
+    DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt64>())};
+    auto agg_function = factory.get(min_max_type, data_types, nullptr, true, 
-1);
+    ASSERT_NE(agg_function, nullptr);
+
+    auto dst = agg_function->create_serialize_column();
+    const IColumn* columns[1] = {nullable_col.get()};
+    agg_function->streaming_agg_serialize_to_column(columns, dst, 
values.size(), arena);
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+        AggregateDataPtr place = memory.get();
+        agg_function->create(place);
+
+        agg_function->deserialize_and_merge_from_column_range(place, *dst, i, 
i, arena);
+
+        auto result_col = ColumnNullable::create(ColumnInt64::create(), 
ColumnUInt8::create());
+        agg_function->insert_result_into(place, *result_col);
+
+        if (nulls[i]) {
+            EXPECT_TRUE(result_col->is_null_at(0)) << "Row " << i << " should 
be null";
+        } else {
+            ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should 
not be null";
+            const auto& result_nested =
+                    assert_cast<const 
ColumnInt64&>(result_col->get_nested_column());
+            EXPECT_EQ(values[i], result_nested.get_element(0))
+                    << "Row " << i << " mismatch for " << min_max_type;
+        }
+        agg_function->destroy(place);
+    }
+}
+
+// Test nullable min/max streaming with DateTimeV2 type — this was the 
original symptom
+// where zero-initialized DateTimeV2 (0000-00-00 00:00:00) produced empty 
strings.
+TEST_P(AggMinMaxTest, nullable_streaming_agg_datetimev2_test) {
+    Arena arena;
+    std::string min_max_type = GetParam();
+
+    // Create nullable DateTimeV2 column with some timestamps.
+    // DateTimeV2 is stored as UInt64 internally via ColumnDateTimeV2.
+    auto nested_col = ColumnDateTimeV2::create();
+    auto null_map = ColumnUInt8::create();
+    // Use valid DateTimeV2 encoded values (year<<46 | month<<42 | day<<37 | 
hour<<32 | min<<26 |
+    // sec<<20 | usec). These are 2000-01-01, 2024-06-15 10:30:45, 2024-12-25 
23:59:59.
+    std::vector<uint64_t> values = {140742023840792576ULL, 
142454833089085440ULL,
+                                    142482653553098752ULL};
+    for (auto v : values) {
+        nested_col->insert_value(binary_cast<uint64_t, 
DateV2Value<DateTimeV2ValueType>>(v));
+        null_map->insert_value(0);
+    }
+    auto nullable_col = ColumnNullable::create(std::move(nested_col), 
std::move(null_map));
+
+    AggregateFunctionSimpleFactory factory;
+    register_aggregate_function_minmax(factory);
+    DataTypes data_types = 
{make_nullable(std::make_shared<DataTypeDateTimeV2>())};
+    auto agg_function = factory.get(min_max_type, data_types, nullptr, true, 
-1);
+    ASSERT_NE(agg_function, nullptr);
+
+    auto dst = agg_function->create_serialize_column();
+    const IColumn* columns[1] = {nullable_col.get()};
+    agg_function->streaming_agg_serialize_to_column(columns, dst, 
values.size(), arena);
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+        AggregateDataPtr place = memory.get();
+        agg_function->create(place);
+
+        agg_function->deserialize_and_merge_from_column_range(place, *dst, i, 
i, arena);
+
+        auto result_col = ColumnNullable::create(ColumnDateTimeV2::create(), 
ColumnUInt8::create());
+        agg_function->insert_result_into(place, *result_col);
+
+        ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not 
be null";
+        const auto& result_nested =
+                assert_cast<const 
ColumnDateTimeV2&>(result_col->get_nested_column());
+        EXPECT_EQ(values[i], result_nested.get_element(0).to_date_int_val())
+                << "Row " << i << " mismatch for " << min_max_type;
+        agg_function->destroy(place);
+    }
+}
+
+// Test that any_value still works correctly with the trivial path 
(is_trivial() should
+// still return true for any_value with fixed-length types since it uses 
has_value guard).
+TEST(AggMinMaxTest, any_value_nullable_streaming_agg_test) {
+    Arena arena;
+
+    auto nested_col = ColumnInt32::create();
+    auto null_map = ColumnUInt8::create();
+    std::vector<int32_t> values = {42, 99, 7};
+    for (auto v : values) {
+        nested_col->insert_value(v);
+        null_map->insert_value(0);
+    }
+    auto nullable_col = ColumnNullable::create(std::move(nested_col), 
std::move(null_map));
+
+    AggregateFunctionSimpleFactory factory;
+    register_aggregate_function_minmax(factory);
+    DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt32>())};
+    auto agg_function = factory.get("any", data_types, nullptr, true, -1);
+    ASSERT_NE(agg_function, nullptr);
+
+    auto dst = agg_function->create_serialize_column();
+    const IColumn* columns[1] = {nullable_col.get()};
+    agg_function->streaming_agg_serialize_to_column(columns, dst, 
values.size(), arena);
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+        AggregateDataPtr place = memory.get();
+        agg_function->create(place);
+
+        agg_function->deserialize_and_merge_from_column_range(place, *dst, i, 
i, arena);
+
+        auto result_col = ColumnNullable::create(ColumnInt32::create(), 
ColumnUInt8::create());
+        agg_function->insert_result_into(place, *result_col);
+
+        ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not 
be null";
+        const auto& result_nested =
+                assert_cast<const 
ColumnInt32&>(result_col->get_nested_column());
+        EXPECT_EQ(values[i], result_nested.get_element(0))
+                << "Row " << i << " mismatch for any_value";
+        agg_function->destroy(place);
+    }
+}
+
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to