This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new ef934acc955 branch-4.1: [Bug](agg) fix wrong is_trivial on min max
#61641 (#61702)
ef934acc955 is described below
commit ef934acc955475ed0d628450a3e3df218d5a53db
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Mar 25 15:38:03 2026 +0800
branch-4.1: [Bug](agg) fix wrong is_trivial on min max #61641 (#61702)
Cherry-picked from #61641
Co-authored-by: Pxl <[email protected]>
---
be/src/exprs/aggregate/aggregate_function.h | 6 +-
.../exprs/aggregate/aggregate_function_min_max.h | 5 +-
be/test/exprs/aggregate/agg_min_max_test.cpp | 198 +++++++++++++++++++++
3 files changed, 207 insertions(+), 2 deletions(-)
diff --git a/be/src/exprs/aggregate/aggregate_function.h
b/be/src/exprs/aggregate/aggregate_function.h
index 475439cd39c..9186c9ad39b 100644
--- a/be/src/exprs/aggregate/aggregate_function.h
+++ b/be/src/exprs/aggregate/aggregate_function.h
@@ -108,7 +108,11 @@ public:
/// Reset aggregation state
virtual void reset(AggregateDataPtr place) const = 0;
- /// It is not necessary to delete data.
+ /// Indicates that the aggregate state can be safely zero-initialized
(memset to 0) instead of
+ /// calling create(). When true, callers may skip create() and destroy()
for the state.
+ /// This is only valid when zero-initialized memory is a correct initial
state for the function.
+ /// For example, sum/count/avg can use zero-init (identity element is 0),
but min/max cannot
+ /// because they require sentinel values (MAX_VALUE for min, MIN_VALUE for
max) set by create().
virtual bool is_trivial() const = 0;
/// Get `sizeof` of structure with data.
diff --git a/be/src/exprs/aggregate/aggregate_function_min_max.h
b/be/src/exprs/aggregate/aggregate_function_min_max.h
index 0cbbda24ef5..4f72a52629e 100644
--- a/be/src/exprs/aggregate/aggregate_function_min_max.h
+++ b/be/src/exprs/aggregate/aggregate_function_min_max.h
@@ -703,7 +703,10 @@ public:
DataTypePtr get_return_type() const override { return type; }
- bool is_trivial() const override { return Data::IsFixedLength; }
+ // min/max require sentinel-initialized state (MAX_VALUE for min,
MIN_VALUE for max) via
+ // create(), so they cannot use zero-init and must return false. any_value
is safe with
+ // zero-init because it checks has_value before comparing
(change_first_time).
+ bool is_trivial() const override { return Data::IsFixedLength &&
Data::IS_ANY; }
void add(AggregateDataPtr __restrict place, const IColumn** columns,
ssize_t row_num,
Arena& arena) const override {
diff --git a/be/test/exprs/aggregate/agg_min_max_test.cpp
b/be/test/exprs/aggregate/agg_min_max_test.cpp
index 501b16e8edc..3f2080fc830 100644
--- a/be/test/exprs/aggregate/agg_min_max_test.cpp
+++ b/be/test/exprs/aggregate/agg_min_max_test.cpp
@@ -25,10 +25,13 @@
#include <vector>
#include "core/column/column.h"
+#include "core/column/column_nullable.h"
#include "core/column/column_string.h"
#include "core/column/column_vector.h"
+#include "core/data_type/data_type_date_or_datetime_v2.h"
#include "core/data_type/data_type_decimal.h"
#include "core/data_type/data_type_jsonb.h"
+#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "core/data_type/data_type_string.h"
#include "core/field.h"
@@ -196,4 +199,199 @@ TEST_P(AggMinMaxTest, any_json_test) {
INSTANTIATE_TEST_SUITE_P(Params, AggMinMaxTest,
::testing::ValuesIn(std::vector<std::string> {"min",
"max"}));
+// Test that nullable min/max streaming_agg_serialize_to_column produces
correct results.
+// This is a regression test for a bug where min/max with is_trivial()=true
caused the
+// streaming aggregation path to skip create(), leaving states
zero-initialized instead of
+// sentinel-initialized (MAX_VALUE for min, MIN_VALUE for max). This led to
incorrect results
+// (e.g., 0 for numeric types, empty strings for datetime types).
+TEST_P(AggMinMaxTest, nullable_streaming_agg_int32_test) {
+ Arena arena;
+ std::string min_max_type = GetParam();
+
+ // Create nullable Int32 column with values [100, 200, 300, 400]
+ auto nested_col = ColumnInt32::create();
+ auto null_map = ColumnUInt8::create();
+ std::vector<int32_t> values = {100, 200, 300, 400};
+ for (auto v : values) {
+ nested_col->insert_value(v);
+ null_map->insert_value(0); // not null
+ }
+ auto nullable_col = ColumnNullable::create(std::move(nested_col),
std::move(null_map));
+
+ // Create nullable aggregate function
+ AggregateFunctionSimpleFactory factory;
+ register_aggregate_function_minmax(factory);
+ DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt32>())};
+ auto agg_function = factory.get(min_max_type, data_types, nullptr, true,
-1);
+ ASSERT_NE(agg_function, nullptr);
+
+ // Call streaming_agg_serialize_to_column — this is the bug path where the
V2 nullable
+ // wrapper allocates per-row states and may skip create() if is_trivial()
returns true.
+ auto dst = agg_function->create_serialize_column();
+ const IColumn* columns[1] = {nullable_col.get()};
+ agg_function->streaming_agg_serialize_to_column(columns, dst,
values.size(), arena);
+
+ // Deserialize each row and verify the value matches the original.
+ // In streaming mode, each row is independently aggregated (single value
per state),
+ // so the result should be the original value itself.
+ for (size_t i = 0; i < values.size(); ++i) {
+ std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+ AggregateDataPtr place = memory.get();
+ agg_function->create(place);
+
+ agg_function->deserialize_and_merge_from_column_range(place, *dst, i,
i, arena);
+
+ auto result_col = ColumnNullable::create(ColumnInt32::create(),
ColumnUInt8::create());
+ agg_function->insert_result_into(place, *result_col);
+
+ ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not
be null";
+ const auto& result_nested =
+ assert_cast<const
ColumnInt32&>(result_col->get_nested_column());
+ EXPECT_EQ(values[i], result_nested.get_element(0))
+ << "Row " << i << " mismatch for " << min_max_type;
+ agg_function->destroy(place);
+ }
+}
+
+// Test nullable min/max streaming with some null values
+TEST_P(AggMinMaxTest, nullable_streaming_agg_with_nulls_test) {
+ Arena arena;
+ std::string min_max_type = GetParam();
+
+ // Create nullable Int64 column: [10, NULL, 30, NULL, 50]
+ auto nested_col = ColumnInt64::create();
+ auto null_map = ColumnUInt8::create();
+ std::vector<int64_t> values = {10, 0, 30, 0, 50};
+ std::vector<uint8_t> nulls = {0, 1, 0, 1, 0};
+ for (size_t i = 0; i < values.size(); i++) {
+ nested_col->insert_value(values[i]);
+ null_map->insert_value(nulls[i]);
+ }
+ auto nullable_col = ColumnNullable::create(std::move(nested_col),
std::move(null_map));
+
+ AggregateFunctionSimpleFactory factory;
+ register_aggregate_function_minmax(factory);
+ DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt64>())};
+ auto agg_function = factory.get(min_max_type, data_types, nullptr, true,
-1);
+ ASSERT_NE(agg_function, nullptr);
+
+ auto dst = agg_function->create_serialize_column();
+ const IColumn* columns[1] = {nullable_col.get()};
+ agg_function->streaming_agg_serialize_to_column(columns, dst,
values.size(), arena);
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+ AggregateDataPtr place = memory.get();
+ agg_function->create(place);
+
+ agg_function->deserialize_and_merge_from_column_range(place, *dst, i,
i, arena);
+
+ auto result_col = ColumnNullable::create(ColumnInt64::create(),
ColumnUInt8::create());
+ agg_function->insert_result_into(place, *result_col);
+
+ if (nulls[i]) {
+ EXPECT_TRUE(result_col->is_null_at(0)) << "Row " << i << " should
be null";
+ } else {
+ ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should
not be null";
+ const auto& result_nested =
+ assert_cast<const
ColumnInt64&>(result_col->get_nested_column());
+ EXPECT_EQ(values[i], result_nested.get_element(0))
+ << "Row " << i << " mismatch for " << min_max_type;
+ }
+ agg_function->destroy(place);
+ }
+}
+
+// Test nullable min/max streaming with DateTimeV2 type — this was the
original symptom
+// where zero-initialized DateTimeV2 (0000-00-00 00:00:00) produced empty
strings.
+TEST_P(AggMinMaxTest, nullable_streaming_agg_datetimev2_test) {
+ Arena arena;
+ std::string min_max_type = GetParam();
+
+ // Create nullable DateTimeV2 column with some timestamps.
+ // DateTimeV2 is stored as UInt64 internally via ColumnDateTimeV2.
+ auto nested_col = ColumnDateTimeV2::create();
+ auto null_map = ColumnUInt8::create();
+ // Use valid DateTimeV2 encoded values (year<<46 | month<<42 | day<<37 |
hour<<32 | min<<26 |
+ // sec<<20 | usec). These are 2000-01-01, 2024-06-15 10:30:45, 2024-12-25
23:59:59.
+ std::vector<uint64_t> values = {140742023840792576ULL,
142454833089085440ULL,
+ 142482653553098752ULL};
+ for (auto v : values) {
+ nested_col->insert_value(binary_cast<uint64_t,
DateV2Value<DateTimeV2ValueType>>(v));
+ null_map->insert_value(0);
+ }
+ auto nullable_col = ColumnNullable::create(std::move(nested_col),
std::move(null_map));
+
+ AggregateFunctionSimpleFactory factory;
+ register_aggregate_function_minmax(factory);
+ DataTypes data_types =
{make_nullable(std::make_shared<DataTypeDateTimeV2>())};
+ auto agg_function = factory.get(min_max_type, data_types, nullptr, true,
-1);
+ ASSERT_NE(agg_function, nullptr);
+
+ auto dst = agg_function->create_serialize_column();
+ const IColumn* columns[1] = {nullable_col.get()};
+ agg_function->streaming_agg_serialize_to_column(columns, dst,
values.size(), arena);
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+ AggregateDataPtr place = memory.get();
+ agg_function->create(place);
+
+ agg_function->deserialize_and_merge_from_column_range(place, *dst, i,
i, arena);
+
+ auto result_col = ColumnNullable::create(ColumnDateTimeV2::create(),
ColumnUInt8::create());
+ agg_function->insert_result_into(place, *result_col);
+
+ ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not
be null";
+ const auto& result_nested =
+ assert_cast<const
ColumnDateTimeV2&>(result_col->get_nested_column());
+ EXPECT_EQ(values[i], result_nested.get_element(0).to_date_int_val())
+ << "Row " << i << " mismatch for " << min_max_type;
+ agg_function->destroy(place);
+ }
+}
+
+// Test that any_value still works correctly with the trivial path
(is_trivial() should
+// still return true for any_value with fixed-length types since it uses
has_value guard).
+TEST(AggMinMaxTest, any_value_nullable_streaming_agg_test) {
+ Arena arena;
+
+ auto nested_col = ColumnInt32::create();
+ auto null_map = ColumnUInt8::create();
+ std::vector<int32_t> values = {42, 99, 7};
+ for (auto v : values) {
+ nested_col->insert_value(v);
+ null_map->insert_value(0);
+ }
+ auto nullable_col = ColumnNullable::create(std::move(nested_col),
std::move(null_map));
+
+ AggregateFunctionSimpleFactory factory;
+ register_aggregate_function_minmax(factory);
+ DataTypes data_types = {make_nullable(std::make_shared<DataTypeInt32>())};
+ auto agg_function = factory.get("any", data_types, nullptr, true, -1);
+ ASSERT_NE(agg_function, nullptr);
+
+ auto dst = agg_function->create_serialize_column();
+ const IColumn* columns[1] = {nullable_col.get()};
+ agg_function->streaming_agg_serialize_to_column(columns, dst,
values.size(), arena);
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
+ AggregateDataPtr place = memory.get();
+ agg_function->create(place);
+
+ agg_function->deserialize_and_merge_from_column_range(place, *dst, i,
i, arena);
+
+ auto result_col = ColumnNullable::create(ColumnInt32::create(),
ColumnUInt8::create());
+ agg_function->insert_result_into(place, *result_col);
+
+ ASSERT_FALSE(result_col->is_null_at(0)) << "Row " << i << " should not
be null";
+ const auto& result_nested =
+ assert_cast<const
ColumnInt32&>(result_col->get_nested_column());
+ EXPECT_EQ(values[i], result_nested.get_element(0))
+ << "Row " << i << " mismatch for any_value";
+ agg_function->destroy(place);
+ }
+}
+
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]