Re: [PR] refactor(hash-aggr): Migrate ordered partial/final aggregation [datafusion]

via GitHub Thu, 25 Jun 2026 01:06:07 -0700


2010YOUY01 commented on code in PR #23181:
URL: https://github.com/apache/datafusion/pull/23181#discussion_r3472776309



##########
datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs:
##########
@@ -373,9 +378,6 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, 
group_by_columns: Vec<&str
         .await
         .unwrap();
     assert!(collected_running.len() > 2);
-    // Running should produce more chunk than the usual AggregateExec.
-    // Otherwise it means that we cannot generate result in running mode.
-    assert!(collected_running.len() > collected_usual.len());

Review Comment:
   This is asserting: we run the same query on `OrderedAggregateStream` and 
`AggregateStream`, the first one should return more number of batches.
   
   This is implementation dependent, and later it will compare the whole result 
row-by-row, so it's safe to delete



##########
datafusion/physical-plan/src/aggregates/ordered_partial_stream.rs:
##########
@@ -0,0 +1,380 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Partial aggregate stream for ordered group input.
+
+use std::ops::ControlFlow;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::Result;
+use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use futures::stream::{Stream, StreamExt};
+
+use super::AggregateExec;
+use super::aggregate_hash_table::{OrderedAggregateTable, OrderedPartialMarker};
+use crate::aggregates::AggregateMode;
+use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput, 
SpillMetrics};
+use crate::stream::EmptyRecordBatchStream;
+use crate::{InputOrderMode, RecordBatchStream, SendableRecordBatchStream, 
metrics};
+
+/// Partial aggregate stream for `InputOrderMode::Sorted` and
+/// `InputOrderMode::PartiallySorted`.
+///
+/// # Example
+///
+/// SELECT k, AVG(v) FROM t GROUP BY k;
+///
+/// If the input is ordered by `k`, the aggregate can use ordered partial and
+/// final stages:
+///
+/// ## Plan
+/// AggregateExec(stage=final, ordered)
+/// -- RepartitionExec(hash(k), preserves_order=true)
+/// ---- AggregateExec(stage=partial, ordered)
+///
+/// ## Partial Stage Behavior
+/// Input: raw rows
+/// Output: partial states for all groups (for example, `AVG(x)` emits `SUM(x)`
+/// and `COUNT(x)`)
+///
+/// ## Final Stage Behavior
+/// Input: partial states
+/// Output: results for all groups (for example, `AVG(x)` calculated from the
+/// state)
+///
+/// # Order-based Optimization
+///
+/// For the aggregation work, the hash aggregation implementation is reused.
+///
+/// After each input batch, check whether any groups can be emitted eagerly to
+/// improve memory efficiency. For example, if the last group key seen is
+/// `k = 100`, it is safe to emit all groups with keys less than 100 because 
the
+/// input is ordered.
+///
+/// ## Implementation Note

Review Comment:
   It's obvious there are many applicable optimizations for this path, here is 
the explanation why this PR tends to keep it simple.



##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -3319,6 +3381,236 @@ mod tests {
         Ok(())
     }
 
+    /// Ensures for ordered input, `OrderedPartilAggregateStream` is used.
+    #[tokio::test]
+    async fn ordered_partial_aggregate_planning() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("sort_col", DataType::Int32, false),
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int64, false),
+        ]));
+
+        let input_batches = vec![
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int32Array::from(vec![1, 1, 1])),
+                    Arc::new(Int32Array::from(vec![10, 11, 10])),
+                    Arc::new(Int64Array::from(vec![1, 1, 1])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int32Array::from(vec![2, 2])),
+                    Arc::new(Int32Array::from(vec![20, 21])),
+                    Arc::new(Int64Array::from(vec![1, 1])),
+                ],
+            )?,
+        ];
+        let ordering = 
LexOrdering::new([PhysicalSortExpr::new_default(Arc::new(
+            Column::new("sort_col", 0),
+        ))])
+        .unwrap();
+        let input = TestMemoryExec::try_new(&[input_batches], 
Arc::clone(&schema), None)?
+            .try_with_sort_information(vec![ordering])?;
+        let input = Arc::new(TestMemoryExec::update_cache(&Arc::new(input)));
+
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (col("sort_col", &schema)?, "sort_col".to_string()),
+            (col("group_col", &schema)?, "group_col".to_string()),
+        ]);
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value_col", 
&schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("COUNT(value_col)")
+                .build()?,
+        )];
+        let aggregate = AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            aggr_expr,
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;
+        assert!(matches!(
+            aggregate.input_order_mode(),
+            InputOrderMode::PartiallySorted(_)
+        ));
+
+        let task_ctx = new_migrated_hash_ctx(2);
+        let stream = aggregate.execute_typed(0, &task_ctx)?;
+        assert!(matches!(stream, StreamType::OrderedPartialAggregate(_)));
+
+        let stream: SendableRecordBatchStream = stream.into();
+        let output = collect(stream).await?;
+        assert_snapshot!(batches_to_sort_string(&output), @r"
++----------+-----------+-------------------------+
+| sort_col | group_col | COUNT(value_col)[count] |
++----------+-----------+-------------------------+
+| 1        | 10        | 2                       |
+| 1        | 11        | 1                       |
+| 2        | 20        | 1                       |
+| 2        | 21        | 1                       |
++----------+-----------+-------------------------+
+");
+
+        Ok(())
+    }
+
+    /// Ensures for ordered input, `OrderedFinalAggregateStream` is used.
+    #[tokio::test]
+    async fn ordered_final_aggregate_planning() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("key", DataType::Int32, false),
+            Field::new("value", DataType::Int64, false),
+        ]));
+        let group_by =
+            PhysicalGroupBy::new_single(vec![(col("key", &schema)?, 
"key".to_string())]);
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value", 
&schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("COUNT(value)")
+                .build()?,
+        )];
+
+        let empty_input =
+            TestMemoryExec::try_new_exec(&[vec![]], Arc::clone(&schema), 
None)?;
+        let partial_aggregate = AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by.clone(),
+            aggr_expr.clone(),
+            vec![None],
+            empty_input,
+            Arc::clone(&schema),
+        )?;
+        let partial_schema = partial_aggregate.schema();
+        let partial_state_batch = RecordBatch::try_new(
+            Arc::clone(&partial_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 1, 2, 3])),
+                Arc::new(Int64Array::from(vec![2, 3, 5, 7])),
+            ],
+        )?;
+        let ordering = 
LexOrdering::new([PhysicalSortExpr::new_default(Arc::new(
+            Column::new("key", 0),
+        ))])
+        .unwrap();
+        let final_input =
+            TestMemoryExec::try_new(&[vec![partial_state_batch]], 
partial_schema, None)?
+                .try_with_sort_information(vec![ordering])?;
+        let final_input = 
Arc::new(TestMemoryExec::update_cache(&Arc::new(final_input)));
+
+        let final_aggregate = AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by.as_final(),
+            aggr_expr,
+            vec![None],
+            final_input,
+            Arc::clone(&schema),
+        )?;
+        assert_eq!(final_aggregate.input_order_mode(), 
&InputOrderMode::Sorted);
+
+        let task_ctx = new_migrated_hash_ctx(2);
+        let stream = final_aggregate.execute_typed(0, &task_ctx)?;
+        assert!(matches!(stream, StreamType::OrderedFinalAggregate(_)));
+
+        let stream: SendableRecordBatchStream = stream.into();
+        let output = collect(stream).await?;
+        assert_snapshot!(batches_to_sort_string(&output), @r"
++-----+--------------+
+| key | COUNT(value) |
++-----+--------------+
+| 1   | 5            |
+| 2   | 5            |
+| 3   | 7            |
++-----+--------------+
+");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn ordered_partial_aggregate_partially_sorted_no_emit_panic() -> 
Result<()> {

Review Comment:
   This test case is migrated from `row_hash.rs`'s existing UT, and there is a 
comment left at the original test, so we can check it easier when deleting the 
old implementation eventually.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] refactor(hash-aggr): Migrate ordered partial/final aggregation [datafusion]

Reply via email to