Re: [PR] [Bug](function) fix wrong result of window funnel v2 + deduplication/fixed mode [doris]

via GitHub Fri, 03 Apr 2026 21:32:13 -0700


github-actions[bot] commented on code in PR #62043:
URL: https://github.com/apache/doris/pull/62043#discussion_r3035107765



##########
be/src/exprs/aggregate/aggregate_function_window_funnel_v2.h:
##########
@@ -393,115 +393,199 @@ struct WindowFunnelStateV2 {
                     if (matched) {
                         events_timestamp[event_idx] = 
{events_timestamp[event_idx - 1].first_ts,
                                                        evt.timestamp, i};
-                        if (event_idx > curr_level) {
-                            curr_level = event_idx;
-                        }
+                        curr_level = std::max(event_idx, curr_level);
                         if (event_idx + 1 == event_count) {
                             return event_count;
                         }
                     }
                 }
             }
 
-            if (curr_level + 1 > max_level) {
-                max_level = curr_level + 1;
-            }
+            max_level = std::max(curr_level + 1, max_level);
         }
         return max_level;
     }
 
-    /// DEDUPLICATION mode: if a previously matched event level appears again,
-    /// the current chain is terminated and max_level is updated.
-    /// This preserves V1 semantics where duplicate events break the chain.
+    /// DEDUPLICATION mode (multi-pass, matching V1 semantics):
+    ///
+    /// V1 tries each E0-matching row as a chain starting point and searches 
for
+    /// conditions one at a time. Before accepting a match for condition K, V1 
checks
+    /// the "gap" (rows between the previous match and the current match) for 
any
+    /// re-occurrence of already-matched conditions 0..K-1. If found, the 
chain breaks.
+    ///
+    /// A single-pass approach cannot correctly implement this because V2's 
greedy
+    /// multi-condition-per-row advancement creates premature higher-level 
matches
+    /// that then conflict with later events. The multi-pass approach iterates 
over
+    /// each event-0 occurrence, building chains one level at a time with 
gap-based
+    /// duplicate detection.
     int _get_deduplication() const {
-        std::vector<TimestampPair> events_timestamp(event_count);
-        int max_level = -1;
-        int curr_level = -1;
+        int max_level = 0;
+        const size_t list_size = events_list.size();
 
-        for (size_t i = 0; i < events_list.size(); ++i) {
-            const auto& evt = events_list[i];
-            int event_idx = get_event_idx(evt.event_idx) - 1;
+        for (size_t start = 0; start < list_size; ++start) {
+            // Remaining-events pruning: if remaining events can't beat 
max_level, stop.
+            if (static_cast<int>(list_size - start) <= max_level) {
+                break;
+            }
 
-            if (event_idx == 0) {
-                // Duplicate of event 0: terminate current chain first
-                if (events_timestamp[0].has_value()) {
-                    if (curr_level > max_level) {
-                        max_level = curr_level;
+            int start_event = get_event_idx(events_list[start].event_idx) - 1;
+            if (start_event != 0) {
+                continue;
+            }
+
+            // Build a chain from this event-0
+            UInt64 first_ts = events_list[start].timestamp;
+            int curr_level = 0;
+            size_t last_advance_idx = start;
+
+            for (size_t i = start + 1; i < list_size; ++i) {
+                int event_idx = get_event_idx(events_list[i].event_idx) - 1;
+
+                // Only search for the next expected level (matches V1's 
sequential scan)
+                if (event_idx != curr_level + 1) {
+                    continue;
+                }
+
+                // Must be from a different row than the last chain event
+                if (_is_same_row(last_advance_idx, i)) {
+                    continue;
+                }
+
+                // Window check
+                if (!_within_window(first_ts, events_list[i].timestamp)) {
+                    break;
+                }
+
+                // Gap-based dedup check (V1 semantics):
+                // Check if any event at a level below the target 
(0..event_idx-1)
+                // fired in the gap between the last chain event's row and the
+                // current event's row.
+
+                // Find gap boundaries, excluding same-row events at endpoints.
+                // gap_start: first event from a new row after last_advance_idx
+                size_t gap_start = last_advance_idx + 1;
+                while (gap_start < i && 
is_continuation(events_list[gap_start].event_idx)) {
+                    ++gap_start;
+                }
+                // i_row_start: first event of the current event's row
+                size_t i_row_start = i;
+                while (i_row_start > 0 && 
is_continuation(events_list[i_row_start].event_idx)) {
+                    --i_row_start;
+                }
+
+                bool dup = false;
+                if (gap_start < i_row_start) {
+                    for (size_t j = gap_start; j < i_row_start; ++j) {
+                        int j_level = get_event_idx(events_list[j].event_idx) 
- 1;
+                        if (j_level < event_idx) {
+                            dup = true;
+                            break;
+                        }
                     }
-                    _eliminate_chain(curr_level, events_timestamp);
                 }
-                // Start a new chain
-                events_timestamp[0] = {evt.timestamp, evt.timestamp, i};
-                curr_level = 0;
-            } else if (events_timestamp[event_idx].has_value()) {
-                // Duplicate event detected: this level was already matched
-                if (curr_level > max_level) {
-                    max_level = curr_level;
+
+                if (dup) {
+                    break;
                 }
-                // Eliminate current chain
-                _eliminate_chain(curr_level, events_timestamp);
-            } else if (events_timestamp[event_idx - 1].has_value() &&
-                       !_is_same_row(events_timestamp[event_idx - 
1].last_list_idx, i)) {
-                // Must be from a DIFFERENT row than the predecessor level
-                if (_promote_level(events_timestamp, evt.timestamp, i, 
event_idx, curr_level,
-                                   false)) {
+
+                // Accept this match and advance the chain
+                curr_level = event_idx;
+                last_advance_idx = i;
+
+                if (curr_level + 1 == event_count) {
                     return event_count;
                 }
             }
-        }
 
-        if (curr_level > max_level) {
-            return curr_level + 1;
+            max_level = std::max(curr_level + 1, max_level);
         }
-        return max_level + 1;
+        return max_level;
     }
 
-    /// FIXED mode (StarRocks-style semantics): if a matched event appears 
whose
+    /// FIXED mode: if a matched event appears whose
     /// predecessor level has NOT been matched, the chain is broken (event 
level jumped).
-    /// Note: V2 semantics differ from V1. V1 checks physical row adjacency;
-    /// V2 checks event level continuity (unmatched rows don't break the 
chain).
+    /// FIXED mode (multi-pass, row-based).
+    ///
+    /// Mirrors V1's row-by-row semantics: after matching c_K at a row, the 
NEXT
+    /// event-producing row must match c_{K+1} or the chain breaks. Truly
+    /// irrelevant rows (matching no condition) are absent from events_list and
+    /// implicitly skipped — this is the documented 4.1 behavior change.
+    ///

Review Comment:
   This comment says the new algorithm mirrors V1 row-by-row semantics, but the 
code below still skips physical rows that match no condition because they never 
enter `events_list`. That changes results in `FIXED` mode.
   
   Concrete case with conditions `E0, E1, E2` and a large window:
   - `r0`: `E0`
   - `r1`: matches no condition
   - `r2`: `E1`
   - `r3`: `E2`
   
   `window_funnel`/V1 returns `1` here, because after matching `E0` it checks 
the next physical row (`r1`) and breaks immediately when that row is not `E1` 
(`aggregate_function_window_funnel.h`, 
`_match_event_list<WindowFunnelMode::FIXED>()`, lines 181-191 in the current 
tree). This V2 implementation skips `r1` entirely and continues at `r2`, so it 
returns `3` instead.
   
   That means the implementation still follows the documented V2 semantic 
difference from V1, rather than fully restoring V1 behavior. At minimum this 
comment needs to be corrected, and if V1 parity is the intent the algorithm 
also needs a way to account for intervening non-matching rows plus a regression 
test for that case.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [Bug](function) fix wrong result of window funnel v2 + deduplication/fixed mode [doris]

Reply via email to