This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b7ae7a07c7c [fix](join) incorrect result of left semi/anti join with 
empty build side (#28898)
b7ae7a07c7c is described below

commit b7ae7a07c7caaf69ad188c1905407dba5fb34811
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Mon Dec 25 09:07:38 2023 +0800

    [fix](join) incorrect result of left semi/anti join with empty build side 
(#28898)
---
 be/src/vec/common/hash_table/hash_map.h            | 35 ++++++++++++++++++++++
 .../test_null_aware_left_anti_join.out             |  7 +++++
 .../test_null_aware_left_anti_join.groovy          | 18 ++++++++---
 3 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/be/src/vec/common/hash_table/hash_map.h 
b/be/src/vec/common/hash_table/hash_map.h
index 6efbdbb3e94..cb2809492ae 100644
--- a/be/src/vec/common/hash_table/hash_map.h
+++ b/be/src/vec/common/hash_table/hash_map.h
@@ -226,6 +226,9 @@ public:
     template <int JoinOpType>
     void prepare_build(size_t num_elem, int batch_size, bool has_null_key) {
         _has_null_key = has_null_key;
+
+        // the first row in build side is not really from build side table
+        _empty_build_side = num_elem <= 1;
         max_batch_size = batch_size;
         bucket_size = calc_bucket_size(num_elem + 1);
         first.resize(bucket_size + 1);
@@ -262,6 +265,14 @@ public:
                     uint32_t* __restrict probe_idxs, bool& probe_visited,
                     uint32_t* __restrict build_idxs,
                     doris::vectorized::ColumnFilterHelper* mark_column) {
+        if constexpr (JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) 
{
+            if (_empty_build_side) {
+                return _process_null_aware_left_anti_join_for_empty_build_side<
+                        JoinOpType, with_other_conjuncts, is_mark_join>(
+                        probe_idx, probe_rows, probe_idxs, build_idxs, 
mark_column);
+            }
+        }
+
         if constexpr (is_mark_join) {
             return _find_batch_mark<JoinOpType, with_other_conjuncts>(
                     keys, build_idx_map, probe_idx, probe_rows, probe_idxs, 
build_idxs,
@@ -367,6 +378,29 @@ private:
         return std::tuple {probe_idx, 0U, matched_cnt};
     }
 
+    template <int JoinOpType, bool with_other_conjuncts, bool is_mark_join>
+    auto _process_null_aware_left_anti_join_for_empty_build_side(
+            int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs,
+            uint32_t* __restrict build_idxs, 
doris::vectorized::ColumnFilterHelper* mark_column) {
+        static_assert(JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN);
+        auto matched_cnt = 0;
+        const auto batch_size = max_batch_size;
+
+        while (probe_idx < probe_rows && matched_cnt < batch_size) {
+            probe_idxs[matched_cnt] = probe_idx++;
+            if constexpr (is_mark_join) {
+                build_idxs[matched_cnt] = 0;
+            }
+            ++matched_cnt;
+        }
+
+        if constexpr (is_mark_join && !with_other_conjuncts) {
+            mark_column->resize_fill(matched_cnt, 1);
+        }
+
+        return std::tuple {probe_idx, 0U, matched_cnt};
+    }
+
     auto _find_batch_right_semi_anti(const Key* __restrict keys,
                                      const uint32_t* __restrict build_idx_map, 
int probe_idx,
                                      int probe_rows) {
@@ -532,6 +566,7 @@ private:
     Cell cell;
     doris::vectorized::Arena* pool;
     bool _has_null_key = false;
+    bool _empty_build_side = true;
 };
 
 template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>,
diff --git 
a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out 
b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
index d33e4e2947f..09d7d231709 100644
--- a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
+++ b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
@@ -9,3 +9,10 @@
 
 -- !select --
 
+-- !anti_emtpy_right --
+\N
+1
+3
+
+-- !semi_emtpy_right --
+
diff --git 
a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy 
b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
index f732b6bda58..6083290b2e5 100644
--- 
a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
+++ 
b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
@@ -60,11 +60,21 @@ suite("test_null_aware_left_anti_join") {
     sql """ set parallel_pipeline_task_num=2; """
     qt_select """ select ${tableName2}.k1 from ${tableName2} where k1 not in 
(select ${tableName1}.k1 from ${tableName1}) order by ${tableName2}.k1; """
 
-    sql """
-        drop table if exists ${tableName2};
+    // In left anti join, if right side is empty, all rows(null included) of 
left should be output.
+    qt_anti_emtpy_right """
+        select
+            *
+        from ${tableName1} t1 where k1 not in (
+            select k1 from ${tableName2} t2 where t2.k1 > 2
+        ) order by 1;
     """
 
-    sql """
-        drop table if exists ${tableName1};
+    // In left semi join, if right side is empty, no row should be output.
+    qt_semi_emtpy_right """
+        select
+            *
+        from ${tableName1} t1 where k1 in (
+            select k1 from ${tableName2} t2 where t2.k1 > 2
+        ) order by 1;
     """
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to