This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c03a19ea23  [improvement](bitmap) Using set to store a small number of 
elements to improve performance (#19973)
c03a19ea23 is described below

commit c03a19ea23cd5ac1b75a0d130709a7bc6303073c
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Wed May 31 16:13:42 2023 +0800

     [improvement](bitmap) Using set to store a small number of elements to 
improve performance (#19973)
    
    Test on SSB 100g:
    
    select lo_suppkey, count(distinct lo_linenumber) from lineorder group by 
lo_suppkey;
    exec time: 4.388s
    
    create materialized view:
    
    create materialized view customer_uv as select lo_suppkey, 
bitmap_union(to_bitmap(lo_linenumber)) from lineorder group by lo_suppkey;
    select lo_suppkey, count(distinct lo_linenumber) from lineorder group by 
lo_suppkey;
    exec time: 12.908s
    
    test with the patch, exec time: 5.790s
---
 be/src/common/config.cpp                 |   3 +
 be/src/common/config.h                   |   3 +
 be/src/util/bitmap_value.h               | 738 ++++++++++++++++++++++++++++---
 be/src/vec/functions/function_bitmap.cpp |  10 +-
 be/test/util/bitmap_value_test.cpp       |  22 -
 be/test/vec/core/column_complex_test.cpp |   5 +-
 regression-test/pipeline/p0/conf/be.conf |   1 +
 regression-test/pipeline/p1/conf/be.conf |   1 +
 8 files changed, 688 insertions(+), 95 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 90478001dc..04c29e8dd7 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1010,6 +1010,9 @@ DEFINE_mInt32(schema_cache_sweep_time_sec, "100");
 // enable feature binlog, default false
 DEFINE_Bool(enable_feature_binlog, "false");
 
+// enable set in BitmapValue
+DEFINE_Bool(enable_set_in_bitmap_value, "false");
+
 #ifdef BE_TEST
 // test s3
 DEFINE_String(test_s3_resource, "resource");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index e06c40afbe..c1d51cfb3b 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1025,6 +1025,9 @@ DECLARE_mInt32(schema_cache_sweep_time_sec);
 // enable binlog
 DECLARE_Bool(enable_feature_binlog);
 
+// enable set in BitmapValue
+DECLARE_Bool(enable_set_in_bitmap_value);
+
 #ifdef BE_TEST
 // test s3
 DECLARE_String(test_s3_resource);
diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h
index 6b660b8caf..5ada50fb59 100644
--- a/be/src/util/bitmap_value.h
+++ b/be/src/util/bitmap_value.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <parallel_hashmap/btree.h>
+#include <parallel_hashmap/phmap.h>
 
 #include <algorithm>
 #include <cstdarg>
@@ -28,10 +29,12 @@
 #include <new>
 #include <numeric>
 #include <roaring/roaring.hh>
+#include <set>
 #include <stdexcept>
 #include <string>
 #include <utility>
 
+#include "common/config.h"
 #include "common/logging.h"
 #include "gutil/integral_types.h"
 #include "udf/udf.h"
@@ -72,10 +75,11 @@ struct BitmapTypeCode {
         // - MapValue := the standard RoaringBitmap format
         //
         // added in 0.12
-        BITMAP64 = 4
+        BITMAP64 = 4,
+        SET = 5
     };
     Status static inline validate(int bitmap_type) {
-        if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type > 
type::BITMAP64)) {
+        if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type > type::SET)) {
             std::string err_msg =
                     fmt::format("BitmapTypeCode invalid, should between: {} 
and {} actrual is {}",
                                 BitmapTypeCode::EMPTY, 
BitmapTypeCode::BITMAP64, bitmap_type);
@@ -1143,6 +1147,9 @@ inline Roaring64MapSetBitForwardIterator 
Roaring64Map::end() const {
 class BitmapValueIterator;
 class BitmapValue {
 public:
+    template <typename T>
+    using SetContainer = phmap::flat_hash_set<T>;
+
     // Construct an empty bitmap.
     BitmapValue() : _type(EMPTY), _is_shared(false) {}
 
@@ -1150,7 +1157,7 @@ public:
     explicit BitmapValue(uint64_t value) : _sv(value), _type(SINGLE), 
_is_shared(false) {}
 
     // Construct a bitmap from serialized data.
-    explicit BitmapValue(const char* src) {
+    explicit BitmapValue(const char* src) : _is_shared(false) {
         bool res = deserialize(src);
         DCHECK(res);
     }
@@ -1159,6 +1166,7 @@ public:
         _type = other._type;
         _sv = other._sv;
         _bitmap = other._bitmap;
+        _set = other._set;
         _is_shared = true;
         // should also set other's state to shared, so that other bitmap value 
will
         // create a new bitmap when it wants to modify it.
@@ -1170,6 +1178,7 @@ public:
         _sv = other._sv;
         _is_shared = other._is_shared;
         _bitmap = std::move(other._bitmap);
+        _set = std::move(other._set);
     }
 
     BitmapValue& operator=(const BitmapValue& other) {
@@ -1177,6 +1186,7 @@ public:
         _sv = other._sv;
         _bitmap = other._bitmap;
         _is_shared = true;
+        _set = other._set;
         // should also set other's state to shared, so that other bitmap value 
will
         // create a new bitmap when it wants to modify it.
         const_cast<BitmapValue&>(other)._is_shared = true;
@@ -1191,47 +1201,71 @@ public:
         _type = other._type;
         _sv = other._sv;
         _bitmap = std::move(other._bitmap);
+        _set = std::move(other._set);
         _is_shared = other._is_shared;
         return *this;
     }
 
     // Construct a bitmap from given elements.
     explicit BitmapValue(const std::vector<uint64_t>& bits) : 
_is_shared(false) {
-        switch (bits.size()) {
-        case 0:
+        if (bits.size() == 0) {
             _type = EMPTY;
-            break;
-        case 1:
+            return;
+        }
+
+        if (bits.size() == 1 && !config::enable_set_in_bitmap_value) {
             _type = SINGLE;
             _sv = bits[0];
-            break;
-        default:
+            return;
+        }
+
+        if (!config::enable_set_in_bitmap_value || bits.size() > 
SET_TYPE_THRESHOLD) {
             _type = BITMAP;
             _prepare_bitmap_for_write();
             _bitmap->addMany(bits.size(), &bits[0]);
+        } else {
+            _type = SET;
+            for (auto v : bits) {
+                _set.insert(v);
+            }
         }
     }
 
     void add(uint64_t value) {
         switch (_type) {
         case EMPTY:
-            _sv = value;
-            _type = SINGLE;
+            if (!config::enable_set_in_bitmap_value) {
+                _sv = value;
+                _type = SINGLE;
+            } else {
+                _set.insert(value);
+                _type = SET;
+            }
             break;
         case SINGLE:
             //there is no need to convert the type if two variables are equal
             if (_sv == value) {
                 break;
             }
-            _prepare_bitmap_for_write();
-            _bitmap->add(_sv);
-            _bitmap->add(value);
-            _type = BITMAP;
+            if (config::enable_set_in_bitmap_value) {
+                _set.insert(_sv);
+                _set.insert(value);
+                _type = SET;
+            } else {
+                _prepare_bitmap_for_write();
+                _bitmap->add(_sv);
+                _bitmap->add(value);
+                _type = BITMAP;
+            }
             break;
         case BITMAP:
             _prepare_bitmap_for_write();
             _bitmap->add(value);
             break;
+        case SET:
+            _set.insert(value);
+            _convert_to_bitmap_if_need();
+            break;
         }
     }
 
@@ -1249,6 +1283,11 @@ public:
             _prepare_bitmap_for_write();
             _bitmap->remove(value);
             _convert_to_smaller_type();
+            break;
+        case SET:
+            _set.erase(value);
+            _convert_to_smaller_type();
+            break;
         }
     }
 
@@ -1274,8 +1313,50 @@ public:
                 *_bitmap -= *rhs._bitmap;
                 _convert_to_smaller_type();
                 break;
+            case SET: {
+                for (auto it = _set.begin(); it != _set.end();) {
+                    if (rhs.contains(*it)) {
+                        it = _set.erase(it);
+                    } else {
+                        ++it;
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
+            }
             }
             break;
+        case SET: {
+            switch (_type) {
+            case EMPTY:
+                break;
+            case SINGLE:
+                if (rhs._set.contains(_sv)) {
+                    _type = EMPTY;
+                }
+                break;
+            case BITMAP:
+                _prepare_bitmap_for_write();
+                for (auto v : rhs._set) {
+                    if (_bitmap->contains(v)) {
+                        _bitmap->remove(v);
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
+            case SET: {
+                for (auto it = _set.begin(); it != _set.end();) {
+                    if (rhs.contains(*it)) {
+                        it = _set.erase(it);
+                    } else {
+                        ++it;
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
+            }
+            }
+        }
         }
         return *this;
     }
@@ -1311,6 +1392,53 @@ public:
             case BITMAP:
                 _prepare_bitmap_for_write();
                 *_bitmap |= *rhs._bitmap;
+                break;
+            case SET: {
+                _prepare_bitmap_for_write();
+                *_bitmap = *rhs._bitmap;
+
+                for (auto v : _set) {
+                    _bitmap->add(v);
+                }
+                _type = BITMAP;
+                break;
+            }
+            }
+            break;
+        case SET:
+            switch (_type) {
+            case EMPTY:
+                _set = rhs._set;
+                _type = SET;
+                break;
+            case SINGLE: {
+                if ((rhs._set.size() + rhs._set.contains(_sv) > 
SET_TYPE_THRESHOLD)) {
+                    _prepare_bitmap_for_write();
+                    _bitmap->add(_sv);
+                    for (auto v : rhs._set) {
+                        _bitmap->add(v);
+                    }
+                    _type = BITMAP;
+                } else {
+                    _set = rhs._set;
+                    _set.insert(_sv);
+                    _type = SET;
+                }
+                break;
+            }
+            case BITMAP:
+                _prepare_bitmap_for_write();
+                for (auto v : rhs._set) {
+                    _bitmap->add(v);
+                }
+                break;
+            case SET: {
+                for (auto v : rhs._set) {
+                    _set.insert(v);
+                }
+                _convert_to_bitmap_if_need();
+                break;
+            }
             }
             break;
         }
@@ -1320,6 +1448,7 @@ public:
     BitmapValue& fastunion(const std::vector<const BitmapValue*>& values) {
         std::vector<const detail::Roaring64Map*> bitmaps;
         std::vector<uint64_t> single_values;
+        std::vector<const SetContainer<uint64_t>*> sets;
         for (int i = 0; i < values.size(); ++i) {
             auto* value = values[i];
             switch (value->_type) {
@@ -1331,6 +1460,9 @@ public:
             case BITMAP:
                 bitmaps.push_back(value->_bitmap.get());
                 break;
+            case SET:
+                sets.push_back(&value->_set);
+                break;
             }
         }
 
@@ -1339,29 +1471,95 @@ public:
             switch (_type) {
             case EMPTY:
                 *_bitmap = detail::Roaring64Map::fastunion(bitmaps.size(), 
bitmaps.data());
-                _type = BITMAP;
                 break;
             case SINGLE:
                 *_bitmap = detail::Roaring64Map::fastunion(bitmaps.size(), 
bitmaps.data());
                 _bitmap->add(_sv);
-                _type = BITMAP;
                 break;
             case BITMAP:
                 *_bitmap |= detail::Roaring64Map::fastunion(bitmaps.size(), 
bitmaps.data());
                 break;
+            case SET: {
+                *_bitmap = detail::Roaring64Map::fastunion(bitmaps.size(), 
bitmaps.data());
+                for (auto v : _set) {
+                    _bitmap->add(v);
+                }
+                _set.clear();
+                break;
+            }
+            }
+            _type = BITMAP;
+        }
+
+        if (!sets.empty()) {
+            for (auto& set : sets) {
+                for (auto v : *set) {
+                    _set.insert(v);
+                }
+            }
+            switch (_type) {
+            case EMPTY:
+                _type = SET;
+                break;
+            case SINGLE: {
+                _set.insert(_sv);
+                _type = SET;
+                _convert_to_bitmap_if_need();
+                break;
+            }
+            case BITMAP:
+                _prepare_bitmap_for_write();
+                for (auto v : _set) {
+                    _bitmap->add(v);
+                }
+                _type = BITMAP;
+                break;
+            case SET: {
+                _convert_to_bitmap_if_need();
+                break;
+            }
             }
         }
 
         if (_type == EMPTY && single_values.size() == 1) {
-            _sv = single_values[0];
-            _type = SINGLE;
+            if (config::enable_set_in_bitmap_value) {
+                _type = SET;
+                _set.insert(single_values[0]);
+            } else {
+                _sv = single_values[0];
+                _type = SINGLE;
+            }
         } else if (!single_values.empty()) {
-            _prepare_bitmap_for_write();
-            _bitmap->addMany(single_values.size(), single_values.data());
-            if (_type == SINGLE) {
-                _bitmap->add(_sv);
+            switch (_type) {
+            case EMPTY:
+            case SINGLE:
+                if (config::enable_set_in_bitmap_value) {
+                    _set.insert(single_values.cbegin(), single_values.cend());
+                    if (_type == SINGLE) {
+                        _set.insert(_sv);
+                    }
+                    _type = SET;
+                    _convert_to_bitmap_if_need();
+                } else {
+                    _prepare_bitmap_for_write();
+                    _bitmap->addMany(single_values.size(), 
single_values.data());
+                    if (_type == SINGLE) {
+                        _bitmap->add(_sv);
+                    }
+                    _type = BITMAP;
+                    _convert_to_smaller_type();
+                }
+                break;
+            case BITMAP: {
+                _prepare_bitmap_for_write();
+                _bitmap->addMany(single_values.size(), single_values.data());
+                break;
+            }
+            case SET:
+                _set.insert(single_values.cbegin(), single_values.cend());
+                _convert_to_bitmap_if_need();
+                break;
             }
-            _type = BITMAP;
         }
 
         return *this;
@@ -1396,6 +1594,15 @@ public:
                 }
                 _bitmap.reset();
                 break;
+            case SET:
+                if (!_set.contains(rhs._sv)) {
+                    _type = EMPTY;
+                } else {
+                    _type = SINGLE;
+                    _sv = rhs._sv;
+                }
+                _set.clear();
+                break;
             }
             break;
         case BITMAP:
@@ -1412,6 +1619,48 @@ public:
                 *_bitmap &= *rhs._bitmap;
                 _convert_to_smaller_type();
                 break;
+            case SET:
+                for (auto it = _set.begin(); it != _set.end();) {
+                    if (!rhs._bitmap->contains(*it)) {
+                        it = _set.erase(it);
+                    } else {
+                        ++it;
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
+            }
+            break;
+        case SET:
+            switch (_type) {
+            case EMPTY:
+                break;
+            case SINGLE:
+                if (!rhs._set.contains(_sv)) {
+                    _type = EMPTY;
+                }
+                break;
+            case BITMAP:
+                _prepare_bitmap_for_write();
+                for (auto v : rhs._set) {
+                    if (_bitmap->contains(v)) {
+                        _set.insert(v);
+                    }
+                }
+                _type = SET;
+                _bitmap.reset();
+                _convert_to_smaller_type();
+                break;
+            case SET:
+                for (auto it = _set.begin(); it != _set.end();) {
+                    if (!rhs._set.contains(*it)) {
+                        it = _set.erase(it);
+                    } else {
+                        ++it;
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
             }
             break;
         }
@@ -1448,6 +1697,13 @@ public:
                     _bitmap->remove(rhs._sv);
                 }
                 break;
+            case SET:
+                if (!_set.contains(rhs._sv)) {
+                    _set.insert(rhs._sv);
+                } else {
+                    _set.erase(rhs._sv);
+                }
+                break;
             }
             break;
         case BITMAP:
@@ -1475,6 +1731,57 @@ public:
                 *_bitmap ^= *rhs._bitmap;
                 _convert_to_smaller_type();
                 break;
+            case SET:
+                _prepare_bitmap_for_write();
+                *_bitmap = *rhs._bitmap;
+                for (auto v : _set) {
+                    if (_bitmap->contains(v)) {
+                        _bitmap->remove(v);
+                    } else {
+                        _bitmap->add(v);
+                    }
+                }
+                _type = BITMAP;
+                _convert_to_smaller_type();
+                break;
+            }
+            break;
+        case SET:
+            switch (_type) {
+            case EMPTY:
+                _set = rhs._set;
+                _type = SET;
+                break;
+            case SINGLE:
+                _set = rhs._set;
+                if (!rhs._set.contains(_sv)) {
+                    _set.insert(_sv);
+                } else {
+                    _set.erase(_sv);
+                }
+                _type = SET;
+                break;
+            case BITMAP:
+                _prepare_bitmap_for_write();
+                for (auto v : rhs._set) {
+                    if (_bitmap->contains(v)) {
+                        _bitmap->remove(v);
+                    } else {
+                        _bitmap->add(v);
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
+            case SET:
+                for (auto v : rhs._set) {
+                    if (_set.contains(v)) {
+                        _set.erase(v);
+                    } else {
+                        _set.insert(v);
+                    }
+                }
+                _convert_to_smaller_type();
+                break;
             }
             break;
         }
@@ -1490,6 +1797,8 @@ public:
             return _sv == x;
         case BITMAP:
             return _bitmap->contains(x);
+        case SET:
+            return _set.contains(x);
         }
         return false;
     }
@@ -1505,6 +1814,8 @@ public:
             return 1;
         case BITMAP:
             return _bitmap->cardinality();
+        case SET:
+            return _set.size();
         }
         return 0;
     }
@@ -1521,7 +1832,10 @@ public:
                 return _sv == rhs._sv;
             case BITMAP:
                 return _bitmap->contains(rhs._sv);
+            case SET:
+                return _set.contains(rhs._sv);
             }
+            break;
         case BITMAP:
             switch (_type) {
             case EMPTY:
@@ -1530,6 +1844,41 @@ public:
                 return rhs._bitmap->contains(_sv);
             case BITMAP:
                 return _bitmap->andCardinality(*rhs._bitmap);
+            case SET: {
+                uint64_t cardinality = 0;
+                for (auto v : _set) {
+                    if (_bitmap->contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
+            }
+            }
+            break;
+        case SET:
+            switch (_type) {
+            case EMPTY:
+                return 0;
+            case SINGLE:
+                return rhs._set.contains(_sv);
+            case BITMAP: {
+                uint64_t cardinality = 0;
+                for (auto v : rhs._set) {
+                    if (_bitmap->contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
+            }
+            case SET: {
+                uint64_t cardinality = 0;
+                for (auto v : _set) {
+                    if (rhs._set.contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
+            }
             }
         }
         return 0;
@@ -1547,7 +1896,10 @@ public:
                 return 1 + (_sv != rhs._sv);
             case BITMAP:
                 return cardinality() + !_bitmap->contains(rhs._sv);
+            case SET:
+                return _set.size() + !_set.contains(rhs._sv);
             }
+            break;
         case BITMAP:
             switch (_type) {
             case EMPTY:
@@ -1556,33 +1908,41 @@ public:
                 return rhs.cardinality() + !rhs._bitmap->contains(_sv);
             case BITMAP:
                 return _bitmap->orCardinality(*rhs._bitmap);
+            case SET: {
+                uint64_t cardinality = rhs._bitmap->cardinality();
+                for (auto v : _set) {
+                    if (!rhs._bitmap->contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
             }
-        }
-        return 0;
-    }
-
-    uint64_t xor_cardinality(const BitmapValue& rhs) const {
-        switch (rhs._type) {
-        case EMPTY:
-            return cardinality();
-        case SINGLE:
-            switch (_type) {
-            case EMPTY:
-                return 1;
-            case SINGLE:
-                return 2 - 2 * (_sv == rhs._sv);
-            case BITMAP:
-                return cardinality() + 1 - 2 * (_bitmap->contains(rhs._sv));
             }
             break;
-        case BITMAP:
+        case SET:
             switch (_type) {
             case EMPTY:
                 return rhs.cardinality();
             case SINGLE:
-                return rhs.cardinality() + 1 - 2 * 
(rhs._bitmap->contains(_sv));
-            case BITMAP:
-                return _bitmap->xorCardinality(*rhs._bitmap);
+                return rhs.cardinality() + !rhs._set.contains(_sv);
+            case BITMAP: {
+                uint64_t cardinality = _bitmap->cardinality();
+                for (auto v : rhs._set) {
+                    if (!_bitmap->contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
+            }
+            case SET: {
+                uint64_t cardinality = _set.size();
+                for (auto v : _set) {
+                    if (!rhs._set.contains(v)) {
+                        ++cardinality;
+                    }
+                }
+                return cardinality;
+            }
             }
         }
         return 0;
@@ -1600,6 +1960,8 @@ public:
                 return 1 - _sv == rhs._sv;
             case BITMAP:
                 return cardinality() - _bitmap->contains(rhs._sv);
+            case SET:
+                return cardinality() - _set.contains(rhs._sv);
             }
             break;
         case BITMAP:
@@ -1610,6 +1972,41 @@ public:
                 return !rhs._bitmap->contains(_sv);
             case BITMAP:
                 return _bitmap->andnotCardinality(*rhs._bitmap);
+            case SET: {
+                uint64_t cardinality = _set.size();
+                for (auto v : _set) {
+                    if (rhs._bitmap->contains(v)) {
+                        cardinality -= 1;
+                    }
+                }
+                return cardinality;
+            }
+            }
+            break;
+        case SET:
+            switch (_type) {
+            case EMPTY:
+                return 0;
+            case SINGLE:
+                return !rhs._set.contains(_sv);
+            case BITMAP: {
+                uint64_t cardinality = _bitmap->cardinality();
+                for (auto v : rhs._set) {
+                    if (_bitmap->contains(v)) {
+                        cardinality -= 1;
+                    }
+                }
+                return cardinality;
+            }
+            case SET: {
+                uint64_t cardinality = _set.size();
+                for (auto v : rhs._set) {
+                    if (_set.contains(v)) {
+                        cardinality -= 1;
+                    }
+                }
+                return cardinality;
+            }
             }
         }
         return 0;
@@ -1635,6 +2032,10 @@ public:
             _bitmap->shrinkToFit();
             res = _bitmap->getSizeInBytes();
             break;
+        case SET:
+            /// 1 byte for type, 1 byte for count
+            res = 2 + sizeof(uint64_t) * _set.size();
+            break;
         }
         return res;
     }
@@ -1655,6 +2056,17 @@ public:
                 encode_fixed64_le(reinterpret_cast<uint8_t*>(dst), _sv);
             }
             break;
+        case SET:
+            DCHECK(config::enable_set_in_bitmap_value);
+            *dst = BitmapTypeCode::SET;
+            ++dst;
+            *dst = static_cast<uint8_t>(_set.size());
+            ++dst;
+            for (auto v : _set) {
+                encode_fixed64_le(reinterpret_cast<uint8_t*>(dst), v);
+                dst += sizeof(uint64_t);
+            }
+            break;
         case BITMAP:
             _bitmap->write(dst);
             break;
@@ -1671,10 +2083,18 @@ public:
         case BitmapTypeCode::SINGLE32:
             _type = SINGLE;
             _sv = decode_fixed32_le(reinterpret_cast<const uint8_t*>(src + 1));
+            if (config::enable_set_in_bitmap_value) {
+                _type = SET;
+                _set.insert(_sv);
+            }
             break;
         case BitmapTypeCode::SINGLE64:
             _type = SINGLE;
             _sv = decode_fixed64_le(reinterpret_cast<const uint8_t*>(src + 1));
+            if (config::enable_set_in_bitmap_value) {
+                _type = SET;
+                _set.insert(_sv);
+            }
             break;
         case BitmapTypeCode::BITMAP32:
         case BitmapTypeCode::BITMAP64:
@@ -1682,9 +2102,30 @@ public:
             _prepare_bitmap_for_write();
             *_bitmap = detail::Roaring64Map::read(src);
             break;
+        case BitmapTypeCode::SET: {
+            _type = SET;
+            ++src;
+            uint8_t count = *src;
+            ++src;
+            CHECK(count <= SET_TYPE_THRESHOLD) << "bitmap value with incorrect 
set count";
+            for (uint8_t i = 0; i != count; ++i, src += sizeof(uint64_t)) {
+                _set.insert(decode_fixed64_le(reinterpret_cast<const 
uint8_t*>(src)));
+            }
+            CHECK_EQ(count, _set.size()) << "bitmap value with incorrect set 
count";
+
+            if (!config::enable_set_in_bitmap_value) {
+                _prepare_bitmap_for_write();
+                for (auto v : _set) {
+                    _bitmap->add(v);
+                }
+                _type = BITMAP;
+                _set.clear();
+            }
+            break;
+        }
         default:
             LOG(ERROR) << "BitmapTypeCode invalid, should between: " << 
BitmapTypeCode::EMPTY
-                       << " and " << BitmapTypeCode::BITMAP64 << " actrual is "
+                       << " and " << BitmapTypeCode::BITMAP64 << " actual is "
                        << static_cast<int>(*src);
             return false;
         }
@@ -1697,6 +2138,8 @@ public:
             return _sv;
         case BITMAP:
             return _bitmap->minimum();
+        case SET:
+            return _min_in_set();
         default:
             return 0;
         }
@@ -1732,6 +2175,26 @@ public:
                     &iter_ctx);
             break;
         }
+        case SET: {
+            struct IterCtx {
+                std::stringstream* ss = nullptr;
+                bool first = true;
+            } iter_ctx;
+            iter_ctx.ss = &ss;
+
+            std::vector<uint64_t> values(_set.begin(), _set.end());
+            std::sort(values.begin(), values.end());
+
+            for (auto v : values) {
+                if (iter_ctx.first) {
+                    iter_ctx.first = false;
+                } else {
+                    (*iter_ctx.ss) << ",";
+                }
+                (*iter_ctx.ss) << v;
+            }
+            break;
+        }
         }
         return ss.str();
     }
@@ -1742,17 +2205,29 @@ public:
             return _sv;
         case BITMAP:
             return _bitmap->maximum();
+        case SET:
+            return _max_in_set();
         default:
             return 0;
         }
     }
 
     uint64_t max(bool* empty) const {
-        return min_or_max(empty, [&]() { return _bitmap->maximum(); });
+        return min_or_max(empty, [&]() { return maximum(); });
     }
 
     uint64_t min(bool* empty) const {
-        return min_or_max(empty, [&]() { return _bitmap->minimum(); });
+        return min_or_max(empty, [&]() { return minimum(); });
+    }
+
+    uint64_t _min_in_set() const {
+        DCHECK_EQ(_type, SET);
+        return *std::min_element(_set.begin(), _set.end());
+    }
+
+    uint64_t _max_in_set() const {
+        DCHECK_EQ(_type, SET);
+        return *std::max_element(_set.begin(), _set.end());
     }
 
     bool empty() const { return _type == EMPTY; }
@@ -1789,6 +2264,19 @@ public:
             }
             return count;
         }
+        case SET: {
+            int64_t count = 0;
+            std::vector<uint64_t> values(_set.begin(), _set.end());
+            std::sort(values.begin(), values.end());
+            for (auto it = values.begin(); it != values.end(); ++it) {
+                if (*it < range_start || *it >= range_end) {
+                    continue;
+                }
+                ret_bitmap->add(*it);
+                ++count;
+            }
+            return count;
+        }
         }
         return 0;
     }
@@ -1828,6 +2316,24 @@ public:
             }
             return count;
         }
+        case SET: {
+            int64_t count = 0;
+
+            std::vector<uint64_t> values(_set.begin(), _set.end());
+            std::sort(values.begin(), values.end());
+            for (auto it = values.begin(); it != values.end(); ++it) {
+                if (*it < range_start) {
+                    continue;
+                }
+                if (count < cardinality_limit) {
+                    ret_bitmap->add(*it);
+                    ++count;
+                } else {
+                    break;
+                }
+            }
+            return count;
+        }
         }
         return 0;
     }
@@ -1850,27 +2356,57 @@ public:
                 return 0;
             }
         }
-        case BITMAP: {
+        default:
+            break;
+        }
+        if (_type == BITMAP) {
             if (std::abs(offset) >= _bitmap->cardinality()) {
                 return 0;
             }
-        }
-        }
-        int64_t abs_offset = offset;
-        if (offset < 0) {
-            abs_offset = _bitmap->cardinality() + offset;
-        }
+            int64_t abs_offset = offset;
+            if (offset < 0) {
+                abs_offset = _bitmap->cardinality() + offset;
+            }
 
-        int64_t count = 0;
-        int64_t offset_count = 0;
-        auto it = _bitmap->begin();
-        for (; it != _bitmap->end() && offset_count < abs_offset; ++it) {
-            ++offset_count;
-        }
-        for (; it != _bitmap->end() && count < limit; ++it, ++count) {
-            ret_bitmap->add(*it);
+            int64_t count = 0;
+            int64_t offset_count = 0;
+            auto it = _bitmap->begin();
+            for (; it != _bitmap->end() && offset_count < abs_offset; ++it) {
+                ++offset_count;
+            }
+            for (; it != _bitmap->end() && count < limit; ++it, ++count) {
+                ret_bitmap->add(*it);
+            }
+            return count;
+        } else {
+            if (std::abs(offset) > _set.size()) {
+                return 0;
+            }
+
+            int64_t abs_offset = offset;
+            if (offset < 0) {
+                abs_offset = _set.size() + offset;
+            }
+
+            std::vector<uint64_t> values(_set.begin(), _set.end());
+            std::sort(values.begin(), values.end());
+
+            int64_t count = 0;
+            size_t index = 0;
+            for (auto v : values) {
+                if (index < abs_offset) {
+                    ++index;
+                    continue;
+                }
+                if (count == limit || index == values.size()) {
+                    break;
+                }
+                ++count;
+                ++index;
+                ret_bitmap->add(v);
+            }
+            return count;
         }
-        return count;
     }
 
     //for function bitmap_to_array
@@ -1888,6 +2424,14 @@ public:
             }
             break;
         }
+        case SET: {
+            std::vector<uint64_t> values(_set.begin(), _set.end());
+            std::sort(values.begin(), values.end());
+            for (auto v : values) {
+                data.emplace_back(v);
+            }
+            break;
+        }
         }
     }
 
@@ -1909,14 +2453,29 @@ private:
     void _convert_to_smaller_type() {
         if (_type == BITMAP) {
             uint64_t c = _bitmap->cardinality();
-            if (c > 1) return;
+            if (config::enable_set_in_bitmap_value && c > SET_TYPE_THRESHOLD) {
+                return;
+            } else if (c > 1) {
+                return;
+            }
             if (c == 0) {
                 _type = EMPTY;
-            } else {
+            } else if (c == 1 && !config::enable_set_in_bitmap_value) {
                 _type = SINGLE;
                 _sv = _bitmap->minimum();
+            } else {
+                _type = SET;
+                for (auto v : *_bitmap) {
+                    _set.insert(v);
+                }
             }
             _bitmap.reset();
+        } else if (_type == SET) {
+            if (_set.size() == 1 && !config::enable_set_in_bitmap_value) {
+                _type = SINGLE;
+                _sv = *_set.begin();
+                _set.clear();
+            }
         }
     }
 
@@ -1928,6 +2487,7 @@ private:
             result = _sv;
             break;
         case BITMAP:
+        case SET:
             result = func();
             break;
         default:
@@ -1959,16 +2519,31 @@ private:
         _is_shared = false;
     }
 
+    void _convert_to_bitmap_if_need() {
+        if (_type != SET || _set.size() <= SET_TYPE_THRESHOLD) {
+            return;
+        }
+        _prepare_bitmap_for_write();
+        for (auto v : _set) {
+            _bitmap->add(v);
+        }
+        _type = BITMAP;
+        _set.clear();
+    }
+
     enum BitmapDataType {
         EMPTY = 0,
         SINGLE = 1, // single element
-        BITMAP = 2  // more than one elements
+        BITMAP = 2, // more than one elements
+        SET = 3     // elements count less or equal than 32
     };
     uint64_t _sv = 0;                              // store the single value 
when _type == SINGLE
     std::shared_ptr<detail::Roaring64Map> _bitmap; // used when _type == BITMAP
+    SetContainer<uint64_t> _set;
     BitmapDataType _type {EMPTY};
     // Indicate whether the state is shared among multi BitmapValue object
     bool _is_shared = true;
+    static constexpr uint64_t SET_TYPE_THRESHOLD = 32;
 };
 
 // A simple implement of bitmap value iterator(Read only)
@@ -1992,6 +2567,9 @@ public:
         case BitmapValue::BitmapDataType::BITMAP:
             _iter = new 
detail::Roaring64MapSetBitForwardIterator(*_bitmap._bitmap, _end);
             break;
+        case BitmapValue::BitmapDataType::SET:
+            _set_iter = _end ? _bitmap._set.end() : _bitmap._set.begin();
+            break;
         default:
             CHECK(false) << _bitmap._type;
         }
@@ -2000,6 +2578,9 @@ public:
     BitmapValueIterator(const BitmapValueIterator& other)
             : _bitmap(other._bitmap), _sv(other._sv), _end(other._end) {
         _iter = other._iter ? new 
detail::Roaring64MapSetBitForwardIterator(*other._iter) : nullptr;
+        if (_bitmap._type == BitmapValue::BitmapDataType::SET) {
+            _set_iter = other._set_iter;
+        }
     }
 
     ~BitmapValueIterator() {
@@ -2016,6 +2597,9 @@ public:
             return _sv;
         case BitmapValue::BitmapDataType::BITMAP:
             return *(*_iter);
+        case BitmapValue::BitmapDataType::SET: {
+            return *_set_iter;
+        }
         default:
             CHECK(false) << _bitmap._type;
         }
@@ -2031,6 +2615,9 @@ public:
         case BitmapValue::BitmapDataType::BITMAP:
             ++(*_iter);
             break;
+        case BitmapValue::BitmapDataType::SET:
+            ++_set_iter;
+            break;
         default:
             CHECK(false) << _bitmap._type;
         }
@@ -2047,6 +2634,9 @@ public:
         case BitmapValue::BitmapDataType::BITMAP:
             ++(*_iter);
             break;
+        case BitmapValue::BitmapDataType::SET:
+            ++_set_iter;
+            break;
         default:
             CHECK(false) << _bitmap._type;
         }
@@ -2065,6 +2655,8 @@ public:
             return _end == other._end && _sv == other._sv;
         case BitmapValue::BitmapDataType::BITMAP:
             return *_iter == *(other._iter);
+        case BitmapValue::BitmapDataType::SET:
+            return _set_iter == other._set_iter;
         default:
             CHECK(false) << _bitmap._type;
         }
@@ -2088,6 +2680,10 @@ public:
                 _end = true;
             }
             break;
+        case BitmapValue::BitmapDataType::SET: {
+            LOG(FATAL) << "BitmapValue with set do not support move";
+            break;
+        }
         default:
             break;
         }
@@ -2097,6 +2693,7 @@ public:
 private:
     const BitmapValue& _bitmap;
     detail::Roaring64MapSetBitForwardIterator* _iter = nullptr;
+    BitmapValue::SetContainer<uint64_t>::const_iterator _set_iter;
     uint64_t _sv = 0;
     bool _end = false;
 };
@@ -2117,6 +2714,15 @@ inline bool BitmapValue::contains_any(uint64_t left, 
uint64_t right) const {
     if (left > right) {
         return false;
     }
+
+    if (_type == SET) {
+        for (auto v : _set) {
+            if (v >= left && v <= right) {
+                return true;
+            }
+        }
+        return false;
+    }
     auto it = lower_bound(left);
     return it != end() && *it <= right;
 }
diff --git a/be/src/vec/functions/function_bitmap.cpp 
b/be/src/vec/functions/function_bitmap.cpp
index 74035f6f7d..8d252f77f2 100644
--- a/be/src/vec/functions/function_bitmap.cpp
+++ b/be/src/vec/functions/function_bitmap.cpp
@@ -127,14 +127,12 @@ struct ToBitmap {
             size_t size = col->size();
 
             for (size_t i = 0; i < size; ++i) {
-                if (arg_is_nullable && ((*nullmap)[i])) {
-                    continue;
-                } else {
-                    int64_t int_value = col->get_data()[i];
-                    if (LIKELY(int_value >= 0)) {
-                        res_data[i].add(int_value);
+                if constexpr (arg_is_nullable) {
+                    if ((*nullmap)[i]) {
+                        continue;
                     }
                 }
+                res_data[i].add(col->get_data()[i]);
             }
         }
     }
diff --git a/be/test/util/bitmap_value_test.cpp 
b/be/test/util/bitmap_value_test.cpp
index 15b275e5aa..dc9908a582 100644
--- a/be/test/util/bitmap_value_test.cpp
+++ b/be/test/util/bitmap_value_test.cpp
@@ -221,14 +221,6 @@ TEST(BitmapValueTest, bitmap_serde) {
         BitmapValue bitmap32({0, UINT32_MAX});
         std::string buffer = convert_bitmap_to_string(bitmap32);
 
-        Roaring roaring;
-        roaring.add(0);
-        roaring.add(UINT32_MAX);
-        std::string expect_buffer(1, BitmapTypeCode::BITMAP32);
-        expect_buffer.resize(1 + roaring.getSizeInBytes());
-        roaring.write(&expect_buffer[1]);
-        EXPECT_EQ(expect_buffer, buffer);
-
         BitmapValue out(buffer.data());
         EXPECT_EQ(2, out.cardinality());
         EXPECT_TRUE(out.contains(0));
@@ -250,20 +242,6 @@ TEST(BitmapValueTest, bitmap_serde) {
         BitmapValue bitmap64({0, static_cast<uint64_t>(UINT32_MAX) + 1});
         std::string buffer = convert_bitmap_to_string(bitmap64);
 
-        Roaring roaring;
-        roaring.add(0);
-        std::string expect_buffer(1, BitmapTypeCode::BITMAP64);
-        put_varint64(&expect_buffer, 2); // map size
-        for (uint32_t i = 0; i < 2; ++i) {
-            std::string map_entry;
-            put_fixed32_le(&map_entry, i); // map key
-            map_entry.resize(sizeof(uint32_t) + roaring.getSizeInBytes());
-            roaring.write(&map_entry[4]); // map value
-
-            expect_buffer.append(map_entry);
-        }
-        EXPECT_EQ(expect_buffer, buffer);
-
         BitmapValue out(buffer.data());
         EXPECT_EQ(2, out.cardinality());
         EXPECT_TRUE(out.contains(0));
diff --git a/be/test/vec/core/column_complex_test.cpp 
b/be/test/vec/core/column_complex_test.cpp
index 1be5804a39..2a45eb1b04 100644
--- a/be/test/vec/core/column_complex_test.cpp
+++ b/be/test/vec/core/column_complex_test.cpp
@@ -64,7 +64,10 @@ public:
         for (size_t i = 0; i < l_col.size(); ++i) {
             auto& l_bitmap = const_cast<BitmapValue&>(l_col.get_element(i));
             auto& r_bitmap = const_cast<BitmapValue&>(r_col.get_element(i));
-            ASSERT_EQ(l_bitmap.xor_cardinality(r_bitmap), 0);
+            ASSERT_EQ(l_bitmap.and_cardinality(r_bitmap), 
r_bitmap.cardinality());
+            auto or_cardinality = l_bitmap.or_cardinality(r_bitmap);
+            ASSERT_EQ(or_cardinality, l_bitmap.cardinality());
+            ASSERT_EQ(or_cardinality, r_bitmap.cardinality());
         }
     }
 
diff --git a/regression-test/pipeline/p0/conf/be.conf 
b/regression-test/pipeline/p0/conf/be.conf
index 06c244600d..9e9525d25a 100644
--- a/regression-test/pipeline/p0/conf/be.conf
+++ b/regression-test/pipeline/p0/conf/be.conf
@@ -70,3 +70,4 @@ priority_networks=172.19.0.0/24
 fragment_pool_thread_num_max=5000
 enable_fuzzy_mode=true
 max_depth_of_expr_tree=200
+enable_set_in_bitmap_value=true
diff --git a/regression-test/pipeline/p1/conf/be.conf 
b/regression-test/pipeline/p1/conf/be.conf
index 46e09df1eb..002ded1d55 100644
--- a/regression-test/pipeline/p1/conf/be.conf
+++ b/regression-test/pipeline/p1/conf/be.conf
@@ -67,3 +67,4 @@ disable_auto_compaction=true
 tablet_map_shard_size=256
 fragment_pool_thread_num_max=5000
 enable_fuzzy_mode=true
+enable_set_in_bitmap_value=true


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org


Reply via email to