This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch dev-1.0.1
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git

commit d2a8a399be2024e0429fff677f38a11198f215e3
Author: Luwei <814383...@qq.com>
AuthorDate: Fri May 27 20:44:26 2022 +0800

    [fix] Fix bug of bloom filter hash value calculation error (#9802)
    
    * Fix bug of bloom filter hash value calculation error
    
    * fix code style
---
 be/src/olap/bloom_filter_predicate.h   |  8 +++++---
 be/src/olap/schema.cpp                 |  2 +-
 be/src/vec/columns/column_dictionary.h | 24 ++++++++++++++++++++----
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/be/src/olap/bloom_filter_predicate.h 
b/be/src/olap/bloom_filter_predicate.h
index 01851cfd96..c8fbdab94c 100644
--- a/be/src/olap/bloom_filter_predicate.h
+++ b/be/src/olap/bloom_filter_predicate.h
@@ -119,8 +119,10 @@ void 
BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16
         auto& null_map_data = nullable_col->get_null_map_column().get_data();
         // deal ColumnDict
         if (nullable_col->get_nested_column().is_column_dictionary()) {
-            auto* dict_col = 
vectorized::check_and_get_column<vectorized::ColumnDictI32>(nullable_col->get_nested_column());
-            
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
+            auto* dict_col = 
vectorized::check_and_get_column<vectorized::ColumnDictI32>(
+                    nullable_col->get_nested_column());
+            const_cast<vectorized::ColumnDictI32*>(dict_col)
+                    ->generate_hash_values_for_runtime_filter();
             for (uint16_t i = 0; i < *size; i++) {
                 uint16_t idx = sel[i];
                 sel[new_size] = idx;
@@ -139,7 +141,7 @@ void 
BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16
         }
     } else if (column.is_column_dictionary()) {
         auto* dict_col = 
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
-        
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
+        
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values_for_runtime_filter();
         for (uint16_t i = 0; i < *size; i++) {
             uint16_t idx = sel[i];
             sel[new_size] = idx;
diff --git a/be/src/olap/schema.cpp b/be/src/olap/schema.cpp
index 6b8354f26a..e30389b7d3 100644
--- a/be/src/olap/schema.cpp
+++ b/be/src/olap/schema.cpp
@@ -205,7 +205,7 @@ vectorized::IColumn::MutablePtr 
Schema::get_predicate_column_ptr(FieldType type)
     case OLAP_FIELD_TYPE_VARCHAR:
     case OLAP_FIELD_TYPE_STRING:
         if (config::enable_low_cardinality_optimize) {
-            return 
doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create();
+            return 
doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create(type);
         }
         return doris::vectorized::PredicateColumnType<StringValue>::create();
 
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index cc27ca1cdb..97052cfb7c 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -61,6 +61,7 @@ private:
     ColumnDictionary() {}
     ColumnDictionary(const size_t n) : _codes(n) {}
     ColumnDictionary(const ColumnDictionary& src) : _codes(src._codes.begin(), 
src._codes.end()) {}
+    ColumnDictionary(FieldType type) : _type(type) {}
 
 public:
     using Self = ColumnDictionary;
@@ -250,8 +251,8 @@ public:
         return _dict.find_code_by_bound(value, greater, eq);
     }
 
-    void generate_hash_values() {
-        _dict.generate_hash_values();
+    void generate_hash_values_for_runtime_filter() {
+        _dict.generate_hash_values_for_runtime_filter(_type);
     }
 
     uint32_t get_hash_value(uint32_t idx) const {
@@ -308,12 +309,26 @@ public:
             return code >= _dict_data.size() ? _null_value : _dict_data[code];
         }
 
-        inline void generate_hash_values() {
+        // The function is only used in the runtime filter feature
+        inline void generate_hash_values_for_runtime_filter(FieldType type) {
             if (_hash_values.empty()) {
                 _hash_values.resize(_dict_data.size());
                 for (size_t i = 0; i < _dict_data.size(); i++) {
                     auto& sv = _dict_data[i];
-                    uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, 
sv.len, 0);
+                    // The char data is stored in the disk with the schema 
length,
+                    // and zeros are filled if the length is insufficient
+
+                    // When reading data, use 
shrink_char_type_column_suffix_zero(_char_type_idx)
+                    // Remove the suffix 0
+                    // When writing data, use the CharField::consume function 
to fill in the trailing 0.
+
+                    // For dictionary data of char type, sv.len is the schema 
length,
+                    // so use strnlen to remove the 0 at the end to get the 
actual length.
+                    int32_t len = sv.len;
+                    if (type == OLAP_FIELD_TYPE_CHAR) {
+                        len = strnlen(sv.ptr, sv.len);
+                    }
+                    uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, len, 
0);
                     _hash_values[i] = hash_val;
                 }
             }
@@ -411,6 +426,7 @@ private:
     bool _dict_code_converted = false;
     Dictionary _dict;
     Container _codes;
+    FieldType _type;
 };
 
 template class ColumnDictionary<int32_t>;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to