This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit d2a8a399be2024e0429fff677f38a11198f215e3 Author: Luwei <814383...@qq.com> AuthorDate: Fri May 27 20:44:26 2022 +0800 [fix] Fix bug of bloom filter hash value calculation error (#9802) * Fix bug of bloom filter hash value calculation error * fix code style --- be/src/olap/bloom_filter_predicate.h | 8 +++++--- be/src/olap/schema.cpp | 2 +- be/src/vec/columns/column_dictionary.h | 24 ++++++++++++++++++++---- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 01851cfd96..c8fbdab94c 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -119,8 +119,10 @@ void BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16 auto& null_map_data = nullable_col->get_null_map_column().get_data(); // deal ColumnDict if (nullable_col->get_nested_column().is_column_dictionary()) { - auto* dict_col = vectorized::check_and_get_column<vectorized::ColumnDictI32>(nullable_col->get_nested_column()); - const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values(); + auto* dict_col = vectorized::check_and_get_column<vectorized::ColumnDictI32>( + nullable_col->get_nested_column()); + const_cast<vectorized::ColumnDictI32*>(dict_col) + ->generate_hash_values_for_runtime_filter(); for (uint16_t i = 0; i < *size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; @@ -139,7 +141,7 @@ void BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16 } } else if (column.is_column_dictionary()) { auto* dict_col = vectorized::check_and_get_column<vectorized::ColumnDictI32>(column); - const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values(); + const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values_for_runtime_filter(); for (uint16_t i = 0; i < *size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; diff --git a/be/src/olap/schema.cpp b/be/src/olap/schema.cpp index 6b8354f26a..e30389b7d3 100644 --- a/be/src/olap/schema.cpp +++ b/be/src/olap/schema.cpp @@ -205,7 +205,7 @@ vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(FieldType type) case OLAP_FIELD_TYPE_VARCHAR: case OLAP_FIELD_TYPE_STRING: if (config::enable_low_cardinality_optimize) { - return doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create(); + return doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create(type); } return doris::vectorized::PredicateColumnType<StringValue>::create(); diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index cc27ca1cdb..97052cfb7c 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -61,6 +61,7 @@ private: ColumnDictionary() {} ColumnDictionary(const size_t n) : _codes(n) {} ColumnDictionary(const ColumnDictionary& src) : _codes(src._codes.begin(), src._codes.end()) {} + ColumnDictionary(FieldType type) : _type(type) {} public: using Self = ColumnDictionary; @@ -250,8 +251,8 @@ public: return _dict.find_code_by_bound(value, greater, eq); } - void generate_hash_values() { - _dict.generate_hash_values(); + void generate_hash_values_for_runtime_filter() { + _dict.generate_hash_values_for_runtime_filter(_type); } uint32_t get_hash_value(uint32_t idx) const { @@ -308,12 +309,26 @@ public: return code >= _dict_data.size() ? _null_value : _dict_data[code]; } - inline void generate_hash_values() { + // The function is only used in the runtime filter feature + inline void generate_hash_values_for_runtime_filter(FieldType type) { if (_hash_values.empty()) { _hash_values.resize(_dict_data.size()); for (size_t i = 0; i < _dict_data.size(); i++) { auto& sv = _dict_data[i]; - uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, sv.len, 0); + // The char data is stored in the disk with the schema length, + // and zeros are filled if the length is insufficient + + // When reading data, use shrink_char_type_column_suffix_zero(_char_type_idx) + // Remove the suffix 0 + // When writing data, use the CharField::consume function to fill in the trailing 0. + + // For dictionary data of char type, sv.len is the schema length, + // so use strnlen to remove the 0 at the end to get the actual length. + int32_t len = sv.len; + if (type == OLAP_FIELD_TYPE_CHAR) { + len = strnlen(sv.ptr, sv.len); + } + uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, len, 0); _hash_values[i] = hash_val; } } @@ -411,6 +426,7 @@ private: bool _dict_code_converted = false; Dictionary _dict; Container _codes; + FieldType _type; }; template class ColumnDictionary<int32_t>; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org