This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new bad22dd4e2 [Fix](orc-reader) Fix orc dict filter null value issue in `_convert_dict_cols_to_string_cols` which caused incorrect result. (#21047) bad22dd4e2 is described below commit bad22dd4e25a8e382d6ada5b161910aabce88bd5 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Wed Jun 21 14:54:01 2023 +0800 [Fix](orc-reader) Fix orc dict filter null value issue in `_convert_dict_cols_to_string_cols` which caused incorrect result. (#21047) Query results should not have empty values. ``` use regresssion.multi_catalog; select commit_id from github_events_orc WHERE (event_type = 'CommitCommentEvent') AND commit_id != "" limit 10; ``` ``` +------------------------------------------+ | commit_id | +------------------------------------------+ | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 | | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 | | 4e3ab2ff2d2474f5d51334b9b0fdf17e9845a166 | | | | | | | | | | | | | | 7191c20cb49da07a7fc16aa32dc0de4faff528b2 | +------------------------------------------+ 10 rows in set (0.54 sec) ``` --- be/src/vec/columns/column_string.h | 2 + be/src/vec/exec/format/orc/vorc_reader.cpp | 47 ++++++++------ be/src/vec/exec/format/orc/vorc_reader.h | 1 + .../format/parquet/byte_array_dict_decoder.cpp | 4 +- .../hive/test_external_github.out | 75 ++++++++++++++++++++++ .../hive/test_external_github.groovy | 9 +++ 6 files changed, 116 insertions(+), 22 deletions(-) diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index eb27cdc4ed..efd90fd844 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -64,6 +64,8 @@ public: using Char = UInt8; using Chars = PaddedPODArray<UInt8>; + static constexpr size_t MAX_STRINGS_OVERFLOW_SIZE = 128; + void static check_chars_length(size_t total_length, size_t element_number) { if (UNLIKELY(total_length > MAX_STRING_SIZE)) { throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE, diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 007aff91ca..8b1ccd8ccc 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -94,6 +94,7 @@ namespace doris::vectorized { // TODO: we need to determine it by test. static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits<uint32_t>::max(); +static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = ""; #define FOR_FLAT_ORC_COLUMNS(M) \ M(TypeIndex::Int8, Int8, orc::LongVectorBatch) \ @@ -1039,7 +1040,6 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name const orc::TypeKind& type_kind, orc::EncodedStringVectorBatch* cvb, size_t num_values) { - const static std::string empty_string; std::vector<StringRef> string_values; size_t max_value_length = 0; string_values.reserve(num_values); @@ -1054,7 +1054,7 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (cvb->notNull[i]) { if constexpr (is_filter) { if (!filter_data[i]) { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); continue; } } @@ -1070,14 +1070,14 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, // so other types do not need to be handled separately like string. - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } } } else { for (int i = 0; i < num_values; ++i) { if constexpr (is_filter) { if (!filter_data[i]) { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); continue; } } @@ -1097,23 +1097,26 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (cvb->notNull[i]) { if constexpr (is_filter) { if (!filter_data[i]) { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); continue; } } char* val_ptr; int64_t length; cvb->dictionary->getValueByIndex(cvb->index.data()[i], val_ptr, length); + if (length > max_value_length) { + max_value_length = length; + } string_values.emplace_back(val_ptr, length); } else { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } } } else { for (int i = 0; i < num_values; ++i) { if constexpr (is_filter) { if (!filter_data[i]) { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); continue; } } @@ -1127,7 +1130,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name } } } - data_column->insert_many_strings(&string_values[0], string_values.size()); + data_column->insert_many_strings_overflow(&string_values[0], string_values.size(), + max_value_length); return Status::OK(); } @@ -1938,11 +1942,12 @@ Status OrcReader::_convert_dict_cols_to_string_cols( const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); const ColumnInt32* dict_column = assert_cast<const ColumnInt32*>(nested_column.get()); DCHECK(dict_column); + const NullMap& null_map = nullable_column->get_null_map_data(); MutableColumnPtr string_column; if (batch_vec != nullptr) { string_column = _convert_dict_column_to_string_column( - dict_column, (*batch_vec)[orc_col_idx->second], + dict_column, &null_map, (*batch_vec)[orc_col_idx->second], _col_orc_type[orc_col_idx->second]); } else { string_column = ColumnString::create(); @@ -1958,7 +1963,7 @@ Status OrcReader::_convert_dict_cols_to_string_cols( MutableColumnPtr string_column; if (batch_vec != nullptr) { string_column = _convert_dict_column_to_string_column( - dict_column, (*batch_vec)[orc_col_idx->second], + dict_column, nullptr, (*batch_vec)[orc_col_idx->second], _col_orc_type[orc_col_idx->second]); } else { string_column = ColumnString::create(); @@ -1971,12 +1976,15 @@ Status OrcReader::_convert_dict_cols_to_string_cols( return Status::OK(); } +// TODO: Possible optimization points. +// After filtering the dict column, the null_map for the null dict column should always not be null. +// Then it can avoid checking null_map. However, currently when inert materialization is enabled, +// the filter column will not be filtered first, but will be filtered together at the end. MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( - const ColumnInt32* dict_column, orc::ColumnVectorBatch* cvb, + const ColumnInt32* dict_column, const NullMap* null_map, orc::ColumnVectorBatch* cvb, const orc::Type* orc_column_type) { SCOPED_RAW_TIMER(&_statistics.decode_value_time); auto res = ColumnString::create(); - const static std::string empty_string; auto* encoded_string_vector_batch = static_cast<orc::EncodedStringVectorBatch*>(cvb); DCHECK(encoded_string_vector_batch); std::vector<StringRef> string_values; @@ -1984,11 +1992,12 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( const int* dict_data = dict_column->get_data().data(); string_values.reserve(num_values); size_t max_value_length = 0; + auto* null_map_data = null_map->data(); if (orc_column_type->getKind() == orc::TypeKind::CHAR) { // Possibly there are some zero padding characters in CHAR type, we have to strip them off. - if (cvb->hasNulls) { + if (null_map) { for (int i = 0; i < num_values; ++i) { - if (cvb->notNull[i]) { + if (!null_map_data[i]) { char* val_ptr; int64_t length; encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr, @@ -2002,7 +2011,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, // so other types do not need to be handled separately like string. - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } } } else { @@ -2019,9 +2028,9 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( } } } else { - if (cvb->hasNulls) { + if (null_map) { for (int i = 0; i < num_values; ++i) { - if (cvb->notNull[i]) { + if (!null_map_data[i]) { char* val_ptr; int64_t length; encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr, @@ -2031,7 +2040,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( } string_values.emplace_back(val_ptr, length); } else { - string_values.emplace_back(empty_string.data(), 0); + string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } } } else { @@ -2047,7 +2056,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( } } } - res->insert_many_strings(&string_values[0], num_values); + res->insert_many_strings_overflow(&string_values[0], num_values, max_value_length); return res; } diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index de7a8d182f..3068d55681 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -475,6 +475,7 @@ private: const std::vector<orc::ColumnVectorBatch*>* batch_vec); MutableColumnPtr _convert_dict_column_to_string_column(const ColumnInt32* dict_column, + const NullMap* null_map, orc::ColumnVectorBatch* cvb, const orc::Type* orc_column_typ); diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp index 76bd89929a..1e09890a98 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp @@ -29,8 +29,6 @@ namespace doris::vectorized { -constexpr int MAX_STRINGS_OVERFLOW_SIZE = 128; - Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length, size_t num_values) { _dict = std::move(dict); @@ -48,7 +46,7 @@ Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t _dict_value_to_code.reserve(num_values); // For insert_many_strings_overflow - _dict_data.resize(total_length + MAX_STRINGS_OVERFLOW_SIZE); + _dict_data.resize(total_length + ColumnString::MAX_STRINGS_OVERFLOW_SIZE); _max_value_length = 0; size_t offset = 0; offset_cursor = 0; diff --git a/regression-test/data/external_table_emr_p2/hive/test_external_github.out b/regression-test/data/external_table_emr_p2/hive/test_external_github.out index fa53b491ff..54f2f21081 100644 --- a/regression-test/data/external_table_emr_p2/hive/test_external_github.out +++ b/regression-test/data/external_table_emr_p2/hive/test_external_github.out @@ -2401,6 +2401,44 @@ centaurean/density 988 google/boringssl 976 woboq/woboq_codebrowser 916 +-- !60 -- +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +0000000000009c012a4fcfe3b1f1d683163b0768 +0000000000009c012a4fcfe3b1f1d683163b0768 +0000000001f2ffbd4641882b1b460e0ec21abc42 +000000076395383d824ec8a468a88ee2baa6f121 +000000076395383d824ec8a468a88ee2baa6f121 +0000000bab11354f9a759332065be5f066c3398f + +-- !61 -- +fffffe630c2e9b0404d9f3838315e42284d033d6 +fffffa26fa725dbc5e01dd52f18eb63ae5575b8d +fffff77486c5866ceef27c274849de55ec49e9c0 +fffff600c5b5272a9c40c2f11beb512429b57eed +fffff427a6fe2f8d73c2ef137570ef93544a8ef5 +fffff23093876194fab64a3fdc182e2c63b49644 +fffff06aa80c33f247249df38a2b3cacecd51653 +ffffe8bc3f37233ebb90f51b68d96cad89d132e6 +ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6 +ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6 + +-- !62 -- +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N + +-- !63 -- + -- !01 -- Homebrew/homebrew-core 33 15 pocoproject/poco 28 14 @@ -4803,3 +4841,40 @@ centaurean/density 988 google/boringssl 976 woboq/woboq_codebrowser 916 +-- !60 -- +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +00000000000004a9b86e267361898666ede4eaaa +0000000000009c012a4fcfe3b1f1d683163b0768 +0000000000009c012a4fcfe3b1f1d683163b0768 +0000000001f2ffbd4641882b1b460e0ec21abc42 +000000076395383d824ec8a468a88ee2baa6f121 +000000076395383d824ec8a468a88ee2baa6f121 +0000000bab11354f9a759332065be5f066c3398f + +-- !61 -- +fffffe630c2e9b0404d9f3838315e42284d033d6 +fffffa26fa725dbc5e01dd52f18eb63ae5575b8d +fffff77486c5866ceef27c274849de55ec49e9c0 +fffff600c5b5272a9c40c2f11beb512429b57eed +fffff427a6fe2f8d73c2ef137570ef93544a8ef5 +fffff23093876194fab64a3fdc182e2c63b49644 +fffff06aa80c33f247249df38a2b3cacecd51653 +ffffe8bc3f37233ebb90f51b68d96cad89d132e6 +ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6 +ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6 + +-- !62 -- +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N + +-- !63 -- diff --git a/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy b/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy index 4d3e636836..362f1e5f61 100644 --- a/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy +++ b/regression-test/suites/external_table_emr_p2/hive/test_external_github.groovy @@ -495,6 +495,11 @@ suite("test_external_github", "p2") { ORDER BY stars DESC LIMIT 50""" + def detail_test1 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id asc limit 10""" + def detail_test2 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id desc limit 10""" + def detail_test3 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null order by commit_id limit 10""" + def detail_test4 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null AND commit_id != "" order by commit_id limit 10""" + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") if (enabled != null && enabled.equalsIgnoreCase("true")) { String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") @@ -580,6 +585,10 @@ suite("test_external_github", "p2") { qt_57 whoAreAllThosePeopleGivingStars1.replace("SUFFIX", format) qt_58 whoAreAllThosePeopleGivingStars2.replace("SUFFIX", format) qt_59 whoAreAllThosePeopleGivingStars3.replace("SUFFIX", format) + qt_60 detail_test1.replace("SUFFIX", format) + qt_61 detail_test2.replace("SUFFIX", format) + qt_62 detail_test3.replace("SUFFIX", format) + qt_63 detail_test4.replace("SUFFIX", format) } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org