This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9ae716554d2 [Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (#41308) 9ae716554d2 is described below commit 9ae716554d24a2bee9563b233e4f0e1e1aee0d58 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Thu Sep 26 17:01:09 2024 +0800 [Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (#41308) ## Proposed changes Backport #40857. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 42 ++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index da3ef608c5f..d6982624aab 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1136,8 +1136,9 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_ if (cvb->hasNulls) { for (int i = 0; i < num_values; ++i) { if (cvb->notNull[i]) { - string_values.emplace_back(cvb->data[i], - trim_right(cvb->data[i], cvb->length[i])); + size_t length = trim_right(cvb->data[i], cvb->length[i]); + string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(), + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -1147,21 +1148,26 @@ Status OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_ } } else { for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(cvb->data[i], trim_right(cvb->data[i], cvb->length[i])); + size_t length = trim_right(cvb->data[i], cvb->length[i]); + string_values.emplace_back((length > 0) ? cvb->data[i] : empty_string.data(), + length); } } } else { if (cvb->hasNulls) { for (int i = 0; i < num_values; ++i) { if (cvb->notNull[i]) { - string_values.emplace_back(cvb->data[i], cvb->length[i]); + string_values.emplace_back( + (cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(), + cvb->length[i]); } else { string_values.emplace_back(empty_string.data(), 0); } } } else { for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(cvb->data[i], cvb->length[i]); + string_values.emplace_back( + (cvb->length[i] > 0) ? cvb->data[i] : empty_string.data(), cvb->length[i]); } } } @@ -1200,7 +1206,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -1223,7 +1230,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } else { @@ -1242,7 +1250,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } @@ -1261,7 +1270,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } @@ -2065,7 +2075,7 @@ Status OrcReader::on_string_dicts_loaded( char* val_ptr; int64_t length; dict->getValueByIndex(i, val_ptr, length); - StringRef dict_value(val_ptr, length); + StringRef dict_value((length > 0) ? val_ptr : "", length); if (length > max_value_length) { max_value_length = length; } @@ -2337,7 +2347,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { // Orc doesn't fill null values in new batch, but the former batch has been release. // Other types like int/long/timestamp... are flat types without pointer in them, @@ -2355,7 +2366,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } else { @@ -2370,7 +2382,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } else { string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0); } @@ -2384,7 +2397,8 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column( if (length > max_value_length) { max_value_length = length; } - string_values.emplace_back(val_ptr, length); + string_values.emplace_back((length > 0) ? val_ptr : EMPTY_STRING_FOR_OVERFLOW, + length); } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org