This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d5133be4b8e [Fix](orc-reader) Fix StringRef nullptr data in 
orc-reader. (#40857)
d5133be4b8e is described below

commit d5133be4b8ebb1586b96c3e2de1a5fc23e606aa6
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Wed Sep 18 14:13:30 2024 +0800

    [Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (#40857)
    
    ## Proposed changes
    
    ### Issue
    ```
    
/var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:9:
 runtime error: reference binding to null pointer of type 'doris::StringRef'
        #0 0x55ee63eb0418 in std::vector<doris::StringRef, 
std::allocator<doris::StringRef>>::operator[](unsigned long) 
/var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:2
        #1 0x55ee63eb0418 in doris::Status 
doris::vectorized::OrcReader::_decode_string_non_dict_encoded_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn> 
const&, orc::TypeKind const&, orc::EncodedStringVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1172:39
        #2 0x55ee63ea2685 in doris::Status 
doris::vectorized::OrcReader::_decode_string_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn> 
const&, orc::TypeKind const&, orc::ColumnVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1124:16
        #3 0x55ee63e97e7a in doris::Status 
doris::vectorized::OrcReader::_fill_doris_data_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&, 
std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, 
orc::ColumnVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1365:16
        #4 0x55ee63b0e450 in doris::Status 
doris::vectorized::OrcReader::_orc_column_to_doris_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::immutable_ptr<doris::vectorized::IColumn>&, 
std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, 
orc::ColumnVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5
        #5 0x55ee63e99622 in doris::Status 
doris::vectorized::OrcReader::_fill_doris_data_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&, 
std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, 
orc::ColumnVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1410:9
        #6 0x55ee63b0e450 in doris::Status 
doris::vectorized::OrcReader::_orc_column_to_doris_column<false>(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char>> const&, 
COW<doris::vectorized::IColumn>::immutable_ptr<doris::vectorized::IColumn>&, 
std::shared_ptr<doris::vectorized::IDataType const> const&, orc::Type const*, 
orc::ColumnVectorBatch*, unsigned long) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1532:5
        #7 0x55ee63ad4f86 in 
doris::vectorized::OrcReader::get_next_block_impl(doris::vectorized::Block*, 
unsigned long*, bool*) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1714:13
        #8 0x55ee63ad093b in 
doris::vectorized::OrcReader::get_next_block(doris::vectorized::Block*, 
unsigned long*, bool*) 
/home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1547:5
    ```
    ### Solution
    [Fix] (orc-reader) Fix StringRef nullptr data in orc-reader. When string
    is empty in orc row batch, the data can point anything, maybe nullptr,
    StringRef has undefined behavior when data is nullptr.
    
    Related with #37845.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp | 42 ++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index cffa934cc2c..16a3c1254c6 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1140,8 +1140,9 @@ Status 
OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
         if (cvb->hasNulls) {
             for (int i = 0; i < num_values; ++i) {
                 if (cvb->notNull[i]) {
-                    string_values.emplace_back(cvb->data[i],
-                                               trim_right(cvb->data[i], 
cvb->length[i]));
+                    size_t length = trim_right(cvb->data[i], cvb->length[i]);
+                    string_values.emplace_back((length > 0) ? cvb->data[i] : 
empty_string.data(),
+                                               length);
                 } else {
                     // Orc doesn't fill null values in new batch, but the 
former batch has been release.
                     // Other types like int/long/timestamp... are flat types 
without pointer in them,
@@ -1151,21 +1152,26 @@ Status 
OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
             }
         } else {
             for (int i = 0; i < num_values; ++i) {
-                string_values.emplace_back(cvb->data[i], 
trim_right(cvb->data[i], cvb->length[i]));
+                size_t length = trim_right(cvb->data[i], cvb->length[i]);
+                string_values.emplace_back((length > 0) ? cvb->data[i] : 
empty_string.data(),
+                                           length);
             }
         }
     } else {
         if (cvb->hasNulls) {
             for (int i = 0; i < num_values; ++i) {
                 if (cvb->notNull[i]) {
-                    string_values.emplace_back(cvb->data[i], cvb->length[i]);
+                    string_values.emplace_back(
+                            (cvb->length[i] > 0) ? cvb->data[i] : 
empty_string.data(),
+                            cvb->length[i]);
                 } else {
                     string_values.emplace_back(empty_string.data(), 0);
                 }
             }
         } else {
             for (int i = 0; i < num_values; ++i) {
-                string_values.emplace_back(cvb->data[i], cvb->length[i]);
+                string_values.emplace_back(
+                        (cvb->length[i] > 0) ? cvb->data[i] : 
empty_string.data(), cvb->length[i]);
             }
         }
     }
@@ -1204,7 +1210,8 @@ Status 
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                     if (length > max_value_length) {
                         max_value_length = length;
                     }
-                    string_values.emplace_back(val_ptr, length);
+                    string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                               length);
                 } else {
                     // Orc doesn't fill null values in new batch, but the 
former batch has been release.
                     // Other types like int/long/timestamp... are flat types 
without pointer in them,
@@ -1227,7 +1234,8 @@ Status 
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                 if (length > max_value_length) {
                     max_value_length = length;
                 }
-                string_values.emplace_back(val_ptr, length);
+                string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                           length);
             }
         }
     } else {
@@ -1246,7 +1254,8 @@ Status 
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                     if (length > max_value_length) {
                         max_value_length = length;
                     }
-                    string_values.emplace_back(val_ptr, length);
+                    string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                               length);
                 } else {
                     string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
@@ -1265,7 +1274,8 @@ Status 
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
                 if (length > max_value_length) {
                     max_value_length = length;
                 }
-                string_values.emplace_back(val_ptr, length);
+                string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                           length);
             }
         }
     }
@@ -2068,7 +2078,7 @@ Status OrcReader::on_string_dicts_loaded(
             char* val_ptr;
             int64_t length;
             dict->getValueByIndex(i, val_ptr, length);
-            StringRef dict_value(val_ptr, length);
+            StringRef dict_value((length > 0) ? val_ptr : "", length);
             if (length > max_value_length) {
                 max_value_length = length;
             }
@@ -2328,7 +2338,8 @@ MutableColumnPtr 
OrcReader::_convert_dict_column_to_string_column(
                     if (length > max_value_length) {
                         max_value_length = length;
                     }
-                    string_values.emplace_back(val_ptr, length);
+                    string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                               length);
                 } else {
                     // Orc doesn't fill null values in new batch, but the 
former batch has been release.
                     // Other types like int/long/timestamp... are flat types 
without pointer in them,
@@ -2346,7 +2357,8 @@ MutableColumnPtr 
OrcReader::_convert_dict_column_to_string_column(
                 if (length > max_value_length) {
                     max_value_length = length;
                 }
-                string_values.emplace_back(val_ptr, length);
+                string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                           length);
             }
         }
     } else {
@@ -2361,7 +2373,8 @@ MutableColumnPtr 
OrcReader::_convert_dict_column_to_string_column(
                     if (length > max_value_length) {
                         max_value_length = length;
                     }
-                    string_values.emplace_back(val_ptr, length);
+                    string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                               length);
                 } else {
                     string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
                 }
@@ -2375,7 +2388,8 @@ MutableColumnPtr 
OrcReader::_convert_dict_column_to_string_column(
                 if (length > max_value_length) {
                     max_value_length = length;
                 }
-                string_values.emplace_back(val_ptr, length);
+                string_values.emplace_back((length > 0) ? val_ptr : 
EMPTY_STRING_FOR_OVERFLOW,
+                                           length);
             }
         }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to