This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f211eb58c7f [fix](ubsan) reinterpret_cast fix length types to int8 is 
not safe (#35912)
f211eb58c7f is described below

commit f211eb58c7f825fda220e35e416849bd243474cf
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Sun Jun 9 21:53:03 2024 +0800

    [fix](ubsan) reinterpret_cast fix length types to int8 is not safe (#35912)
    
    Fix type check of ubsan.
    ```
    
/root/doris/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h:75:78: 
runtime error: member call on address 0x5582f35db5c0 which does not point to an 
object of type 'doris::vectorized::ColumnVector<signed char>'
    0x5582f35db5c0: note: object is of type 
'doris::vectorized::ColumnVector<int>'
     83 55 00 00  78 c0 b0 5a 82 55 00 00  02 00 00 00 00 00 00 00  10 a0 00 d7 
83 55 00 00  10 a0 00 d7
                  ^~~~~~~~~~~~~~~~~~~~~~~
                  vptr for 'doris::vectorized::ColumnVector<int>'
    doris::Status 
doris::vectorized::FixLengthPlainDecoder::_decode_values<false>(COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&,
 std::shared_ptr<doris::vectorized::IDataType const>&, 
doris::vectorized::ColumnSelectVector&, bool) at 
fix_length_plain_decoder.h:75:78
    ```
---
 .../vec/exec/format/parquet/fix_length_dict_decoder.hpp | 17 ++++++++++-------
 .../vec/exec/format/parquet/fix_length_plain_decoder.h  | 13 +++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp 
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index 115ca68bc1e..65e329ae89b 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -60,23 +60,26 @@ public:
             return _decode_dict_values<has_filter>(doris_column, 
select_vector, is_dict_filter);
         }
 
-        return _decode_fixed_values<has_filter>(doris_column, select_vector);
+        return _decode_fixed_values<has_filter>(doris_column, data_type, 
select_vector);
     }
 
 protected:
     template <bool has_filter>
-    Status _decode_fixed_values(MutableColumnPtr& doris_column, 
ColumnSelectVector& select_vector) {
-        auto& column_data = 
reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
-        size_t data_index = column_data.size();
-        column_data.resize(data_index + _type_length * 
(select_vector.num_values() -
-                                                        
select_vector.num_filtered()));
+    Status _decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& 
data_type,
+                                ColumnSelectVector& select_vector) {
+        size_t primitive_length = 
remove_nullable(data_type)->get_size_of_value_in_memory();
+        size_t data_index = doris_column->size() * primitive_length;
+        size_t scale_size = (select_vector.num_values() - 
select_vector.num_filtered()) *
+                            (_type_length / primitive_length);
+        doris_column->resize(doris_column->size() + scale_size);
+        char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
         size_t dict_index = 0;
         ColumnSelectVector::DataReadType read_type;
         while (size_t run_length = 
select_vector.get_next_run<has_filter>(&read_type)) {
             switch (read_type) {
             case ColumnSelectVector::CONTENT: {
                 for (size_t i = 0; i < run_length; ++i) {
-                    memcpy(column_data.data() + data_index, 
_dict_items[_indexes[dict_index++]],
+                    memcpy(raw_data + data_index, 
_dict_items[_indexes[dict_index++]],
                            _type_length);
                     data_index += _type_length;
                 }
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h 
b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
index 72cb283f3f9..40e4c54a822 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
@@ -72,16 +72,17 @@ Status 
FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
         return Status::IOError("Out-of-bounds access in parquet data decoder");
     }
 
-    auto& column_data = 
reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
-    size_t data_index = column_data.size();
-    column_data.resize(data_index +
-                       _type_length * (select_vector.num_values() - 
select_vector.num_filtered()));
+    size_t primitive_length = 
remove_nullable(data_type)->get_size_of_value_in_memory();
+    size_t data_index = doris_column->size() * primitive_length;
+    size_t scale_size = (select_vector.num_values() - 
select_vector.num_filtered()) *
+                        (_type_length / primitive_length);
+    doris_column->resize(doris_column->size() + scale_size);
+    char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
     ColumnSelectVector::DataReadType read_type;
     while (size_t run_length = 
select_vector.get_next_run<has_filter>(&read_type)) {
         switch (read_type) {
         case ColumnSelectVector::CONTENT: {
-            memcpy(column_data.data() + data_index, _data->data + _offset,
-                   run_length * _type_length);
+            memcpy(raw_data + data_index, _data->data + _offset, run_length * 
_type_length);
             _offset += run_length * _type_length;
             data_index += run_length * _type_length;
             break;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to