This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d259770b86 [Fix] avoid core dump cause by malformed bitmap type data 
(#10458)
d259770b86 is described below

commit d259770b86955a4cb34102d4cc95ae1a731326af
Author: Zhengguo Yang <yangz...@gmail.com>
AuthorDate: Thu Jun 30 11:27:22 2022 +0800

    [Fix] avoid core dump cause by malformed bitmap type data (#10458)
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp   |  8 ++++----
 be/src/olap/rowset/segment_v2/binary_dict_page.h     |  4 ++--
 be/src/olap/rowset/segment_v2/binary_plain_page.h    | 20 +++++++++++++++++++-
 be/src/olap/rowset/segment_v2/column_reader.cpp      |  6 ++++--
 be/src/olap/rowset/segment_v2/encoding_info.cpp      |  4 ++--
 .../olap/rowset/segment_v2/indexed_column_reader.cpp |  4 +++-
 .../olap/rowset/segment_v2/indexed_column_writer.cpp |  6 ++++--
 be/src/olap/rowset/segment_v2/options.h              |  8 +++++---
 be/src/olap/rowset/segment_v2/parsed_page.h          |  4 ++--
 be/src/tools/meta_tool.cpp                           |  1 -
 be/src/util/bitmap_value.h                           | 14 +++++++++++++-
 .../olap/rowset/segment_v2/binary_dict_page_test.cpp | 10 ++++++----
 .../rowset/segment_v2/binary_plain_page_test.cpp     |  3 ++-
 13 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp 
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index b53ff52ade..ea9fe66dfc 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -45,7 +45,7 @@ BinaryDictPageBuilder::BinaryDictPageBuilder(const 
PageBuilderOptions& options)
     _data_page_builder.reset(new 
BitshufflePageBuilder<OLAP_FIELD_TYPE_INT>(options));
     PageBuilderOptions dict_builder_options;
     dict_builder_options.data_page_size = _options.dict_page_size;
-    _dict_builder.reset(new BinaryPlainPageBuilder(dict_builder_options));
+    _dict_builder.reset(new 
BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>(dict_builder_options));
     reset();
 }
 
@@ -135,7 +135,7 @@ void BinaryDictPageBuilder::reset() {
     _buffer.resize(BINARY_DICT_PAGE_HEADER_SIZE);
 
     if (_encoding_type == DICT_ENCODING && _dict_builder->is_page_full()) {
-        _data_page_builder.reset(new BinaryPlainPageBuilder(_options));
+        _data_page_builder.reset(new 
BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>(_options));
         _encoding_type = PLAIN_ENCODING;
     } else {
         _data_page_builder->reset();
@@ -206,7 +206,7 @@ Status BinaryDictPageDecoder::init() {
                 _bit_shuffle_ptr = new 
BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else if (_encoding_type == PLAIN_ENCODING) {
         DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
-        _data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options));
+        _data_page_decoder.reset(new 
BinaryPlainPageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else {
         LOG(WARNING) << "invalid encoding type:" << _encoding_type;
         return Status::Corruption(strings::Substitute("invalid encoding 
type:$0", _encoding_type));
@@ -228,7 +228,7 @@ bool BinaryDictPageDecoder::is_dict_encoding() const {
 }
 
 void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, 
StringRef* dict_word_info) {
-    _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
+    _dict_decoder = 
(BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>*)dict_decoder;
     _dict_word_info = dict_word_info;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h 
b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index e1dcdba49b..d57f7cf974 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -78,7 +78,7 @@ private:
 
     std::unique_ptr<PageBuilder> _data_page_builder;
 
-    std::unique_ptr<BinaryPlainPageBuilder> _dict_builder;
+    std::unique_ptr<BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>> 
_dict_builder;
 
     EncodingTypePB _encoding_type;
     struct HashOfSlice {
@@ -122,7 +122,7 @@ private:
     Slice _data;
     PageDecoderOptions _options;
     std::unique_ptr<PageDecoder> _data_page_decoder;
-    BinaryPlainPageDecoder* _dict_decoder = nullptr;
+    BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>* _dict_decoder = nullptr;
     BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>* _bit_shuffle_ptr = nullptr;
     bool _parsed;
     EncodingTypePB _encoding_type;
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index cac9da3709..1919efcba8 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -44,6 +44,7 @@
 namespace doris {
 namespace segment_v2 {
 
+template <FieldType Type>
 class BinaryPlainPageBuilder : public PageBuilder {
 public:
     BinaryPlainPageBuilder(const PageBuilderOptions& options)
@@ -64,6 +65,11 @@ public:
         // If the page is full, should stop adding more items.
         while (!is_page_full() && i < *count) {
             auto src = reinterpret_cast<const Slice*>(vals);
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(src->data)));
+                }
+            }
             size_t offset = _buffer.size();
             _offsets.push_back(offset);
             _buffer.append(src->data, src->size);
@@ -154,6 +160,7 @@ private:
     faststring _last_value;
 };
 
+template <FieldType Type>
 class BinaryPlainPageDecoder : public PageDecoder {
 public:
     BinaryPlainPageDecoder(Slice data) : BinaryPlainPageDecoder(data, 
PageDecoderOptions()) {}
@@ -204,6 +211,11 @@ public:
         size_t mem_len[max_fetch];
         for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) {
             *out = string_at_index(_cur_idx);
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(out->data)));
+                }
+            }
             mem_len[i] = out->size;
         }
 
@@ -246,6 +258,11 @@ public:
             uint32_t len = offset(_cur_idx + 1) - start_offset;
             len_array[i] = len;
             start_offset_array[i] = start_offset;
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + 
start_offset)));
+                }
+            }
         }
         dst->insert_many_binary_data(_data.mutable_data(), len_array, 
start_offset_array,
                                      max_fetch);
@@ -271,8 +288,9 @@ public:
     }
 
     void get_dict_word_info(StringRef* dict_word_info) {
-        if (_num_elems <= 0) [[unlikely]]
+        if (UNLIKELY(_num_elems <= 0)) {
             return;
+        }
 
         char* data_begin = (char*)&_data[0];
         char* offset_ptr = (char*)&_data[_offsets_pos];
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7243736281..a7d8c99899 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -747,10 +747,12 @@ Status FileColumnIterator::_read_data_page(const 
OrdinalPageIndexIterator& iter)
                                                    _compress_codec.get()));
                 // ignore dict_footer.dict_page_footer().encoding() due to only
                 // PLAIN_ENCODING is supported for dict page right now
-                _dict_decoder = 
std::make_unique<BinaryPlainPageDecoder>(dict_data);
+                _dict_decoder = 
std::make_unique<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>>(
+                        dict_data);
                 RETURN_IF_ERROR(_dict_decoder->init());
 
-                auto* pd_decoder = 
(BinaryPlainPageDecoder*)_dict_decoder.get();
+                auto* pd_decoder =
+                        
(BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>*)_dict_decoder.get();
                 _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
                 pd_decoder->get_dict_word_info(_dict_word_info.get());
             }
diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp 
b/be/src/olap/rowset/segment_v2/encoding_info.cpp
index ac3a1a624f..4f9e2d5d14 100644
--- a/be/src/olap/rowset/segment_v2/encoding_info.cpp
+++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp
@@ -58,12 +58,12 @@ struct TypeEncodingTraits<type, PLAIN_ENCODING, CppType> {
 template <FieldType type>
 struct TypeEncodingTraits<type, PLAIN_ENCODING, Slice> {
     static Status create_page_builder(const PageBuilderOptions& opts, 
PageBuilder** builder) {
-        *builder = new BinaryPlainPageBuilder(opts);
+        *builder = new BinaryPlainPageBuilder<type>(opts);
         return Status::OK();
     }
     static Status create_page_decoder(const Slice& data, const 
PageDecoderOptions& opts,
                                       PageDecoder** decoder) {
-        *decoder = new BinaryPlainPageDecoder(data, opts);
+        *decoder = new BinaryPlainPageDecoder<type>(data, opts);
         return Status::OK();
     }
 };
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp 
b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
index 6a0b2ca81e..682f620ea8 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
@@ -110,8 +110,10 @@ Status IndexedColumnIterator::_read_data_page(const 
PagePointer& pp) {
                                        _compress_codec.get()));
     // parse data page
     // note that page_index is not used in IndexedColumnIterator, so we pass 0
+    PageDecoderOptions opts;
+    opts.need_check_bitmap = false;
     return ParsedPage::create(std::move(handle), body, 
footer.data_page_footer(),
-                              _reader->encoding_info(), pp, 0, &_data_page);
+                              _reader->encoding_info(), pp, 0, &_data_page, 
opts);
 }
 
 Status IndexedColumnIterator::seek_to_ordinal(ordinal_t idx) {
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp 
b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
index 4c7259c90c..979481adbc 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
@@ -59,8 +59,10 @@ Status IndexedColumnWriter::init() {
     // because the default encoding of a data type can be changed in the future
     DCHECK_NE(_options.encoding, DEFAULT_ENCODING);
 
-    PageBuilder* data_page_builder;
-    RETURN_IF_ERROR(encoding_info->create_page_builder(PageBuilderOptions(), 
&data_page_builder));
+    PageBuilder* data_page_builder = nullptr;
+    PageBuilderOptions builder_option;
+    builder_option.need_check_bitmap = false;
+    RETURN_IF_ERROR(encoding_info->create_page_builder(builder_option, 
&data_page_builder));
     _data_page_builder.reset(data_page_builder);
 
     if (_options.write_ordinal_index) {
diff --git a/be/src/olap/rowset/segment_v2/options.h 
b/be/src/olap/rowset/segment_v2/options.h
index 0cb2dafa39..9405eb19cf 100644
--- a/be/src/olap/rowset/segment_v2/options.h
+++ b/be/src/olap/rowset/segment_v2/options.h
@@ -22,17 +22,19 @@
 namespace doris {
 namespace segment_v2 {
 
-class BinaryPlainPageDecoder;
-
 static constexpr size_t DEFAULT_PAGE_SIZE = 1024 * 1024; // default size: 1M
 
 struct PageBuilderOptions {
     size_t data_page_size = DEFAULT_PAGE_SIZE;
 
     size_t dict_page_size = DEFAULT_PAGE_SIZE;
+
+    bool need_check_bitmap = true;
 };
 
-struct PageDecoderOptions {};
+struct PageDecoderOptions {
+    bool need_check_bitmap = true;
+};
 
 } // namespace segment_v2
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h 
b/be/src/olap/rowset/segment_v2/parsed_page.h
index c70ebd62a6..232c347c38 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -38,7 +38,8 @@ namespace segment_v2 {
 struct ParsedPage {
     static Status create(PageHandle handle, const Slice& body, const 
DataPageFooterPB& footer,
                          const EncodingInfo* encoding, const PagePointer& 
page_pointer,
-                         uint32_t page_index, ParsedPage* result) {
+                         uint32_t page_index, ParsedPage* result,
+                         PageDecoderOptions opts = PageDecoderOptions()) {
         result->~ParsedPage();
         ParsedPage* page = new (result)(ParsedPage);
         page->page_handle = std::move(handle);
@@ -53,7 +54,6 @@ struct ParsedPage {
         }
 
         Slice data_slice(body.data, body.size - null_size);
-        PageDecoderOptions opts;
         RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, 
&page->data_decoder));
         RETURN_IF_ERROR(page->data_decoder->init());
 
diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp
index 60b56b35cb..f4e5e6087c 100644
--- a/be/src/tools/meta_tool.cpp
+++ b/be/src/tools/meta_tool.cpp
@@ -56,7 +56,6 @@ using doris::RandomAccessFile;
 using strings::Substitute;
 using doris::segment_v2::SegmentFooterPB;
 using doris::segment_v2::ColumnReader;
-using doris::segment_v2::BinaryPlainPageDecoder;
 using doris::segment_v2::PageHandle;
 using doris::segment_v2::PagePointer;
 using doris::segment_v2::ColumnReaderOptions;
diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h
index 4b1445ba59..7eaeb11e79 100644
--- a/be/src/util/bitmap_value.h
+++ b/be/src/util/bitmap_value.h
@@ -71,6 +71,16 @@ struct BitmapTypeCode {
         // added in 0.12
         BITMAP64 = 4
     };
+    Status static inline validate(int bitmap_type) {
+        if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type > 
type::BITMAP64)) {
+            std::string err_msg =
+                    fmt::format("BitmapTypeCode invalid, should between: {} 
and {} actrual is {}",
+                                BitmapTypeCode::EMPTY, 
BitmapTypeCode::BITMAP64, bitmap_type);
+            LOG(ERROR) << err_msg;
+            return Status::IOError(err_msg);
+        }
+        return Status::OK();
+    }
 };
 
 namespace detail {
@@ -1536,7 +1546,6 @@ public:
     // Deserialize a bitmap value from `src`.
     // Return false if `src` begins with unknown type code, true otherwise.
     bool deserialize(const char* src) {
-        DCHECK(*src >= BitmapTypeCode::EMPTY && *src <= 
BitmapTypeCode::BITMAP64);
         switch (*src) {
         case BitmapTypeCode::EMPTY:
             _type = EMPTY;
@@ -1555,6 +1564,9 @@ public:
             _bitmap = detail::Roaring64Map::read(src);
             break;
         default:
+            LOG(ERROR) << "BitmapTypeCode invalid, should between: " << 
BitmapTypeCode::EMPTY
+                       << " and " << BitmapTypeCode::BITMAP64 << " actrual is "
+                       << static_cast<int>(*src);
             return false;
         }
         return true;
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp 
b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index 8c59da6491..bb3493efd6 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -68,8 +68,9 @@ public:
         Status status = page_builder.get_dictionary_page(&dict_slice);
         EXPECT_TRUE(status.ok());
         PageDecoderOptions dict_decoder_options;
-        std::unique_ptr<BinaryPlainPageDecoder> dict_page_decoder(
-                new BinaryPlainPageDecoder(dict_slice.slice(), 
dict_decoder_options));
+        std::unique_ptr<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>> 
dict_page_decoder(
+                new 
BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>(dict_slice.slice(),
+                                                                    
dict_decoder_options));
         status = dict_page_decoder->init();
         EXPECT_TRUE(status.ok());
         // because every slice is unique
@@ -175,8 +176,9 @@ public:
             int slice_index = random() % results.size();
             //int slice_index = 1;
             PageDecoderOptions dict_decoder_options;
-            std::unique_ptr<BinaryPlainPageDecoder> dict_page_decoder(
-                    new BinaryPlainPageDecoder(dict_slice.slice(), 
dict_decoder_options));
+            std::unique_ptr<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>> 
dict_page_decoder(
+                    new 
BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>(dict_slice.slice(),
+                                                                        
dict_decoder_options));
             status = dict_page_decoder->init();
             EXPECT_TRUE(status.ok());
 
diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp 
b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
index 0e0d34606d..1da1db55e1 100644
--- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
@@ -108,7 +108,8 @@ public:
 };
 
 TEST_F(BinaryPlainPageTest, TestBinaryPlainPageBuilderSeekByValueSmallPage) {
-    TestBinarySeekByValueSmallPage<BinaryPlainPageBuilder, 
BinaryPlainPageDecoder>();
+    
TestBinarySeekByValueSmallPage<BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>,
+                                   
BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>>();
 }
 
 } // namespace segment_v2


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to