This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 55bd2845902e90ea72fa54eae061bf060adcd668
Author: Mingyu Chen <morning...@163.com>
AuthorDate: Thu Aug 31 18:23:05 2023 +0800

    [fix](parquet) fix potential heap-use-after-free issue and cache issue 
(#23638)
    
    1. When file meta cache is disabled (by setting 
`max_external_file_meta_cache_num=0` in be.conf),
    the parquet's meta info is owned by parquet reader and will be released 
when calling `reader->close()`.
    
    But the underlying file reader of this parquet reader will be released 
after `reader->close()`,
    this may causing `heap-use-after-free` bug because some part of meta info 
may be referenced by file reader.
    
    This PR fix it by making sure that meta info is released after file reader 
released.
    
    2. Add modification time for file meta cache in BE, to avoid parquet read 
error like:
    `Failed to deserialize parquet page header`
---
 be/src/vec/exec/format/parquet/vparquet_reader.cpp | 18 ++++++++----------
 be/src/vec/exec/format/parquet/vparquet_reader.h   | 19 +++++++++++++------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index e627a96e0d..49798ed4f1 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -206,10 +206,6 @@ void ParquetReader::_close_internal() {
         }
         _closed = true;
     }
-
-    if (_is_file_metadata_owned && _file_metadata != nullptr) {
-        delete _file_metadata;
-    }
 }
 
 Status ParquetReader::_open_file() {
@@ -230,23 +226,25 @@ Status ParquetReader::_open_file() {
         }
         size_t meta_size = 0;
         if (_meta_cache == nullptr) {
-            _is_file_metadata_owned = true;
             RETURN_IF_ERROR(
                     parse_thrift_footer(_file_reader, &_file_metadata, 
&meta_size, _io_ctx));
+            // wrap it with unique ptr, so that it can be released finally.
+            _file_metadata_ptr.reset(_file_metadata);
+            _file_metadata = _file_metadata_ptr.get();
+
             _column_statistics.read_bytes += meta_size;
             // parse magic number & parse meta data
             _column_statistics.meta_read_calls += 1;
         } else {
-            _is_file_metadata_owned = false;
-            RETURN_IF_ERROR(_meta_cache->get_parquet_footer(_file_reader, 
_io_ctx, 0, &meta_size,
-                                                            &_cache_handle));
-
+            RETURN_IF_ERROR(_meta_cache->get_parquet_footer(_file_reader, 
_io_ctx,
+                                                            
_file_description.mtime, &meta_size,
+                                                            
&_meta_cache_handle));
             _column_statistics.read_bytes += meta_size;
             if (meta_size > 0) {
                 _column_statistics.meta_read_calls += 1;
             }
 
-            _file_metadata = (FileMetaData*)_cache_handle.data();
+            _file_metadata = (FileMetaData*)_meta_cache_handle.data();
         }
 
         if (_file_metadata == nullptr) {
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index b2f8a2bd19..5ceca55d7e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -214,14 +214,21 @@ private:
     const TFileRangeDesc& _scan_range;
     io::FileSystemProperties _system_properties;
     io::FileDescription _file_description;
-    std::shared_ptr<io::FileSystem> _file_system = nullptr;
-    io::FileReaderSPtr _file_reader = nullptr;
-    ObjLRUCache::CacheHandle _cache_handle;
+
+    // the following fields are for parquet meta data cache.
+    // if _meta_cache is not null, the _file_metadata will be got from 
_meta_cache,
+    // and it is owned by _meta_cache_handle.
+    // if _meta_cache is null, _file_metadata will be managed by 
_file_metadata_ptr,
+    // which will be released when deconstructing.
+    // ATTN: these fields must be before _file_reader, to make sure they will 
be released
+    // after _file_reader. Otherwise, there may be heap-use-after-free bug.
+    ObjLRUCache::CacheHandle _meta_cache_handle;
+    std::unique_ptr<FileMetaData> _file_metadata_ptr;
     FileMetaData* _file_metadata = nullptr;
-    // set to true if _file_metadata is owned by this reader.
-    // otherwise, it is owned by someone else, such as _meta_cache
-    bool _is_file_metadata_owned = false;
     const tparquet::FileMetaData* _t_metadata;
+
+    std::shared_ptr<io::FileSystem> _file_system = nullptr;
+    io::FileReaderSPtr _file_reader = nullptr;
     std::unique_ptr<RowGroupReader> _current_group_reader = nullptr;
     // read to the end of current reader
     bool _row_group_eof = true;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to