eldenmoon commented on code in PR #56159:
URL: https://github.com/apache/doris/pull/56159#discussion_r2357338989


##########
be/src/olap/rowset/segment_v2/variant/sparse_column_extract_iterator.h:
##########
@@ -105,20 +91,27 @@ class BaseSparseColumnProcessor : public ColumnIterator {
     Status _process_batch(ReadMethod&& read_method, size_t nrows,
                           vectorized::MutableColumnPtr& dst) {
         // Cache check and population logic
-        if (has_sparse_column_cache()) {
-            _sparse_column =
-                    
_read_opts->sparse_column_cache[_col.parent_unique_id()]->assume_mutable();
-        } else {
-            _sparse_column->clear();
-            RETURN_IF_ERROR(read_method());
-
-            // cache the sparse column
-            if (_read_opts) {
-                _read_opts->sparse_column_cache[_col.parent_unique_id()] =
-                        _sparse_column->get_ptr();
-            }
-        }
-
+        // if (has_sparse_column_cache()) {

Review Comment:
   remove useless code 



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {

Review Comment:
   add comment for this struct, and what does shared mean



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {
+    ColumnIteratorUPtr _column_iterator = nullptr;
+    mutable vectorized::MutableColumnPtr _column = nullptr;
+    // used for seek_to_ordinal
+    mutable int64_t _offset = -1;
+    // used for next_batch, read_by_rowids
+    mutable int64_t _length = -1;

Review Comment:
   同上



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp:
##########
@@ -296,7 +310,8 @@ Status 
VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,
 Status VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,
                                          const TabletColumn* target_col,
                                          const StorageReadOptions* opt,
-                                         ColumnReaderCache* 
column_reader_cache) {
+                                         ColumnReaderCache* 
column_reader_cache,
+                                         PathToSharedColumnCacheUPtr* 
column_cache) {

Review Comment:
   rename to `column_path_data_cache`



##########
be/src/olap/rowset/segment_v2/segment_iterator.h:
##########
@@ -496,6 +496,8 @@ class SegmentIterator : public RowwiseIterator {
     std::map<ColumnId, size_t> _vir_cid_to_idx_in_block;
 
     IndexQueryContextPtr _index_query_context;
+
+    std::unordered_map<ColumnId, PathToSharedColumnCacheUPtr> _column_cache;

Review Comment:
   _column_cache-> `_column_path_data_cache`



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.h:
##########
@@ -59,7 +59,8 @@ class VariantColumnReader : public ColumnReader {
                         const StorageReadOptions* opt) override;
 
     Status new_iterator(ColumnIteratorUPtr* iterator, const TabletColumn* col,
-                        const StorageReadOptions* opt, ColumnReaderCache* 
column_reader_cache);
+                        const StorageReadOptions* opt, ColumnReaderCache* 
column_reader_cache,
+                        PathToSharedColumnCacheUPtr* column_cache = nullptr);

Review Comment:
   `column_path_data_cache`



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {
+    ColumnIteratorUPtr _column_iterator = nullptr;
+    mutable vectorized::MutableColumnPtr _column = nullptr;

Review Comment:
   struct里面变量名别用下划线`_`



##########
be/src/olap/rowset/segment_v2/segment_iterator.h:
##########
@@ -496,6 +496,8 @@ class SegmentIterator : public RowwiseIterator {
     std::map<ColumnId, size_t> _vir_cid_to_idx_in_block;
 
     IndexQueryContextPtr _index_query_context;
+
+    std::unordered_map<ColumnId, PathToSharedColumnCacheUPtr> _column_cache;

Review Comment:
   _opt里的`sparse_column_cache`可以删了



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {
+    ColumnIteratorUPtr _column_iterator = nullptr;

Review Comment:
   rename to `_sparse_column_iterator`



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {
+    ColumnIteratorUPtr _column_iterator = nullptr;
+    mutable vectorized::MutableColumnPtr _column = nullptr;
+    // used for seek_to_ordinal
+    mutable int64_t _offset = -1;
+    // used for next_batch, read_by_rowids
+    mutable int64_t _length = -1;
+    // used for read_by_rowids
+    std::unique_ptr<rowid_t[]> _rowids = nullptr;
+    bool _inited = false;
+
+    SharedColumnCache() = default;
+    SharedColumnCache(ColumnIteratorUPtr column_iterator, 
vectorized::MutableColumnPtr column)
+            : _column_iterator(std::move(column_iterator)), 
_column(std::move(column)) {}
+
+    Status init(const ColumnIteratorOptions& opts) {
+        if (_inited) {
+            return Status::OK();
+        } else {
+            _inited = true;
+            return _column_iterator->init(opts);
+        }
+    }
+
+    Status seek_to_ordinal(ordinal_t ord) const {

Review Comment:
   为什么是const的



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -700,6 +701,80 @@ class DefaultValueColumnIterator : public ColumnIterator {
     // current rowid
     ordinal_t _current_rowid = 0;
 };
+struct SharedColumnCache {
+    ColumnIteratorUPtr _column_iterator = nullptr;
+    mutable vectorized::MutableColumnPtr _column = nullptr;
+    // used for seek_to_ordinal
+    mutable int64_t _offset = -1;

Review Comment:
   -1是什么含义,注释



##########
be/src/olap/rowset/segment_v2/variant/sparse_column_extract_iterator.h:
##########
@@ -105,20 +91,27 @@ class BaseSparseColumnProcessor : public ColumnIterator {
     Status _process_batch(ReadMethod&& read_method, size_t nrows,
                           vectorized::MutableColumnPtr& dst) {
         // Cache check and population logic
-        if (has_sparse_column_cache()) {
-            _sparse_column =
-                    
_read_opts->sparse_column_cache[_col.parent_unique_id()]->assume_mutable();
-        } else {
-            _sparse_column->clear();
-            RETURN_IF_ERROR(read_method());
-
-            // cache the sparse column
-            if (_read_opts) {
-                _read_opts->sparse_column_cache[_col.parent_unique_id()] =
-                        _sparse_column->get_ptr();
-            }
-        }
-
+        // if (has_sparse_column_cache()) {
+        //     _sparse_column =
+        //             
_read_opts->sparse_column_cache[_col.parent_unique_id()]->assume_mutable();
+        // } else {
+        //     _sparse_column->clear();
+        //     {
+        //         
SCOPED_RAW_TIMER(&_read_opts->stats->variant_scan_sparse_column_timer_ns);
+        //         int64_t before_size = 
_read_opts->stats->uncompressed_bytes_read;
+        //         RETURN_IF_ERROR(read_method());
+        //         _read_opts->stats->variant_scan_sparse_column_bytes +=
+        //                 _read_opts->stats->uncompressed_bytes_read - 
before_size;
+        //     }
+
+        //     // cache the sparse column
+        //     if (_read_opts) {
+        //         _read_opts->sparse_column_cache[_col.parent_unique_id()] =
+        //                 _sparse_column->get_ptr();
+        //     }
+        // }
+        RETURN_IF_ERROR(read_method());
+        _sparse_column = _shared_column_cache->_column->assume_mutable();

Review Comment:
   `_sparse_column` 这个参数是不是可以去掉了,直接用_shared_column_cache->_column



##########
be/src/olap/rowset/segment_v2/segment_iterator.h:
##########
@@ -496,6 +496,8 @@ class SegmentIterator : public RowwiseIterator {
     std::map<ColumnId, size_t> _vir_cid_to_idx_in_block;
 
     IndexQueryContextPtr _index_query_context;
+
+    std::unordered_map<ColumnId, PathToSharedColumnCacheUPtr> _column_cache;

Review Comment:
   这不不应该是ColumnId应该是int32_t 表示unique_id,然后加上注释



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp:
##########
@@ -225,12 +225,28 @@ Status 
VariantColumnReader::_new_default_iter_with_same_nested(
     return Status::OK();
 }
 
-Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIteratorUPtr* 
iterator,
-                                                           const TabletColumn& 
target_col,
-                                                           const 
StorageReadOptions* opts,
-                                                           bool 
exceeded_sparse_column_limit,
-                                                           bool 
existed_in_sparse_column,
-                                                           ColumnReaderCache* 
column_reader_cache) {
+Result<SharedColumnCacheSPtr> VariantColumnReader::_get_shared_column_cache(
+        PathToSharedColumnCacheUPtr* column_cache, const std::string& path) {
+    if (!column_cache || (*column_cache)->find(path) == 
(*column_cache)->end()) {
+        ColumnIteratorUPtr inner_iter;
+        
RETURN_IF_ERROR_RESULT(_sparse_column_reader->new_iterator(&inner_iter, 
nullptr));
+        vectorized::MutableColumnPtr sparse_column =
+                vectorized::ColumnVariant::create_sparse_column_fn();
+        auto shared_column_cache = 
std::make_shared<SharedColumnCache>(std::move(inner_iter),
+                                                                       
std::move(sparse_column));
+        if (column_cache) {
+            (*column_cache)->emplace(path, shared_column_cache);
+        }
+        return shared_column_cache;
+    }
+    return (*column_cache)->at(path);
+}
+
+Status VariantColumnReader::_new_iterator_with_flat_leaves(
+        ColumnIteratorUPtr* iterator, const TabletColumn& target_col,
+        const StorageReadOptions* opts, bool exceeded_sparse_column_limit,
+        bool existed_in_sparse_column, ColumnReaderCache* column_reader_cache,
+        PathToSharedColumnCacheUPtr* column_cache) {

Review Comment:
   这个应该叫`column_path_data_cache`



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp:
##########
@@ -225,12 +225,28 @@ Status 
VariantColumnReader::_new_default_iter_with_same_nested(
     return Status::OK();
 }
 
-Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIteratorUPtr* 
iterator,
-                                                           const TabletColumn& 
target_col,
-                                                           const 
StorageReadOptions* opts,
-                                                           bool 
exceeded_sparse_column_limit,
-                                                           bool 
existed_in_sparse_column,
-                                                           ColumnReaderCache* 
column_reader_cache) {
+Result<SharedColumnCacheSPtr> VariantColumnReader::_get_shared_column_cache(
+        PathToSharedColumnCacheUPtr* column_cache, const std::string& path) {
+    if (!column_cache || (*column_cache)->find(path) == 
(*column_cache)->end()) {

Review Comment:
   `(*column_cache)->find(path) == (*column_cache)->end()` 
->`!(*column_cache)->contains(path)`



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.h:
##########
@@ -100,7 +101,8 @@ class VariantColumnReader : public ColumnReader {
                                           const StorageReadOptions* opts,
                                           bool exceeded_sparse_column_limit,
                                           bool existed_in_sparse_column,
-                                          ColumnReaderCache* 
column_reader_cache);
+                                          ColumnReaderCache* 
column_reader_cache,

Review Comment:
   column_path_data_cache



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp:
##########
@@ -225,12 +225,28 @@ Status 
VariantColumnReader::_new_default_iter_with_same_nested(
     return Status::OK();
 }
 
-Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIteratorUPtr* 
iterator,
-                                                           const TabletColumn& 
target_col,
-                                                           const 
StorageReadOptions* opts,
-                                                           bool 
exceeded_sparse_column_limit,
-                                                           bool 
existed_in_sparse_column,
-                                                           ColumnReaderCache* 
column_reader_cache) {
+Result<SharedColumnCacheSPtr> VariantColumnReader::_get_shared_column_cache(
+        PathToSharedColumnCacheUPtr* column_cache, const std::string& path) {
+    if (!column_cache || (*column_cache)->find(path) == 
(*column_cache)->end()) {

Review Comment:
   `column_cache` -> `column_path_data_cache`



##########
be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp:
##########
@@ -225,12 +225,28 @@ Status 
VariantColumnReader::_new_default_iter_with_same_nested(
     return Status::OK();
 }
 
-Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIteratorUPtr* 
iterator,
-                                                           const TabletColumn& 
target_col,
-                                                           const 
StorageReadOptions* opts,
-                                                           bool 
exceeded_sparse_column_limit,
-                                                           bool 
existed_in_sparse_column,
-                                                           ColumnReaderCache* 
column_reader_cache) {
+Result<SharedColumnCacheSPtr> VariantColumnReader::_get_shared_column_cache(
+        PathToSharedColumnCacheUPtr* column_cache, const std::string& path) {
+    if (!column_cache || (*column_cache)->find(path) == 
(*column_cache)->end()) {
+        ColumnIteratorUPtr inner_iter;
+        
RETURN_IF_ERROR_RESULT(_sparse_column_reader->new_iterator(&inner_iter, 
nullptr));
+        vectorized::MutableColumnPtr sparse_column =
+                vectorized::ColumnVariant::create_sparse_column_fn();
+        auto shared_column_cache = 
std::make_shared<SharedColumnCache>(std::move(inner_iter),
+                                                                       
std::move(sparse_column));
+        if (column_cache) {
+            (*column_cache)->emplace(path, shared_column_cache);

Review Comment:
   new_column_iterator会被并发访问, 这个`column_cache` 可能不是线程安全的



##########
be/src/olap/rowset/segment_v2/segment.h:
##########
@@ -110,12 +110,10 @@ class Segment : public 
std::enable_shared_from_this<Segment>, public MetadataAdd
 
     uint32_t num_rows() const { return _num_rows; }
 
-    Status new_column_iterator(const TabletColumn& tablet_column,
-                               std::unique_ptr<ColumnIterator>* iter,
-                               const StorageReadOptions* opt);
-
-    Status new_column_iterator(int32_t unique_id, const StorageReadOptions* 
opt,
-                               std::unique_ptr<ColumnIterator>* iter);
+    Status new_column_iterator(
+            const TabletColumn& tablet_column, 
std::unique_ptr<ColumnIterator>* iter,
+            const StorageReadOptions* opt,
+            std::unordered_map<ColumnId, PathToSharedColumnCacheUPtr>* 
column_cache = nullptr);

Review Comment:
   `column_cache`这里有必要是指针吗



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to