Re: [PR] [Feature](Row store) support column group with store row format for partial columns of table [doris]

via GitHub Sun, 12 May 2024 21:17:09 -0700


xiaokang commented on code in PR #34089:
URL: https://github.com/apache/doris/pull/34089#discussion_r1597834122



##########
be/src/olap/tablet_schema.h:
##########
@@ -213,6 +215,10 @@ class TabletColumn {
     // Use shared_ptr for reuse and reducing column memory usage
     std::vector<TabletColumnPtr> _sparse_cols;
     size_t _num_sparse_columns = 0;
+
+    // groups of this column belongs to
+    // contains column ids of group, and encode the group of columns into row 
store
+    std::vector<int32_t> _group_ids;

Review Comment:
   _row_store_group_ids



##########
be/src/olap/rowset/segment_v2/segment_writer.cpp:
##########
@@ -800,9 +792,8 @@ Status SegmentWriter::append_block(const vectorized::Block* 
block, size_t row_po
             << ", _column_writers.size()=" << _column_writers.size();
     // Row column should be filled here when it's a directly write from 
memtable
     // or it's schema change write(since column data type maybe changed, so we 
should reubild)
-    if (_tablet_schema->store_row_column() &&
-        (_opts.write_type == DataWriteType::TYPE_DIRECT ||
-         _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE)) {
+    if (_opts.write_type == DataWriteType::TYPE_DIRECT ||

Review Comment:
   check partial



##########
be/src/service/point_query_executor.cpp:
##########
@@ -47,16 +52,103 @@
 #include "vec/data_types/serde/data_type_serde.h"
 #include "vec/exprs/vexpr.h"
 #include "vec/exprs/vexpr_context.h"
+#include "vec/exprs/vexpr_fwd.h"
+#include "vec/exprs/vslot_ref.h"
 #include "vec/jsonb/serialize.h"
 #include "vec/sink/vmysql_result_writer.cpp"
 #include "vec/sink/vmysql_result_writer.h"
 
 namespace doris {
 
-Reusable::~Reusable() {}
+Reusable::~Reusable() = default;
+
+// Function to find the best matching row store column from the input tuple 
descriptor
+// It returns the row store column's unique id with column groups that matches 
the most slots from the input tuple descriptor.
+// If no match return -1
+static int pick_row_store(const TabletSchema& schema, const 
std::vector<SlotDescriptor*>& slots) {
+    std::unordered_set<int> input_slots_cid_set;
+    for (auto* slot : slots) {
+        input_slots_cid_set.insert(slot->col_unique_id());
+    }
+    int max_match = 0;
+    int result_col_unique_id = -1;
+    int smallest_size = INT_MAX;
+
+    for (const auto& col : schema.columns()) {
+        if (!col->is_row_store_column()) {
+            continue;
+        }
+        int current_match = 0;
+        // The full column group is considered a full match.
+        if (col->group_ids().empty()) {
+            current_match = input_slots_cid_set.size();
+        }
+        // Count how many elements in the current column group are in the 
input tuple slots.
+        for (int num : col->group_ids()) {
+            if (input_slots_cid_set.contains(num)) {
+                current_match++;
+            }
+        }
+
+        // Update the best matching set if the current set has more matches,
+        // or if it has the same number of matches but is smaller.
+        // The smaller set is preferred because it needs read less data.
+        if (current_match > 0 &&
+            (current_match > max_match ||
+             (current_match == max_match && col->group_ids().size() < 
smallest_size))) {
+            max_match = current_match;
+            result_col_unique_id = col->unique_id();
+            smallest_size = col->group_ids().empty() ? INT_MAX : 
col->group_ids().size();
+        }
+    }
+    return result_col_unique_id;
+}
+
+static void get_missing_and_include_cids(const TabletSchema& schema,
+                                         const std::vector<SlotDescriptor*>& 
slots,
+                                         int target_rs_column_id,
+                                         std::unordered_set<int>& missing_cids,
+                                         std::unordered_set<int>& 
include_cids) {
+    missing_cids.clear();
+    include_cids.clear();
+    for (auto* slot : slots) {
+        missing_cids.insert(slot->col_unique_id());
+    }
+    if (target_rs_column_id == -1) {
+        // no row store columns
+        return;
+    }
+    const TabletColumn& target_rs_column = 
schema.column_by_uid(target_rs_column_id);
+    DCHECK(target_rs_column.is_row_store_column());
+    // The full column group is considered a full match, thus no missing cids
+    if (target_rs_column.group_ids().empty()) {

Review Comment:
   consider that if it's right after light schema chage



##########
be/src/olap/rowset/segment_v2/segment_writer.cpp:
##########
@@ -576,11 +571,8 @@ Status 
SegmentWriter::append_block_with_partial_content(const vectorized::Block*
     RETURN_IF_ERROR(fill_missing_columns(mutable_full_columns, 
use_default_or_null_flag,
                                          has_default_or_nullable, 
segment_start_pos));
     full_block.set_columns(std::move(mutable_full_columns));
-    // row column should be filled here
-    if (_tablet_schema->store_row_column()) {
-        // convert block to row store format
-        _serialize_block_to_row_column(full_block);
-    }
+    // convert block to row store format
+    _serialize_block_to_row_column(full_block);

Review Comment:
   check partial row store



##########
be/src/vec/jsonb/serialize.h:
##########
@@ -34,15 +36,20 @@ namespace doris::vectorized {
 // use jsonb codec to store row format
 class JsonbSerializeUtil {
 public:
+    // encode partial columns into jsonb
+    // empty group_cids means encode full schema columns for compability
     static void block_to_jsonb(const TabletSchema& schema, const Block& block, 
ColumnString& dst,
-                               int num_cols, const DataTypeSerDeSPtrs& serdes);
+                               int num_cols, const DataTypeSerDeSPtrs& serdes,
+                               const std::unordered_set<int32_t>& group_cids);

Review Comment:
   consistent name include_cids is better



##########
be/src/olap/base_tablet.cpp:
##########
@@ -909,7 +908,7 @@ Status 
BaseTablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset,
         serdes[i] = type->get_serde();
     }
     vectorized::JsonbSerializeUtil::jsonb_to_block(serdes, *string_column, 
col_uid_to_idx, block,
-                                                   default_values);
+                                                   default_values, {});

Review Comment:
   Do you mean use all columns from row column? Can it be optimized to only 
copy needed columns?



##########
be/src/olap/tablet_schema.h:
##########
@@ -323,8 +329,13 @@ class TabletSchema {
         _enable_single_replica_compaction = enable_single_replica_compaction;
     }
     bool enable_single_replica_compaction() const { return 
_enable_single_replica_compaction; }
-    void set_store_row_column(bool store_row_column) { _store_row_column = 
store_row_column; }
-    bool store_row_column() const { return _store_row_column; }
+    // indicate if full row store column(all the columns encodes as row) exists
+    bool has_full_row_store_column() const { return _store_row_column; }

Review Comment:
   consistent name is better



##########
be/src/exec/rowid_fetcher.cpp:
##########
@@ -405,12 +405,14 @@ Status RowIdStorageReader::read_by_rowids(const 
PMultiGetRequest& request,
                                         row_loc.segment_id(), 
row_loc.ordinal_id());
         // fetch by row store, more effcient way
         if (request.fetch_row_store()) {
-            CHECK(tablet->tablet_schema()->store_row_column());
+            CHECK(tablet->tablet_schema()->has_full_row_store_column());
             RowLocation loc(rowset_id, segment->id(), row_loc.ordinal_id());
             string* value = response->add_binary_row_data();
             RETURN_IF_ERROR(scope_timer_run(
                     [&]() {
-                        return tablet->lookup_row_data({}, loc, rowset, &desc, 
stats, *value);
+                        return tablet->lookup_row_data(
+                                {}, loc, rowset, &desc, stats, *value,
+                                
tablet->tablet_schema()->column(BeConsts::ROW_STORE_COL));

Review Comment:
   Why not specific row column xxx?



##########
be/src/service/point_query_executor.h:
##########
@@ -103,6 +111,12 @@ class Reusable {
     vectorized::DataTypeSerDeSPtrs _data_type_serdes;
     std::unordered_map<uint32_t, uint32_t> _col_uid_to_idx;
     std::vector<std::string> _col_default_values;
+    // picked rowstore(column group) column unique id
+    int32_t _rs_column_uid;

Review Comment:
   what's the meaning of name rs?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [Feature](Row store) support column group with store row format for partial columns of table [doris]

Reply via email to