xiaokang commented on code in PR #34089: URL: https://github.com/apache/doris/pull/34089#discussion_r1597834122
########## be/src/olap/tablet_schema.h: ########## @@ -213,6 +215,10 @@ class TabletColumn { // Use shared_ptr for reuse and reducing column memory usage std::vector<TabletColumnPtr> _sparse_cols; size_t _num_sparse_columns = 0; + + // groups of this column belongs to + // contains column ids of group, and encode the group of columns into row store + std::vector<int32_t> _group_ids; Review Comment: _row_store_group_ids ########## be/src/olap/rowset/segment_v2/segment_writer.cpp: ########## @@ -800,9 +792,8 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po << ", _column_writers.size()=" << _column_writers.size(); // Row column should be filled here when it's a directly write from memtable // or it's schema change write(since column data type maybe changed, so we should reubild) - if (_tablet_schema->store_row_column() && - (_opts.write_type == DataWriteType::TYPE_DIRECT || - _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE)) { + if (_opts.write_type == DataWriteType::TYPE_DIRECT || Review Comment: check partial ########## be/src/service/point_query_executor.cpp: ########## @@ -47,16 +52,103 @@ #include "vec/data_types/serde/data_type_serde.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" +#include "vec/exprs/vexpr_fwd.h" +#include "vec/exprs/vslot_ref.h" #include "vec/jsonb/serialize.h" #include "vec/sink/vmysql_result_writer.cpp" #include "vec/sink/vmysql_result_writer.h" namespace doris { -Reusable::~Reusable() {} +Reusable::~Reusable() = default; + +// Function to find the best matching row store column from the input tuple descriptor +// It returns the row store column's unique id with column groups that matches the most slots from the input tuple descriptor. +// If no match return -1 +static int pick_row_store(const TabletSchema& schema, const std::vector<SlotDescriptor*>& slots) { + std::unordered_set<int> input_slots_cid_set; + for (auto* slot : slots) { + input_slots_cid_set.insert(slot->col_unique_id()); + } + int max_match = 0; + int result_col_unique_id = -1; + int smallest_size = INT_MAX; + + for (const auto& col : schema.columns()) { + if (!col->is_row_store_column()) { + continue; + } + int current_match = 0; + // The full column group is considered a full match. + if (col->group_ids().empty()) { + current_match = input_slots_cid_set.size(); + } + // Count how many elements in the current column group are in the input tuple slots. + for (int num : col->group_ids()) { + if (input_slots_cid_set.contains(num)) { + current_match++; + } + } + + // Update the best matching set if the current set has more matches, + // or if it has the same number of matches but is smaller. + // The smaller set is preferred because it needs read less data. + if (current_match > 0 && + (current_match > max_match || + (current_match == max_match && col->group_ids().size() < smallest_size))) { + max_match = current_match; + result_col_unique_id = col->unique_id(); + smallest_size = col->group_ids().empty() ? INT_MAX : col->group_ids().size(); + } + } + return result_col_unique_id; +} + +static void get_missing_and_include_cids(const TabletSchema& schema, + const std::vector<SlotDescriptor*>& slots, + int target_rs_column_id, + std::unordered_set<int>& missing_cids, + std::unordered_set<int>& include_cids) { + missing_cids.clear(); + include_cids.clear(); + for (auto* slot : slots) { + missing_cids.insert(slot->col_unique_id()); + } + if (target_rs_column_id == -1) { + // no row store columns + return; + } + const TabletColumn& target_rs_column = schema.column_by_uid(target_rs_column_id); + DCHECK(target_rs_column.is_row_store_column()); + // The full column group is considered a full match, thus no missing cids + if (target_rs_column.group_ids().empty()) { Review Comment: consider that if it's right after light schema chage ########## be/src/olap/rowset/segment_v2/segment_writer.cpp: ########## @@ -576,11 +571,8 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* RETURN_IF_ERROR(fill_missing_columns(mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, segment_start_pos)); full_block.set_columns(std::move(mutable_full_columns)); - // row column should be filled here - if (_tablet_schema->store_row_column()) { - // convert block to row store format - _serialize_block_to_row_column(full_block); - } + // convert block to row store format + _serialize_block_to_row_column(full_block); Review Comment: check partial row store ########## be/src/vec/jsonb/serialize.h: ########## @@ -34,15 +36,20 @@ namespace doris::vectorized { // use jsonb codec to store row format class JsonbSerializeUtil { public: + // encode partial columns into jsonb + // empty group_cids means encode full schema columns for compability static void block_to_jsonb(const TabletSchema& schema, const Block& block, ColumnString& dst, - int num_cols, const DataTypeSerDeSPtrs& serdes); + int num_cols, const DataTypeSerDeSPtrs& serdes, + const std::unordered_set<int32_t>& group_cids); Review Comment: consistent name include_cids is better ########## be/src/olap/base_tablet.cpp: ########## @@ -909,7 +908,7 @@ Status BaseTablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, serdes[i] = type->get_serde(); } vectorized::JsonbSerializeUtil::jsonb_to_block(serdes, *string_column, col_uid_to_idx, block, - default_values); + default_values, {}); Review Comment: Do you mean use all columns from row column? Can it be optimized to only copy needed columns? ########## be/src/olap/tablet_schema.h: ########## @@ -323,8 +329,13 @@ class TabletSchema { _enable_single_replica_compaction = enable_single_replica_compaction; } bool enable_single_replica_compaction() const { return _enable_single_replica_compaction; } - void set_store_row_column(bool store_row_column) { _store_row_column = store_row_column; } - bool store_row_column() const { return _store_row_column; } + // indicate if full row store column(all the columns encodes as row) exists + bool has_full_row_store_column() const { return _store_row_column; } Review Comment: consistent name is better ########## be/src/exec/rowid_fetcher.cpp: ########## @@ -405,12 +405,14 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequest& request, row_loc.segment_id(), row_loc.ordinal_id()); // fetch by row store, more effcient way if (request.fetch_row_store()) { - CHECK(tablet->tablet_schema()->store_row_column()); + CHECK(tablet->tablet_schema()->has_full_row_store_column()); RowLocation loc(rowset_id, segment->id(), row_loc.ordinal_id()); string* value = response->add_binary_row_data(); RETURN_IF_ERROR(scope_timer_run( [&]() { - return tablet->lookup_row_data({}, loc, rowset, &desc, stats, *value); + return tablet->lookup_row_data( + {}, loc, rowset, &desc, stats, *value, + tablet->tablet_schema()->column(BeConsts::ROW_STORE_COL)); Review Comment: Why not specific row column xxx? ########## be/src/service/point_query_executor.h: ########## @@ -103,6 +111,12 @@ class Reusable { vectorized::DataTypeSerDeSPtrs _data_type_serdes; std::unordered_map<uint32_t, uint32_t> _col_uid_to_idx; std::vector<std::string> _col_default_values; + // picked rowstore(column group) column unique id + int32_t _rs_column_uid; Review Comment: what's the meaning of name rs? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org