xiaokang commented on code in PR #26749: URL: https://github.com/apache/doris/pull/26749#discussion_r1403825647
########## be/src/vec/exec/scan/new_olap_scan_node.cpp: ########## @@ -414,6 +419,49 @@ std::string NewOlapScanNode::get_name() { return fmt::format("VNewOlapScanNode({0})", _olap_scan_node.table_name); } +void NewOlapScanNode::_filter_and_collect_cast_type_for_variant( + const VExpr* expr, + phmap::flat_hash_map<std::string, std::vector<PrimitiveType>>& colname_to_cast_types) { + auto* cast_expr = dynamic_cast<const VCastExpr*>(expr); + if (cast_expr != nullptr) { + auto* src_slot = cast_expr->get_child(0)->node_type() == TExprNodeType::SLOT_REF + ? dynamic_cast<const VSlotRef*>(cast_expr->get_child(0).get()) + : nullptr; + if (src_slot == nullptr) { + return; + } + std::vector<SlotDescriptor*> slots = _output_tuple_desc->slots(); + SlotDescriptor* src_slot_desc = slots[_slot_id_to_slot_idx[src_slot->slot_id()]]; + PrimitiveType cast_dst_type = + cast_expr->get_target_type()->get_type_as_type_descriptor().type; + if (src_slot_desc->type().is_variant_type()) { + colname_to_cast_types[src_slot_desc->col_name()].push_back(cast_dst_type); + } + } + for (const auto& child : expr->children()) { + _filter_and_collect_cast_type_for_variant(child.get(), colname_to_cast_types); + } +} + +void NewOlapScanNode::get_cast_types_for_variants() { + phmap::flat_hash_map<std::string, std::vector<PrimitiveType>> colname_to_cast_types; + for (auto it = _conjuncts.begin(); it != _conjuncts.end();) { + auto& conjunct = *it; + if (conjunct->root()) { + _filter_and_collect_cast_type_for_variant(conjunct->root().get(), + colname_to_cast_types); + } + ++it; + } + // cast to one certain type for variant could utilize fully predicates performance + // when storage layer type equals to cast type + for (const auto& [slotid, types] : colname_to_cast_types) { + if (types.size() == 1) { + _cast_types_for_variants[slotid] = types[0]; + } Review Comment: What about types.size() > 1? ########## be/src/vec/exec/scan/new_olap_scanner.cpp: ########## @@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params( return Status::OK(); } +vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) { + PathInDataBuilder path_builder; + const std::string& col_name = slot->col_name_lower_case(); + auto delimeter_index = col_name.find("."); + std::string_view root_name = delimeter_index == std::string::npos + ? col_name + : std::string_view(col_name.data(), delimeter_index); + path_builder = path_builder.append(root_name, false); + for (const std::string& path : slot->column_paths()) { + path_builder.append(path, false); + } + return path_builder.build(); +} + +Status NewOlapScanner::_init_variant_columns() { + auto& tablet_schema = _tablet_reader_params.tablet_schema; + // Parent column has path info to distinction from each other + for (auto slot : _output_tuple_desc->slots()) { + if (!slot->is_materialized()) { + continue; + } + if (!slot->need_materialize()) { + continue; + } + if (slot->type().is_variant_type()) { + // Such columns are not exist in frontend schema info, so we need to + // add them into tablet_schema for later column indexing. + TabletColumn subcol; + subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcol.set_is_nullable(true); + subcol.set_unique_id(slot->col_unique_id()); + subcol.set_parent_unique_id(slot->col_unique_id()); + PathInData path = _build_path(slot); + subcol.set_path_info(path); + subcol.set_name(path.get_path()); + if (tablet_schema->field_index(path) < 0) { + tablet_schema->append_column(subcol, TabletSchema::ColumnType::VARIANT); + } + } else if (!slot->column_paths().empty()) { Review Comment: Why check both slot->type().is_variant_type() and !slot->column_paths().empty() ? ########## be/src/vec/exec/scan/vscan_node.h: ########## @@ -271,10 +276,15 @@ class VScanNode : public ExecNode, public RuntimeFilterConsumer { // Save all function predicates which may be pushed down to data source. std::vector<FunctionFilter> _push_down_functions; + // colname -> cast dst type + std::map<std::string, PrimitiveType> _cast_types_for_variants; + // slot id -> ColumnValueRange // Parsed from conjuncts phmap::flat_hash_map<int, std::pair<SlotDescriptor*, ColumnValueRangeType>> _slot_id_to_value_range; + // slot id -> slot idx in TupleDescriptor + phmap::flat_hash_map<int, int> _slot_id_to_slot_idx; Review Comment: It's always used as slots[_slot_id_to_slot_idx[slotid]]. So it's better to change it to map<int, SlotDescriptor*> _slot_id_to_slot . ########## be/src/vec/exec/scan/new_olap_scan_node.cpp: ########## @@ -414,6 +419,49 @@ std::string NewOlapScanNode::get_name() { return fmt::format("VNewOlapScanNode({0})", _olap_scan_node.table_name); } +void NewOlapScanNode::_filter_and_collect_cast_type_for_variant( + const VExpr* expr, + phmap::flat_hash_map<std::string, std::vector<PrimitiveType>>& colname_to_cast_types) { + auto* cast_expr = dynamic_cast<const VCastExpr*>(expr); + if (cast_expr != nullptr) { + auto* src_slot = cast_expr->get_child(0)->node_type() == TExprNodeType::SLOT_REF + ? dynamic_cast<const VSlotRef*>(cast_expr->get_child(0).get()) + : nullptr; + if (src_slot == nullptr) { + return; + } + std::vector<SlotDescriptor*> slots = _output_tuple_desc->slots(); + SlotDescriptor* src_slot_desc = slots[_slot_id_to_slot_idx[src_slot->slot_id()]]; + PrimitiveType cast_dst_type = + cast_expr->get_target_type()->get_type_as_type_descriptor().type; + if (src_slot_desc->type().is_variant_type()) { + colname_to_cast_types[src_slot_desc->col_name()].push_back(cast_dst_type); + } + } + for (const auto& child : expr->children()) { + _filter_and_collect_cast_type_for_variant(child.get(), colname_to_cast_types); + } +} + +void NewOlapScanNode::get_cast_types_for_variants() { + phmap::flat_hash_map<std::string, std::vector<PrimitiveType>> colname_to_cast_types; + for (auto it = _conjuncts.begin(); it != _conjuncts.end();) { + auto& conjunct = *it; + if (conjunct->root()) { + _filter_and_collect_cast_type_for_variant(conjunct->root().get(), + colname_to_cast_types); + } + ++it; + } + // cast to one certain type for variant could utilize fully predicates performance + // when storage layer type equals to cast type + for (const auto& [slotid, types] : colname_to_cast_types) { Review Comment: slotid -> colname ########## be/src/vec/exec/scan/scanner_scheduler.cpp: ########## @@ -414,6 +413,7 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, ScannerContext LOG(WARNING) << "Scan thread read VScanner failed: " << status.to_string(); break; } + VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos; Review Comment: why change it? ########## fe/fe-core/src/main/java/org/apache/doris/analysis/Analyzer.java: ########## @@ -1004,11 +1011,49 @@ public SlotDescriptor registerColumnRef(TableName tblName, String colName) throw newTblName == null ? d.getTable().getName() : newTblName.toString()); } + LOG.debug("register column ref table {}, colName {}, col {}", tblName, colName, col.toSql()); + if (col.getType().isVariantType() || (subColNames != null && !subColNames.isEmpty())) { + if (!col.getType().isVariantType()) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_ILLEGAL_COLUMN_REFERENCE_ERROR, + Joiner.on(".").join(tblName.getTbl(), colName)); + } + if (subColNames == null) { + // Root + subColNames = new ArrayList<String>(); + } + String key = d.getAlias() + "." + col.getName(); + if (subColumnSlotRefMap.get(key) == null) { + subColumnSlotRefMap.put(key, Maps.newTreeMap( + new Comparator<List<String>>() { + public int compare(List<String> lst1, List<String> lst2) { + String str1 = String.join(".", lst1); + String str2 = String.join(".", lst2); + return str1.compareTo(str2); + } + })); + } + SlotDescriptor result = subColumnSlotRefMap.get(key).get(subColNames); + if (result != null) { + // avoid duplicate slots + return result; + } + result = globalState.descTbl.addSlotDescriptor(d); + LOG.debug("register slot descriptor {}", result); + result.setSubColLables(subColNames); + result.setColumn(col); + if (!subColNames.isEmpty()) { + result.setMaterializedColumnName(col.getName() + "." + String.join(".", subColNames)); + } + result.setIsMaterialized(true); + result.setIsNullable(col.isAllowNull()); + subColumnSlotRefMap.get(key).put(subColNames, result); Review Comment: can we reuse slotRefMap and use key v.a.b.c ########## be/src/vec/exec/scan/vscan_node.cpp: ########## @@ -78,6 +80,10 @@ static bool ignore_cast(SlotDescriptor* slot, VExpr* expr) { if (slot->type().is_string_type() && expr->type().is_string_type()) { return true; } + // Variant slot cast could be eliminated + if (slot->type().is_variant_type()) { + return true; Review Comment: Do you need to check the expr type? ########## be/src/vec/exec/scan/new_olap_scanner.cpp: ########## @@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params( return Status::OK(); } +vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) { + PathInDataBuilder path_builder; + const std::string& col_name = slot->col_name_lower_case(); + auto delimeter_index = col_name.find("."); + std::string_view root_name = delimeter_index == std::string::npos + ? col_name + : std::string_view(col_name.data(), delimeter_index); + path_builder = path_builder.append(root_name, false); + for (const std::string& path : slot->column_paths()) { + path_builder.append(path, false); + } + return path_builder.build(); +} + +Status NewOlapScanner::_init_variant_columns() { + auto& tablet_schema = _tablet_reader_params.tablet_schema; + // Parent column has path info to distinction from each other + for (auto slot : _output_tuple_desc->slots()) { + if (!slot->is_materialized()) { + continue; + } + if (!slot->need_materialize()) { + continue; + } + if (slot->type().is_variant_type()) { + // Such columns are not exist in frontend schema info, so we need to + // add them into tablet_schema for later column indexing. + TabletColumn subcol; + subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcol.set_is_nullable(true); + subcol.set_unique_id(slot->col_unique_id()); Review Comment: Use the same unique id as parent? ########## be/src/vec/exec/scan/new_olap_scanner.cpp: ########## @@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params( return Status::OK(); } +vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) { + PathInDataBuilder path_builder; + const std::string& col_name = slot->col_name_lower_case(); + auto delimeter_index = col_name.find("."); Review Comment: It's not safe to determin root_name by "." for escaped name `a.b` ########## be/src/vec/exec/scan/new_olap_scanner.cpp: ########## @@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params( return Status::OK(); } +vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) { + PathInDataBuilder path_builder; + const std::string& col_name = slot->col_name_lower_case(); + auto delimeter_index = col_name.find("."); + std::string_view root_name = delimeter_index == std::string::npos + ? col_name + : std::string_view(col_name.data(), delimeter_index); + path_builder = path_builder.append(root_name, false); + for (const std::string& path : slot->column_paths()) { Review Comment: Does the slot data like this? slot->col_name 'v.a.b.c' slot->column_paths [a, b, c] ########## fe/fe-core/src/main/java/org/apache/doris/analysis/SlotDescriptor.java: ########## @@ -44,6 +44,9 @@ public class SlotDescriptor { // for SlotRef.toSql() in the absence of a path private String label; + // for variant column's sub column lables + private List<String> subColLabels; + private String materializedColumnnName; Review Comment: materializedColumnnName -> materializedColumnName ```suggestion private String materializedColumnName; ``` ########## be/src/vec/exec/scan/new_olap_scanner.cpp: ########## @@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params( return Status::OK(); } +vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) { + PathInDataBuilder path_builder; + const std::string& col_name = slot->col_name_lower_case(); + auto delimeter_index = col_name.find("."); + std::string_view root_name = delimeter_index == std::string::npos + ? col_name + : std::string_view(col_name.data(), delimeter_index); + path_builder = path_builder.append(root_name, false); + for (const std::string& path : slot->column_paths()) { + path_builder.append(path, false); + } + return path_builder.build(); +} + +Status NewOlapScanner::_init_variant_columns() { + auto& tablet_schema = _tablet_reader_params.tablet_schema; + // Parent column has path info to distinction from each other + for (auto slot : _output_tuple_desc->slots()) { + if (!slot->is_materialized()) { + continue; + } + if (!slot->need_materialize()) { + continue; + } + if (slot->type().is_variant_type()) { + // Such columns are not exist in frontend schema info, so we need to + // add them into tablet_schema for later column indexing. + TabletColumn subcol; + subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcol.set_is_nullable(true); + subcol.set_unique_id(slot->col_unique_id()); + subcol.set_parent_unique_id(slot->col_unique_id()); + PathInData path = _build_path(slot); + subcol.set_path_info(path); + subcol.set_name(path.get_path()); + if (tablet_schema->field_index(path) < 0) { + tablet_schema->append_column(subcol, TabletSchema::ColumnType::VARIANT); + } + } else if (!slot->column_paths().empty()) { + // Extracted materialized columns update it's path info + PathInData path = _build_path(slot); + int index = tablet_schema->field_index(slot->col_unique_id()); Review Comment: Is any chance slot->col_unique_id() is not in tablet_schema? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org