xiaokang commented on code in PR #26749:
URL: https://github.com/apache/doris/pull/26749#discussion_r1403825647


##########
be/src/vec/exec/scan/new_olap_scan_node.cpp:
##########
@@ -414,6 +419,49 @@ std::string NewOlapScanNode::get_name() {
     return fmt::format("VNewOlapScanNode({0})", _olap_scan_node.table_name);
 }
 
+void NewOlapScanNode::_filter_and_collect_cast_type_for_variant(
+        const VExpr* expr,
+        phmap::flat_hash_map<std::string, std::vector<PrimitiveType>>& 
colname_to_cast_types) {
+    auto* cast_expr = dynamic_cast<const VCastExpr*>(expr);
+    if (cast_expr != nullptr) {
+        auto* src_slot = cast_expr->get_child(0)->node_type() == 
TExprNodeType::SLOT_REF
+                                 ? dynamic_cast<const 
VSlotRef*>(cast_expr->get_child(0).get())
+                                 : nullptr;
+        if (src_slot == nullptr) {
+            return;
+        }
+        std::vector<SlotDescriptor*> slots = _output_tuple_desc->slots();
+        SlotDescriptor* src_slot_desc = 
slots[_slot_id_to_slot_idx[src_slot->slot_id()]];
+        PrimitiveType cast_dst_type =
+                
cast_expr->get_target_type()->get_type_as_type_descriptor().type;
+        if (src_slot_desc->type().is_variant_type()) {
+            
colname_to_cast_types[src_slot_desc->col_name()].push_back(cast_dst_type);
+        }
+    }
+    for (const auto& child : expr->children()) {
+        _filter_and_collect_cast_type_for_variant(child.get(), 
colname_to_cast_types);
+    }
+}
+
+void NewOlapScanNode::get_cast_types_for_variants() {
+    phmap::flat_hash_map<std::string, std::vector<PrimitiveType>> 
colname_to_cast_types;
+    for (auto it = _conjuncts.begin(); it != _conjuncts.end();) {
+        auto& conjunct = *it;
+        if (conjunct->root()) {
+            _filter_and_collect_cast_type_for_variant(conjunct->root().get(),
+                                                      colname_to_cast_types);
+        }
+        ++it;
+    }
+    // cast to one certain type for variant could utilize fully predicates 
performance
+    // when storage layer type equals to cast type
+    for (const auto& [slotid, types] : colname_to_cast_types) {
+        if (types.size() == 1) {
+            _cast_types_for_variants[slotid] = types[0];
+        }

Review Comment:
   What about types.size() > 1?



##########
be/src/vec/exec/scan/new_olap_scanner.cpp:
##########
@@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params(
     return Status::OK();
 }
 
+vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) {
+    PathInDataBuilder path_builder;
+    const std::string& col_name = slot->col_name_lower_case();
+    auto delimeter_index = col_name.find(".");
+    std::string_view root_name = delimeter_index == std::string::npos
+                                         ? col_name
+                                         : std::string_view(col_name.data(), 
delimeter_index);
+    path_builder = path_builder.append(root_name, false);
+    for (const std::string& path : slot->column_paths()) {
+        path_builder.append(path, false);
+    }
+    return path_builder.build();
+}
+
+Status NewOlapScanner::_init_variant_columns() {
+    auto& tablet_schema = _tablet_reader_params.tablet_schema;
+    // Parent column has path info to distinction from each other
+    for (auto slot : _output_tuple_desc->slots()) {
+        if (!slot->is_materialized()) {
+            continue;
+        }
+        if (!slot->need_materialize()) {
+            continue;
+        }
+        if (slot->type().is_variant_type()) {
+            // Such columns are not exist in frontend schema info, so we need 
to
+            // add them into tablet_schema for later column indexing.
+            TabletColumn subcol;
+            subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+            subcol.set_is_nullable(true);
+            subcol.set_unique_id(slot->col_unique_id());
+            subcol.set_parent_unique_id(slot->col_unique_id());
+            PathInData path = _build_path(slot);
+            subcol.set_path_info(path);
+            subcol.set_name(path.get_path());
+            if (tablet_schema->field_index(path) < 0) {
+                tablet_schema->append_column(subcol, 
TabletSchema::ColumnType::VARIANT);
+            }
+        } else if (!slot->column_paths().empty()) {

Review Comment:
   Why check both slot->type().is_variant_type() and 
!slot->column_paths().empty() ?



##########
be/src/vec/exec/scan/vscan_node.h:
##########
@@ -271,10 +276,15 @@ class VScanNode : public ExecNode, public 
RuntimeFilterConsumer {
     // Save all function predicates which may be pushed down to data source.
     std::vector<FunctionFilter> _push_down_functions;
 
+    // colname -> cast dst type
+    std::map<std::string, PrimitiveType> _cast_types_for_variants;
+
     // slot id -> ColumnValueRange
     // Parsed from conjuncts
     phmap::flat_hash_map<int, std::pair<SlotDescriptor*, ColumnValueRangeType>>
             _slot_id_to_value_range;
+    // slot id -> slot idx in TupleDescriptor
+    phmap::flat_hash_map<int, int> _slot_id_to_slot_idx;

Review Comment:
   It's always used as slots[_slot_id_to_slot_idx[slotid]]. So it's better to 
change it to map<int, SlotDescriptor*> _slot_id_to_slot .



##########
be/src/vec/exec/scan/new_olap_scan_node.cpp:
##########
@@ -414,6 +419,49 @@ std::string NewOlapScanNode::get_name() {
     return fmt::format("VNewOlapScanNode({0})", _olap_scan_node.table_name);
 }
 
+void NewOlapScanNode::_filter_and_collect_cast_type_for_variant(
+        const VExpr* expr,
+        phmap::flat_hash_map<std::string, std::vector<PrimitiveType>>& 
colname_to_cast_types) {
+    auto* cast_expr = dynamic_cast<const VCastExpr*>(expr);
+    if (cast_expr != nullptr) {
+        auto* src_slot = cast_expr->get_child(0)->node_type() == 
TExprNodeType::SLOT_REF
+                                 ? dynamic_cast<const 
VSlotRef*>(cast_expr->get_child(0).get())
+                                 : nullptr;
+        if (src_slot == nullptr) {
+            return;
+        }
+        std::vector<SlotDescriptor*> slots = _output_tuple_desc->slots();
+        SlotDescriptor* src_slot_desc = 
slots[_slot_id_to_slot_idx[src_slot->slot_id()]];
+        PrimitiveType cast_dst_type =
+                
cast_expr->get_target_type()->get_type_as_type_descriptor().type;
+        if (src_slot_desc->type().is_variant_type()) {
+            
colname_to_cast_types[src_slot_desc->col_name()].push_back(cast_dst_type);
+        }
+    }
+    for (const auto& child : expr->children()) {
+        _filter_and_collect_cast_type_for_variant(child.get(), 
colname_to_cast_types);
+    }
+}
+
+void NewOlapScanNode::get_cast_types_for_variants() {
+    phmap::flat_hash_map<std::string, std::vector<PrimitiveType>> 
colname_to_cast_types;
+    for (auto it = _conjuncts.begin(); it != _conjuncts.end();) {
+        auto& conjunct = *it;
+        if (conjunct->root()) {
+            _filter_and_collect_cast_type_for_variant(conjunct->root().get(),
+                                                      colname_to_cast_types);
+        }
+        ++it;
+    }
+    // cast to one certain type for variant could utilize fully predicates 
performance
+    // when storage layer type equals to cast type
+    for (const auto& [slotid, types] : colname_to_cast_types) {

Review Comment:
   slotid -> colname



##########
be/src/vec/exec/scan/scanner_scheduler.cpp:
##########
@@ -414,6 +413,7 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* 
scheduler, ScannerContext
             LOG(WARNING) << "Scan thread read VScanner failed: " << 
status.to_string();
             break;
         }
+        VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << 
eos;

Review Comment:
   why change it?



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/Analyzer.java:
##########
@@ -1004,11 +1011,49 @@ public SlotDescriptor registerColumnRef(TableName 
tblName, String colName) throw
                                                 newTblName == null ? 
d.getTable().getName() : newTblName.toString());
         }
 
+        LOG.debug("register column ref table {}, colName {}, col {}", tblName, 
colName, col.toSql());
+        if (col.getType().isVariantType() || (subColNames != null && 
!subColNames.isEmpty())) {
+            if (!col.getType().isVariantType()) {
+                
ErrorReport.reportAnalysisException(ErrorCode.ERR_ILLEGAL_COLUMN_REFERENCE_ERROR,
+                        Joiner.on(".").join(tblName.getTbl(), colName));
+            }
+            if (subColNames == null) {
+                // Root
+                subColNames = new ArrayList<String>();
+            }
+            String key = d.getAlias() + "." + col.getName();
+            if (subColumnSlotRefMap.get(key) == null) {
+                subColumnSlotRefMap.put(key, Maps.newTreeMap(
+                    new Comparator<List<String>>() {
+                        public int compare(List<String> lst1, List<String> 
lst2) {
+                            String str1 = String.join(".", lst1);
+                            String str2 = String.join(".", lst2);
+                            return str1.compareTo(str2);
+                        }
+                    }));
+            }
+            SlotDescriptor result = 
subColumnSlotRefMap.get(key).get(subColNames);
+            if (result != null) {
+                // avoid duplicate slots
+                return result;
+            }
+            result = globalState.descTbl.addSlotDescriptor(d);
+            LOG.debug("register slot descriptor {}", result);
+            result.setSubColLables(subColNames);
+            result.setColumn(col);
+            if (!subColNames.isEmpty()) {
+                result.setMaterializedColumnName(col.getName() + "." + 
String.join(".", subColNames));
+            }
+            result.setIsMaterialized(true);
+            result.setIsNullable(col.isAllowNull());
+            subColumnSlotRefMap.get(key).put(subColNames, result);

Review Comment:
   can we reuse slotRefMap and use key v.a.b.c



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -78,6 +80,10 @@ static bool ignore_cast(SlotDescriptor* slot, VExpr* expr) {
     if (slot->type().is_string_type() && expr->type().is_string_type()) {
         return true;
     }
+    // Variant slot cast could be eliminated
+    if (slot->type().is_variant_type()) {
+        return true;

Review Comment:
   Do you need to check the expr type?



##########
be/src/vec/exec/scan/new_olap_scanner.cpp:
##########
@@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params(
     return Status::OK();
 }
 
+vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) {
+    PathInDataBuilder path_builder;
+    const std::string& col_name = slot->col_name_lower_case();
+    auto delimeter_index = col_name.find(".");
+    std::string_view root_name = delimeter_index == std::string::npos
+                                         ? col_name
+                                         : std::string_view(col_name.data(), 
delimeter_index);
+    path_builder = path_builder.append(root_name, false);
+    for (const std::string& path : slot->column_paths()) {
+        path_builder.append(path, false);
+    }
+    return path_builder.build();
+}
+
+Status NewOlapScanner::_init_variant_columns() {
+    auto& tablet_schema = _tablet_reader_params.tablet_schema;
+    // Parent column has path info to distinction from each other
+    for (auto slot : _output_tuple_desc->slots()) {
+        if (!slot->is_materialized()) {
+            continue;
+        }
+        if (!slot->need_materialize()) {
+            continue;
+        }
+        if (slot->type().is_variant_type()) {
+            // Such columns are not exist in frontend schema info, so we need 
to
+            // add them into tablet_schema for later column indexing.
+            TabletColumn subcol;
+            subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+            subcol.set_is_nullable(true);
+            subcol.set_unique_id(slot->col_unique_id());

Review Comment:
   Use the same unique id as parent?



##########
be/src/vec/exec/scan/new_olap_scanner.cpp:
##########
@@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params(
     return Status::OK();
 }
 
+vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) {
+    PathInDataBuilder path_builder;
+    const std::string& col_name = slot->col_name_lower_case();
+    auto delimeter_index = col_name.find(".");

Review Comment:
   It's not safe to determin root_name by "." for escaped name `a.b` 



##########
be/src/vec/exec/scan/new_olap_scanner.cpp:
##########
@@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params(
     return Status::OK();
 }
 
+vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) {
+    PathInDataBuilder path_builder;
+    const std::string& col_name = slot->col_name_lower_case();
+    auto delimeter_index = col_name.find(".");
+    std::string_view root_name = delimeter_index == std::string::npos
+                                         ? col_name
+                                         : std::string_view(col_name.data(), 
delimeter_index);
+    path_builder = path_builder.append(root_name, false);
+    for (const std::string& path : slot->column_paths()) {

Review Comment:
   Does the slot data like this?
   
   slot->col_name  'v.a.b.c'
   slot->column_paths [a, b, c]



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/SlotDescriptor.java:
##########
@@ -44,6 +44,9 @@ public class SlotDescriptor {
 
     // for SlotRef.toSql() in the absence of a path
     private String label;
+    // for variant column's sub column lables
+    private List<String> subColLabels;
+    private String materializedColumnnName;

Review Comment:
   materializedColumnnName -> materializedColumnName
   ```suggestion
       private String materializedColumnName;
   ```



##########
be/src/vec/exec/scan/new_olap_scanner.cpp:
##########
@@ -406,6 +411,55 @@ Status NewOlapScanner::_init_tablet_reader_params(
     return Status::OK();
 }
 
+vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot) {
+    PathInDataBuilder path_builder;
+    const std::string& col_name = slot->col_name_lower_case();
+    auto delimeter_index = col_name.find(".");
+    std::string_view root_name = delimeter_index == std::string::npos
+                                         ? col_name
+                                         : std::string_view(col_name.data(), 
delimeter_index);
+    path_builder = path_builder.append(root_name, false);
+    for (const std::string& path : slot->column_paths()) {
+        path_builder.append(path, false);
+    }
+    return path_builder.build();
+}
+
+Status NewOlapScanner::_init_variant_columns() {
+    auto& tablet_schema = _tablet_reader_params.tablet_schema;
+    // Parent column has path info to distinction from each other
+    for (auto slot : _output_tuple_desc->slots()) {
+        if (!slot->is_materialized()) {
+            continue;
+        }
+        if (!slot->need_materialize()) {
+            continue;
+        }
+        if (slot->type().is_variant_type()) {
+            // Such columns are not exist in frontend schema info, so we need 
to
+            // add them into tablet_schema for later column indexing.
+            TabletColumn subcol;
+            subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+            subcol.set_is_nullable(true);
+            subcol.set_unique_id(slot->col_unique_id());
+            subcol.set_parent_unique_id(slot->col_unique_id());
+            PathInData path = _build_path(slot);
+            subcol.set_path_info(path);
+            subcol.set_name(path.get_path());
+            if (tablet_schema->field_index(path) < 0) {
+                tablet_schema->append_column(subcol, 
TabletSchema::ColumnType::VARIANT);
+            }
+        } else if (!slot->column_paths().empty()) {
+            // Extracted materialized columns update it's path info
+            PathInData path = _build_path(slot);
+            int index = tablet_schema->field_index(slot->col_unique_id());

Review Comment:
   Is any chance slot->col_unique_id() is not in tablet_schema?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to