github-actions[bot] commented on code in PR #45252:
URL: https://github.com/apache/doris/pull/45252#discussion_r1877721869


##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1062,42 +1100,190 @@
     }
 }
 
+bool ColumnObject::try_add_new_subcolumn(std::string_view path) {
+    if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
+
+    return add_sub_column(path, num_rows);
+}
+
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
 #ifndef NDEBUG
     check_consistency();
 #endif
     const auto& src_object = assert_cast<const ColumnObject&>(src);
+
+    // First, insert src subcolumns
+    // We can reach the limit of subcolumns, and in this case
+    // the rest of subcolumns from src will be inserted into sparse column.
+    std::map<std::string_view, Subcolumn> 
src_path_and_subcoumn_for_sparse_column;
     for (const auto& entry : src_object.subcolumns) {
-        if (!has_subcolumn(entry->path)) {
-            if (entry->path.has_nested_part()) {
-                FieldInfo field_info {
-                        .scalar_type_id = 
entry->data.least_common_type.get_base_type_id(),
-                        .have_nulls = false,
-                        .need_convert = false,
-                        .num_dimensions = entry->data.get_dimensions()};
-                add_nested_subcolumn(entry->path, field_info, num_rows);
-            } else {
-                add_sub_column(entry->path, num_rows);
-            }
+        // Check if we already have such dense column path.
+        if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != 
nullptr) {
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else if (try_add_new_subcolumn(entry->path.get_path())) {
+            subcolumn = get_subcolumn(entry->path);
+            DCHECK(subcolumn != nullptr);
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else {
+            
src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), 
entry->data);
         }
-        auto* subcolumn = get_subcolumn(entry->path);
-        subcolumn->insert_range_from(entry->data, start, length);
     }
-    for (auto& entry : subcolumns) {
-        if (!src_object.has_subcolumn(entry->path)) {
-            bool inserted = try_insert_many_defaults_from_nested(entry);
-            if (!inserted) {
-                entry->data.insert_many_defaults(length);
-            }
-        }
+
+    // Paths in sparse column are sorted, so paths from 
src_dense_column_path_for_sparse_column should be inserted properly
+    // to keep paths sorted. Let's sort them in advance.
+    std::vector<std::pair<std::string_view, Subcolumn>> 
sorted_src_subcolumn_for_sparse_column;
+    auto it = src_path_and_subcoumn_for_sparse_column.begin();
+    auto end = src_path_and_subcoumn_for_sparse_column.end();
+    while (it != end) {
+        sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, 
it->second);
+        ++it;
     }
+
+    insert_from_sparse_column_and_fill_remaing_dense_column(
+            src_object, std::move(sorted_src_subcolumn_for_sparse_column), 
start, length);
+
     num_rows += length;
     finalize();
 #ifndef NDEBUG
     check_consistency();
 #endif
 }
 
+// std::map<std::string_view, Subcolumn>
+void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(

Review Comment:
   warning: function 'insert_from_sparse_column_and_fill_remaing_dense_column' 
exceeds recommended size/complexity thresholds [readability-function-size]
   ```cpp
   void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(
                      ^
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/src/vec/columns/column_object.cpp:1152:** 128 lines including 
whitespace and comments (threshold 80)
   ```cpp
   void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(
                      ^
   ```
   
   </details>
   



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1062,42 +1100,190 @@
     }
 }
 
+bool ColumnObject::try_add_new_subcolumn(std::string_view path) {
+    if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
+
+    return add_sub_column(path, num_rows);
+}
+
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
 #ifndef NDEBUG
     check_consistency();
 #endif
     const auto& src_object = assert_cast<const ColumnObject&>(src);
+
+    // First, insert src subcolumns
+    // We can reach the limit of subcolumns, and in this case
+    // the rest of subcolumns from src will be inserted into sparse column.
+    std::map<std::string_view, Subcolumn> 
src_path_and_subcoumn_for_sparse_column;
     for (const auto& entry : src_object.subcolumns) {
-        if (!has_subcolumn(entry->path)) {
-            if (entry->path.has_nested_part()) {
-                FieldInfo field_info {
-                        .scalar_type_id = 
entry->data.least_common_type.get_base_type_id(),
-                        .have_nulls = false,
-                        .need_convert = false,
-                        .num_dimensions = entry->data.get_dimensions()};
-                add_nested_subcolumn(entry->path, field_info, num_rows);
-            } else {
-                add_sub_column(entry->path, num_rows);
-            }
+        // Check if we already have such dense column path.
+        if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != 
nullptr) {
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else if (try_add_new_subcolumn(entry->path.get_path())) {
+            subcolumn = get_subcolumn(entry->path);
+            DCHECK(subcolumn != nullptr);
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else {
+            
src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), 
entry->data);
         }
-        auto* subcolumn = get_subcolumn(entry->path);
-        subcolumn->insert_range_from(entry->data, start, length);
     }
-    for (auto& entry : subcolumns) {
-        if (!src_object.has_subcolumn(entry->path)) {
-            bool inserted = try_insert_many_defaults_from_nested(entry);
-            if (!inserted) {
-                entry->data.insert_many_defaults(length);
-            }
-        }
+
+    // Paths in sparse column are sorted, so paths from 
src_dense_column_path_for_sparse_column should be inserted properly
+    // to keep paths sorted. Let's sort them in advance.
+    std::vector<std::pair<std::string_view, Subcolumn>> 
sorted_src_subcolumn_for_sparse_column;
+    auto it = src_path_and_subcoumn_for_sparse_column.begin();
+    auto end = src_path_and_subcoumn_for_sparse_column.end();
+    while (it != end) {
+        sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, 
it->second);
+        ++it;
     }
+
+    insert_from_sparse_column_and_fill_remaing_dense_column(
+            src_object, std::move(sorted_src_subcolumn_for_sparse_column), 
start, length);
+
     num_rows += length;
     finalize();
 #ifndef NDEBUG
     check_consistency();
 #endif
 }
 
+// std::map<std::string_view, Subcolumn>
+void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(

Review Comment:
   warning: method 'insert_from_sparse_column_and_fill_remaing_dense_column' 
can be made const [readability-make-member-function-const]
   
   be/src/vec/columns/column_object.cpp:1156:
   ```diff
   -         size_t start, size_t length) {
   +         size_t start, size_t length) const {
   ```
   



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1062,42 +1100,190 @@
     }
 }
 
+bool ColumnObject::try_add_new_subcolumn(std::string_view path) {
+    if (subcolumns.size() == MAX_SUBCOLUMNS) return false;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
       if (subcolumns.size() == MAX_SUBCOLUMNS) { return false;
   }
   ```
   



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1062,42 +1100,190 @@
     }
 }
 
+bool ColumnObject::try_add_new_subcolumn(std::string_view path) {
+    if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
+
+    return add_sub_column(path, num_rows);
+}
+
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
 #ifndef NDEBUG
     check_consistency();
 #endif
     const auto& src_object = assert_cast<const ColumnObject&>(src);
+
+    // First, insert src subcolumns
+    // We can reach the limit of subcolumns, and in this case
+    // the rest of subcolumns from src will be inserted into sparse column.
+    std::map<std::string_view, Subcolumn> 
src_path_and_subcoumn_for_sparse_column;
     for (const auto& entry : src_object.subcolumns) {
-        if (!has_subcolumn(entry->path)) {
-            if (entry->path.has_nested_part()) {
-                FieldInfo field_info {
-                        .scalar_type_id = 
entry->data.least_common_type.get_base_type_id(),
-                        .have_nulls = false,
-                        .need_convert = false,
-                        .num_dimensions = entry->data.get_dimensions()};
-                add_nested_subcolumn(entry->path, field_info, num_rows);
-            } else {
-                add_sub_column(entry->path, num_rows);
-            }
+        // Check if we already have such dense column path.
+        if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != 
nullptr) {
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else if (try_add_new_subcolumn(entry->path.get_path())) {
+            subcolumn = get_subcolumn(entry->path);
+            DCHECK(subcolumn != nullptr);
+            subcolumn->insert_range_from(entry->data, start, length);
+        } else {
+            
src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), 
entry->data);
         }
-        auto* subcolumn = get_subcolumn(entry->path);
-        subcolumn->insert_range_from(entry->data, start, length);
     }
-    for (auto& entry : subcolumns) {
-        if (!src_object.has_subcolumn(entry->path)) {
-            bool inserted = try_insert_many_defaults_from_nested(entry);
-            if (!inserted) {
-                entry->data.insert_many_defaults(length);
-            }
-        }
+
+    // Paths in sparse column are sorted, so paths from 
src_dense_column_path_for_sparse_column should be inserted properly
+    // to keep paths sorted. Let's sort them in advance.
+    std::vector<std::pair<std::string_view, Subcolumn>> 
sorted_src_subcolumn_for_sparse_column;
+    auto it = src_path_and_subcoumn_for_sparse_column.begin();
+    auto end = src_path_and_subcoumn_for_sparse_column.end();
+    while (it != end) {
+        sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, 
it->second);
+        ++it;
     }
+
+    insert_from_sparse_column_and_fill_remaing_dense_column(
+            src_object, std::move(sorted_src_subcolumn_for_sparse_column), 
start, length);
+
     num_rows += length;
     finalize();
 #ifndef NDEBUG
     check_consistency();
 #endif
 }
 
+// std::map<std::string_view, Subcolumn>
+void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(
+        const ColumnObject& src,
+        std::vector<std::pair<std::string_view, Subcolumn>>&&
+                sorted_src_subcolumn_for_sparse_column,
+        size_t start, size_t length) {
+    /// Check if src object doesn't have any paths in serialized sparse column.
+    const auto& src_serialized_sparse_column_offsets = 
src.serialized_sparse_column_offsets();
+    if (src_serialized_sparse_column_offsets[start - 1] ==
+        src_serialized_sparse_column_offsets[start + length - 1]) {
+        size_t current_size = size();
+
+        /// If no src subcolumns should be inserted into sparse column, insert 
defaults.
+        if (sorted_src_subcolumn_for_sparse_column.empty()) {
+            serialized_sparse_column->insert_many_defaults(length);
+        } else {
+            // Otherwise insert required src dense columns into sparse column.
+            auto [sparse_column_keys, sparse_column_values] = 
get_sparse_data_paths_and_values();
+            auto& sparse_column_offsets = serialized_sparse_column_offsets();
+            for (size_t i = start; i != start + length; ++i) {
+                int null_count = 0;
+                // Paths in sorted_src_subcolumn_for_sparse_column are already 
sorted.
+                for (auto& [path, subcolumn] : 
sorted_src_subcolumn_for_sparse_column) {
+                    bool is_null = false;
+                    subcolumn.serialize_to_sparse_column(sparse_column_keys, 
path,
+                                                         sparse_column_values, 
i, is_null);
+                    if (is_null) {
+                        ++null_count;
+                    }
+                }
+
+                // All the sparse columns in this row are null.
+                if (null_count == 
sorted_src_subcolumn_for_sparse_column.size()) {
+                    serialized_sparse_column->insert_default();
+                } else {
+                    DCHECK_EQ(sparse_column_keys->size(),
+                              sparse_column_offsets[i - 1] +
+                                      
sorted_src_subcolumn_for_sparse_column.size() - null_count);
+                    DCHECK_EQ(sparse_column_values->size(), 
sparse_column_keys->size());
+                    
sparse_column_offsets.push_back(sparse_column_keys->size());
+                }
+            }
+        }
+
+        // Insert default values in all remaining dense columns.
+        for (const auto& entry : subcolumns) {
+            if (entry->data.size() == current_size) {
+                entry->data.insert_many_defaults(length);
+            }
+        }
+        return;
+    }
+
+    // Src object column contains some paths in serialized sparse column in 
specified range.
+    // Iterate over this range and insert all required paths into serialized 
sparse column or subcolumns.
+    const auto& [src_sparse_column_path, src_sparse_column_values] =
+            src.get_sparse_data_paths_and_values();
+    auto [sparse_column_path, sparse_column_values] = 
get_sparse_data_paths_and_values();
+
+    auto& sparse_column_offsets = serialized_sparse_column_offsets();
+    for (size_t row = start; row != start + length; ++row) {
+        size_t current_size = sparse_column_offsets.size();
+
+        // Use separate index to iterate over sorted 
sorted_src_subcolumn_for_sparse_column.
+        size_t sorted_src_subcolumn_for_sparse_column_idx = 0;
+        size_t sorted_src_subcolumn_for_sparse_column_size =
+                sorted_src_subcolumn_for_sparse_column.size();
+        int null_count = 0;
+
+        size_t offset = src_serialized_sparse_column_offsets[row - 1];
+        size_t end = src_serialized_sparse_column_offsets[row];
+        // Iterator over [path, binary value]
+        for (size_t i = offset; i != end; ++i) {
+            const StringRef src_sparse_path_string = 
src_sparse_column_path->get_data_at(i);
+            const std::string_view src_sparse_path(src_sparse_path_string);
+            // Check if we have this path in subcolumns.
+            const PathInData column_path(src_sparse_path);
+            if (auto* subcolumn = get_subcolumn(column_path); subcolumn != 
nullptr) {
+                // Deserialize binary value into subcolumn from src serialized 
sparse column data.
+                
subcolumn->deserialize_from_sparse_column(src_sparse_column_values, i);
+            } else {
+                // Before inserting this path into sparse column check if we 
need to
+                // insert suibcolumns from 
sorted_src_subcolumn_for_sparse_column before.
+                while (sorted_src_subcolumn_for_sparse_column_idx <
+                               sorted_src_subcolumn_for_sparse_column_size &&
+                       sorted_src_subcolumn_for_sparse_column
+                                       
[sorted_src_subcolumn_for_sparse_column_idx]
+                                               .first < src_sparse_path) {
+                    auto& [src_path, src_subcolumn] = 
sorted_src_subcolumn_for_sparse_column
+                            [sorted_src_subcolumn_for_sparse_column_idx++];
+                    bool is_null = false;
+                    
src_subcolumn.serialize_to_sparse_column(sparse_column_path, src_path,
+                                                             
sparse_column_values, row, is_null);
+                    if (is_null) {
+                        ++null_count;
+                    }
+                }
+
+                /// Insert path and value from src sparse column to our sparse 
column.
+                sparse_column_path->insert_from(*src_sparse_column_path, i);
+                sparse_column_values->insert_from(*src_sparse_column_values, 
i);
+            }
+        }
+
+        // Insert remaining dynamic paths from 
src_dynamic_paths_for_shared_data.
+        while (sorted_src_subcolumn_for_sparse_column_idx <
+               sorted_src_subcolumn_for_sparse_column_size) {
+            auto& [src_path, src_subcolumn] = 
sorted_src_subcolumn_for_sparse_column
+                    [sorted_src_subcolumn_for_sparse_column_idx++];
+            bool is_null = false;
+            src_subcolumn.serialize_to_sparse_column(sparse_column_path, 
src_path,
+                                                     sparse_column_values, 
row, is_null);
+            if (is_null) {
+                ++null_count;
+            }
+        }
+
+        // All the sparse columns in this row are null.
+        if (null_count == sorted_src_subcolumn_for_sparse_column.size()) {
+            serialized_sparse_column->insert_default();
+        } else {
+            sparse_column_offsets.push_back(sparse_column_path->size());
+        }
+
+        // Insert default values in all remaining dense columns.
+        for (const auto& entry : subcolumns) {
+            if (entry->data.size() == current_size) {
+                entry->data.insert_default();
+            }
+        }
+    }
+
+    return;
+}

Review Comment:
   warning: redundant return statement at the end of a function with a void 
return type [readability-redundant-control-flow]
   
   ```suggestion
       }
   ```
   



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -982,6 +983,43 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) 
const {
                            n);
 }
 
+void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, 
std::string_view path,

Review Comment:
   warning: method 'serialize_to_sparse_column' can be made const 
[readability-make-member-function-const]
   
   be/src/vec/columns/column_object.cpp:987:
   ```diff
   -                                                          bool& is_null) {
   +                                                          bool& is_null) 
const {
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to