github-actions[bot] commented on code in PR #45252: URL: https://github.com/apache/doris/pull/45252#discussion_r1877721869
########## be/src/vec/columns/column_object.cpp: ########## @@ -1062,42 +1100,190 @@ } } +bool ColumnObject::try_add_new_subcolumn(std::string_view path) { + if (subcolumns.size() == MAX_SUBCOLUMNS) return false; + + return add_sub_column(path, num_rows); +} + void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) { #ifndef NDEBUG check_consistency(); #endif const auto& src_object = assert_cast<const ColumnObject&>(src); + + // First, insert src subcolumns + // We can reach the limit of subcolumns, and in this case + // the rest of subcolumns from src will be inserted into sparse column. + std::map<std::string_view, Subcolumn> src_path_and_subcoumn_for_sparse_column; for (const auto& entry : src_object.subcolumns) { - if (!has_subcolumn(entry->path)) { - if (entry->path.has_nested_part()) { - FieldInfo field_info { - .scalar_type_id = entry->data.least_common_type.get_base_type_id(), - .have_nulls = false, - .need_convert = false, - .num_dimensions = entry->data.get_dimensions()}; - add_nested_subcolumn(entry->path, field_info, num_rows); - } else { - add_sub_column(entry->path, num_rows); - } + // Check if we already have such dense column path. + if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != nullptr) { + subcolumn->insert_range_from(entry->data, start, length); + } else if (try_add_new_subcolumn(entry->path.get_path())) { + subcolumn = get_subcolumn(entry->path); + DCHECK(subcolumn != nullptr); + subcolumn->insert_range_from(entry->data, start, length); + } else { + src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), entry->data); } - auto* subcolumn = get_subcolumn(entry->path); - subcolumn->insert_range_from(entry->data, start, length); } - for (auto& entry : subcolumns) { - if (!src_object.has_subcolumn(entry->path)) { - bool inserted = try_insert_many_defaults_from_nested(entry); - if (!inserted) { - entry->data.insert_many_defaults(length); - } - } + + // Paths in sparse column are sorted, so paths from src_dense_column_path_for_sparse_column should be inserted properly + // to keep paths sorted. Let's sort them in advance. + std::vector<std::pair<std::string_view, Subcolumn>> sorted_src_subcolumn_for_sparse_column; + auto it = src_path_and_subcoumn_for_sparse_column.begin(); + auto end = src_path_and_subcoumn_for_sparse_column.end(); + while (it != end) { + sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, it->second); + ++it; } + + insert_from_sparse_column_and_fill_remaing_dense_column( + src_object, std::move(sorted_src_subcolumn_for_sparse_column), start, length); + num_rows += length; finalize(); #ifndef NDEBUG check_consistency(); #endif } +// std::map<std::string_view, Subcolumn> +void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column( Review Comment: warning: function 'insert_from_sparse_column_and_fill_remaing_dense_column' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column( ^ ``` <details> <summary>Additional context</summary> **be/src/vec/columns/column_object.cpp:1152:** 128 lines including whitespace and comments (threshold 80) ```cpp void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column( ^ ``` </details> ########## be/src/vec/columns/column_object.cpp: ########## @@ -1062,42 +1100,190 @@ } } +bool ColumnObject::try_add_new_subcolumn(std::string_view path) { + if (subcolumns.size() == MAX_SUBCOLUMNS) return false; + + return add_sub_column(path, num_rows); +} + void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) { #ifndef NDEBUG check_consistency(); #endif const auto& src_object = assert_cast<const ColumnObject&>(src); + + // First, insert src subcolumns + // We can reach the limit of subcolumns, and in this case + // the rest of subcolumns from src will be inserted into sparse column. + std::map<std::string_view, Subcolumn> src_path_and_subcoumn_for_sparse_column; for (const auto& entry : src_object.subcolumns) { - if (!has_subcolumn(entry->path)) { - if (entry->path.has_nested_part()) { - FieldInfo field_info { - .scalar_type_id = entry->data.least_common_type.get_base_type_id(), - .have_nulls = false, - .need_convert = false, - .num_dimensions = entry->data.get_dimensions()}; - add_nested_subcolumn(entry->path, field_info, num_rows); - } else { - add_sub_column(entry->path, num_rows); - } + // Check if we already have such dense column path. + if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != nullptr) { + subcolumn->insert_range_from(entry->data, start, length); + } else if (try_add_new_subcolumn(entry->path.get_path())) { + subcolumn = get_subcolumn(entry->path); + DCHECK(subcolumn != nullptr); + subcolumn->insert_range_from(entry->data, start, length); + } else { + src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), entry->data); } - auto* subcolumn = get_subcolumn(entry->path); - subcolumn->insert_range_from(entry->data, start, length); } - for (auto& entry : subcolumns) { - if (!src_object.has_subcolumn(entry->path)) { - bool inserted = try_insert_many_defaults_from_nested(entry); - if (!inserted) { - entry->data.insert_many_defaults(length); - } - } + + // Paths in sparse column are sorted, so paths from src_dense_column_path_for_sparse_column should be inserted properly + // to keep paths sorted. Let's sort them in advance. + std::vector<std::pair<std::string_view, Subcolumn>> sorted_src_subcolumn_for_sparse_column; + auto it = src_path_and_subcoumn_for_sparse_column.begin(); + auto end = src_path_and_subcoumn_for_sparse_column.end(); + while (it != end) { + sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, it->second); + ++it; } + + insert_from_sparse_column_and_fill_remaing_dense_column( + src_object, std::move(sorted_src_subcolumn_for_sparse_column), start, length); + num_rows += length; finalize(); #ifndef NDEBUG check_consistency(); #endif } +// std::map<std::string_view, Subcolumn> +void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column( Review Comment: warning: method 'insert_from_sparse_column_and_fill_remaing_dense_column' can be made const [readability-make-member-function-const] be/src/vec/columns/column_object.cpp:1156: ```diff - size_t start, size_t length) { + size_t start, size_t length) const { ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -1062,42 +1100,190 @@ } } +bool ColumnObject::try_add_new_subcolumn(std::string_view path) { + if (subcolumns.size() == MAX_SUBCOLUMNS) return false; Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (subcolumns.size() == MAX_SUBCOLUMNS) { return false; } ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -1062,42 +1100,190 @@ } } +bool ColumnObject::try_add_new_subcolumn(std::string_view path) { + if (subcolumns.size() == MAX_SUBCOLUMNS) return false; + + return add_sub_column(path, num_rows); +} + void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) { #ifndef NDEBUG check_consistency(); #endif const auto& src_object = assert_cast<const ColumnObject&>(src); + + // First, insert src subcolumns + // We can reach the limit of subcolumns, and in this case + // the rest of subcolumns from src will be inserted into sparse column. + std::map<std::string_view, Subcolumn> src_path_and_subcoumn_for_sparse_column; for (const auto& entry : src_object.subcolumns) { - if (!has_subcolumn(entry->path)) { - if (entry->path.has_nested_part()) { - FieldInfo field_info { - .scalar_type_id = entry->data.least_common_type.get_base_type_id(), - .have_nulls = false, - .need_convert = false, - .num_dimensions = entry->data.get_dimensions()}; - add_nested_subcolumn(entry->path, field_info, num_rows); - } else { - add_sub_column(entry->path, num_rows); - } + // Check if we already have such dense column path. + if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != nullptr) { + subcolumn->insert_range_from(entry->data, start, length); + } else if (try_add_new_subcolumn(entry->path.get_path())) { + subcolumn = get_subcolumn(entry->path); + DCHECK(subcolumn != nullptr); + subcolumn->insert_range_from(entry->data, start, length); + } else { + src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), entry->data); } - auto* subcolumn = get_subcolumn(entry->path); - subcolumn->insert_range_from(entry->data, start, length); } - for (auto& entry : subcolumns) { - if (!src_object.has_subcolumn(entry->path)) { - bool inserted = try_insert_many_defaults_from_nested(entry); - if (!inserted) { - entry->data.insert_many_defaults(length); - } - } + + // Paths in sparse column are sorted, so paths from src_dense_column_path_for_sparse_column should be inserted properly + // to keep paths sorted. Let's sort them in advance. + std::vector<std::pair<std::string_view, Subcolumn>> sorted_src_subcolumn_for_sparse_column; + auto it = src_path_and_subcoumn_for_sparse_column.begin(); + auto end = src_path_and_subcoumn_for_sparse_column.end(); + while (it != end) { + sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, it->second); + ++it; } + + insert_from_sparse_column_and_fill_remaing_dense_column( + src_object, std::move(sorted_src_subcolumn_for_sparse_column), start, length); + num_rows += length; finalize(); #ifndef NDEBUG check_consistency(); #endif } +// std::map<std::string_view, Subcolumn> +void ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column( + const ColumnObject& src, + std::vector<std::pair<std::string_view, Subcolumn>>&& + sorted_src_subcolumn_for_sparse_column, + size_t start, size_t length) { + /// Check if src object doesn't have any paths in serialized sparse column. + const auto& src_serialized_sparse_column_offsets = src.serialized_sparse_column_offsets(); + if (src_serialized_sparse_column_offsets[start - 1] == + src_serialized_sparse_column_offsets[start + length - 1]) { + size_t current_size = size(); + + /// If no src subcolumns should be inserted into sparse column, insert defaults. + if (sorted_src_subcolumn_for_sparse_column.empty()) { + serialized_sparse_column->insert_many_defaults(length); + } else { + // Otherwise insert required src dense columns into sparse column. + auto [sparse_column_keys, sparse_column_values] = get_sparse_data_paths_and_values(); + auto& sparse_column_offsets = serialized_sparse_column_offsets(); + for (size_t i = start; i != start + length; ++i) { + int null_count = 0; + // Paths in sorted_src_subcolumn_for_sparse_column are already sorted. + for (auto& [path, subcolumn] : sorted_src_subcolumn_for_sparse_column) { + bool is_null = false; + subcolumn.serialize_to_sparse_column(sparse_column_keys, path, + sparse_column_values, i, is_null); + if (is_null) { + ++null_count; + } + } + + // All the sparse columns in this row are null. + if (null_count == sorted_src_subcolumn_for_sparse_column.size()) { + serialized_sparse_column->insert_default(); + } else { + DCHECK_EQ(sparse_column_keys->size(), + sparse_column_offsets[i - 1] + + sorted_src_subcolumn_for_sparse_column.size() - null_count); + DCHECK_EQ(sparse_column_values->size(), sparse_column_keys->size()); + sparse_column_offsets.push_back(sparse_column_keys->size()); + } + } + } + + // Insert default values in all remaining dense columns. + for (const auto& entry : subcolumns) { + if (entry->data.size() == current_size) { + entry->data.insert_many_defaults(length); + } + } + return; + } + + // Src object column contains some paths in serialized sparse column in specified range. + // Iterate over this range and insert all required paths into serialized sparse column or subcolumns. + const auto& [src_sparse_column_path, src_sparse_column_values] = + src.get_sparse_data_paths_and_values(); + auto [sparse_column_path, sparse_column_values] = get_sparse_data_paths_and_values(); + + auto& sparse_column_offsets = serialized_sparse_column_offsets(); + for (size_t row = start; row != start + length; ++row) { + size_t current_size = sparse_column_offsets.size(); + + // Use separate index to iterate over sorted sorted_src_subcolumn_for_sparse_column. + size_t sorted_src_subcolumn_for_sparse_column_idx = 0; + size_t sorted_src_subcolumn_for_sparse_column_size = + sorted_src_subcolumn_for_sparse_column.size(); + int null_count = 0; + + size_t offset = src_serialized_sparse_column_offsets[row - 1]; + size_t end = src_serialized_sparse_column_offsets[row]; + // Iterator over [path, binary value] + for (size_t i = offset; i != end; ++i) { + const StringRef src_sparse_path_string = src_sparse_column_path->get_data_at(i); + const std::string_view src_sparse_path(src_sparse_path_string); + // Check if we have this path in subcolumns. + const PathInData column_path(src_sparse_path); + if (auto* subcolumn = get_subcolumn(column_path); subcolumn != nullptr) { + // Deserialize binary value into subcolumn from src serialized sparse column data. + subcolumn->deserialize_from_sparse_column(src_sparse_column_values, i); + } else { + // Before inserting this path into sparse column check if we need to + // insert suibcolumns from sorted_src_subcolumn_for_sparse_column before. + while (sorted_src_subcolumn_for_sparse_column_idx < + sorted_src_subcolumn_for_sparse_column_size && + sorted_src_subcolumn_for_sparse_column + [sorted_src_subcolumn_for_sparse_column_idx] + .first < src_sparse_path) { + auto& [src_path, src_subcolumn] = sorted_src_subcolumn_for_sparse_column + [sorted_src_subcolumn_for_sparse_column_idx++]; + bool is_null = false; + src_subcolumn.serialize_to_sparse_column(sparse_column_path, src_path, + sparse_column_values, row, is_null); + if (is_null) { + ++null_count; + } + } + + /// Insert path and value from src sparse column to our sparse column. + sparse_column_path->insert_from(*src_sparse_column_path, i); + sparse_column_values->insert_from(*src_sparse_column_values, i); + } + } + + // Insert remaining dynamic paths from src_dynamic_paths_for_shared_data. + while (sorted_src_subcolumn_for_sparse_column_idx < + sorted_src_subcolumn_for_sparse_column_size) { + auto& [src_path, src_subcolumn] = sorted_src_subcolumn_for_sparse_column + [sorted_src_subcolumn_for_sparse_column_idx++]; + bool is_null = false; + src_subcolumn.serialize_to_sparse_column(sparse_column_path, src_path, + sparse_column_values, row, is_null); + if (is_null) { + ++null_count; + } + } + + // All the sparse columns in this row are null. + if (null_count == sorted_src_subcolumn_for_sparse_column.size()) { + serialized_sparse_column->insert_default(); + } else { + sparse_column_offsets.push_back(sparse_column_path->size()); + } + + // Insert default values in all remaining dense columns. + for (const auto& entry : subcolumns) { + if (entry->data.size() == current_size) { + entry->data.insert_default(); + } + } + } + + return; +} Review Comment: warning: redundant return statement at the end of a function with a void return type [readability-redundant-control-flow] ```suggestion } ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -982,6 +983,43 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) const { n); } +void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std::string_view path, Review Comment: warning: method 'serialize_to_sparse_column' can be made const [readability-make-member-function-const] be/src/vec/columns/column_object.cpp:987: ```diff - bool& is_null) { + bool& is_null) const { ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org