This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/variant-sparse by this push:
     new 74344538c6f [fix] (variant) remove nullable type in sparse column and 
implement replicate (#45502)
74344538c6f is described below

commit 74344538c6fb5da3e3fcb1cbc2bf8224d731de8e
Author: Sun Chenyang <suncheny...@selectdb.com>
AuthorDate: Tue Dec 17 11:55:35 2024 +0800

    [fix] (variant) remove nullable type in sparse column and implement 
replicate (#45502)
---
 be/src/vec/columns/column_object.cpp               | 122 ++++++++-------------
 be/src/vec/columns/column_object.h                 |   6 +-
 .../data_types/serde/data_type_nullable_serde.h    |   2 +
 3 files changed, 52 insertions(+), 78 deletions(-)

diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 11130e628e9..568ed7f8bbc 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -632,6 +632,7 @@ MutableColumnPtr ColumnObject::apply_for_columns(Func&& 
func) const {
     }
     auto sparse_column = func(serialized_sparse_column);
     res->serialized_sparse_column = sparse_column->assume_mutable();
+    res->set_num_rows(serialized_sparse_column->size());
     check_consistency();
     return res;
 }
@@ -823,9 +824,6 @@ ColumnObject::ColumnObject(size_t num_rows) : 
is_nullable(true) {
 }
 
 void ColumnObject::check_consistency() const {
-    if (subcolumns.empty() && serialized_sparse_column->empty()) {
-        return;
-    }
     for (const auto& leaf : subcolumns) {
         if (num_rows != leaf->data.size()) {
             throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
@@ -851,14 +849,8 @@ MutableColumnPtr ColumnObject::clone_resized(size_t 
new_size) const {
     if (new_size == 0) {
         return ColumnObject::create(is_nullable);
     }
-    // If subcolumns are empty, then res will be empty but new_size > 0
-    if (subcolumns.empty()) {
-        auto res = ColumnObject::create(new_size);
-        return res;
-    }
-    auto res = apply_for_columns(
+    return apply_for_columns(
             [&](const ColumnPtr column) { return 
column->clone_resized(new_size); });
-    return res;
 }
 
 size_t ColumnObject::byte_size() const {
@@ -919,38 +911,38 @@ void ColumnObject::try_insert(const Field& field) {
             root = get_subcolumn({});
         }
         root->insert(field);
-        ++num_rows;
-        return;
-    }
-    const auto& object = field.get<const VariantMap&>();
-    size_t old_size = size();
-    for (const auto& [key_str, value] : object) {
-        PathInData key;
-        if (!key_str.empty()) {
-            key = PathInData(key_str);
-        }
-        if (!has_subcolumn(key)) {
-            bool succ = add_sub_column(key, old_size);
-            if (!succ) {
-                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
-                                       "Failed to add sub column {}", 
key.get_path());
+    } else {
+        const auto& object = field.get<const VariantMap&>();
+        size_t old_size = size();
+        for (const auto& [key_str, value] : object) {
+            PathInData key;
+            if (!key_str.empty()) {
+                key = PathInData(key_str);
             }
+            if (!has_subcolumn(key)) {
+                bool succ = add_sub_column(key, old_size);
+                if (!succ) {
+                    throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
+                                           "Failed to add sub column {}", 
key.get_path());
+                }
+            }
+            auto* subcolumn = get_subcolumn(key);
+            if (!subcolumn) {
+                doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
+                                 fmt::format("Failed to find sub column {}", 
key.get_path()));
+            }
+            subcolumn->insert(value);
         }
-        auto* subcolumn = get_subcolumn(key);
-        if (!subcolumn) {
-            doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
-                             fmt::format("Failed to find sub column {}", 
key.get_path()));
-        }
-        subcolumn->insert(value);
-    }
-    for (auto& entry : subcolumns) {
-        if (old_size == entry->data.size()) {
-            bool inserted = try_insert_default_from_nested(entry);
-            if (!inserted) {
-                entry->data.insert_default();
+        for (auto& entry : subcolumns) {
+            if (old_size == entry->data.size()) {
+                bool inserted = try_insert_default_from_nested(entry);
+                if (!inserted) {
+                    entry->data.insert_default();
+                }
             }
         }
     }
+    serialized_sparse_column->insert_default();
     ++num_rows;
 }
 
@@ -1043,15 +1035,22 @@ void 
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
     for (size_t i = 0; i < data.size(); ++i) {
         const auto& part = data[i];
         if (row < part->size()) {
+            // no need null in sparse column
             if (assert_cast<const ColumnNullable&>(*part).is_null_at(row)) {
                 is_null = true;
             } else {
                 is_null = false;
                 // insert key
                 key->insert_data(path.data(), path.size());
-                const auto& part_type_serde = data_serdes[i];
+
+                // every subcolumn is always Nullable
+                auto nullable_serde =
+                        
std::static_pointer_cast<DataTypeNullableSerDe>(data_types[i]->get_serde());
+                auto& nullable_col = assert_cast<const ColumnNullable&>(*part);
+
                 // insert value
-                part_type_serde->write_one_cell_to_binary(*part, value, row);
+                nullable_serde->get_nested_serde()->write_one_cell_to_binary(
+                        nullable_col.get_nested_column(), value, row);
             }
             return;
         }
@@ -1118,11 +1117,6 @@ const char* parse_binary_from_sparse_column(TypeIndex 
type, const char* data, Fi
         auto& array = res.get<Array>();
         info_res.num_dimensions++;
         for (size_t i = 0; i < size; ++i) {
-            const uint8_t is_null = *reinterpret_cast<const uint8_t*>(data++);
-            if (is_null) {
-                array.emplace_back(Null());
-                continue;
-            }
             Field nested_field;
             const auto nested_type =
                     assert_cast<const TypeIndex>(*reinterpret_cast<const 
uint8_t*>(data++));
@@ -1143,25 +1137,15 @@ std::pair<Field, FieldInfo> 
ColumnObject::deserialize_from_sparse_column(const C
                                                                          
size_t row) {
     const auto& data_ref = value->get_data_at(row);
     const char* data = data_ref.data;
-    DCHECK(data_ref.size > 0);
-
+    DCHECK(data_ref.size > 1);
+    const TypeIndex type = assert_cast<const 
TypeIndex>(*reinterpret_cast<const uint8_t*>(data++));
+    Field res;
     FieldInfo info_res = {
-            .scalar_type_id = TypeIndex::Nothing,
+            .scalar_type_id = type,
             .have_nulls = false,
             .need_convert = false,
             .num_dimensions = 1,
     };
-    // 0 is null
-    const uint8_t is_null = *reinterpret_cast<const uint8_t*>(data++);
-    if (is_null) {
-        DCHECK(data_ref.size == 1);
-        return {Null(), info_res};
-    }
-
-    DCHECK(data_ref.size > 1);
-    const auto type = assert_cast<const TypeIndex>(*reinterpret_cast<const 
uint8_t*>(data++));
-    info_res.scalar_type_id = type;
-    Field res;
     const char* end = parse_binary_from_sparse_column(type, data, res, 
info_res);
     DCHECK_EQ(end - data_ref.data, data_ref.size);
     return {std::move(res), std::move(info_res)};
@@ -1446,20 +1430,6 @@ void 
ColumnObject::insert_from_sparse_column_and_fill_remaing_dense_column(
 }
 
 ColumnPtr ColumnObject::permute(const Permutation& perm, size_t limit) const {
-    if (subcolumns.empty()) {
-        if (limit == 0) {
-            limit = num_rows;
-        } else {
-            limit = std::min(num_rows, limit);
-        }
-
-        if (perm.size() < limit) {
-            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
-                                   "Size of permutation is less than 
required.");
-        }
-        auto res = ColumnObject::create(limit);
-        return res;
-    }
     return apply_for_columns([&](const ColumnPtr column) { return 
column->permute(perm, limit); });
 }
 
@@ -2177,7 +2147,6 @@ ColumnPtr get_base_column_of_array(const ColumnPtr& 
column) {
     return column;
 }
 
-// ----
 ColumnPtr ColumnObject::filter(const Filter& filter, ssize_t count) const {
     if (!is_finalized()) {
         auto finalized = clone_finalized();
@@ -2194,10 +2163,15 @@ ColumnPtr ColumnObject::filter(const Filter& filter, 
ssize_t count) const {
         new_column->add_sub_column(entry->path, subcolumn->assume_mutable(),
                                    entry->data.get_least_common_type());
     }
-    // filter
+
     return new_column;
 }
 
+ColumnPtr ColumnObject::replicate(const IColumn::Offsets& offsets) const {
+    column_match_offsets_size(num_rows, offsets.size());
+    return apply_for_columns([&](const ColumnPtr column) { return 
column->replicate(offsets); });
+}
+
 size_t ColumnObject::filter(const Filter& filter) {
     if (!is_finalized()) {
         finalize();
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index b63c0c5c0d8..86ba60fffce 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -441,6 +441,8 @@ public:
 
     ColumnPtr permute(const Permutation&, size_t) const override;
 
+    ColumnPtr replicate(const IColumn::Offsets& offsets) const override;
+
     bool is_variable_length() const override { return true; }
 
     template <typename Func>
@@ -469,10 +471,6 @@ public:
     void update_crc_with_value(size_t start, size_t end, uint32_t& hash,
                                const uint8_t* __restrict null_data) const 
override;
 
-    ColumnPtr replicate(const Offsets& offsets) const override {
-        throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "replicate" + 
get_name());
-    }
-
     Int64 get_int(size_t /*n*/) const override {
         throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "get_int" + 
get_name());
     }
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h 
b/be/src/vec/data_types/serde/data_type_nullable_serde.h
index 51cbf54eaed..d0f46ce7cbc 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.h
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h
@@ -102,6 +102,8 @@ public:
     void write_one_cell_to_binary(const IColumn& src_column, ColumnString* 
dst_column,
                                   int64_t row_num) const override;
 
+    DataTypeSerDeSPtr get_nested_serde() { return nested_serde; }
+
 private:
     template <bool is_binary_format>
     Status _write_column_to_mysql(const IColumn& column, 
MysqlRowBuffer<is_binary_format>& result,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to