This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/variant-sparse by this push:
     new 00b4042dab1 [enhance](variant)Add ut writer var (#49624)
00b4042dab1 is described below

commit 00b4042dab1103f96c9546b2fc101423be0a7739
Author: Sun Chenyang <suncheny...@selectdb.com>
AuthorDate: Fri Mar 28 16:02:40 2025 +0800

    [enhance](variant)Add ut writer var (#49624)
---
 .../rowset/segment_v2/hierarchical_data_reader.cpp |   2 +-
 .../segment_v2/variant_column_writer_impl.cpp      |  17 +-
 .../rowset/segment_v2/variant_column_writer_impl.h |  12 +-
 be/src/vec/columns/column_object.cpp               |  16 +-
 be/src/vec/columns/column_object.h                 |   2 +
 be/src/vec/common/schema_util.cpp                  |  42 +-
 be/src/vec/common/schema_util.h                    |  16 +-
 be/src/vec/olap/olap_data_convertor.cpp            |   8 +-
 be/src/vec/olap/olap_data_convertor.h              |   6 +
 be/test/common/schema_util_test.cpp                | 121 ----
 .../compaction/util/index_compaction_utils.cpp     |   2 +-
 .../variant_column_writer_reader_test.cpp          | 663 +++++++++++++++++++++
 .../olap/rowset/variant_with_compaction_test.cpp   |   0
 be/test/testutil/schema_utils.h                    |  48 ++
 be/test/testutil/variant_util.h                    | 137 +++++
 be/test/vec/common/schema_util_rowset_test.cpp     | 265 ++++++++
 be/test/vec/common/schema_util_test.cpp            | 357 +++++++++++
 17 files changed, 1532 insertions(+), 182 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp 
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
index 2367a38821b..185a6d82422 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
@@ -382,7 +382,7 @@ Status 
HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject&
                         // from "" which is empty path and root
                         if (container_variant.is_null_root()) {
                             // root was created with nrows with Nothing type, 
resize it to fit the size of sparse column
-                            
container_variant.get_root()->resize(sparse_data_offsets.size());
+                            
container_variant.get_subcolumn({})->resize(sparse_data_offsets.size());
                             // bool added = 
container_variant.add_sub_column({}, sparse_data_offsets.size());
                             // if (!added) {
                             //     return Status::InternalError("Failed to add 
subcolumn for sparse column");
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 34fe6e085ec..ae1f2a8de61 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -403,9 +403,9 @@ Status VariantColumnWriterImpl::_process_sparse_column(
         return status;
     }
     VLOG_DEBUG << "dump sparse "
-               << vectorized::schema_util::dump_column(
-                          vectorized::ColumnObject::get_sparse_column_type(),
-                          ptr->get_sparse_column());
+               << vectorized::Block::dump_column(
+                          ptr->get_sparse_column(),
+                          vectorized::ColumnObject::get_sparse_column_type());
     RETURN_IF_ERROR(
             _sparse_column_writer->append(column->get_nullmap(), 
column->get_data(), num_rows));
     ++column_id;
@@ -498,10 +498,11 @@ bool VariantColumnWriterImpl::is_finalized() const {
 
 Status VariantColumnWriterImpl::append_data(const uint8_t** ptr, size_t 
num_rows) {
     DCHECK(!is_finalized());
-    const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr);
+    const auto* column = reinterpret_cast<const 
vectorized::VariantColumnData*>(*ptr);
+    const auto& src = *reinterpret_cast<const 
vectorized::ColumnObject*>(column->column_data);
     auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get());
     // TODO: if direct write we could avoid copy
-    dst_ptr->insert_range_from(src, 0, num_rows);
+    dst_ptr->insert_range_from(src, column->row_pos, num_rows);
     return Status::OK();
 }
 
@@ -629,9 +630,11 @@ Status VariantSubcolumnWriter::init() {
 }
 
 Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t 
num_rows) {
-    const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr);
+    const auto* column = reinterpret_cast<const 
vectorized::VariantColumnData*>(*ptr);
+    const auto& src = *reinterpret_cast<const 
vectorized::ColumnObject*>(column->column_data);
     auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get());
-    dst_ptr->insert_range_from(src, 0, num_rows);
+    // TODO: if direct write we could avoid copy
+    dst_ptr->insert_range_from(src, column->row_pos, num_rows);
     return Status::OK();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
index d9974dd6f2e..9f67cf04505 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
@@ -36,8 +36,13 @@ class ColumnWriter;
 class ScalarColumnWriter;
 
 struct VariantStatistics {
-    // If reached the size of this, we should stop writing statistics for 
sparse data
-    constexpr static size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
+    // #ifdef BE_TEST
+    //     static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10;
+    // #else
+    //     // If reached the size of this, we should stop writing statistics 
for sparse data
+    //     static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
+    // #endif
+    static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
     std::map<std::string, size_t> subcolumns_non_null_size;
     std::map<std::string, size_t> sparse_column_non_null_size;
 
@@ -96,5 +101,8 @@ private:
     // hold the references of subcolumns indexes
     std::vector<std::unique_ptr<TabletIndex>> _subcolumns_indexes;
 };
+
+void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const 
TabletColumn& column,
+                       CompressionTypePB compression_type);
 } // namespace segment_v2
 } // namespace doris
\ No newline at end of file
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 896fad0795d..448ff7df4c1 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -640,6 +640,17 @@ bool ColumnObject::Subcolumn::is_finalized() const {
            (data.empty() || (data.size() == 1));
 }
 
+void ColumnObject::Subcolumn::resize(size_t n) {
+    if (n == num_rows) {
+        return;
+    }
+    if (n > num_rows) {
+        insert_many_defaults(n - num_rows);
+    } else {
+        pop_back(num_rows - n);
+    }
+}
+
 template <typename Func>
 MutableColumnPtr ColumnObject::apply_for_columns(Func&& func) const {
     if (!is_finalized()) {
@@ -1107,7 +1118,8 @@ void 
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
     row -= num_of_defaults_in_prefix;
     for (size_t i = 0; i < data.size(); ++i) {
         const auto& part = data[i];
-        if (row < part->size()) {
+        size_t current_column_size = part->size();
+        if (row < current_column_size) {
             // no need null in sparse column
             if (!assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*part).is_null_at(
                         row)) {
@@ -1129,7 +1141,7 @@ void 
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
             return;
         }
 
-        row -= part->size();
+        row -= current_column_size;
     }
 
     throw doris::Exception(ErrorCode::OUT_OF_BOUND,
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index 10078c0ede5..f7c42843f7b 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -204,6 +204,8 @@ public:
 
         bool is_empty_nested(size_t row) const;
 
+        void resize(size_t n);
+
     private:
         class LeastCommonType {
         public:
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 2f8a4de72e7..6298daffd8f 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -534,7 +534,8 @@ Status _parse_variant_columns(Block& block, const 
std::vector<int>& variant_pos,
         }
 
         if (scalar_root_column->is_column_string()) {
-            variant_column = ColumnObject::create(var.max_subcolumns_count());
+            // now, subcolumns have not been set, so we set it to 0
+            variant_column = ColumnObject::create(0);
             parse_json_to_variant(*variant_column.get(),
                                   assert_cast<const 
ColumnString&>(*scalar_root_column), config);
         } else {
@@ -577,43 +578,6 @@ vectorized::ColumnObject::Subcolumns get_sorted_subcolumns(
     return sorted;
 }
 
-// ---------------------------
-
-std::string dump_column(DataTypePtr type, const ColumnPtr& col) {
-    Block tmp;
-    tmp.insert(ColumnWithTypeAndName {col, type, col->get_name()});
-    return tmp.dump_data(0, tmp.rows());
-}
-
-// ---------------------------
-Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& 
dst) {
-    auto type_string = std::make_shared<DataTypeString>();
-    std::string jsonpath = path.to_jsonpath();
-    bool is_nullable = source->is_nullable();
-    auto json_type = is_nullable ? 
make_nullable(std::make_shared<DataTypeJsonb>())
-                                 : std::make_shared<DataTypeJsonb>();
-    ColumnsWithTypeAndName arguments {
-            {source, json_type, ""},
-            {type_string->create_column_const(1, Field(String(jsonpath.data(), 
jsonpath.size()))),
-             type_string, ""}};
-    auto function =
-            SimpleFunctionFactory::instance().get_function("jsonb_extract", 
arguments, json_type);
-    if (!function) {
-        return Status::InternalError("Not found function jsonb_extract");
-    }
-    Block tmp_block {arguments};
-    vectorized::ColumnNumbers argnum;
-    argnum.emplace_back(0);
-    argnum.emplace_back(1);
-    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
-    tmp_block.insert({nullptr, json_type, ""});
-    RETURN_IF_ERROR(function->execute(nullptr, tmp_block, argnum, 
result_column, source->size()));
-    dst = tmp_block.get_by_position(result_column)
-                  .column->convert_to_full_column_if_const()
-                  ->assume_mutable();
-    return Status::OK();
-}
-
 bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* 
old_schema,
                            int32_t new_col_idx, int32_t old_col_idx) {
     const auto& column_new = new_schema->column(new_col_idx);
@@ -645,8 +609,6 @@ TabletColumn create_sparse_column(const TabletColumn& 
variant) {
     return res;
 }
 
-using PathToNoneNullValues = std::unordered_map<std::string, size_t>;
-
 Status collect_path_stats(const RowsetSharedPtr& rs,
                           std::unordered_map<int32_t, PathToNoneNullValues>& 
uid_to_path_stats) {
     SegmentCacheHandle segment_cache;
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index 4c1ee876254..8281cdec7b6 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -54,6 +54,8 @@ struct ColumnWithTypeAndName;
 
 const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
 namespace doris::vectorized::schema_util {
+using PathToNoneNullValues = std::unordered_map<std::string, size_t>;
+
 /// Returns number of dimensions in Array type. 0 if type is not array.
 size_t get_number_of_dimensions(const IDataType& type);
 
@@ -122,17 +124,21 @@ void inherit_column_attributes(const TabletColumn& 
source, TabletColumn& target,
 vectorized::ColumnObject::Subcolumns get_sorted_subcolumns(
         const vectorized::ColumnObject::Subcolumns& subcolumns);
 
-// Extract json data from source with path
-Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& 
dst);
-
-std::string dump_column(DataTypePtr type, const ColumnPtr& col);
-
 bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* 
old_schema,
                            int32_t new_col_idx, int32_t old_col_idx);
 
 // create ColumnMap<String, String>
 TabletColumn create_sparse_column(const TabletColumn& variant);
 
+// get the subpaths and sparse paths for the variant column
+void get_subpaths(const TabletColumn& variant,
+                  const std::unordered_map<int32_t, PathToNoneNullValues>& 
path_stats,
+                  std::unordered_map<int32_t, TabletSchema::PathsSetInfo>& 
uid_to_paths_set_info);
+
+// collect path stats from the rowset
+Status collect_path_stats(const RowsetSharedPtr& rs,
+                          std::unordered_map<int32_t, PathToNoneNullValues>& 
uid_to_path_stats);
+
 // Build the temporary schema for compaction, this will reduce the memory 
usage of compacting variant columns
 Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, 
TabletSchemaSPtr& target);
 
diff --git a/be/src/vec/olap/olap_data_convertor.cpp 
b/be/src/vec/olap/olap_data_convertor.cpp
index c2887c41934..c286127d54e 100644
--- a/be/src/vec/olap/olap_data_convertor.cpp
+++ b/be/src/vec/olap/olap_data_convertor.cpp
@@ -1205,6 +1205,8 @@ Status 
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::convert_to_olap()
     }
     // Do nothing, the column writer will finally do finalize and write 
subcolumns one by one
     // since we are not sure the final column(type and columns) until the end 
of the last block
+    // need to return the position of the column data
+    _variant_column_data = std::make_unique<VariantColumnData>(_value_ptr, 
_row_pos);
     return Status::OK();
 }
 
@@ -1212,9 +1214,9 @@ const void* 
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data() c
     if (!_value_ptr) {
         return _root_data_convertor->get_data();
     }
-    // return the ptr of original column, see 
VariantColumnWriterImpl::append_data
-    // which will cast to ColumnObject
-    return _value_ptr;
+    // return the ptr of VariantColumnData, see 
VariantColumnWriterImpl::append_data
+    // which will cast to VariantColumnData
+    return _variant_column_data.get();
 }
 const void* 
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data_at(
         size_t offset) const {
diff --git a/be/src/vec/olap/olap_data_convertor.h 
b/be/src/vec/olap/olap_data_convertor.h
index c2a8e6ace47..e0f1184021e 100644
--- a/be/src/vec/olap/olap_data_convertor.h
+++ b/be/src/vec/olap/olap_data_convertor.h
@@ -72,6 +72,11 @@ public:
     virtual ~IOlapColumnDataAccessor() = default;
 };
 
+struct VariantColumnData {
+    const void* column_data;
+    size_t row_pos;
+};
+
 class OlapBlockDataConvertor {
 public:
     OlapBlockDataConvertor() = default;
@@ -536,6 +541,7 @@ private:
     private:
         const void* _value_ptr;
         std::unique_ptr<OlapColumnDataConvertorVarChar> _root_data_convertor;
+        std::unique_ptr<VariantColumnData> _variant_column_data;
     };
 
 private:
diff --git a/be/test/common/schema_util_test.cpp 
b/be/test/common/schema_util_test.cpp
deleted file mode 100644
index fb8b23c10cb..00000000000
--- a/be/test/common/schema_util_test.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "vec/common/schema_util.h"
-
-#include <gtest/gtest.h>
-
-namespace doris {
-
-class SchemaUtilTest : public testing::Test {};
-
-void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, 
int64_t index_id,
-                      const std::string& index_name, int32_t col_unique_id,
-                      const std::string& column_type, const std::string& 
column_name,
-                      const IndexType& index_type) {
-    column_pb->set_unique_id(col_unique_id);
-    column_pb->set_name(column_name);
-    column_pb->set_type(column_type);
-    column_pb->set_is_nullable(true);
-    column_pb->set_is_bf_column(true);
-    tablet_index->set_index_id(index_id);
-    tablet_index->set_index_name(index_name);
-    tablet_index->set_index_type(index_type);
-    tablet_index->add_col_unique_id(col_unique_id);
-}
-
-void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, 
int32_t col_unique_id,
-                         std::string_view path, std::vector<TabletColumn>* 
subcolumns) {
-    TabletColumn subcol;
-    subcol.set_type(type);
-    subcol.set_is_nullable(true);
-    subcol.set_unique_id(-1);
-    subcol.set_parent_unique_id(col_unique_id);
-    vectorized::PathInData col_path(path);
-    subcol.set_path_info(col_path);
-    subcol.set_name(col_path.get_path());
-
-    if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
-        TabletColumn array_item_col;
-        // double not support inverted index
-        array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE);
-        array_item_col.set_is_nullable(true);
-        array_item_col.set_unique_id(-1);
-        array_item_col.set_parent_unique_id(col_unique_id);
-
-        subcol.add_sub_column(array_item_col);
-    }
-
-    schema->append_column(subcol);
-    subcolumns->emplace_back(std::move(subcol));
-}
-
-TEST_F(SchemaUtilTest, inherit_column_attributes) {
-    TabletSchemaPB schema_pb;
-    schema_pb.set_keys_type(KeysType::DUP_KEYS);
-    
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
-
-    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, 
"key_index", 0, "INT",
-                     "key", IndexType::INVERTED);
-    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, 
"v1_index", 1, "VARIANT",
-                     "v1", IndexType::INVERTED);
-    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, 
"v3_index", 3, "VARIANT",
-                     "v3", IndexType::INVERTED);
-
-    TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
-    tablet_schema->init_from_pb(schema_pb);
-    std::vector<TabletColumn> subcolumns;
-
-    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, 
"v1.b", &subcolumns);
-    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, 
"v1.c", &subcolumns);
-
-    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, 
"v3.d", &subcolumns);
-    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, 
"v3.a", &subcolumns);
-
-    vectorized::schema_util::inherit_column_attributes(tablet_schema);
-    for (const auto& col : subcolumns) {
-        switch (col._parent_col_unique_id) {
-        case 1:
-            EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr);
-            break;
-        case 3:
-            EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr);
-            break;
-        default:
-            EXPECT_TRUE(false);
-        }
-    }
-    EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7);
-
-    for (const auto& col : tablet_schema->_cols) {
-        if (!col->is_extracted_column()) {
-            continue;
-        }
-        switch (col->_parent_col_unique_id) {
-        case 1:
-            EXPECT_TRUE(col->is_bf_column());
-            break;
-        case 3:
-            EXPECT_TRUE(!col->is_bf_column());
-            break;
-        default:
-            EXPECT_TRUE(false);
-        }
-    }
-}
-
-} // namespace doris
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
index 02353fc5441..4f4a601e63c 100644
--- 
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
@@ -416,7 +416,7 @@ class IndexCompactionUtils {
         // only base compaction can handle delete predicate
         BaseCompaction compaction(*engine_ref, tablet);
         compaction._input_rowsets = std::move(rowsets);
-        compaction.build_basic_info();
+        RETURN_IF_ERROR(compaction.build_basic_info());
 
         std::vector<RowsetReaderSharedPtr> input_rs_readers;
         create_input_rowsets_readers(compaction, input_rs_readers);
diff --git 
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp 
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
new file mode 100644
index 00000000000..df8428ac877
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -0,0 +1,663 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+#include "olap/storage_engine.h"
+#include "testutil/schema_utils.h"
+#include "testutil/variant_util.h"
+
+using namespace doris::vectorized;
+
+namespace doris {
+
+constexpr static uint32_t MAX_PATH_LEN = 1024;
+constexpr static std::string_view dest_dir = 
"/ut_dir/variant_column_writer_test";
+constexpr static std::string_view tmp_dir = "./ut_dir/tmp";
+
+class VariantColumnWriterReaderTest : public testing::Test {
+public:
+    void SetUp() override {
+        // absolute dir
+        char buffer[MAX_PATH_LEN];
+        EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+        _current_dir = std::string(buffer);
+        _absolute_dir = _current_dir + std::string(dest_dir);
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok());
+
+        // tmp dir
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        Status st = tmp_file_dirs->init();
+        EXPECT_TRUE(st.ok()) << st.to_json();
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+        // storage engine
+        doris::EngineOptions options;
+        auto engine = std::make_unique<StorageEngine>(options);
+        _engine_ref = engine.get();
+        _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
+        static_cast<void>(_data_dir->update_capacity());
+        ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+    }
+
+    void TearDown() override {
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+        _engine_ref = nullptr;
+        ExecEnv::GetInstance()->set_storage_engine(nullptr);
+    }
+
+    VariantColumnWriterReaderTest() = default;
+    ~VariantColumnWriterReaderTest() override = default;
+
+private:
+    TabletSchemaSPtr _tablet_schema = nullptr;
+    StorageEngine* _engine_ref = nullptr;
+    std::unique_ptr<DataDir> _data_dir = nullptr;
+    TabletSharedPtr _tablet = nullptr;
+    std::string _absolute_dir;
+    std::string _current_dir;
+};
+
+void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) {
+    EXPECT_TRUE(column_meta.has_column_path_info());
+    auto path = std::make_shared<vectorized::PathInData>();
+    path->from_protobuf(column_meta.column_path_info());
+    EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1);
+    EXPECT_EQ(column_meta.none_null_size(), 
path_with_size[path->copy_pop_front().get_path()]);
+}
+
+void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& 
path_with_size) {
+    EXPECT_TRUE(column_meta.has_column_path_info());
+    auto path = std::make_shared<vectorized::PathInData>();
+    path->from_protobuf(column_meta.column_path_info());
+    EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1);
+    for (const auto& [path, size] :
+         column_meta.variant_statistics().sparse_column_non_null_size()) {
+        EXPECT_EQ(size, path_with_size[path]);
+    }
+    EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__");
+}
+
+TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) {
+    // 1. create tablet_schema
+    TabletSchemaPB schema_pb;
+    schema_pb.set_keys_type(KeysType::DUP_KEYS);
+    SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1");
+    _tablet_schema = std::make_shared<TabletSchema>();
+    _tablet_schema->init_from_pb(schema_pb);
+
+    // 2. create tablet
+    TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+    tablet_meta->_tablet_id = 10000;
+    _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, 
_data_dir.get());
+
+    EXPECT_TRUE(_tablet->init().ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+    // 3. create file_writer
+    io::FileWriterPtr file_writer;
+    auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+    auto st = io::global_local_filesystem()->create_file(file_path, 
&file_writer);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    // 4. create column_writer
+    SegmentFooterPB footer;
+    ColumnWriterOptions opts;
+    opts.meta = footer.add_columns();
+    opts.compression_type = CompressionTypePB::LZ4;
+    opts.file_writer = file_writer.get();
+    opts.footer = &footer;
+    RowsetWriterContext rowset_ctx;
+    rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+    opts.rowset_ctx = &rowset_ctx;
+    opts.rowset_ctx->tablet_schema = _tablet_schema;
+    TabletColumn column = _tablet_schema->column(0);
+    _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+    std::unique_ptr<ColumnWriter> writer;
+    EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), 
&writer).ok());
+    EXPECT_TRUE(writer->init().ok());
+    EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+    // 5. write data
+    auto olap_data_convertor = 
std::make_unique<vectorized::OlapBlockDataConvertor>();
+    auto block = _tablet_schema->create_block();
+    auto column_object = 
(*std::move(block.get_by_position(0).column)).mutate();
+    std::unordered_map<int, std::string> inserted_jsonstr;
+    auto path_with_size =
+            VariantUtil::fill_object_column_with_test_data(column_object, 
1000, &inserted_jsonstr);
+    olap_data_convertor->add_column_data_convertor(column);
+    olap_data_convertor->set_source_content(&block, 0, 1000);
+    auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+    EXPECT_TRUE(result.ok());
+    EXPECT_TRUE(accessor != nullptr);
+    EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 
1000).ok());
+    st = writer->finish();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_data();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_ordinal_index();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_zone_map();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(file_writer->close().ok());
+    footer.set_num_rows(1000);
+
+    // 6. check footer
+    EXPECT_EQ(footer.columns_size(), 5);
+    auto column_meta = footer.columns(0);
+    EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+    for (int i = 1; i < footer.columns_size() - 1; ++i) {
+        auto column_meta = footer.columns(i);
+        check_column_meta(column_meta, path_with_size);
+    }
+    check_sparse_column_meta(footer.columns(footer.columns_size() - 1), 
path_with_size);
+
+    // 7. check variant reader
+    io::FileReaderSPtr file_reader;
+    st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    ColumnReaderOptions read_opts;
+    std::unique_ptr<ColumnReader> column_reader;
+    st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, 
&column_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    auto variant_column_reader = 
assert_cast<VariantColumnReader*>(column_reader.get());
+    EXPECT_TRUE(variant_column_reader != nullptr);
+
+    auto subcolumn_reader = 
variant_column_reader->get_reader_by_path(PathInData("key0"));
+    EXPECT_TRUE(subcolumn_reader != nullptr);
+    subcolumn_reader = 
variant_column_reader->get_reader_by_path(PathInData("key1"));
+    EXPECT_TRUE(subcolumn_reader != nullptr);
+    subcolumn_reader = 
variant_column_reader->get_reader_by_path(PathInData("key2"));
+    EXPECT_TRUE(subcolumn_reader != nullptr);
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8")));
+    
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9")));
+    auto size = variant_column_reader->get_metadata_size();
+    EXPECT_GT(size, 0);
+
+    // 8. check statistics
+    auto statistics = variant_column_reader->get_stats();
+    for (const auto& [path, size] : statistics->subcolumns_non_null_size) {
+        EXPECT_EQ(path_with_size[path], size);
+    }
+    for (const auto& [path, size] : statistics->sparse_column_non_null_size) {
+        EXPECT_EQ(path_with_size[path], size);
+    }
+
+    // 9. check hier reader
+    ColumnIterator* it;
+    TabletColumn parent_column = _tablet_schema->column(0);
+    StorageReadOptions storage_read_opts;
+    storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+    st = variant_column_reader->new_iterator(&it, parent_column, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+    ColumnIteratorOptions column_iter_opts;
+    OlapReaderStatistics stats;
+    column_iter_opts.stats = &stats;
+    column_iter_opts.file_reader = file_reader.get();
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    MutableColumnPtr new_column_object = ColumnObject::create(3);
+    size_t nrows = 1000;
+    st = it->seek_to_ordinal(0);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = it->next_batch(&nrows, new_column_object);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(stats.bytes_read > 0);
+
+    for (int i = 0; i < 1000; ++i) {
+        std::string value;
+        st = assert_cast<ColumnObject*>(new_column_object.get())
+                     ->serialize_one_row_to_string(i, &value);
+
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_EQ(value, inserted_jsonstr[i]);
+    }
+
+    std::vector<rowid_t> row_ids;
+    for (int i = 0; i < 1000; ++i) {
+        if (i % 7 == 0) {
+            row_ids.push_back(i);
+        }
+    }
+    new_column_object = ColumnObject::create(3);
+    st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    for (int i = 0; i < row_ids.size(); ++i) {
+        std::string value;
+        st = assert_cast<ColumnObject*>(new_column_object.get())
+                     ->serialize_one_row_to_string(i, &value);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]);
+    }
+
+    auto read_to_column_object = [&]() {
+        new_column_object = ColumnObject::create(3);
+        nrows = 1000;
+        st = it->seek_to_ordinal(0);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        st = it->next_batch(&nrows, new_column_object);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_TRUE(stats.bytes_read > 0);
+        EXPECT_EQ(nrows, 1000);
+    };
+
+    // 10. check sparse extract reader
+    for (int i = 3; i < 10; ++i) {
+        std::string key = ".key" + std::to_string(i);
+        TabletColumn subcolumn_in_sparse;
+        subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key);
+        subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+        subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id());
+        
subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + 
key));
+        subcolumn_in_sparse.set_variant_max_subcolumns_count(
+                parent_column.variant_max_subcolumns_count());
+        subcolumn_in_sparse.set_is_nullable(true);
+
+        st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, 
&storage_read_opts);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+        st = it->init(column_iter_opts);
+        EXPECT_TRUE(st.ok()) << st.msg();
+
+        read_to_column_object();
+
+        for (int row = 0; row < 1000; ++row) {
+            std::string value;
+            st = assert_cast<ColumnObject*>(new_column_object.get())
+                         ->serialize_one_row_to_string(row, &value);
+            EXPECT_TRUE(st.ok()) << st.msg();
+            if (inserted_jsonstr[row].find(key) != std::string::npos) {
+                if (i % 2 == 0) {
+                    EXPECT_EQ(value, "88");
+                } else {
+                    EXPECT_EQ(value, "str99");
+                }
+            }
+        }
+    }
+
+    // 11. check leaf reader
+    auto check_leaf_reader = [&]() {
+        for (int i = 0; i < 3; ++i) {
+            std::string key = ".key" + std::to_string(i);
+            TabletColumn subcolumn;
+            subcolumn.set_name(parent_column.name_lower_case() + key);
+            subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type());
+            subcolumn.set_parent_unique_id(parent_column.unique_id());
+            subcolumn.set_path_info(PathInData(parent_column.name_lower_case() 
+ key));
+            subcolumn.set_variant_max_subcolumns_count(
+                    parent_column.variant_max_subcolumns_count());
+            subcolumn.set_is_nullable(true);
+
+            st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+            EXPECT_TRUE(st.ok()) << st.msg();
+            EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr);
+            st = it->init(column_iter_opts);
+            EXPECT_TRUE(st.ok()) << st.msg();
+
+            auto column_type = 
DataTypeFactory::instance().create_data_type(subcolumn, false);
+            auto read_column = column_type->create_column();
+            nrows = 1000;
+            st = it->seek_to_ordinal(0);
+            EXPECT_TRUE(st.ok()) << st.msg();
+            st = it->next_batch(&nrows, read_column);
+            EXPECT_TRUE(st.ok()) << st.msg();
+            EXPECT_TRUE(stats.bytes_read > 0);
+
+            for (int row = 0; row < 1000; ++row) {
+                const std::string& value = 
column_type->to_string(*read_column, row);
+                if (inserted_jsonstr[row].find(key) != std::string::npos) {
+                    if (i % 2 == 0) {
+                        EXPECT_EQ(value, "88");
+                    } else {
+                        EXPECT_EQ(value, "str99");
+                    }
+                }
+            }
+        }
+    };
+    check_leaf_reader();
+
+    // 12. check empty
+    TabletColumn subcolumn;
+    subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+    subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+    subcolumn.set_parent_unique_id(parent_column.unique_id());
+    subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + 
".key10"));
+    subcolumn.set_is_nullable(true);
+    st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr);
+
+    // 13. check statistics size == limit
+    auto& variant_stats = variant_column_reader->_statistics;
+    EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() <
+                VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+    auto limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE -
+                 variant_stats->sparse_column_non_null_size.size();
+    for (int i = 0; i < limit; ++i) {
+        std::string key = parent_column.name_lower_case() + ".key10" + 
std::to_string(i);
+        variant_stats->sparse_column_non_null_size[key] = 10000;
+    }
+    EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() ==
+                VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+
+    st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    auto check_empty_column = [&]() {
+        for (int row = 0; row < 1000; ++row) {
+            std::string value;
+            st = assert_cast<ColumnObject*>(new_column_object.get())
+                         ->serialize_one_row_to_string(row, &value);
+
+            EXPECT_TRUE(st.ok()) << st.msg();
+            EXPECT_EQ(value, "{}");
+        }
+    };
+
+    read_to_column_object();
+    check_empty_column();
+
+    // construct tablet schema for compaction
+    storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION;
+    storage_read_opts.tablet_schema = _tablet_schema;
+    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
+    TabletSchema::PathsSetInfo paths_set_info;
+    paths_set_info.sub_path_set.insert("key0");
+    paths_set_info.sub_path_set.insert("key3");
+    paths_set_info.sub_path_set.insert("key4");
+    paths_set_info.sparse_path_set.insert("key1");
+    paths_set_info.sparse_path_set.insert("key2");
+    paths_set_info.sparse_path_set.insert("key5");
+    paths_set_info.sparse_path_set.insert("key6");
+    paths_set_info.sparse_path_set.insert("key7");
+    paths_set_info.sparse_path_set.insert("key8");
+    paths_set_info.sparse_path_set.insert("key9");
+    uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info;
+    _tablet_schema->set_path_set_info(uid_to_paths_set_info);
+
+    // 14. check compaction subcolumn reader
+    check_leaf_reader();
+
+    // 15. check compaction root reader
+    st = variant_column_reader->new_iterator(&it, parent_column, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr);
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    // 16. check compacton sparse column
+    TabletColumn sparse_column = 
schema_util::create_sparse_column(parent_column);
+    st = variant_column_reader->new_iterator(&it, sparse_column, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr);
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    auto column_type = 
DataTypeFactory::instance().create_data_type(sparse_column, false);
+    auto read_column = column_type->create_column();
+    nrows = 1000;
+    st = it->seek_to_ordinal(0);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = it->next_batch(&nrows, read_column);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(stats.bytes_read > 0);
+
+    for (int row = 0; row < 1000; ++row) {
+        const std::string& value = column_type->to_string(*read_column, row);
+        EXPECT_TRUE(value.find("key0") == std::string::npos)
+                << "row: " << row << ", value: " << value;
+        EXPECT_TRUE(value.find("key3") == std::string::npos)
+                << "row: " << row << ", value: " << value;
+        EXPECT_TRUE(value.find("key4") == std::string::npos)
+                << "row: " << row << ", value: " << value;
+    }
+
+    // 17. check limit = 10000
+    subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+    subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + 
".key10"));
+    st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+
+    for (int i = 0; i < limit; ++i) {
+        std::string key = parent_column.name_lower_case() + ".key10" + 
std::to_string(i);
+        variant_stats->sparse_column_non_null_size.erase(key);
+    }
+
+    // 18. check compacton sparse extract column
+    subcolumn.set_name(parent_column.name_lower_case() + ".key3");
+    subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + 
".key3"));
+    st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+
+    // 19. check compaction default column
+    subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+    subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + 
".key10"));
+    st = variant_column_reader->new_iterator(&it, subcolumn, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr);
+    
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+}
+
+TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) {
+    // 1. create tablet_schema
+    TabletSchemaPB schema_pb;
+    schema_pb.set_keys_type(KeysType::DUP_KEYS);
+    SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 
10);
+    _tablet_schema = std::make_shared<TabletSchema>();
+    _tablet_schema->init_from_pb(schema_pb);
+
+    // 2. create tablet
+    TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+    tablet_meta->_tablet_id = 10000;
+    _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, 
_data_dir.get());
+    EXPECT_TRUE(_tablet->init().ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+    // 3. create file_writer
+    io::FileWriterPtr file_writer;
+    auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+    auto st = io::global_local_filesystem()->create_file(file_path, 
&file_writer);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    // 4. create column_writer
+    SegmentFooterPB footer;
+    ColumnWriterOptions opts;
+    opts.meta = footer.add_columns();
+    opts.compression_type = CompressionTypePB::LZ4;
+    opts.file_writer = file_writer.get();
+    opts.footer = &footer;
+    RowsetWriterContext rowset_ctx;
+    rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+    opts.rowset_ctx = &rowset_ctx;
+    opts.rowset_ctx->tablet_schema = _tablet_schema;
+    TabletColumn column = _tablet_schema->column(0);
+    _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+    std::unique_ptr<ColumnWriter> writer;
+    EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), 
&writer).ok());
+    EXPECT_TRUE(writer->init().ok());
+    EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+    // 5. write data
+    auto olap_data_convertor = 
std::make_unique<vectorized::OlapBlockDataConvertor>();
+    auto block = _tablet_schema->create_block();
+    auto column_object = 
(*std::move(block.get_by_position(0).column)).mutate();
+    std::unordered_map<int, std::string> inserted_jsonstr;
+    auto path_with_size = 
VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000,
+                                                                               
 &inserted_jsonstr);
+    olap_data_convertor->add_column_data_convertor(column);
+    olap_data_convertor->set_source_content(&block, 0, 1000);
+    auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+    EXPECT_TRUE(result.ok());
+    EXPECT_TRUE(accessor != nullptr);
+    EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 
1000).ok());
+    st = writer->finish();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_data();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_ordinal_index();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_zone_map();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(file_writer->close().ok());
+    footer.set_num_rows(1000);
+
+    // 6. check footer
+    EXPECT_EQ(footer.columns_size(), 12);
+    auto column_meta = footer.columns(0);
+    EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+    for (int i = 1; i < footer.columns_size() - 1; ++i) {
+        auto column_meta = footer.columns(i);
+        check_column_meta(column_meta, path_with_size);
+    }
+    check_sparse_column_meta(footer.columns(footer.columns_size() - 1), 
path_with_size);
+
+    // 7. check variant reader
+    io::FileReaderSPtr file_reader;
+    st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    ColumnReaderOptions read_opts;
+    std::unique_ptr<ColumnReader> column_reader;
+    st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, 
&column_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    auto variant_column_reader = 
assert_cast<VariantColumnReader*>(column_reader.get());
+    EXPECT_TRUE(variant_column_reader != nullptr);
+
+    // 8. check statistics
+    auto statistics = variant_column_reader->get_stats();
+    for (const auto& [path, size] : statistics->subcolumns_non_null_size) {
+        std::cout << "path: " << path << ", size: " << size << std::endl;
+        EXPECT_EQ(path_with_size[path], size);
+    }
+    for (const auto& [path, size] : statistics->sparse_column_non_null_size) {
+        std::cout << "sparse path: " << path << ", size: " << size << 
std::endl;
+        EXPECT_EQ(path_with_size[path], size);
+    }
+
+    // 9. check root
+    ColumnIterator* it;
+    TabletColumn parent_column = _tablet_schema->column(0);
+    StorageReadOptions storage_read_opts;
+    storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+    st = variant_column_reader->new_iterator(&it, parent_column, 
&storage_read_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+    ColumnIteratorOptions column_iter_opts;
+    OlapReaderStatistics stats;
+    column_iter_opts.stats = &stats;
+    column_iter_opts.file_reader = file_reader.get();
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    MutableColumnPtr new_column_object = ColumnObject::create(3);
+    size_t nrows = 1000;
+    st = it->seek_to_ordinal(0);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = it->next_batch(&nrows, new_column_object);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(stats.bytes_read > 0);
+
+    for (int i = 0; i < 1000; ++i) {
+        std::string value;
+        st = assert_cast<ColumnObject*>(new_column_object.get())
+                     ->serialize_one_row_to_string(i, &value);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_EQ(value, inserted_jsonstr[i]);
+    }
+
+     auto read_to_column_object = [&]() {
+        new_column_object = ColumnObject::create(10);
+        nrows = 1000;
+        st = it->seek_to_ordinal(0);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        st = it->next_batch(&nrows, new_column_object);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_TRUE(stats.bytes_read > 0);
+        EXPECT_EQ(nrows, 1000);
+    };
+
+    auto check_key_stats = [&](const std::string& key_num) {
+        std::string key = ".key" + key_num;
+        TabletColumn subcolumn_in_nested;
+        subcolumn_in_nested.set_name(parent_column.name_lower_case() + key);
+        subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+        subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id());
+        
subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + 
key));
+        subcolumn_in_nested.set_variant_max_subcolumns_count(
+                parent_column.variant_max_subcolumns_count());
+        subcolumn_in_nested.set_is_nullable(true);
+
+        st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, 
&storage_read_opts);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+        st = it->init(column_iter_opts);
+        EXPECT_TRUE(st.ok()) << st.msg();
+        read_to_column_object();
+
+        size_t key_count = 0;
+        size_t key_nested_count = 0;
+        for (int row = 0; row < 1000; ++row) {
+            std::string value;
+            st = assert_cast<ColumnObject*>(new_column_object.get())
+                        ->serialize_one_row_to_string(row, &value);
+            EXPECT_TRUE(st.ok()) << st.msg();
+            if (value.find("nested" + key_num) != std::string::npos) {
+                key_nested_count++;
+            } else if (value.find("88") != std::string::npos) {
+                key_count++;
+            }
+        }
+        EXPECT_EQ(key_count, path_with_size["key" + key_num]);
+        EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" 
+ key_num]);
+    };
+
+    for (int i = 3; i < 10; ++i) {
+        check_key_stats(std::to_string(i));
+    }
+
+    
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/olap/rowset/variant_with_compaction_test.cpp 
b/be/test/olap/rowset/variant_with_compaction_test.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/be/test/testutil/schema_utils.h b/be/test/testutil/schema_utils.h
new file mode 100644
index 00000000000..fdb8354f975
--- /dev/null
+++ b/be/test/testutil/schema_utils.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "vec/common/schema_util.h"
+
+namespace doris {
+
+class SchemaUtils {
+public:
+    static void construct_column(ColumnPB* column_pb, int32_t col_unique_id,
+                                 const std::string& column_type, const 
std::string& column_name,
+                                 int variant_max_subcolumns_count = 3, bool 
is_key = false) {
+        column_pb->set_unique_id(col_unique_id);
+        column_pb->set_name(column_name);
+        column_pb->set_type(column_type);
+        column_pb->set_is_key(is_key);
+        column_pb->set_is_nullable(false);
+        if (column_type == "VARIANT") {
+            
column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count);
+        }
+    }
+
+    static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t 
index_id,
+                                       const std::string& index_name, int32_t 
col_unique_id) {
+        tablet_index->set_index_id(index_id);
+        tablet_index->set_index_name(index_name);
+        tablet_index->set_index_type(IndexType::INVERTED);
+        tablet_index->add_col_unique_id(col_unique_id);
+    }
+};
+
+} // namespace doris
diff --git a/be/test/testutil/variant_util.h b/be/test/testutil/variant_util.h
new file mode 100644
index 00000000000..27b9000ce55
--- /dev/null
+++ b/be/test/testutil/variant_util.h
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "vec/columns/column_object.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/parse2column.h"
+
+namespace doris {
+
+using namespace vectorized;
+
+class VariantUtil {
+public:
+    static schema_util::PathToNoneNullValues fill_string_column_with_test_data(
+            auto& column_string, int size, std::unordered_map<int, 
std::string>* inserted_jsonstr) {
+        schema_util::PathToNoneNullValues all_path_stats;
+        std::srand(42);
+        for (int i = 0; i < size; i++) {
+            std::string json_str = "{";
+            int num_pairs = std::rand() % 10 + 1;
+            for (int j = 0; j < num_pairs; j++) {
+                std::string key = "key" + std::to_string(j);
+                if (j % 2 == 0) {
+                    int value = 88;
+                    json_str += "\"" + key + "\":" + std::to_string(value);
+                } else {
+                    std::string value = "str" + std::to_string(99);
+                    json_str += "\"" + key + "\":\"" + value + "\"";
+                }
+                if (j < num_pairs - 1) {
+                    json_str += ",";
+                }
+                all_path_stats[key] += 1;
+            }
+            json_str += "}";
+            vectorized::Field str(json_str);
+            column_string->insert_data(json_str.data(), json_str.size());
+            (*inserted_jsonstr)[i] = json_str;
+        }
+        return all_path_stats;
+    }
+
+    static schema_util::PathToNoneNullValues 
fill_string_column_with_nested_test_data(
+            auto& column_string, int size, std::unordered_map<int, 
std::string>* inserted_jsonstr) {
+        schema_util::PathToNoneNullValues all_path_stats;
+        std::srand(42);
+        for (int i = 0; i < size; i++) {
+            std::string json_str = "{";
+
+            int num_paths = std::rand() % 9 + 2;
+            int current_path = 0;
+
+            json_str += "\"key0\":{";
+
+            json_str += "\"key1\":{";
+
+            json_str += "\"key2\":" + std::to_string(88) + ",";
+            json_str += "\"key3\":\"" + std::to_string(88) + "\"";
+            json_str += "},";
+            json_str += "\"key4\":" + std::to_string(88);
+            json_str += "},";
+
+            all_path_stats["key0.key1.key2"] += 1;
+            all_path_stats["key0.key1.key3"] += 1;
+            all_path_stats["key0.key4"] += 1;
+            current_path += 3;
+
+            while (current_path < num_paths) {
+                std::string key = "key" + std::to_string(current_path);
+                if (std::rand() % 2 == 0) {
+                    json_str += "\"" + key + "\":{";
+                    json_str +=
+                            "\"nested" + std::to_string(current_path) + "\":" 
+ std::to_string(88);
+                    json_str += "},";
+                    all_path_stats[key + ".nested" + 
std::to_string(current_path)] += 1;
+                } else {
+                    // 添加简单路径
+                    json_str += "\"" + key + "\":\"" + std::to_string(88) + 
"\",";
+                    all_path_stats[key] += 1;
+                }
+                current_path++;
+            }
+
+            json_str = json_str.substr(0, json_str.length() - 1);
+            json_str += "}";
+
+            vectorized::Field str(json_str);
+            column_string->insert_data(json_str.data(), json_str.size());
+            (*inserted_jsonstr)[i] = json_str;
+        }
+        return all_path_stats;
+    }
+
+    static schema_util::PathToNoneNullValues fill_object_column_with_test_data(
+            auto& column_object, int size, std::unordered_map<int, 
std::string>* inserted_jsonstr) {
+        auto type_string = std::make_shared<vectorized::DataTypeString>();
+        auto column = type_string->create_column();
+        auto column_string = assert_cast<ColumnString*>(column.get());
+        auto res = fill_string_column_with_test_data(column_string, size, 
inserted_jsonstr);
+        vectorized::ParseConfig config;
+        config.enable_flatten_nested = false;
+        parse_json_to_variant(*column_object, *column_string, config);
+        return res;
+    }
+
+    static schema_util::PathToNoneNullValues 
fill_object_column_with_nested_test_data(
+            auto& column_object, int size, std::unordered_map<int, 
std::string>* inserted_jsonstr) {
+        auto type_string = std::make_shared<vectorized::DataTypeString>();
+        auto column = type_string->create_column();
+        auto column_string = assert_cast<ColumnString*>(column.get());
+        auto res = fill_string_column_with_nested_test_data(column_string, 
size, inserted_jsonstr);
+        vectorized::ParseConfig config;
+        config.enable_flatten_nested = false;
+        parse_json_to_variant(*column_object, *column_string, config);
+        return res;
+    }
+};
+
+} // namespace doris
diff --git a/be/test/vec/common/schema_util_rowset_test.cpp 
b/be/test/vec/common/schema_util_rowset_test.cpp
new file mode 100644
index 00000000000..eb4b2ad4c39
--- /dev/null
+++ b/be/test/vec/common/schema_util_rowset_test.cpp
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock-more-matchers.h>
+#include <gtest/gtest.h>
+
+#include "olap/rowset/beta_rowset_writer.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet_schema.h"
+#include "vec/common/schema_util.h"
+#include "vec/json/parse2column.h"
+
+using namespace doris::vectorized;
+
+using namespace doris::segment_v2;
+
+using namespace doris;
+
+constexpr static uint32_t MAX_PATH_LEN = 1024;
+constexpr static std::string_view dest_dir = "/ut_dir/schema_util_test";
+constexpr static std::string_view tmp_dir = "./ut_dir/tmp";
+
+class SchemaUtilRowsetTest : public testing::Test {
+protected:
+    void SetUp() override {
+        // absolute dir
+        char buffer[MAX_PATH_LEN];
+        EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+        _curreent_dir = std::string(buffer);
+        _absolute_dir = _curreent_dir + std::string(dest_dir);
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok());
+
+        // tmp dir
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        EXPECT_TRUE(tmp_file_dirs->init().ok());
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+        // storage engine
+        doris::EngineOptions options;
+        auto engine = std::make_unique<StorageEngine>(options);
+        _engine_ref = engine.get();
+        _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
+        static_cast<void>(_data_dir->update_capacity());
+        EXPECT_TRUE(_data_dir->init(true).ok());
+        ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+    }
+    void TearDown() override {
+        
//EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+        _engine_ref = nullptr;
+        ExecEnv::GetInstance()->set_storage_engine(nullptr);
+    }
+
+public:
+    SchemaUtilRowsetTest() = default;
+    virtual ~SchemaUtilRowsetTest() = default;
+
+private:
+    StorageEngine* _engine_ref = nullptr;
+    std::unique_ptr<DataDir> _data_dir = nullptr;
+    TabletSharedPtr _tablet = nullptr;
+    std::string _absolute_dir;
+    std::string _curreent_dir;
+};
+
+static void construct_column(ColumnPB* column_pb, int32_t col_unique_id,
+                             const std::string& column_type, const 
std::string& column_name,
+                             bool is_key = false) {
+    column_pb->set_unique_id(col_unique_id);
+    column_pb->set_name(column_name);
+    column_pb->set_type(column_type);
+    column_pb->set_is_key(is_key);
+    column_pb->set_is_nullable(false);
+    if (column_type == "VARIANT") {
+        column_pb->set_variant_max_subcolumns_count(3);
+    }
+}
+
+// static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t 
index_id, const std::string& index_name, int32_t col_unique_id) {
+//     tablet_index->set_index_id(index_id);
+//     tablet_index->set_index_name(index_name);
+//     tablet_index->set_index_type(IndexType::INVERTED);
+//     tablet_index->add_col_unique_id(col_unique_id);
+// }
+
+static std::unordered_map<int32_t, schema_util::PathToNoneNullValues> 
all_path_stats;
+static void fill_string_column_with_test_data(auto& column_string, int size, 
int uid) {
+    std::srand(42);
+    for (int i = 0; i < size; i++) {
+        std::string json_str = "{";
+        int num_pairs = std::rand() % 10 + 1;
+        for (int j = 0; j < num_pairs; j++) {
+            std::string key = "key" + std::to_string(j);
+            if (std::rand() % 2 == 0) {
+                int value = std::rand() % 100;
+                json_str += "\"" + key + "\" : " + std::to_string(value);
+            } else {
+                std::string value = "str" + std::to_string(std::rand() % 100);
+                json_str += "\"" + key + "\" : \"" + value + "\"";
+            }
+            if (j < num_pairs - 1) {
+                json_str += ", ";
+            }
+            all_path_stats[uid][key] += 1;
+        }
+        json_str += "}";
+        vectorized::Field str(json_str);
+        column_string->insert_data(json_str.data(), json_str.size());
+    }
+}
+
+static void fill_varaint_column(auto& variant_column, int size, int uid) {
+    auto type_string = std::make_shared<vectorized::DataTypeString>();
+    auto column = type_string->create_column();
+    auto column_string = assert_cast<ColumnString*>(column.get());
+    fill_string_column_with_test_data(column_string, size, uid);
+    vectorized::ParseConfig config;
+    config.enable_flatten_nested = false;
+    parse_json_to_variant(*variant_column, *column_string, config);
+}
+
+static void fill_block_with_test_data(vectorized::Block* block, int size) {
+    auto columns = block->mutate_columns();
+    // insert key
+    for (int i = 0; i < size; i++) {
+        vectorized::Field key = i;
+        columns[0]->insert(key);
+    }
+
+    // insert v1
+    fill_varaint_column(columns[1], size, 1);
+
+    // insert v2
+    for (int i = 0; i < size; i++) {
+        vectorized::Field v2("V2");
+        columns[2]->insert(v2);
+    }
+
+    // insert v3
+    fill_varaint_column(columns[3], size, 3);
+
+    // insert v4
+    for (int i = 0; i < size; i++) {
+        vectorized::Field v4(i);
+        columns[4]->insert(v4);
+    }
+}
+static int64_t inc_id = 1000;
+static RowsetWriterContext rowset_writer_context(const 
std::unique_ptr<DataDir>& data_dir,
+                                                 const TabletSchemaSPtr& 
schema,
+                                                 const std::string& 
tablet_path) {
+    RowsetWriterContext context;
+    RowsetId rowset_id;
+    rowset_id.init(inc_id);
+    context.rowset_id = rowset_id;
+    context.rowset_type = BETA_ROWSET;
+    context.data_dir = data_dir.get();
+    context.rowset_state = VISIBLE;
+    context.tablet_schema = schema;
+    context.tablet_path = tablet_path;
+    context.version = Version(inc_id, inc_id);
+    context.max_rows_per_segment = 200;
+    inc_id++;
+    return context;
+}
+
+static RowsetSharedPtr create_rowset(auto& rowset_writer, const 
TabletSchemaSPtr& tablet_schema) {
+    vectorized::Block block = tablet_schema->create_block();
+    fill_block_with_test_data(&block, 1000);
+    auto st = rowset_writer->add_block(&block);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = rowset_writer->flush();
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    RowsetSharedPtr rowset;
+    EXPECT_TRUE(rowset_writer->build(rowset).ok());
+    EXPECT_TRUE(rowset->num_segments() == 5);
+    return rowset;
+}
+
+TEST_F(SchemaUtilRowsetTest, collect_path_stats_and_get_compaction_schema) {
+    // 1.create tablet schema
+    TabletSchemaPB schema_pb;
+    construct_column(schema_pb.add_column(), 0, "INT", "key", true);
+    construct_column(schema_pb.add_column(), 1, "VARIANT", "v1");
+    construct_column(schema_pb.add_column(), 2, "STRING", "v2");
+    construct_column(schema_pb.add_column(), 3, "VARIANT", "v3");
+    construct_column(schema_pb.add_column(), 4, "INT", "v4");
+    TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+    tablet_schema->init_from_pb(schema_pb);
+
+    // 2. create tablet
+    TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema));
+    _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, 
_data_dir.get());
+    EXPECT_TRUE(_tablet->init().ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+    // 3. create rowset
+    std::vector<RowsetSharedPtr> rowsets;
+    for (int i = 0; i < 5; i++) {
+        const auto& res = RowsetFactory::create_rowset_writer(
+                *_engine_ref,
+                rowset_writer_context(_data_dir, tablet_schema, 
_tablet->tablet_path()), false);
+        EXPECT_TRUE(res.has_value()) << res.error();
+        const auto& rowset_writer = res.value();
+        auto rowset = create_rowset(rowset_writer, tablet_schema);
+        EXPECT_TRUE(_tablet->add_rowset(rowset).ok());
+        rowsets.push_back(rowset);
+    }
+
+    std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+    for (const auto& rowset : rowsets) {
+        auto st = schema_util::collect_path_stats(rowset, path_stats);
+        EXPECT_TRUE(st.ok()) << st.msg();
+    }
+
+    for (const auto& [uid, path_stats] : path_stats) {
+        for (const auto& [path, size] : path_stats) {
+            EXPECT_EQ(all_path_stats[uid][path], size);
+        }
+    }
+
+    // 4. get compaction schema
+    TabletSchemaSPtr compaction_schema = tablet_schema;
+    auto st = schema_util::get_compaction_schema(rowsets, compaction_schema);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    // 5. check compaction schema
+    std::unordered_map<int32_t, std::vector<std::string>> 
compaction_schema_map;
+    for (const auto& column : compaction_schema->columns()) {
+        if (column->parent_unique_id() > 0) {
+            
compaction_schema_map[column->parent_unique_id()].push_back(column->name());
+        }
+    }
+    for (auto& [uid, paths] : compaction_schema_map) {
+        EXPECT_EQ(paths.size(), 4);
+        std::sort(paths.begin(), paths.end());
+        EXPECT_TRUE(paths[0].ends_with("__DORIS_VARIANT_SPARSE__"));
+        EXPECT_TRUE(paths[1].ends_with("key0"));
+        EXPECT_TRUE(paths[2].ends_with("key1"));
+        EXPECT_TRUE(paths[3].ends_with("key2"));
+    }
+}
diff --git a/be/test/vec/common/schema_util_test.cpp 
b/be/test/vec/common/schema_util_test.cpp
new file mode 100644
index 00000000000..ffa790ddf49
--- /dev/null
+++ b/be/test/vec/common/schema_util_test.cpp
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/common/schema_util.h"
+
+#include <gmock/gmock-more-matchers.h>
+#include <gtest/gtest.h>
+
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+
+using namespace doris::vectorized;
+
+using namespace doris::segment_v2;
+
+using namespace doris;
+
+class SchemaUtilTest : public testing::Test {
+public:
+    SchemaUtilTest() = default;
+    virtual ~SchemaUtilTest() = default;
+};
+
+void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, 
int64_t index_id,
+                      const std::string& index_name, int32_t col_unique_id,
+                      const std::string& column_type, const std::string& 
column_name,
+                      const IndexType& index_type) {
+    column_pb->set_unique_id(col_unique_id);
+    column_pb->set_name(column_name);
+    column_pb->set_type(column_type);
+    column_pb->set_is_nullable(true);
+    column_pb->set_is_bf_column(true);
+    tablet_index->set_index_id(index_id);
+    tablet_index->set_index_name(index_name);
+    tablet_index->set_index_type(index_type);
+    tablet_index->add_col_unique_id(col_unique_id);
+}
+
+void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, 
int32_t col_unique_id,
+                         std::string_view path, std::vector<TabletColumn>* 
subcolumns) {
+    TabletColumn subcol;
+    subcol.set_type(type);
+    subcol.set_is_nullable(true);
+    subcol.set_unique_id(-1);
+    subcol.set_parent_unique_id(col_unique_id);
+    vectorized::PathInData col_path(path);
+    subcol.set_path_info(col_path);
+    subcol.set_name(col_path.get_path());
+
+    if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
+        TabletColumn array_item_col;
+        // double not support inverted index
+        array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE);
+        array_item_col.set_is_nullable(true);
+        array_item_col.set_unique_id(-1);
+        array_item_col.set_parent_unique_id(col_unique_id);
+
+        subcol.add_sub_column(array_item_col);
+    }
+
+    schema->append_column(subcol);
+    subcolumns->emplace_back(std::move(subcol));
+}
+
+// void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type,
+//                                 int32_t col_unique_id, std::string_view 
path,
+//                                 std::vector<TabletColumn>* subcolumns) {
+//     TabletColumn subcol;
+//     subcol.set_type(type);
+//     subcol.set_is_nullable(true);
+//     subcol.set_unique_id(-1);
+//     subcol.set_parent_unique_id(col_unique_id);
+//     vectorized::PathInData col_path(path);
+//     subcol.set_path_info(col_path);
+//     subcol.set_name(col_path.get_path());
+//     schema->append_column(subcol);
+//     subcolumns->emplace_back(std::move(subcol));
+// }
+
+TEST_F(SchemaUtilTest, inherit_column_attributes) {
+    TabletSchemaPB schema_pb;
+    schema_pb.set_keys_type(KeysType::DUP_KEYS);
+    
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
+
+    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, 
"key_index", 0, "INT",
+                     "key", IndexType::INVERTED);
+    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, 
"v1_index", 1, "VARIANT",
+                     "v1", IndexType::INVERTED);
+    construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, 
"v3_index", 3, "VARIANT",
+                     "v3", IndexType::INVERTED);
+
+    TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+    tablet_schema->init_from_pb(schema_pb);
+    std::vector<TabletColumn> subcolumns;
+
+    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, 
"v1.b", &subcolumns);
+    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, 
"v1.c", &subcolumns);
+
+    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, 
"v3.d", &subcolumns);
+    construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, 
"v3.a", &subcolumns);
+
+    schema_util::inherit_column_attributes(tablet_schema);
+    for (const auto& col : subcolumns) {
+        switch (col._parent_col_unique_id) {
+        case 1:
+            EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr);
+            break;
+        case 3:
+            EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr);
+            break;
+        default:
+            EXPECT_TRUE(false);
+        }
+    }
+    EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7);
+
+    for (const auto& col : tablet_schema->_cols) {
+        if (!col->is_extracted_column()) {
+            continue;
+        }
+        switch (col->_parent_col_unique_id) {
+        case 1:
+            EXPECT_TRUE(col->is_bf_column());
+            break;
+        case 3:
+            EXPECT_TRUE(!col->is_bf_column());
+            break;
+        default:
+            EXPECT_TRUE(false);
+        }
+    }
+}
+
+static std::unordered_map<std::string, int> 
construct_column_map_with_random_values(
+        auto& column_map, int key_size, int value_size, const std::string& 
prefix) {
+    std::unordered_map<std::string, int> key_value_counts;
+    auto& key = assert_cast<ColumnString&>(column_map->get_keys());
+    auto& value = assert_cast<ColumnString&>(column_map->get_values());
+    auto& offsets = column_map->get_offsets();
+
+    std::srand(42);
+
+    for (int i = 0; i < key_size; ++i) {
+        std::string current_key = prefix + std::to_string(i);
+
+        int value_count = std::rand() % value_size + 1;
+        key_value_counts[current_key] = value_count;
+
+        for (int j = 0; j < value_count; ++j) {
+            key.insert_data(current_key.data(), current_key.size());
+            auto value_str = prefix + std::to_string(j);
+            value.insert_data(value_str.data(), value_str.size());
+        }
+        offsets.push_back(key.size());
+    }
+
+    return key_value_counts;
+}
+
+TEST_F(SchemaUtilTest, calculate_variant_stats) {
+    VariantStatisticsPB stats;
+    auto column_map = ColumnMap::create(ColumnString::create(), 
ColumnString::create(),
+                                        ColumnArray::ColumnOffsets::create());
+
+    const auto& key_value_counts =
+            construct_column_map_with_random_values(column_map, 200, 100, 
"key_");
+
+    // calculate stats
+    schema_util::calculate_variant_stats(*column_map, &stats, 0, 200);
+    EXPECT_EQ(stats.sparse_column_non_null_size_size(), 
key_value_counts.size());
+
+    for (const auto& kv : key_value_counts) {
+        auto it = stats.sparse_column_non_null_size().find(kv.first);
+        EXPECT_NE(it, stats.sparse_column_non_null_size().end());
+        EXPECT_EQ(it->second, kv.second);
+    }
+
+    // test with different key size
+    column_map->clear();
+    const auto& key_value_counts2 =
+            construct_column_map_with_random_values(column_map, 3000, 100, 
"key_");
+    schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000);
+    EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000);
+
+    for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
+        auto first_size = key_value_counts.find(path) == key_value_counts.end()
+                                  ? 0
+                                  : key_value_counts.find(path)->second;
+        auto second_size = key_value_counts2.find(path) == 
key_value_counts2.end()
+                                   ? 0
+                                   : key_value_counts2.find(path)->second;
+        EXPECT_EQ(size, first_size + second_size);
+    }
+
+    // test with max size
+    column_map->clear();
+    const auto& key_value_counts3 = construct_column_map_with_random_values(
+            column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, 
"key2_");
+    schema_util::calculate_variant_stats(*column_map, &stats, 0,
+                                         
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+    EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE,
+              stats.sparse_column_non_null_size_size());
+
+    for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
+        auto first_size = key_value_counts.find(path) == key_value_counts.end()
+                                  ? 0
+                                  : key_value_counts.find(path)->second;
+        auto second_size = key_value_counts2.find(path) == 
key_value_counts2.end()
+                                   ? 0
+                                   : key_value_counts2.find(path)->second;
+        auto third_size = key_value_counts3.find(path) == 
key_value_counts3.end()
+                                  ? 0
+                                  : key_value_counts3.find(path)->second;
+        EXPECT_EQ(size, first_size + second_size + third_size);
+    }
+}
+
+TEST_F(SchemaUtilTest, get_subpaths) {
+    TabletColumn variant;
+    variant.set_unique_id(1);
+    variant.set_variant_max_subcolumns_count(3);
+
+    std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+    path_stats[1] = {
+            {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, 
{"path5", 200}};
+
+    // get subpaths
+    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
+    schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+    EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+    EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2);
+
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+
+    EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") !=
+                uid_to_paths_set_info[1].sparse_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") !=
+                uid_to_paths_set_info[1].sparse_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) {
+    TabletColumn variant;
+    variant.set_unique_id(1);
+    variant.set_variant_max_subcolumns_count(3);
+
+    std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+    path_stats[1] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}};
+
+    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
+    schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+    EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+    EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0);
+
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) {
+    TabletColumn variant1;
+    variant1.set_unique_id(1);
+    variant1.set_variant_max_subcolumns_count(3);
+
+    TabletColumn variant2;
+    variant2.set_unique_id(2);
+    variant2.set_variant_max_subcolumns_count(2);
+
+    TabletColumn variant3;
+    variant3.set_unique_id(3);
+    variant3.set_variant_max_subcolumns_count(4);
+
+    std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+    path_stats[1] = {
+            {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, 
{"path5", 200}};
+    path_stats[2] = {{"path1", 1000}, {"path2", 800}};
+    path_stats[3] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}, 
{"path4", 300}};
+    path_stats[4] = {
+            {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, 
{"path5", 200}};
+
+    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
+    schema_util::get_subpaths(variant1, path_stats, uid_to_paths_set_info);
+    schema_util::get_subpaths(variant2, path_stats, uid_to_paths_set_info);
+    schema_util::get_subpaths(variant3, path_stats, uid_to_paths_set_info);
+
+    EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+    EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2);
+
+    EXPECT_EQ(uid_to_paths_set_info[2].sub_path_set.size(), 2);
+    EXPECT_EQ(uid_to_paths_set_info[2].sparse_path_set.size(), 0);
+
+    EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4);
+    EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0);
+
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+                uid_to_paths_set_info[1].sub_path_set.end());
+
+    EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") !=
+                uid_to_paths_set_info[1].sparse_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") !=
+                uid_to_paths_set_info[1].sparse_path_set.end());
+
+    EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") !=
+                uid_to_paths_set_info[2].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") !=
+                uid_to_paths_set_info[2].sub_path_set.end());
+
+    EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") !=
+                uid_to_paths_set_info[3].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") !=
+                uid_to_paths_set_info[3].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") !=
+                uid_to_paths_set_info[3].sub_path_set.end());
+    EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") !=
+                uid_to_paths_set_info[3].sub_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) {
+    TabletColumn variant;
+    variant.set_unique_id(1);
+    variant.set_variant_max_subcolumns_count(3);
+
+    std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+    path_stats[2] = {{"path1", 1000}, {"path2", 800}};
+
+    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
+    schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+    EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0);
+    EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0);
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to