This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git

The following commit(s) were added to refs/heads/variant-sparse by this push:
     new 89b3d4178f7 fix 7 (#45680)
89b3d4178f7 is described below

commit 89b3d4178f7004fa5c67a8cabd14f9e4e1b8cc35
Author: lihangyu <lihan...@selectdb.com>
AuthorDate: Fri Dec 20 01:00:55 2024 +0800

    fix 7 (#45680)
---
 .../rowset/segment_v2/hierarchical_data_reader.cpp |  8 +++++
 be/src/olap/rowset/segment_v2/segment.cpp          |  6 +++-
 be/src/olap/rowset/segment_v2/segment.h            |  2 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |  1 +
 .../segment_v2/variant_column_writer_impl.cpp      | 39 ++++++++++++++--------
 .../rowset/segment_v2/vertical_segment_writer.cpp  |  2 +-
 be/src/vec/columns/column_object.cpp               |  7 +++-
 be/src/vec/columns/column_object.h                 |  4 +--
 be/src/vec/functions/function_cast.h               |  1 -
 be/src/vec/json/parse2column.cpp                   |  7 ++++
 10 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp 
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
index a0e8b3fd0ee..de0123a330a 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
@@ -104,6 +104,10 @@ Status HierarchicalDataReader::seek_to_ordinal(ordinal_t 
ord) {
         DCHECK(_root_reader->inited);
         RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
     }
+    if (_sparse_column_reader) {
+        DCHECK(_sparse_column_reader->inited);
+        RETURN_IF_ERROR(_sparse_column_reader->iterator->seek_to_ordinal(ord));
+    }
     return Status::OK();
 }
 
@@ -424,6 +428,10 @@ void 
SparseColumnExtractReader::_fill_path_column(vectorized::MutableColumnPtr&
             *var.get_subcolumn({}) /*root*/, null_map, StringRef 
{_path.data(), _path.size()},
             _sparse_column->get_ptr(), 0, _sparse_column->size());
     var.incr_num_rows(_sparse_column->size());
+    
var.get_sparse_column()->assume_mutable()->insert_many_defaults(_sparse_column->size());
+#ifndef NDEBUG
+    var.check_consistency();
+#endif
     _sparse_column->clear();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 238898a74ec..9b505e4a4a5 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -812,7 +812,11 @@ Status Segment::new_column_iterator(const TabletColumn& 
tablet_column,
     return Status::OK();
 }
 
-ColumnReader* Segment::get_column_reader(int32_t col_unique_id) {
+Result<ColumnReader*> Segment::get_column_reader(int32_t col_unique_id) {
+    auto status = _create_column_readers_once();
+    if (!status) {
+        return ResultError(std::move(status));
+    }
     if (_column_readers.contains(col_unique_id)) {
         return _column_readers[col_unique_id].get();
     }
diff --git a/be/src/olap/rowset/segment_v2/segment.h 
b/be/src/olap/rowset/segment_v2/segment.h
index 5b88e60e37a..9fe545006e3 100644
--- a/be/src/olap/rowset/segment_v2/segment.h
+++ b/be/src/olap/rowset/segment_v2/segment.h
@@ -209,7 +209,7 @@ public:
 
     const TabletSchemaSPtr& tablet_schema() { return _tablet_schema; }
 
-    ColumnReader* get_column_reader(int32_t col_unique_id);
+    Result<ColumnReader*> get_column_reader(int32_t col_unique_id);
 
 private:
     DISALLOW_COPY_AND_ASSIGN(Segment);
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index b76acf68978..60fd7cea28a 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -1151,6 +1151,7 @@ Status SegmentWriter::_write_footer() {
 
     // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), 
MagicNumber(4)
     std::string footer_buf;
+    VLOG_DEBUG << "footer " << _footer.DebugString();
     if (!_footer.SerializeToString(&footer_buf)) {
         return Status::InternalError("failed to serialize segment footer");
     }
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 6588e7dbe4f..33499a8e7e2 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -30,6 +30,7 @@
 #include "vec/columns/column_object.h"
 #include "vec/columns/columns_number.h"
 #include "vec/common/schema_util.h"
+#include "vec/json/path_in_data.h"
 #include "vec/olap/olap_data_convertor.h"
 
 namespace doris::segment_v2 {
@@ -47,12 +48,12 @@ Status VariantColumnWriterImpl::init() {
     if (dynamic_paths.empty()) {
         _column = vectorized::ColumnObject::create(true, false);
     } else {
-        vectorized::ColumnObject::Subcolumns dynamic_subcolumns;
-        for (const auto& path : dynamic_paths) {
-            dynamic_subcolumns.add(vectorized::PathInData(path),
-                                   vectorized::ColumnObject::Subcolumn {0, 
true});
+        // create root
+        auto col = vectorized::ColumnObject::create(true, true);
+        for (const auto& str_path : dynamic_paths) {
+            DCHECK(col->add_sub_column(vectorized::PathInData(str_path), 0));
         }
-        _column = 
vectorized::ColumnObject::create(std::move(dynamic_subcolumns), true);
+        _column = std::move(col);
     }
     if (_tablet_column->is_nullable()) {
         _null_column = vectorized::ColumnUInt8::create(0);
@@ -69,7 +70,8 @@ Status 
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
         RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
                 std::static_pointer_cast<BetaRowset>(reader->rowset()), 
&segment_cache));
         for (const auto& segment : segment_cache.get_segments()) {
-            ColumnReader* column_reader = 
segment->get_column_reader(_tablet_column->unique_id());
+            ColumnReader* column_reader =
+                    
DORIS_TRY(segment->get_column_reader(_tablet_column->unique_id()));
             if (!column_reader) {
                 continue;
             }
@@ -104,10 +106,10 @@ Status 
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
             paths_with_sizes.emplace_back(size, path);
         }
         std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), 
std::greater());
-
         // Fill dynamic_paths with first max_dynamic_paths paths in sorted 
list.
+        // reserve 1 for root column
         for (const auto& [size, path] : paths_with_sizes) {
-            if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS) {
+            if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) {
                 paths.emplace(path);
             }
             // // todo : Add all remaining paths into shared data statistics 
until we reach its max size;
@@ -141,6 +143,7 @@ Status 
VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p
     ptr->ensure_root_node_type(expected_root_type);
 
     converter->add_column_data_convertor(*_tablet_column);
+    DCHECK_EQ(ptr->get_root()->get_ptr()->size(), num_rows);
     RETURN_IF_ERROR(converter->set_source_content_with_specifid_column(
             {ptr->get_root()->get_ptr(), nullptr, ""}, 0, num_rows, 
column_id));
     auto [status, column] = converter->convert_column_data(column_id);
@@ -228,12 +231,17 @@ Status VariantColumnWriterImpl::_process_sparse_column(
 
     // convert root column data from engine format to storage layer format
     converter->add_column_data_convertor(sparse_column);
+    DCHECK_EQ(ptr->get_sparse_column()->size(), num_rows);
     RETURN_IF_ERROR(converter->set_source_content_with_specifid_column(
-            {ptr->get_sparse_column()->get_ptr(), nullptr, ""}, 0, num_rows, 
column_id));
+            {ptr->get_sparse_column(), nullptr, ""}, 0, num_rows, column_id));
     auto [status, column] = converter->convert_column_data(column_id);
     if (!status.ok()) {
         return status;
     }
+    VLOG_DEBUG << "dump sparse "
+               << vectorized::schema_util::dump_column(
+                          vectorized::ColumnObject::get_sparse_column_type(),
+                          ptr->get_sparse_column());
     RETURN_IF_ERROR(
             _sparse_column_writer->append(column->get_nullmap(), 
column->get_data(), num_rows));
     ++column_id;
@@ -253,7 +261,6 @@ Status VariantColumnWriterImpl::_process_sparse_column(
             _statistics.sparse_column_non_null_size.emplace(path, 1);
         }
     }
-
     sparse_writer_opts.meta->set_num_rows(num_rows);
     return Status::OK();
 }
@@ -294,6 +301,10 @@ Status VariantColumnWriterImpl::finalize() {
         ptr->create_root(root_type, std::move(root_col));
     }
 
+#ifndef NDEBUG
+    ptr->check_consistency();
+#endif
+
     size_t num_rows = _column->size();
     int column_id = 0;
 
@@ -333,10 +344,10 @@ uint64_t VariantColumnWriterImpl::estimate_buffer_size() {
         return _column->byte_size();
     }
     uint64_t size = 0;
+    size += _root_writer->estimate_buffer_size();
     for (auto& column_writer : _subcolumn_writers) {
         size += column_writer->estimate_buffer_size();
     }
-    size += _root_writer->estimate_buffer_size();
     size += _sparse_column_writer->estimate_buffer_size();
     return size;
 }
@@ -346,10 +357,10 @@ Status VariantColumnWriterImpl::finish() {
         RETURN_IF_ERROR(finalize());
     }
     RETURN_IF_ERROR(_root_writer->finish());
-    RETURN_IF_ERROR(_sparse_column_writer->finish());
     for (auto& column_writer : _subcolumn_writers) {
         RETURN_IF_ERROR(column_writer->finish());
     }
+    RETURN_IF_ERROR(_sparse_column_writer->finish());
     return Status::OK();
 }
 Status VariantColumnWriterImpl::write_data() {
@@ -357,10 +368,10 @@ Status VariantColumnWriterImpl::write_data() {
         RETURN_IF_ERROR(finalize());
     }
     RETURN_IF_ERROR(_root_writer->write_data());
-    RETURN_IF_ERROR(_sparse_column_writer->write_data());
     for (auto& column_writer : _subcolumn_writers) {
         RETURN_IF_ERROR(column_writer->write_data());
     }
+    RETURN_IF_ERROR(_sparse_column_writer->write_data());
     return Status::OK();
 }
 Status VariantColumnWriterImpl::write_ordinal_index() {
@@ -368,10 +379,10 @@ Status VariantColumnWriterImpl::write_ordinal_index() {
         RETURN_IF_ERROR(finalize());
     }
     RETURN_IF_ERROR(_root_writer->write_ordinal_index());
-    RETURN_IF_ERROR(_sparse_column_writer->write_ordinal_index());
     for (auto& column_writer : _subcolumn_writers) {
         RETURN_IF_ERROR(column_writer->write_ordinal_index());
     }
+    RETURN_IF_ERROR(_sparse_column_writer->write_ordinal_index());
     return Status::OK();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
index 714ed6b7d6e..1b6da1dbf4c 100644
--- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
@@ -1506,7 +1506,7 @@ Status VerticalSegmentWriter::_write_footer() {
     _footer.set_num_rows(_row_count);
 
     // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), 
MagicNumber(4)
-    LOG(INFO) << "footer " << _footer.DebugString();
+    VLOG_DEBUG << "footer " << _footer.DebugString();
     std::string footer_buf;
     if (!_footer.SerializeToString(&footer_buf)) {
         return Status::InternalError("failed to serialize segment footer");
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 31b77d549fb..91a0936673f 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -701,6 +701,7 @@ void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
     }
     data = {std::move(result_column)};
     data_types = {std::move(to_type)};
+    data_serdes = {data_types[0]->get_serde()};
     num_of_defaults_in_prefix = 0;
 }
 
@@ -1253,10 +1254,11 @@ bool ColumnObject::try_add_new_subcolumn(const 
PathInData& path) {
 }
 
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
+    const auto& src_object = assert_cast<const ColumnObject&>(src);
 #ifndef NDEBUG
     check_consistency();
+    src_object.check_consistency();
 #endif
-    const auto& src_object = assert_cast<const ColumnObject&>(src);
 
     // First, insert src subcolumns
     // We can reach the limit of subcolumns, and in this case
@@ -2224,6 +2226,9 @@ void ColumnObject::create_root(const DataTypePtr& type, 
MutableColumnPtr&& colum
         num_rows = column->size();
     }
     add_sub_column({}, std::move(column), type);
+    if (serialized_sparse_column->empty()) {
+        serialized_sparse_column->insert_many_defaults(num_rows);
+    }
 }
 
 const DataTypePtr& ColumnObject::get_most_common_type() {
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index 9d9d40f9075..c7859ab4b93 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -363,9 +363,7 @@ public:
 
     Subcolumns& get_subcolumns() { return subcolumns; }
 
-    ColumnPtr get_sparse_column() const {
-        return serialized_sparse_column->convert_to_full_column_if_const();
-    }
+    ColumnPtr get_sparse_column() const { return serialized_sparse_column; }
 
     // use sparse_subcolumns_schema to record sparse column's path info and 
type
     static MutableColumnPtr create_sparse_column_fn() {
diff --git a/be/src/vec/functions/function_cast.h 
b/be/src/vec/functions/function_cast.h
index 0e7a8c495d3..5de820dfa3a 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -1933,7 +1933,6 @@ private:
             // set variant root column/type to from column/type
             auto variant = ColumnObject::create(true /*always nullable*/);
             variant->create_root(from_type, col_from->assume_mutable());
-            
variant->get_sparse_column()->assume_mutable()->insert_many_defaults(input_rows_count);
             block.replace_by_position(result, std::move(variant));
             return Status::OK();
         }
diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp
index ba18083a95c..0e8472928a5 100644
--- a/be/src/vec/json/parse2column.cpp
+++ b/be/src/vec/json/parse2column.cpp
@@ -191,6 +191,13 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
         }
     }
     column_object.incr_num_rows();
+    auto sparse_column = column_object.get_sparse_column();
+    if (sparse_column->size() == old_num_rows) {
+        sparse_column->assume_mutable()->insert_default();
+    }
+#ifndef NDEBUG
+    column_object.check_consistency();
+#endif
 }
 
 // exposed interfaces


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to