This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 6b5102d0b14 branch-3.1: [fix](variant) fix read from variant sprase
column #55036 (#55235)
6b5102d0b14 is described below
commit 6b5102d0b14e230cf39d325adbe8d8dd385b5877
Author: Sun Chenyang <[email protected]>
AuthorDate: Tue Aug 26 09:05:02 2025 +0800
branch-3.1: [fix](variant) fix read from variant sprase column #55036
(#55235)
pick from master #55036
---
.../variant/hierarchical_data_iterator.cpp | 51 +++++++-
be/src/vec/columns/column_object.h | 21 ++++
.../segment_v2/hierarchical_data_iterator_test.cpp | 138 +++++++++++++++++++++
.../data/variant_p0/variant_hirachinal.out | Bin 623 -> 697 bytes
.../suites/variant_p0/variant_hirachinal.groovy | 36 ++++++
5 files changed, 244 insertions(+), 2 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/variant/hierarchical_data_iterator.cpp
b/be/src/olap/rowset/segment_v2/variant/hierarchical_data_iterator.cpp
index 699efdb080e..77e23d264fb 100644
--- a/be/src/olap/rowset/segment_v2/variant/hierarchical_data_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/hierarchical_data_iterator.cpp
@@ -348,6 +348,13 @@ Status
HierarchicalDataIterator::_process_sparse_column(vectorized::ColumnObject
container_variant.get_sparse_data_paths_and_values();
StringRef prefix_ref(_path.get_path());
std::string_view path_prefix(prefix_ref.data, prefix_ref.size);
+
+ // Collect subcolumns materialized from sparse data. We try to
densify frequent
+ // subpaths into real subcolumns under the capacity constraint of
the container.
+ std::unordered_map<std::string_view, ColumnObject::Subcolumn>
+ subcolumns_from_sparse_column;
+ // How many more subcolumns we are allowed to add into the
container.
+ size_t count = container_variant.can_add_subcolumns_count();
for (size_t i = 0; i != src_sparse_data_offsets.size(); ++i) {
size_t start = src_sparse_data_offsets[ssize_t(i) - 1];
size_t end = src_sparse_data_offsets[ssize_t(i)];
@@ -363,8 +370,29 @@ Status
HierarchicalDataIterator::_process_sparse_column(vectorized::ColumnObject
// Don't include path that is equal to the prefix.
if (path.size() != path_prefix.size()) {
auto sub_path = get_sub_path(path, path_prefix);
- sparse_data_paths->insert_data(sub_path.data(),
sub_path.size());
-
sparse_data_values->insert_from(src_sparse_data_values, lower_bound_index);
+ // Case 1: subcolumn already created, append this
row's value into it.
+ if (auto it =
subcolumns_from_sparse_column.find(sub_path);
+ it != subcolumns_from_sparse_column.end()) {
+ const auto& data =
ColumnObject::deserialize_from_sparse_column(
+ &src_sparse_data_values,
lower_bound_index);
+ it->second.insert(data.first, data.second);
+ }
+ // Case 2: subcolumn not created yet and we still have
quota → create it and insert.
+ else if (subcolumns_from_sparse_column.size() < count)
{
+ // Initialize subcolumn with current logical row
index i to align sizes.
+ ColumnObject::Subcolumn subcolumn(/*size*/ i,
/*is_nullable*/ true,
+ false);
+ const auto& data =
ColumnObject::deserialize_from_sparse_column(
+ &src_sparse_data_values,
lower_bound_index);
+ subcolumn.insert(data.first, data.second);
+ subcolumns_from_sparse_column.emplace(sub_path,
std::move(subcolumn));
+ }
+ // Case 3: quota exhausted → keep the key/value in
container's sparse column.
+ else {
+ sparse_data_paths->insert_data(sub_path.data(),
sub_path.size());
+
sparse_data_values->insert_from(src_sparse_data_values,
+ lower_bound_index);
+ }
} else {
// insert into root column, example: access v['b']
and b is in sparse column
// data example:
@@ -391,6 +419,25 @@ Status
HierarchicalDataIterator::_process_sparse_column(vectorized::ColumnObject
container_variant.get_subcolumn({})->insert_default();
}
sparse_data_offsets.push_back(sparse_data_paths->size());
+
+ // all subcolumns keep the same number of rows (i + 1 after
this iteration).
+ for (auto& entry : subcolumns_from_sparse_column) {
+ if (entry.second.size() == i) {
+ entry.second.insert_default();
+ }
+ }
+ }
+
+ // Finalize materialized subcolumns and attach them into the
container variant.
+ for (auto& entry : subcolumns_from_sparse_column) {
+ entry.second.finalize();
+ if (!container_variant.add_sub_column(
+ PathInData(entry.first),
+
IColumn::mutate(entry.second.get_finalized_column_ptr()),
+ entry.second.get_least_common_type())) {
+ return Status::InternalError(
+ "Failed to add subcolumn {}, which is from sparse
column", entry.first);
+ }
}
}
}
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index 319e5569e95..235fdf996ff 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -301,10 +301,13 @@ private:
WrappedPtr serialized_sparse_column = ColumnMap::create(
ColumnString::create(), ColumnString::create(),
ColumnArray::ColumnOffsets::create());
+ // if `_max_subcolumns_count == 0`, all subcolumns are materialized.
int32_t _max_subcolumns_count = 0;
+ // subcolumns count materialized from typed paths
size_t typed_path_count = 0;
+ // subcolumns count materialized from nested paths
size_t nested_path_count = 0;
public:
@@ -602,6 +605,24 @@ public:
_max_subcolumns_count = max_subcolumns_count;
}
+ // Returns how many dynamic subcolumns are still allowed to be appended,
+ // The remaining quota is `max - current`.
+ size_t can_add_subcolumns_count() const {
+ // When `_max_subcolumns_count == 0`, appending dynamic subcolumns is
disabled.
+ // In this case, all subcolumns are materialized.
+ if (_max_subcolumns_count == 0) {
+ return 0;
+ }
+
+ // `current_subcolumns_count` excludes:
+ // 1) subcolumns materialized from typed paths (`typed_path_count`),
+ // 2) subcolumns materialized from nested paths
(`nested_path_count`),
+ // 3) the implicit root holder node in `subcolumns` (hence the `-
1`).
+ size_t current_subcolumns_count =
+ subcolumns.size() - typed_path_count - nested_path_count - 1;
+ return _max_subcolumns_count - current_subcolumns_count;
+ }
+
private:
// May throw execption
void try_insert(const Field& field);
diff --git a/be/test/olap/rowset/segment_v2/hierarchical_data_iterator_test.cpp
b/be/test/olap/rowset/segment_v2/hierarchical_data_iterator_test.cpp
new file mode 100644
index 00000000000..6094d3b5dd2
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/hierarchical_data_iterator_test.cpp
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant/hierarchical_data_iterator.h"
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_object.h"
+#include "vec/columns/column_string.h"
+#include "vec/data_types/data_type_nothing.h"
+#include "vec/json/path_in_data.h"
+
+using doris::Status;
+using doris::segment_v2::ColumnIterator;
+using doris::segment_v2::ColumnIteratorOptions;
+using doris::segment_v2::HierarchicalDataIterator;
+using doris::vectorized::ColumnMap;
+using doris::vectorized::ColumnString;
+using doris::vectorized::ColumnObject;
+using doris::vectorized::MutableColumnPtr;
+using doris::vectorized::PathInData;
+
+class DummySparseIterator final : public ColumnIterator {
+public:
+ Status init(const ColumnIteratorOptions&) override { return Status::OK(); }
+ Status seek_to_ordinal(ordinal_t) override { return Status::OK(); }
+ ordinal_t get_current_ordinal() const override { return 0; }
+ Status next_batch(size_t*, MutableColumnPtr&, bool*) override { return
Status::OK(); }
+ Status read_by_rowids(const doris::segment_v2::rowid_t*, const size_t,
+ MutableColumnPtr&) override {
+ return Status::OK();
+ }
+ Status seek_to_first() override { return Status::OK(); }
+};
+
+TEST(HierarchicalDataIteratorTest, ProcessSparseExtractSubpaths) {
+ std::unique_ptr<ColumnIterator> sparse_reader =
std::make_unique<DummySparseIterator>();
+ doris::segment_v2::ColumnIteratorUPtr iter;
+ auto sparse_iter = std::make_unique<SubstreamIterator>(
+ doris::vectorized::ColumnObject::create_sparse_column_fn(),
std::move(sparse_reader),
+ nullptr);
+ ASSERT_TRUE(HierarchicalDataIterator::create(
+ &iter, /*col_uid*/ 0, PathInData("a.b"), /*node*/
nullptr,
+ /*root*/ std::move(sparse_iter), nullptr, nullptr,
nullptr)
+ .ok());
+
+ ColumnIteratorOptions opts;
+ ASSERT_TRUE(iter->init(opts).ok());
+ ASSERT_TRUE(iter->seek_to_ordinal(0).ok());
+
+ auto* hiter = static_cast<HierarchicalDataIterator*>(iter.get());
+ auto& map = assert_cast<ColumnMap&>(*hiter->_sparse_column_reader->column);
+ auto& keys = assert_cast<ColumnString&>(map.get_keys());
+ auto& vals = assert_cast<ColumnString&>(map.get_values());
+ auto& offs = map.get_offsets();
+
+ doris::vectorized::DataTypePtr str_type =
std::make_shared<doris::vectorized::DataTypeString>();
+ auto str_col = str_type->create_column();
+ auto serde = str_type->get_serde();
+ str_col->insert_data("abcvalues", strlen("abcvalues"));
+ str_col->insert_data("abdvalues", strlen("abdvalues"));
+ str_col->insert_data("abcvalues", strlen("abcvalues"));
+ str_col->insert_data("abevalues", strlen("abevalues"));
+ str_col->insert_data("axvalues", strlen("axvalues"));
+ ColumnString::Chars& chars = vals.get_chars();
+ for (size_t i = 0; i < 5; ++i) {
+ serde->write_one_cell_to_binary(*str_col, chars, i);
+ vals.get_offsets().push_back(chars.size());
+ }
+
+ // row0: {"a.b.c": "abcvalues", "a.b.d": "abdvalues"}
+ keys.insert_data("a.b.c", strlen("a.b.c"));
+ keys.insert_data("a.b.d", strlen("a.b.d"));
+ offs.push_back(keys.size());
+
+ // row1: {"a.b.c": "abcvalues", "a.b.e": "abevalues", "a.x": "axvalues"}
+ keys.insert_data("a.b.c", strlen("a.b.c"));
+ keys.insert_data("a.b.e", strlen("a.b.e"));
+ keys.insert_data("a.x", strlen("a.x"));
+ offs.push_back(keys.size());
+
+ const size_t nrows = 2;
+ MutableColumnPtr dst = ColumnObject::create(/*max_subcolumns_count*/ 2,
nrows);
+
+ auto& variant = assert_cast<ColumnObject&>(*dst);
+ ASSERT_TRUE(hiter->_process_sparse_column(variant, nrows).ok());
+
+ // root column + 2 subcolumns
+ EXPECT_EQ(variant.get_subcolumns().size(), 3);
+
+ auto* abc_subcolumn = variant.get_subcolumn(PathInData("c"));
+ auto* abd_subcolumn = variant.get_subcolumn(PathInData("d"));
+
+ EXPECT_TRUE(abc_subcolumn);
+ EXPECT_TRUE(abd_subcolumn);
+
+ EXPECT_EQ(abc_subcolumn->get_non_null_value_size(), 2);
+ EXPECT_EQ(abd_subcolumn->get_non_null_value_size(), 1);
+
+ const auto& abc_subcolumn_data = assert_cast<const
doris::vectorized::ColumnNullable&>(
+ *abc_subcolumn->get_finalized_column_ptr());
+ const auto& abd_subcolumn_data = assert_cast<const
doris::vectorized::ColumnNullable&>(
+ *abd_subcolumn->get_finalized_column_ptr());
+
EXPECT_EQ(abc_subcolumn_data.get_nested_column_ptr()->get_data_at(0).to_string(),
"abcvalues");
+
EXPECT_EQ(abc_subcolumn_data.get_nested_column_ptr()->get_data_at(1).to_string(),
"abcvalues");
+
EXPECT_EQ(abd_subcolumn_data.get_nested_column_ptr()->get_data_at(0).to_string(),
"abdvalues");
+
+ const auto& read_map = assert_cast<const
ColumnMap&>(*variant.get_sparse_column());
+ const auto& read_keys = assert_cast<const
ColumnString&>(read_map.get_keys());
+ const auto& read_vals = assert_cast<const
ColumnString&>(read_map.get_values());
+ const auto& read_offs = read_map.get_offsets();
+
+ EXPECT_EQ(read_offs.size(), 2);
+
+ EXPECT_EQ(read_keys.get_data_at(0).to_string(), "e");
+ auto val = read_vals.get_data_at(0).to_string();
+ EXPECT_EQ(val.substr(val.size() - 9, 9), "abevalues");
+
+ EXPECT_EQ(read_offs[0], 0);
+ EXPECT_EQ(read_offs[1], 1);
+}
diff --git a/regression-test/data/variant_p0/variant_hirachinal.out
b/regression-test/data/variant_p0/variant_hirachinal.out
index a3b4f28e286..48ed2967326 100644
Binary files a/regression-test/data/variant_p0/variant_hirachinal.out and
b/regression-test/data/variant_p0/variant_hirachinal.out differ
diff --git a/regression-test/suites/variant_p0/variant_hirachinal.groovy
b/regression-test/suites/variant_p0/variant_hirachinal.groovy
index b3eaaccd2b6..3336ca4d23f 100644
--- a/regression-test/suites/variant_p0/variant_hirachinal.groovy
+++ b/regression-test/suites/variant_p0/variant_hirachinal.groovy
@@ -37,4 +37,40 @@ suite("regression_test_variant_hirachinal", "variant_type"){
sql """insert into ${table_name} values (-3, '{"c" : 12345}')"""
order_qt_sql1 "select cast(v['c'] as string) from var_rs where k = -3 or k
= -2 or k = -4 or (k = 1 and v['c'] = 1024) order by k"
order_qt_sql2 "select cast(v['c'] as string) from var_rs where k = -3 or k
= -2 or k = 1 order by k, cast(v['c'] as text) limit 3"
+
+
+ sql "DROP TABLE IF EXISTS ${table_name}"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS ${table_name} (
+ k bigint,
+ v variant<properties("variant_max_subcolumns_count" = "2")>
+ )
+ DUPLICATE KEY(`k`)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ properties("replication_num" = "1", "disable_auto_compaction" =
"false");
+ """
+
+ sql """insert into ${table_name} values (1, '{"a": 1, "b": 2, "c" : {"d" :
2}}'), (2, '{"a": 3, "b": 4}');"""
+ sql """insert into ${table_name} values (3, '{"c": {"d": 6}}');"""
+
+ qt_sql """select v['c'] from ${table_name} order by k;"""
+
+ sql "DROP TABLE IF EXISTS ${table_name}"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS ${table_name} (
+ k bigint,
+ v variant<'c.d' : decimal(10, 5),
properties("variant_max_subcolumns_count" = "2",
"variant_enable_typed_paths_to_sparse" = "true")>
+ )
+ DUPLICATE KEY(`k`)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ properties("replication_num" = "1", "disable_auto_compaction" =
"false");
+ """
+
+ sql """insert into ${table_name} values (1, '{"a": 1, "b": 2, "c" : {"d" :
2}}'), (2, '{"a": 3, "b": 4}');"""
+ sql """insert into ${table_name} values (3, '{"c": {"d": 6}}');"""
+
+ qt_sql """select v['c'] from ${table_name} order by k;"""
+
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]