This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7e8f1b21ea8 [fix](variant) Preserve json object when casting jsonb to
variant (#63792)
7e8f1b21ea8 is described below
commit 7e8f1b21ea8a43f5179f3cdd791aef2fadc69a7d
Author: lihangyu <[email protected]>
AuthorDate: Wed Jun 17 10:56:01 2026 +0800
[fix](variant) Preserve json object when casting jsonb to variant (#63792)
Problem Summary: Inserting JSONB-producing expressions, such as
`json_object` or a `JSON` column, into a `VARIANT` column could
materialize the root JSONB through the generic JSONB-to-string cast
path. That path unescaped string contents before variant parsing, so
JSON objects containing quoted strings could be parsed incorrectly and
subpath extraction failed.
### What is changed?
Convert scalar JSONB variant roots directly back to JSON text with
`JsonbToJson::jsonb_to_json_string` before parsing into `ColumnVariant`,
instead of using the generic JSONB-to-string cast path. This preserves
embedded quotes and backslashes in object string values and lets variant
path extraction work correctly.
The regression covers:
- `INSERT INTO variant SELECT json_object(...)` with `Willie "The Lion"
Smith`, verifying stored object text and path extraction.
- `INSERT INTO variant SELECT json_column` with escaped quotes, a
Windows-style path, nested object, and array, verifying `ok`, `msg`,
`path`, `nested.x`, and array size.
The test intentionally avoids asserting `variant_type(ch)` because the
physical variant layout can be reported either as a materialized
subcolumn shape or as `__DORIS_VARIANT_DOC_VALUE__` map while preserving
the same user-visible object and path extraction behavior
---
be/src/exec/common/variant_util.cpp | 44 +++++++---
be/test/exec/common/schema_util_test.cpp | 95 ++++++++++++++++++++-
.../test_insert_json_object_to_variant.out | 7 ++
.../test_insert_json_object_to_variant.groovy | 98 ++++++++++++++++++++++
4 files changed, 231 insertions(+), 13 deletions(-)
diff --git a/be/src/exec/common/variant_util.cpp
b/be/src/exec/common/variant_util.cpp
index 5314b38e480..15fd0b48527 100644
--- a/be/src/exec/common/variant_util.cpp
+++ b/be/src/exec/common/variant_util.cpp
@@ -100,6 +100,7 @@
#include "util/json/json_parser.h"
#include "util/json/path_in_data.h"
#include "util/json/simd_json_parser.h"
+#include "util/jsonb_utils.h"
namespace doris::variant_util {
@@ -398,6 +399,37 @@ Status cast_column(const ColumnWithTypeAndName& arg, const
DataTypePtr& type, Co
return Status::OK();
}
+ColumnPtr jsonb_root_to_json_string_column(const IColumn& root) {
+ auto root_column = root.convert_to_full_column_if_const();
+ const IColumn* jsonb_column = root_column.get();
+ const NullMap* null_map = nullptr;
+ if (root_column->is_nullable()) {
+ const auto& nullable = assert_cast<const
ColumnNullable&>(*root_column);
+ jsonb_column = &nullable.get_nested_column();
+ null_map = &nullable.get_null_map_data();
+ }
+
+ const auto& column = assert_cast<const ColumnString&>(*jsonb_column);
+ auto result = ColumnString::create();
+ result->reserve(column.size());
+ for (size_t i = 0; i < column.size(); ++i) {
+ if (null_map != nullptr && (*null_map)[i]) {
+ result->insert_default();
+ continue;
+ }
+
+ const auto jsonb = column.get_data_at(i);
+ if (jsonb.size == 0) {
+ result->insert_default();
+ continue;
+ }
+
+ const auto json = JsonbToJson::jsonb_to_json_string(jsonb.data,
jsonb.size);
+ result->insert_data(json.data(), json.size());
+ }
+ return result->get_ptr();
+}
+
void get_column_by_type(const DataTypePtr& data_type, const std::string& name,
TabletColumn& column,
const ExtraInfo& ext_info) {
column.set_name(name);
@@ -2174,17 +2206,7 @@ Status _parse_and_materialize_variant_columns(Block&
block,
VLOG_DEBUG << "parse scalar variant column: " <<
var.get_root_type()->get_name();
ColumnPtr scalar_root_column;
if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
- // TODO more efficient way to parse jsonb type, currently we just
convert jsonb to
- // json str and parse them into variant
- RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(),
""},
- var.get_root()->is_nullable()
- ?
make_nullable(std::make_shared<DataTypeString>())
- :
std::make_shared<DataTypeString>(),
- &scalar_root_column));
- if (is_column_nullable(*scalar_root_column)) {
- scalar_root_column = assert_cast<const
ColumnNullable*>(scalar_root_column.get())
- ->get_nested_column_ptr();
- }
+ scalar_root_column =
jsonb_root_to_json_string_column(*var.get_root());
} else {
const auto& root = *var.get_root();
scalar_root_column =
diff --git a/be/test/exec/common/schema_util_test.cpp
b/be/test/exec/common/schema_util_test.cpp
index 05b9f14ab8c..c5d83e7ce64 100644
--- a/be/test/exec/common/schema_util_test.cpp
+++ b/be/test/exec/common/schema_util_test.cpp
@@ -18,6 +18,10 @@
#include <gmock/gmock-more-matchers.h>
#include <gtest/gtest.h>
+#include <initializer_list>
+#include <string>
+#include <string_view>
+
#include "core/column/column_nothing.h"
#include "core/column/column_variant.h"
#include "core/data_type/data_type_array.h"
@@ -25,9 +29,12 @@
#include "core/data_type/data_type_date_time.h"
#include "core/data_type/data_type_decimal.h"
#include "core/data_type/data_type_ipv4.h"
+#include "core/data_type/data_type_jsonb.h"
#include "core/data_type/data_type_nothing.h"
+#include "core/data_type/data_type_string.h"
#include "core/data_type/data_type_time.h"
#include "core/data_type/data_type_variant.h"
+#include "core/data_type_serde/data_type_jsonb_serde.h"
#include "exec/common/variant_util.h"
#include "storage/rowset/beta_rowset.h"
#include "storage/rowset/rowset_fwd.h"
@@ -46,6 +53,52 @@ public:
~SchemaUtilTest() override = default;
};
+ColumnString::MutablePtr
make_jsonb_column(std::initializer_list<std::string_view> jsons) {
+ auto jsonb_column = ColumnString::create();
+ DataTypeJsonbSerDe jsonb_serde;
+ DataTypeSerDe::FormatOptions options;
+ options.converted_from_string = true;
+ options.escape_char = '\\';
+
+ for (auto json : jsons) {
+ std::string json_text(json);
+ Slice slice(json_text.data(), json_text.size());
+ auto status =
jsonb_serde.deserialize_one_cell_from_json(*jsonb_column, slice, options);
+ EXPECT_TRUE(status.ok()) << status.to_string();
+ }
+ return jsonb_column;
+}
+
+void expect_variant_string_subcolumn(const ColumnVariant& variant,
std::string_view path,
+ std::string_view expected) {
+ const auto* subcolumn =
variant.get_subcolumn(PathInData(std::string(path)));
+ ASSERT_NE(subcolumn, nullptr);
+
+ FieldWithDataType field;
+ subcolumn->get(0, field);
+ ASSERT_EQ(field.field.get_type(), PrimitiveType::TYPE_STRING);
+ EXPECT_EQ(field.field.get<PrimitiveType::TYPE_STRING>(), expected);
+}
+
+void expect_variant_int_field(const Field& field, int64_t expected) {
+ switch (field.get_type()) {
+ case PrimitiveType::TYPE_TINYINT:
+ EXPECT_EQ(field.get<PrimitiveType::TYPE_TINYINT>(), expected);
+ break;
+ case PrimitiveType::TYPE_SMALLINT:
+ EXPECT_EQ(field.get<PrimitiveType::TYPE_SMALLINT>(), expected);
+ break;
+ case PrimitiveType::TYPE_INT:
+ EXPECT_EQ(field.get<PrimitiveType::TYPE_INT>(), expected);
+ break;
+ case PrimitiveType::TYPE_BIGINT:
+ EXPECT_EQ(field.get<PrimitiveType::TYPE_BIGINT>(), expected);
+ break;
+ default:
+ FAIL() << "unexpected field type: " << field.get_type_name();
+ }
+}
+
void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index,
int64_t index_id,
const std::string& index_name, int32_t col_unique_id,
const std::string& column_type, const std::string&
column_name,
@@ -1294,8 +1347,7 @@ TEST_F(SchemaUtilTest, TestParseVariantColumnsEdgeCases) {
// Test parsing from JSONB to variant
auto jsonb_type = std::make_shared<DataTypeJsonb>();
- auto jsonb_column = ColumnString::create();
-
jsonb_column->insert(Field::create_field<PrimitiveType::TYPE_STRING>("{'x':
1}"));
+ auto jsonb_column = make_jsonb_column({R"({"x":1})"});
auto variant_column2 = ColumnVariant::create(10, false);
variant_column2->create_root(jsonb_type, jsonb_column->get_ptr());
@@ -1317,6 +1369,45 @@ TEST_F(SchemaUtilTest, TestParseVariantColumnsEdgeCases)
{
EXPECT_TRUE(status.ok());
}
+TEST_F(SchemaUtilTest, TestParseJsonbRootVariantMaterializesDocument) {
+ auto variant_type = std::make_shared<DataTypeVariant>(10, false);
+ auto jsonb_type = std::make_shared<DataTypeJsonb>();
+ auto jsonb_column = make_jsonb_column(
+ {R"({"ok":"abc","msg":"he said
\"hi\"","path":"C:\\tmp","nested":{"x":1},"arr":[1,2]})"});
+
+ auto variant_column = ColumnVariant::create(10, false);
+ variant_column->create_root(jsonb_type, jsonb_column->get_ptr());
+
+ Block block;
+ block.insert({variant_column->get_ptr(), variant_type, "variant_col"});
+
+ ParseConfig config;
+ auto status = variant_util::parse_and_materialize_variant_columns(block,
{0}, {config});
+ ASSERT_TRUE(status.ok()) << status.to_string();
+
+ const auto& result = assert_cast<const
ColumnVariant&>(*block.get_by_position(0).column);
+ EXPECT_FALSE(result.is_scalar_variant());
+ expect_variant_string_subcolumn(result, "ok", "abc");
+ expect_variant_string_subcolumn(result, "msg", R"(he said "hi")");
+ expect_variant_string_subcolumn(result, "path", R"(C:\tmp)");
+
+ FieldWithDataType nested_x;
+ const auto* nested_x_subcolumn =
result.get_subcolumn(PathInData("nested.x"));
+ ASSERT_NE(nested_x_subcolumn, nullptr);
+ nested_x_subcolumn->get(0, nested_x);
+ expect_variant_int_field(nested_x.field, 1);
+
+ FieldWithDataType arr;
+ const auto* arr_subcolumn = result.get_subcolumn(PathInData("arr"));
+ ASSERT_NE(arr_subcolumn, nullptr);
+ arr_subcolumn->get(0, arr);
+ ASSERT_EQ(arr.field.get_type(), PrimitiveType::TYPE_ARRAY);
+ const auto& arr_value = arr.field.get<PrimitiveType::TYPE_ARRAY>();
+ ASSERT_EQ(arr_value.size(), 2);
+ expect_variant_int_field(arr_value[0], 1);
+ expect_variant_int_field(arr_value[1], 2);
+}
+
TEST_F(SchemaUtilTest, TestParseVariantColumnsWithNulls) {
Block block;
diff --git
a/regression-test/data/variant_p0/test_insert_json_object_to_variant.out
b/regression-test/data/variant_p0/test_insert_json_object_to_variant.out
new file mode 100644
index 00000000000..c8e9ad3135b
--- /dev/null
+++ b/regression-test/data/variant_p0/test_insert_json_object_to_variant.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !insert_json_object_to_variant --
+17075 {"ch":"Willie \\"The Lion\\" Smith"} Willie "The Lion" Smith
+
+-- !insert_json_to_variant --
+1 abc he said "hi" 433A5C746D70 1 2
+
diff --git
a/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy
b/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy
new file mode 100644
index 00000000000..8753809e737
--- /dev/null
+++
b/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_insert_json_object_to_variant", "variant_type") {
+ sql """DROP TABLE IF EXISTS test_insert_json_object_to_variant_dst"""
+ sql """DROP TABLE IF EXISTS test_insert_json_object_to_variant_src"""
+ sql """DROP TABLE IF EXISTS test_insert_json_to_variant_dst"""
+ sql """DROP TABLE IF EXISTS test_insert_json_to_variant_src"""
+
+ sql """
+ CREATE TABLE test_insert_json_object_to_variant_src (
+ a bigint NOT NULL AUTO_INCREMENT(1),
+ ch text NULL
+ )
+ DUPLICATE KEY(a)
+ DISTRIBUTED BY HASH(a) BUCKETS 1
+ PROPERTIES ("replication_num" = "1", "disable_auto_compaction" =
"true");
+ """
+
+ sql """
+ CREATE TABLE test_insert_json_object_to_variant_dst (
+ a bigint NOT NULL AUTO_INCREMENT(1),
+ ch variant NULL
+ )
+ DUPLICATE KEY(a)
+ DISTRIBUTED BY HASH(a) BUCKETS 1
+ PROPERTIES ("replication_num" = "1", "disable_auto_compaction" =
"true");
+ """
+
+ sql """INSERT INTO test_insert_json_object_to_variant_src VALUES (17075,
'Willie "The Lion" Smith')"""
+ sql """
+ INSERT INTO test_insert_json_object_to_variant_dst
+ SELECT a, json_object("ch", ch)
+ FROM test_insert_json_object_to_variant_src
+ WHERE a = 17075
+ """
+
+ order_qt_insert_json_object_to_variant """
+ SELECT a, ch, CAST(ch['ch'] AS string)
+ FROM test_insert_json_object_to_variant_dst
+ ORDER BY a
+ """
+
+ sql """
+ CREATE TABLE test_insert_json_to_variant_src (
+ id int,
+ j JSON
+ )
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1", "disable_auto_compaction" =
"true");
+ """
+
+ sql """
+ CREATE TABLE test_insert_json_to_variant_dst (
+ id int,
+ v VARIANT
+ )
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1", "disable_auto_compaction" =
"true");
+ """
+
+ sql $/INSERT INTO test_insert_json_to_variant_src VALUES
+ (1, '{"ok":"abc","msg":"he said
\\\"hi\\\"","path":"C:\\\\tmp","nested":{"x":1},"arr":[1,2]}')/$
+
+ sql """
+ INSERT INTO test_insert_json_to_variant_dst
+ SELECT id, j
+ FROM test_insert_json_to_variant_src
+ WHERE id = 1
+ """
+
+ order_qt_insert_json_to_variant """
+ SELECT id,
+ CAST(v['ok'] AS string),
+ CAST(v['msg'] AS string),
+ HEX(CAST(v['path'] AS string)),
+ CAST(v['nested']['x'] AS int),
+ size(CAST(v['arr'] AS array<int>))
+ FROM test_insert_json_to_variant_dst
+ ORDER BY id
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]