This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 7e8f1b21ea8 [fix](variant) Preserve json object when casting jsonb to 
variant (#63792)
7e8f1b21ea8 is described below

commit 7e8f1b21ea8a43f5179f3cdd791aef2fadc69a7d
Author: lihangyu <[email protected]>
AuthorDate: Wed Jun 17 10:56:01 2026 +0800

    [fix](variant) Preserve json object when casting jsonb to variant (#63792)
    
    Problem Summary: Inserting JSONB-producing expressions, such as
    `json_object` or a `JSON` column, into a `VARIANT` column could
    materialize the root JSONB through the generic JSONB-to-string cast
    path. That path unescaped string contents before variant parsing, so
    JSON objects containing quoted strings could be parsed incorrectly and
    subpath extraction failed.
    
    ### What is changed?
    
    Convert scalar JSONB variant roots directly back to JSON text with
    `JsonbToJson::jsonb_to_json_string` before parsing into `ColumnVariant`,
    instead of using the generic JSONB-to-string cast path. This preserves
    embedded quotes and backslashes in object string values and lets variant
    path extraction work correctly.
    
    The regression covers:
    
    - `INSERT INTO variant SELECT json_object(...)` with `Willie "The Lion"
    Smith`, verifying stored object text and path extraction.
    - `INSERT INTO variant SELECT json_column` with escaped quotes, a
    Windows-style path, nested object, and array, verifying `ok`, `msg`,
    `path`, `nested.x`, and array size.
    
    The test intentionally avoids asserting `variant_type(ch)` because the
    physical variant layout can be reported either as a materialized
    subcolumn shape or as `__DORIS_VARIANT_DOC_VALUE__` map while preserving
    the same user-visible object and path extraction behavior
---
 be/src/exec/common/variant_util.cpp                | 44 +++++++---
 be/test/exec/common/schema_util_test.cpp           | 95 ++++++++++++++++++++-
 .../test_insert_json_object_to_variant.out         |  7 ++
 .../test_insert_json_object_to_variant.groovy      | 98 ++++++++++++++++++++++
 4 files changed, 231 insertions(+), 13 deletions(-)

diff --git a/be/src/exec/common/variant_util.cpp 
b/be/src/exec/common/variant_util.cpp
index 5314b38e480..15fd0b48527 100644
--- a/be/src/exec/common/variant_util.cpp
+++ b/be/src/exec/common/variant_util.cpp
@@ -100,6 +100,7 @@
 #include "util/json/json_parser.h"
 #include "util/json/path_in_data.h"
 #include "util/json/simd_json_parser.h"
+#include "util/jsonb_utils.h"
 
 namespace doris::variant_util {
 
@@ -398,6 +399,37 @@ Status cast_column(const ColumnWithTypeAndName& arg, const 
DataTypePtr& type, Co
     return Status::OK();
 }
 
+ColumnPtr jsonb_root_to_json_string_column(const IColumn& root) {
+    auto root_column = root.convert_to_full_column_if_const();
+    const IColumn* jsonb_column = root_column.get();
+    const NullMap* null_map = nullptr;
+    if (root_column->is_nullable()) {
+        const auto& nullable = assert_cast<const 
ColumnNullable&>(*root_column);
+        jsonb_column = &nullable.get_nested_column();
+        null_map = &nullable.get_null_map_data();
+    }
+
+    const auto& column = assert_cast<const ColumnString&>(*jsonb_column);
+    auto result = ColumnString::create();
+    result->reserve(column.size());
+    for (size_t i = 0; i < column.size(); ++i) {
+        if (null_map != nullptr && (*null_map)[i]) {
+            result->insert_default();
+            continue;
+        }
+
+        const auto jsonb = column.get_data_at(i);
+        if (jsonb.size == 0) {
+            result->insert_default();
+            continue;
+        }
+
+        const auto json = JsonbToJson::jsonb_to_json_string(jsonb.data, 
jsonb.size);
+        result->insert_data(json.data(), json.size());
+    }
+    return result->get_ptr();
+}
+
 void get_column_by_type(const DataTypePtr& data_type, const std::string& name, 
TabletColumn& column,
                         const ExtraInfo& ext_info) {
     column.set_name(name);
@@ -2174,17 +2206,7 @@ Status _parse_and_materialize_variant_columns(Block& 
block,
         VLOG_DEBUG << "parse scalar variant column: " << 
var.get_root_type()->get_name();
         ColumnPtr scalar_root_column;
         if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
-            // TODO more efficient way to parse jsonb type, currently we just 
convert jsonb to
-            // json str and parse them into variant
-            RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(), 
""},
-                                        var.get_root()->is_nullable()
-                                                ? 
make_nullable(std::make_shared<DataTypeString>())
-                                                : 
std::make_shared<DataTypeString>(),
-                                        &scalar_root_column));
-            if (is_column_nullable(*scalar_root_column)) {
-                scalar_root_column = assert_cast<const 
ColumnNullable*>(scalar_root_column.get())
-                                             ->get_nested_column_ptr();
-            }
+            scalar_root_column = 
jsonb_root_to_json_string_column(*var.get_root());
         } else {
             const auto& root = *var.get_root();
             scalar_root_column =
diff --git a/be/test/exec/common/schema_util_test.cpp 
b/be/test/exec/common/schema_util_test.cpp
index 05b9f14ab8c..c5d83e7ce64 100644
--- a/be/test/exec/common/schema_util_test.cpp
+++ b/be/test/exec/common/schema_util_test.cpp
@@ -18,6 +18,10 @@
 #include <gmock/gmock-more-matchers.h>
 #include <gtest/gtest.h>
 
+#include <initializer_list>
+#include <string>
+#include <string_view>
+
 #include "core/column/column_nothing.h"
 #include "core/column/column_variant.h"
 #include "core/data_type/data_type_array.h"
@@ -25,9 +29,12 @@
 #include "core/data_type/data_type_date_time.h"
 #include "core/data_type/data_type_decimal.h"
 #include "core/data_type/data_type_ipv4.h"
+#include "core/data_type/data_type_jsonb.h"
 #include "core/data_type/data_type_nothing.h"
+#include "core/data_type/data_type_string.h"
 #include "core/data_type/data_type_time.h"
 #include "core/data_type/data_type_variant.h"
+#include "core/data_type_serde/data_type_jsonb_serde.h"
 #include "exec/common/variant_util.h"
 #include "storage/rowset/beta_rowset.h"
 #include "storage/rowset/rowset_fwd.h"
@@ -46,6 +53,52 @@ public:
     ~SchemaUtilTest() override = default;
 };
 
+ColumnString::MutablePtr 
make_jsonb_column(std::initializer_list<std::string_view> jsons) {
+    auto jsonb_column = ColumnString::create();
+    DataTypeJsonbSerDe jsonb_serde;
+    DataTypeSerDe::FormatOptions options;
+    options.converted_from_string = true;
+    options.escape_char = '\\';
+
+    for (auto json : jsons) {
+        std::string json_text(json);
+        Slice slice(json_text.data(), json_text.size());
+        auto status = 
jsonb_serde.deserialize_one_cell_from_json(*jsonb_column, slice, options);
+        EXPECT_TRUE(status.ok()) << status.to_string();
+    }
+    return jsonb_column;
+}
+
+void expect_variant_string_subcolumn(const ColumnVariant& variant, 
std::string_view path,
+                                     std::string_view expected) {
+    const auto* subcolumn = 
variant.get_subcolumn(PathInData(std::string(path)));
+    ASSERT_NE(subcolumn, nullptr);
+
+    FieldWithDataType field;
+    subcolumn->get(0, field);
+    ASSERT_EQ(field.field.get_type(), PrimitiveType::TYPE_STRING);
+    EXPECT_EQ(field.field.get<PrimitiveType::TYPE_STRING>(), expected);
+}
+
+void expect_variant_int_field(const Field& field, int64_t expected) {
+    switch (field.get_type()) {
+    case PrimitiveType::TYPE_TINYINT:
+        EXPECT_EQ(field.get<PrimitiveType::TYPE_TINYINT>(), expected);
+        break;
+    case PrimitiveType::TYPE_SMALLINT:
+        EXPECT_EQ(field.get<PrimitiveType::TYPE_SMALLINT>(), expected);
+        break;
+    case PrimitiveType::TYPE_INT:
+        EXPECT_EQ(field.get<PrimitiveType::TYPE_INT>(), expected);
+        break;
+    case PrimitiveType::TYPE_BIGINT:
+        EXPECT_EQ(field.get<PrimitiveType::TYPE_BIGINT>(), expected);
+        break;
+    default:
+        FAIL() << "unexpected field type: " << field.get_type_name();
+    }
+}
+
 void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, 
int64_t index_id,
                       const std::string& index_name, int32_t col_unique_id,
                       const std::string& column_type, const std::string& 
column_name,
@@ -1294,8 +1347,7 @@ TEST_F(SchemaUtilTest, TestParseVariantColumnsEdgeCases) {
 
     // Test parsing from JSONB to variant
     auto jsonb_type = std::make_shared<DataTypeJsonb>();
-    auto jsonb_column = ColumnString::create();
-    
jsonb_column->insert(Field::create_field<PrimitiveType::TYPE_STRING>("{'x': 
1}"));
+    auto jsonb_column = make_jsonb_column({R"({"x":1})"});
 
     auto variant_column2 = ColumnVariant::create(10, false);
     variant_column2->create_root(jsonb_type, jsonb_column->get_ptr());
@@ -1317,6 +1369,45 @@ TEST_F(SchemaUtilTest, TestParseVariantColumnsEdgeCases) 
{
     EXPECT_TRUE(status.ok());
 }
 
+TEST_F(SchemaUtilTest, TestParseJsonbRootVariantMaterializesDocument) {
+    auto variant_type = std::make_shared<DataTypeVariant>(10, false);
+    auto jsonb_type = std::make_shared<DataTypeJsonb>();
+    auto jsonb_column = make_jsonb_column(
+            {R"({"ok":"abc","msg":"he said 
\"hi\"","path":"C:\\tmp","nested":{"x":1},"arr":[1,2]})"});
+
+    auto variant_column = ColumnVariant::create(10, false);
+    variant_column->create_root(jsonb_type, jsonb_column->get_ptr());
+
+    Block block;
+    block.insert({variant_column->get_ptr(), variant_type, "variant_col"});
+
+    ParseConfig config;
+    auto status = variant_util::parse_and_materialize_variant_columns(block, 
{0}, {config});
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    const auto& result = assert_cast<const 
ColumnVariant&>(*block.get_by_position(0).column);
+    EXPECT_FALSE(result.is_scalar_variant());
+    expect_variant_string_subcolumn(result, "ok", "abc");
+    expect_variant_string_subcolumn(result, "msg", R"(he said "hi")");
+    expect_variant_string_subcolumn(result, "path", R"(C:\tmp)");
+
+    FieldWithDataType nested_x;
+    const auto* nested_x_subcolumn = 
result.get_subcolumn(PathInData("nested.x"));
+    ASSERT_NE(nested_x_subcolumn, nullptr);
+    nested_x_subcolumn->get(0, nested_x);
+    expect_variant_int_field(nested_x.field, 1);
+
+    FieldWithDataType arr;
+    const auto* arr_subcolumn = result.get_subcolumn(PathInData("arr"));
+    ASSERT_NE(arr_subcolumn, nullptr);
+    arr_subcolumn->get(0, arr);
+    ASSERT_EQ(arr.field.get_type(), PrimitiveType::TYPE_ARRAY);
+    const auto& arr_value = arr.field.get<PrimitiveType::TYPE_ARRAY>();
+    ASSERT_EQ(arr_value.size(), 2);
+    expect_variant_int_field(arr_value[0], 1);
+    expect_variant_int_field(arr_value[1], 2);
+}
+
 TEST_F(SchemaUtilTest, TestParseVariantColumnsWithNulls) {
     Block block;
 
diff --git 
a/regression-test/data/variant_p0/test_insert_json_object_to_variant.out 
b/regression-test/data/variant_p0/test_insert_json_object_to_variant.out
new file mode 100644
index 00000000000..c8e9ad3135b
--- /dev/null
+++ b/regression-test/data/variant_p0/test_insert_json_object_to_variant.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !insert_json_object_to_variant --
+17075  {"ch":"Willie \\"The Lion\\" Smith"}    Willie "The Lion" Smith
+
+-- !insert_json_to_variant --
+1      abc     he said "hi"    433A5C746D70    1       2
+
diff --git 
a/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy 
b/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy
new file mode 100644
index 00000000000..8753809e737
--- /dev/null
+++ 
b/regression-test/suites/variant_p0/test_insert_json_object_to_variant.groovy
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_insert_json_object_to_variant", "variant_type") {
+    sql """DROP TABLE IF EXISTS test_insert_json_object_to_variant_dst"""
+    sql """DROP TABLE IF EXISTS test_insert_json_object_to_variant_src"""
+    sql """DROP TABLE IF EXISTS test_insert_json_to_variant_dst"""
+    sql """DROP TABLE IF EXISTS test_insert_json_to_variant_src"""
+
+    sql """
+        CREATE TABLE test_insert_json_object_to_variant_src (
+            a bigint NOT NULL AUTO_INCREMENT(1),
+            ch text NULL
+        )
+        DUPLICATE KEY(a)
+        DISTRIBUTED BY HASH(a) BUCKETS 1
+        PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = 
"true");
+    """
+
+    sql """
+        CREATE TABLE test_insert_json_object_to_variant_dst (
+            a bigint NOT NULL AUTO_INCREMENT(1),
+            ch variant NULL
+        )
+        DUPLICATE KEY(a)
+        DISTRIBUTED BY HASH(a) BUCKETS 1
+        PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = 
"true");
+    """
+
+    sql """INSERT INTO test_insert_json_object_to_variant_src VALUES (17075, 
'Willie "The Lion" Smith')"""
+    sql """
+        INSERT INTO test_insert_json_object_to_variant_dst
+        SELECT a, json_object("ch", ch)
+        FROM test_insert_json_object_to_variant_src
+        WHERE a = 17075
+    """
+
+    order_qt_insert_json_object_to_variant """
+        SELECT a, ch, CAST(ch['ch'] AS string)
+        FROM test_insert_json_object_to_variant_dst
+        ORDER BY a
+    """
+
+    sql """
+        CREATE TABLE test_insert_json_to_variant_src (
+            id int,
+            j JSON
+        )
+        DUPLICATE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = 
"true");
+    """
+
+    sql """
+        CREATE TABLE test_insert_json_to_variant_dst (
+            id int,
+            v VARIANT
+        )
+        DUPLICATE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = 
"true");
+    """
+
+    sql $/INSERT INTO test_insert_json_to_variant_src VALUES
+        (1, '{"ok":"abc","msg":"he said 
\\\"hi\\\"","path":"C:\\\\tmp","nested":{"x":1},"arr":[1,2]}')/$
+
+    sql """
+        INSERT INTO test_insert_json_to_variant_dst
+        SELECT id, j
+        FROM test_insert_json_to_variant_src
+        WHERE id = 1
+    """
+
+    order_qt_insert_json_to_variant """
+        SELECT id,
+               CAST(v['ok'] AS string),
+               CAST(v['msg'] AS string),
+               HEX(CAST(v['path'] AS string)),
+               CAST(v['nested']['x'] AS int),
+               size(CAST(v['arr'] AS array<int>))
+        FROM test_insert_json_to_variant_dst
+        ORDER BY id
+    """
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to