This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b43120b514e [Fix](Serde) Support hive compatible output format (#49036)
b43120b514e is described below

commit b43120b514edf2284755a026f02488975d15719c
Author: Tiewei Fang <fangtie...@selectdb.com>
AuthorDate: Mon Apr 7 15:12:34 2025 +0800

    [Fix](Serde) Support hive compatible output format (#49036)
    
    Problem Summary:
    
    The output format of complex data types are different between Hive and
    Doris, such as array, map and struct.
    When user migrate from Hive to Doris, they expect the same format so
    that they don't need to modify their business code.
    
    This PR mainly changes:
    
    Add a new option to  session variable `serde_dialect`: If set to hive,
    the output format returned to MySQL client of some datatypes will be
    changed:
    
    Array
    Doris: ["abc", "def", "", null, 1]
    Hive: ["abc","def","",null,true]
    
    Map
    Doris: {"k1":null, "k2":"v3"}
    Hive: {"k1":null,"k2":"v3"}
    
    Struct
    Doris: {"s_id":100, "s_name":"abc , "", "s_address":null}
    Hive: {"s_id":100,"s_name":"abc ,"","s_address":null}
    
    Related #37039
---
 .../vec/data_types/serde/data_type_array_serde.cpp |   5 +-
 .../vec/data_types/serde/data_type_map_serde.cpp   |   7 +-
 .../data_types/serde/data_type_number_serde.cpp    |   9 +-
 be/src/vec/data_types/serde/data_type_serde.h      |  20 ++++
 .../data_types/serde/data_type_struct_serde.cpp    |   5 +-
 be/src/vec/sink/vmysql_result_writer.cpp           |  16 +++
 .../org/apache/doris/nereids/NereidsPlanner.java   |   1 +
 .../java/org/apache/doris/qe/SessionVariable.java  |  10 +-
 gensrc/thrift/PaloInternalService.thrift           |   3 +-
 .../datatype_p0/serde/test_serde_dialect_hive.out  | Bin 0 -> 2029 bytes
 .../serde/test_serde_dialect_hive.groovy           | 107 +++++++++++++++++++++
 11 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp 
b/be/src/vec/data_types/serde/data_type_array_serde.cpp
index d654e3ae22d..a98d9622720 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp
@@ -331,7 +331,8 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const 
IColumn& column,
     const auto end_arr_element = offsets[row_idx_of_col_arr];
     for (auto j = begin_arr_element; j < end_arr_element; ++j) {
         if (j != begin_arr_element) {
-            if (0 != result.push_string(", ", 2)) {
+            if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+                                        
options.mysql_collection_delim.size())) {
                 return Status::InternalError("pack mysql buffer failed.");
             }
         }
@@ -340,6 +341,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const 
IColumn& column,
                 return Status::InternalError("pack mysql buffer failed.");
             }
         } else {
+            ++options.level;
             if (is_nested_string && options.wrapper_len > 0) {
                 if (0 != result.push_string(options.nested_string_wrapper, 
options.wrapper_len)) {
                     return Status::InternalError("pack mysql buffer failed.");
@@ -353,6 +355,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const 
IColumn& column,
                 RETURN_IF_ERROR(
                         nested_serde->write_column_to_mysql(data, result, j, 
false, options));
             }
+            --options.level;
         }
     }
     if (0 != result.push_string("]", 1)) {
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp 
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index 7a4921623f3..118a2841f5f 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -419,7 +419,8 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const 
IColumn& column,
     const auto& offsets = map_column.get_offsets();
     for (auto j = offsets[col_index - 1]; j < offsets[col_index]; ++j) {
         if (j != offsets[col_index - 1]) {
-            if (0 != result.push_string(", ", 2)) {
+            if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+                                        
options.mysql_collection_delim.size())) {
                 return Status::InternalError("pack mysql buffer failed.");
             }
         }
@@ -428,6 +429,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const 
IColumn& column,
                 return Status::InternalError("pack mysql buffer failed.");
             }
         } else {
+            ++options.level;
             if (is_key_string && options.wrapper_len > 0) {
                 if (0 != result.push_string(options.nested_string_wrapper, 
options.wrapper_len)) {
                     return Status::InternalError("pack mysql buffer failed.");
@@ -441,6 +443,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const 
IColumn& column,
                 
RETURN_IF_ERROR(key_serde->write_column_to_mysql(nested_keys_column, result, j,
                                                                  false, 
options));
             }
+            --options.level;
         }
         if (0 != result.push_string(&options.map_key_delim, 1)) {
             return Status::InternalError("pack mysql buffer failed.");
@@ -450,6 +453,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const 
IColumn& column,
                 return Status::InternalError("pack mysql buffer failed.");
             }
         } else {
+            ++options.level;
             if (is_val_string && options.wrapper_len > 0) {
                 if (0 != result.push_string(options.nested_string_wrapper, 
options.wrapper_len)) {
                     return Status::InternalError("pack mysql buffer failed.");
@@ -463,6 +467,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const 
IColumn& column,
                 
RETURN_IF_ERROR(value_serde->write_column_to_mysql(nested_values_column, 
result, j,
                                                                    false, 
options));
             }
+            --options.level;
         }
     }
     if (0 != result.push_string("}", 1)) {
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp 
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index d58439c9dd8..2eb972cbf1a 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -277,8 +277,15 @@ Status 
DataTypeNumberSerDe<T>::_write_column_to_mysql(const IColumn& column,
     int buf_ret = 0;
     auto& data = assert_cast<const ColumnType&>(column).get_data();
     const auto col_index = index_check_const(row_idx, col_const);
-    if constexpr (std::is_same_v<T, Int8> || std::is_same_v<T, UInt8>) {
+    if constexpr (std::is_same_v<T, Int8>) {
         buf_ret = result.push_tinyint(data[col_index]);
+    } else if constexpr (std::is_same_v<T, UInt8>) {
+        if (options.level > 0 && !options.is_bool_value_num) {
+            std::string bool_value = data[col_index] ? "true" : "false";
+            result.push_string(bool_value.c_str(), bool_value.size());
+        } else {
+            buf_ret = result.push_tinyint(data[col_index]);
+        }
     } else if constexpr (std::is_same_v<T, Int16> || std::is_same_v<T, 
UInt16>) {
         buf_ret = result.push_smallint(data[col_index]);
     } else if constexpr (std::is_same_v<T, Int32> || std::is_same_v<T, 
UInt32>) {
diff --git a/be/src/vec/data_types/serde/data_type_serde.h 
b/be/src/vec/data_types/serde/data_type_serde.h
index f54c4604499..569b3cc7b2d 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -180,6 +180,26 @@ public:
         const char* nested_string_wrapper;
         int wrapper_len;
 
+        /**
+         * mysql_collection_delim is used to separate elements in collection, 
such as array, map, struct
+         * It is used to write to mysql.
+         */
+        std::string mysql_collection_delim = ", ";
+
+        /**
+         * is_bool_value_num is used to display bool value in collection, such 
as array, map, struct
+         * eg, if set to true, the array<true> will be:
+         *      [1]
+         *     if set to false, the array<true> will be:
+         *      [true]
+         */
+        bool is_bool_value_num = true;
+
+        /**
+         * Indicate the nested level of column. It is used to control some 
behavior of serde
+         */
+        mutable int level = 0;
+
         [[nodiscard]] char get_collection_delimiter(
                 int hive_text_complex_type_delimiter_level) const {
             CHECK(0 <= hive_text_complex_type_delimiter_level &&
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp 
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index ee59138a372..2dcaa5ab000 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -351,7 +351,8 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const 
IColumn& column,
     bool begin = true;
     for (size_t j = 0; j < elem_serdes_ptrs.size(); ++j) {
         if (!begin) {
-            if (0 != result.push_string(", ", 2)) {
+            if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+                                        
options.mysql_collection_delim.size())) {
                 return Status::InternalError("pack mysql buffer failed.");
             }
         }
@@ -375,6 +376,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const 
IColumn& column,
                 return Status::InternalError("pack mysql buffer failed.");
             }
         } else {
+            ++options.level;
             if (remove_nullable(col.get_column_ptr(j))->is_column_string() &&
                 options.wrapper_len > 0) {
                 if (0 != result.push_string(options.nested_string_wrapper, 
options.wrapper_len)) {
@@ -389,6 +391,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const 
IColumn& column,
                 RETURN_IF_ERROR(elem_serdes_ptrs[j]->write_column_to_mysql(
                         col.get_column(j), result, col_index, false, options));
             }
+            --options.level;
         }
         begin = false;
     }
diff --git a/be/src/vec/sink/vmysql_result_writer.cpp 
b/be/src/vec/sink/vmysql_result_writer.cpp
index 1fc2aed3477..3f331eb9012 100644
--- a/be/src/vec/sink/vmysql_result_writer.cpp
+++ b/be/src/vec/sink/vmysql_result_writer.cpp
@@ -141,6 +141,8 @@ Status VMysqlResultWriter<is_binary_format>::_set_options(
         _options.map_key_delim = ':';
         _options.null_format = "null";
         _options.null_len = 4;
+        _options.mysql_collection_delim = ", ";
+        _options.is_bool_value_num = true;
         break;
     case TSerdeDialect::PRESTO:
         // eg:
@@ -151,6 +153,20 @@ Status VMysqlResultWriter<is_binary_format>::_set_options(
         _options.map_key_delim = '=';
         _options.null_format = "NULL";
         _options.null_len = 4;
+        _options.mysql_collection_delim = ", ";
+        _options.is_bool_value_num = true;
+        break;
+    case TSerdeDialect::HIVE:
+        // eg:
+        //  array: ["abc","def","",null]
+        //  map: {"k1":null,"k2":"v3"}
+        _options.nested_string_wrapper = "\"";
+        _options.wrapper_len = 1;
+        _options.map_key_delim = ':';
+        _options.null_format = "null";
+        _options.null_len = 4;
+        _options.mysql_collection_delim = ",";
+        _options.is_bool_value_num = false;
         break;
     default:
         return Status::InternalError("unknown serde dialect: {}", 
serde_dialect);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
index 56adf5f2f82..5f6b74a597b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
@@ -810,6 +810,7 @@ public class NereidsPlanner extends Planner {
                 
statementContext.setFormatOptions(FormatOptions.getForPresto());
                 break;
             case "doris":
+            case "hive":
                 statementContext.setFormatOptions(FormatOptions.getDefault());
                 break;
             default:
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index b7985bcfa53..86c47de612f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -4470,9 +4470,11 @@ public class SessionVariable implements Serializable, 
Writable {
             throw new UnsupportedOperationException("serdeDialect value is 
empty");
         }
 
-        if (!serdeDialect.equalsIgnoreCase("doris") && 
!serdeDialect.equalsIgnoreCase("presto")
-                && !serdeDialect.equalsIgnoreCase("trino")) {
-            LOG.warn("serdeDialect value is invalid, the invalid value is {}", 
serdeDialect);
+        if (!serdeDialect.equalsIgnoreCase("doris")
+                && !serdeDialect.equalsIgnoreCase("presto")
+                && !serdeDialect.equalsIgnoreCase("trino")
+                && !serdeDialect.equalsIgnoreCase("hive")) {
+            LOG.warn("serde dialect value is invalid, the invalid value is 
{}", serdeDialect);
             throw new UnsupportedOperationException(
                     "sqlDialect value is invalid, the invalid value is " + 
serdeDialect);
         }
@@ -4644,6 +4646,8 @@ public class SessionVariable implements Serializable, 
Writable {
             case "presto":
             case "trino":
                 return TSerdeDialect.PRESTO;
+            case "hive":
+                return TSerdeDialect.HIVE;
             default:
                 throw new IllegalArgumentException("Unknown serde dialect: " + 
serdeDialect);
         }
diff --git a/gensrc/thrift/PaloInternalService.thrift 
b/gensrc/thrift/PaloInternalService.thrift
index f6248f9ad62..624d756966d 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -82,7 +82,8 @@ struct TResourceLimit {
 
 enum TSerdeDialect {
   DORIS = 0,
-  PRESTO = 1
+  PRESTO = 1,
+  HIVE = 2
 }
 
 // Query options that correspond to PaloService.PaloQueryOptions,
diff --git a/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out 
b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out
new file mode 100644
index 00000000000..3ea1043cdf6
Binary files /dev/null and 
b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out differ
diff --git 
a/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy 
b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy
new file mode 100644
index 00000000000..b8e3037d770
--- /dev/null
+++ b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_serde_dialect_hive", "p0") {
+
+    sql """create database if not exists test_serde_dialect_hive;"""
+    sql """use test_serde_dialect_hive;"""
+    sql """drop table if exists test_serde_dialect_hive_tbl"""
+    sql """
+        create table if not exists test_serde_dialect_hive_tbl (
+            c1 tinyint,
+            c2 smallint,
+            c3 int,
+            c4 bigint,
+            c5 largeint,
+            c6 float,
+            c7 double,
+            c8 decimal(27, 9),
+            c9 date,
+            c10 datetime,
+            c11 datetime(6),
+            c12 ipv4,
+            c13 ipv6,
+            c14 string,
+            c15 char(6),
+            c16 varchar(1024),
+            c17 boolean,
+            c18 json,
+            c19 array<int>,
+            c20 array<double>,
+            c21 array<decimal(10, 5)>,
+            c22 array<string>,
+            c23 array<map<string, string>>,
+            c24 array<array<string>>,
+            c25 array<struct<s_id:int(11), s_name:string, s_address:string>>,
+            c26 array<struct<s_id:struct<k1:string, k2:decimal(10,2)>, 
s_name:array<ipv4>, s_address:map<string, ipv6>>>,
+            c27 map<string, string>,
+            c28 map<string, array<array<string>>>,
+            c29 map<int, map<string, array<array<string>>>>,
+            c30 map<decimal(5, 3), array<struct<s_id:struct<k1:string, 
k2:decimal(10,2)>, s_name:array<string>, s_address:map<string, string>>>>,
+            c31 struct<s_id:int(11), s_name:string, s_address:string>,
+            c32 struct<s_id:int(11), s_name:array<string>, s_address:string>,
+            c33 array<date>,
+            c34 array<datetime(3)>,
+            c35 array<boolean>,
+            c36 struct<s_id:int(11), s_name:string, s_gender:boolean>,
+            c37 map<string, boolean>
+        )
+        distributed by random buckets 1
+        properties("replication_num" = "1");
+    """
+
+    sql """
+        insert into test_serde_dialect_hive_tbl
+        (c1, c2,c3, 
c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c27,c28,c29,c31,c32,c33,c34,c35,c36,c37)
+        values(
+        1,2,3,4,5,1.1,2.0000,123456.123456789,"2024-06-30", "2024-06-30 
10:10:11", "2024-06-30 10:10:11.123456",
+        '59.50.185.152',
+        'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff',
+        'this is a string with , and "',
+        'abc ef',
+        ' 123ndedwdw',
+        true,
+        '[1, 2, 3, 4, 5]',
+        [1,2,3,null,5],
+        [1.1,2.1,3.1,null,5.00],
+        [1.1,2.1,3.00000,null,5.12345],
+        ['abc', 'de, f"', null, ''],
+        [{'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, {'k1': 'v1', 'k2': 
null, 'k3 , "abc':'', 'k4':'a , "a'}],
+        [['abc', 'de, f"', null, ''],[],null],
+        {'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'},
+        {'k1': [['abc', 'de, f"', null, ''],[],null], 'k2': null},
+        {10: {'k1': [['abc', 'de, f"', null, ''],[],null]}, 11: null},
+        named_struct('s_id', 100, 's_name', 'abc , "', 's_address', null),
+        named_struct('s_id', null, 's_name', ['abc', 'de, f"', null, ''], 
's_address', ''),
+        ['2024-06-01',null,'2024-06-03'],
+        ['2024-06-01 10:10:10',null,'2024-06-03 01:11:23.123'],
+        [true, true, false, false, true, false, false],
+        named_struct('s_id', 100, 's_name', 'abc , "', 's_gender', true),
+        {'k1': false, 'k2': true, 'k3':true, 'k4': false}
+        );
+    """
+
+    sql """set serde_dialect="doris";"""
+    qt_sql01 """select * from test_serde_dialect_hive_tbl"""
+    sql """set serde_dialect="hive";"""
+    qt_sql01 """select * from test_serde_dialect_hive_tbl"""
+
+    test {
+        sql """set serde_dialect="invalid""""
+        exception "sqlDialect value is invalid"
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to