This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new ba1ad12a210 branch-2.1: [Fix](Serde) Support hive compatible output format #49036 (#49831) ba1ad12a210 is described below commit ba1ad12a21060f9835650be63821da91b2255bcb Author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> AuthorDate: Fri Apr 11 15:13:02 2025 +0800 branch-2.1: [Fix](Serde) Support hive compatible output format #49036 (#49831) Cherry-picked from #49036 Co-authored-by: Tiewei Fang <fangtie...@selectdb.com> --- .../vec/data_types/serde/data_type_array_serde.cpp | 5 +- .../vec/data_types/serde/data_type_map_serde.cpp | 7 +- .../data_types/serde/data_type_number_serde.cpp | 9 +- be/src/vec/data_types/serde/data_type_serde.h | 20 ++++ .../data_types/serde/data_type_struct_serde.cpp | 5 +- be/src/vec/sink/vmysql_result_writer.cpp | 16 +++ .../org/apache/doris/nereids/NereidsPlanner.java | 1 + .../java/org/apache/doris/qe/SessionVariable.java | 10 +- gensrc/thrift/PaloInternalService.thrift | 3 +- .../datatype_p0/serde/test_serde_dialect_hive.out | Bin 0 -> 2029 bytes .../serde/test_serde_dialect_hive.groovy | 107 +++++++++++++++++++++ 11 files changed, 175 insertions(+), 8 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp b/be/src/vec/data_types/serde/data_type_array_serde.cpp index 872dd84d8c7..e5fc7461e45 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp @@ -336,7 +336,8 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, const auto end_arr_element = offsets[row_idx_of_col_arr]; for (int j = begin_arr_element; j < end_arr_element; ++j) { if (j != begin_arr_element) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -345,6 +346,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_nested_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -358,6 +360,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR( nested_serde->write_column_to_mysql(data, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("]", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp b/be/src/vec/data_types/serde/data_type_map_serde.cpp index 2140885942d..bf018ce3a80 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp @@ -418,7 +418,8 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, auto& offsets = map_column.get_offsets(); for (auto j = offsets[col_index - 1]; j < offsets[col_index]; ++j) { if (j != offsets[col_index - 1]) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -427,6 +428,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_key_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -440,6 +442,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(key_serde->write_column_to_mysql(nested_keys_column, result, j, false, options)); } + --options.level; } if (0 != result.push_string(&options.map_key_delim, 1)) { return Status::InternalError("pack mysql buffer failed."); @@ -449,6 +452,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_val_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -462,6 +466,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(value_serde->write_column_to_mysql(nested_values_column, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("}", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 522cf02c75f..cd8b3d567e9 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -277,8 +277,15 @@ Status DataTypeNumberSerDe<T>::_write_column_to_mysql(const IColumn& column, int buf_ret = 0; auto& data = assert_cast<const ColumnType&>(column).get_data(); const auto col_index = index_check_const(row_idx, col_const); - if constexpr (std::is_same_v<T, Int8> || std::is_same_v<T, UInt8>) { + if constexpr (std::is_same_v<T, Int8>) { buf_ret = result.push_tinyint(data[col_index]); + } else if constexpr (std::is_same_v<T, UInt8>) { + if (options.level > 0 && !options.is_bool_value_num) { + std::string bool_value = data[col_index] ? "true" : "false"; + result.push_string(bool_value.c_str(), bool_value.size()); + } else { + buf_ret = result.push_tinyint(data[col_index]); + } } else if constexpr (std::is_same_v<T, Int16> || std::is_same_v<T, UInt16>) { buf_ret = result.push_smallint(data[col_index]); } else if constexpr (std::is_same_v<T, Int32> || std::is_same_v<T, UInt32>) { diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 7dedf30ac32..b23a6a21501 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -165,6 +165,26 @@ public: const char* nested_string_wrapper; int wrapper_len; + /** + * mysql_collection_delim is used to separate elements in collection, such as array, map, struct + * It is used to write to mysql. + */ + std::string mysql_collection_delim = ", "; + + /** + * is_bool_value_num is used to display bool value in collection, such as array, map, struct + * eg, if set to true, the array<true> will be: + * [1] + * if set to false, the array<true> will be: + * [true] + */ + bool is_bool_value_num = true; + + /** + * Indicate the nested level of column. It is used to control some behavior of serde + */ + mutable int level = 0; + [[nodiscard]] char get_collection_delimiter( int hive_text_complex_type_delimiter_level) const { CHECK(0 <= hive_text_complex_type_delimiter_level && diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp b/be/src/vec/data_types/serde/data_type_struct_serde.cpp index d48f42e2227..d95682e604c 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp @@ -348,7 +348,8 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, bool begin = true; for (size_t j = 0; j < elem_serdes_ptrs.size(); ++j) { if (!begin) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -372,6 +373,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (remove_nullable(col.get_column_ptr(j))->is_column_string() && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { @@ -386,6 +388,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(elem_serdes_ptrs[j]->write_column_to_mysql( col.get_column(j), result, col_index, false, options)); } + --options.level; } begin = false; } diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index 0cdf1b34034..8ad1c276025 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -123,6 +123,8 @@ Status VMysqlResultWriter<is_binary_format>::_set_options( _options.map_key_delim = ':'; _options.null_format = "null"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; break; case TSerdeDialect::PRESTO: // eg: @@ -133,6 +135,20 @@ Status VMysqlResultWriter<is_binary_format>::_set_options( _options.map_key_delim = '='; _options.null_format = "NULL"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; + break; + case TSerdeDialect::HIVE: + // eg: + // array: ["abc","def","",null] + // map: {"k1":null,"k2":"v3"} + _options.nested_string_wrapper = "\""; + _options.wrapper_len = 1; + _options.map_key_delim = ':'; + _options.null_format = "null"; + _options.null_len = 4; + _options.mysql_collection_delim = ","; + _options.is_bool_value_num = false; break; default: return Status::InternalError("unknown serde dialect: {}", serde_dialect); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 89a9d220be3..57d88de7a4e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -677,6 +677,7 @@ public class NereidsPlanner extends Planner { statementContext.setFormatOptions(FormatOptions.getForPresto()); break; case "doris": + case "hive": statementContext.setFormatOptions(FormatOptions.getDefault()); break; default: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 3f874077f51..7d9b8416618 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -4348,9 +4348,11 @@ public class SessionVariable implements Serializable, Writable { throw new UnsupportedOperationException("serdeDialect value is empty"); } - if (!serdeDialect.equalsIgnoreCase("doris") && !serdeDialect.equalsIgnoreCase("presto") - && !serdeDialect.equalsIgnoreCase("trino")) { - LOG.warn("serdeDialect value is invalid, the invalid value is {}", serdeDialect); + if (!serdeDialect.equalsIgnoreCase("doris") + && !serdeDialect.equalsIgnoreCase("presto") + && !serdeDialect.equalsIgnoreCase("trino") + && !serdeDialect.equalsIgnoreCase("hive")) { + LOG.warn("serde dialect value is invalid, the invalid value is {}", serdeDialect); throw new UnsupportedOperationException( "sqlDialect value is invalid, the invalid value is " + serdeDialect); } @@ -4512,6 +4514,8 @@ public class SessionVariable implements Serializable, Writable { case "presto": case "trino": return TSerdeDialect.PRESTO; + case "hive": + return TSerdeDialect.HIVE; default: throw new IllegalArgumentException("Unknown serde dialect: " + serdeDialect); } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index c612826836e..ac43d3a3dee 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -83,7 +83,8 @@ struct TResourceLimit { enum TSerdeDialect { DORIS = 0, - PRESTO = 1 + PRESTO = 1, + HIVE = 2 } // Query options that correspond to PaloService.PaloQueryOptions, diff --git a/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out new file mode 100644 index 00000000000..3ea1043cdf6 Binary files /dev/null and b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out differ diff --git a/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy new file mode 100644 index 00000000000..b8e3037d770 --- /dev/null +++ b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_serde_dialect_hive", "p0") { + + sql """create database if not exists test_serde_dialect_hive;""" + sql """use test_serde_dialect_hive;""" + sql """drop table if exists test_serde_dialect_hive_tbl""" + sql """ + create table if not exists test_serde_dialect_hive_tbl ( + c1 tinyint, + c2 smallint, + c3 int, + c4 bigint, + c5 largeint, + c6 float, + c7 double, + c8 decimal(27, 9), + c9 date, + c10 datetime, + c11 datetime(6), + c12 ipv4, + c13 ipv6, + c14 string, + c15 char(6), + c16 varchar(1024), + c17 boolean, + c18 json, + c19 array<int>, + c20 array<double>, + c21 array<decimal(10, 5)>, + c22 array<string>, + c23 array<map<string, string>>, + c24 array<array<string>>, + c25 array<struct<s_id:int(11), s_name:string, s_address:string>>, + c26 array<struct<s_id:struct<k1:string, k2:decimal(10,2)>, s_name:array<ipv4>, s_address:map<string, ipv6>>>, + c27 map<string, string>, + c28 map<string, array<array<string>>>, + c29 map<int, map<string, array<array<string>>>>, + c30 map<decimal(5, 3), array<struct<s_id:struct<k1:string, k2:decimal(10,2)>, s_name:array<string>, s_address:map<string, string>>>>, + c31 struct<s_id:int(11), s_name:string, s_address:string>, + c32 struct<s_id:int(11), s_name:array<string>, s_address:string>, + c33 array<date>, + c34 array<datetime(3)>, + c35 array<boolean>, + c36 struct<s_id:int(11), s_name:string, s_gender:boolean>, + c37 map<string, boolean> + ) + distributed by random buckets 1 + properties("replication_num" = "1"); + """ + + sql """ + insert into test_serde_dialect_hive_tbl + (c1, c2,c3, c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c27,c28,c29,c31,c32,c33,c34,c35,c36,c37) + values( + 1,2,3,4,5,1.1,2.0000,123456.123456789,"2024-06-30", "2024-06-30 10:10:11", "2024-06-30 10:10:11.123456", + '59.50.185.152', + 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff', + 'this is a string with , and "', + 'abc ef', + ' 123ndedwdw', + true, + '[1, 2, 3, 4, 5]', + [1,2,3,null,5], + [1.1,2.1,3.1,null,5.00], + [1.1,2.1,3.00000,null,5.12345], + ['abc', 'de, f"', null, ''], + [{'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, {'k1': 'v1', 'k2': null, 'k3 , "abc':'', 'k4':'a , "a'}], + [['abc', 'de, f"', null, ''],[],null], + {'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, + {'k1': [['abc', 'de, f"', null, ''],[],null], 'k2': null}, + {10: {'k1': [['abc', 'de, f"', null, ''],[],null]}, 11: null}, + named_struct('s_id', 100, 's_name', 'abc , "', 's_address', null), + named_struct('s_id', null, 's_name', ['abc', 'de, f"', null, ''], 's_address', ''), + ['2024-06-01',null,'2024-06-03'], + ['2024-06-01 10:10:10',null,'2024-06-03 01:11:23.123'], + [true, true, false, false, true, false, false], + named_struct('s_id', 100, 's_name', 'abc , "', 's_gender', true), + {'k1': false, 'k2': true, 'k3':true, 'k4': false} + ); + """ + + sql """set serde_dialect="doris";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + sql """set serde_dialect="hive";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + + test { + sql """set serde_dialect="invalid"""" + exception "sqlDialect value is invalid" + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org