This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 358ed8caac2 [feature](hive)support hive catalog read json table. (#43469) 358ed8caac2 is described below commit 358ed8caac231f88f0ce40d484ac5c2fdc626f8b Author: daidai <changyu...@selectdb.com> AuthorDate: Thu Nov 28 22:13:27 2024 +0800 [feature](hive)support hive catalog read json table. (#43469) ### What problem does this PR solve? Problem Summary: Support reading json format hive table like: ```mysql mysql> show create table basic_json_table; CREATE TABLE `basic_json_table`( `id` int, `name` string, `age` tinyint, `salary` float, `is_active` boolean, `join_date` date, `last_login` timestamp, `height` double, `profile` binary, `rating` decimal(10,2)) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' ``` Behavior changed: To implement this feature, this pr modifies `new_json_reader`. Previously, `new_json_reader` could only insert data into columnString. In order to support inserting data into columns of other types, `DataTypeSerDe` is introduced to insert data into columns. To maintain compatibility with previous versions, changes to this pr are triggered only when reading hive json tables. Limitation of Use: 1. Currently, only query is supported, and writing is not supported. 2. Currently, only the `ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';` scenario is supported. For some properties specified in `with serdeproperties`, Doris does not take effect. 3. Since Hive does not allow columns with the same name but different case when creating a table in Json format (including inside a Struct), we convert the field names in the Json data to lowercase when reading the Json data file, and then match according to the lowercase field names. For field names that are duplicated after being converted to lowercase in the data, the value of the last field is used (consistent with Hive behavior). example: ``` create table json_table( column int )ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; a.json: {"column":1,"COLumn",2,"COLUMN":3} {"column":10,"COLumn",20} {"column":100} in Hive : load a.json to table json_table in Doris query: --- 3 20 100 --- ``` Todo(in next pr): Merge `serde` and `json_reader` ,because they have logical conflicts. ### Release note Hive catalog support read json format table. --- .../vec/data_types/serde/data_type_array_serde.h | 2 + be/src/vec/data_types/serde/data_type_map_serde.h | 4 + .../data_types/serde/data_type_nullable_serde.h | 2 + be/src/vec/data_types/serde/data_type_serde.h | 12 +- .../vec/data_types/serde/data_type_struct_serde.h | 2 + be/src/vec/exec/format/json/new_json_reader.cpp | 559 +++++++++++++++------ be/src/vec/exec/format/json/new_json_reader.h | 32 +- be/src/vec/exec/scan/vfile_scanner.cpp | 4 +- .../scripts/create_preinstalled_scripts/run69.hql | 35 ++ .../scripts/create_preinstalled_scripts/run70.hql | 73 +++ .../scripts/create_preinstalled_scripts/run71.hql | 13 + .../json/json_all_complex_types/dt=dt1/000000_0 | 3 + .../json/json_all_complex_types/dt=dt2/000000_0 | 1 + .../json/json_all_complex_types/dt=dt3/000000_0 | 2 + .../preinstalled_data/json/json_load_data_table/1 | 13 + .../json/json_nested_complex_table/1 | 2 + .../json/json_nested_complex_table/2 | 1 + .../json/json_nested_complex_table/modify_2 | 2 + .../datasource/hive/HiveMetaStoreClientHelper.java | 3 + .../doris/datasource/hive/source/HiveScanNode.java | 41 +- .../hive/hive_json_basic_test.out | 115 +++++ .../hive/hive_json_basic_test.groovy | 71 +++ 22 files changed, 832 insertions(+), 160 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_array_serde.h b/be/src/vec/data_types/serde/data_type_array_serde.h index 5b15f48f502..cdd21155760 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.h +++ b/be/src/vec/data_types/serde/data_type_array_serde.h @@ -101,6 +101,8 @@ public: nested_serde->set_return_object_as_string(value); } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return {nested_serde}; } + private: template <bool is_binary_format> Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, diff --git a/be/src/vec/data_types/serde/data_type_map_serde.h b/be/src/vec/data_types/serde/data_type_map_serde.h index 599533eb4d0..51c11300d44 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.h +++ b/be/src/vec/data_types/serde/data_type_map_serde.h @@ -96,6 +96,10 @@ public: value_serde->set_return_object_as_string(value); } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { + return {key_serde, value_serde}; + } + private: template <bool is_binary_format> Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h b/be/src/vec/data_types/serde/data_type_nullable_serde.h index 6051c7f722d..159db890540 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.h +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h @@ -99,6 +99,8 @@ public: int64_t row_num) const override; Status read_one_cell_from_json(IColumn& column, const rapidjson::Value& result) const override; + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return {nested_serde}; } + private: template <bool is_binary_format> Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index f0e9eb27961..bec0fabfcca 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -100,6 +100,10 @@ class IColumn; class Arena; class IDataType; +class DataTypeSerDe; +using DataTypeSerDeSPtr = std::shared_ptr<DataTypeSerDe>; +using DataTypeSerDeSPtrs = std::vector<DataTypeSerDeSPtr>; + // Deserialize means read from different file format or memory format, // for example read from arrow, read from parquet. // Serialize means write the column cell or the total column into another @@ -337,6 +341,11 @@ public: Arena& mem_pool, int64_t row_num) const; virtual Status read_one_cell_from_json(IColumn& column, const rapidjson::Value& result) const; + virtual DataTypeSerDeSPtrs get_nested_serdes() const { + throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, + "Method get_nested_serdes is not supported for this serde"); + } + protected: bool _return_object_as_string = false; // This parameter indicates what level the serde belongs to and is mainly used for complex types @@ -379,9 +388,6 @@ inline void checkArrowStatus(const arrow::Status& status, const std::string& col } } -using DataTypeSerDeSPtr = std::shared_ptr<DataTypeSerDe>; -using DataTypeSerDeSPtrs = std::vector<DataTypeSerDeSPtr>; - DataTypeSerDeSPtrs create_data_type_serdes( const std::vector<std::shared_ptr<const IDataType>>& types); DataTypeSerDeSPtrs create_data_type_serdes(const std::vector<SlotDescriptor*>& slots); diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.h b/be/src/vec/data_types/serde/data_type_struct_serde.h index 2ec871838d7..fc77e26b71f 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.h +++ b/be/src/vec/data_types/serde/data_type_struct_serde.h @@ -172,6 +172,8 @@ public: } } + virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return elem_serdes_ptrs; } + private: std::optional<size_t> try_get_position_by_name(const String& name) const; diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index d726de3d998..d79e8652074 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -54,8 +54,11 @@ #include "util/slice.h" #include "util/uid_util.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_map.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" #include "vec/common/assert_cast.h" #include "vec/common/typeid_cast.h" #include "vec/core/block.h" @@ -164,10 +167,18 @@ void NewJsonReader::_init_file_description() { } Status NewJsonReader::init_reader( - const std::unordered_map<std::string, VExprContextSPtr>& col_default_value_ctx) { + const std::unordered_map<std::string, VExprContextSPtr>& col_default_value_ctx, + bool is_load) { + _is_load = is_load; + // generate _col_default_value_map RETURN_IF_ERROR(_get_column_default_value(_file_slot_descs, col_default_value_ctx)); + //use serde insert data to column. + for (auto* slot_desc : _file_slot_descs) { + _serdes.emplace_back(slot_desc->get_data_type_ptr()->get_serde()); + } + // create decompressor. // _decompressor may be nullptr if this is not a compressed file RETURN_IF_ERROR(Decompressor::create_decompressor(_file_compress_type, &_decompressor)); @@ -390,6 +401,9 @@ Status NewJsonReader::_get_range_params() { if (_params.file_attributes.__isset.fuzzy_parse) { _fuzzy_parse = _params.file_attributes.fuzzy_parse; } + if (_range.table_format_params.table_format_type == "hive") { + _is_hive_table = true; + } return Status::OK(); } @@ -477,8 +491,8 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState* /*state*/, Block& block bool valid = false; if (_next_row >= _total_rows) { // parse json and generic document Status st = _parse_json(is_empty_row, eof); - if (st.is<DATA_QUALITY_ERROR>()) { - continue; // continue to read next + if (_is_load && st.is<DATA_QUALITY_ERROR>()) { + continue; // continue to read next (for load, after this , already append error to file.) } RETURN_IF_ERROR(st); if (*is_empty_row) { @@ -761,7 +775,20 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl _append_empty_skip_bitmap_value(block, cur_row_count); } - for (auto* slot_desc : slot_descs) { + if (_is_hive_table) { + //don't like _fuzzy_parse,each line read in must modify name_map once. + + for (int i = 0; i < objectValue.MemberCount(); ++i) { + auto it = objectValue.MemberBegin() + i; + std::string field_name(it->name.GetString(), it->name.GetStringLength()); + std::transform(field_name.begin(), field_name.end(), field_name.begin(), ::tolower); + + //Use the last value with the same name. + _name_map.emplace(field_name, i); + } + } + for (size_t slot_idx = 0; slot_idx < slot_descs.size(); ++slot_idx) { + auto* slot_desc = slot_descs[slot_idx]; if (!slot_desc->is_materialized()) { continue; } @@ -773,7 +800,7 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl auto* column_ptr = block.get_by_position(dest_index).column->assume_mutable().get(); rapidjson::Value::ConstMemberIterator it = objectValue.MemberEnd(); - if (_fuzzy_parse) { + if (_fuzzy_parse || _is_hive_table) { auto idx_it = _name_map.find(slot_desc->col_name()); if (idx_it != _name_map.end() && idx_it->second < objectValue.MemberCount()) { it = objectValue.MemberBegin() + idx_it->second; @@ -785,7 +812,8 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl if (it != objectValue.MemberEnd()) { const rapidjson::Value& value = it->value; - RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc->type(), column_ptr, + slot_desc->col_name(), _serdes[slot_idx], valid)); if (!(*valid)) { return Status::OK(); } @@ -814,14 +842,15 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl column_ptr->insert_default(); } else { // not found, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR( + _fill_missing_column(slot_desc, _serdes[slot_idx], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } } } - if (!has_valid_value) { + if (!has_valid_value && _is_load) { // there is no valid value in json line but has filled with default value before // so remove this line in block string col_names; @@ -845,79 +874,188 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + *valid = true; + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + } else { - return Status::InternalError<false>("It should not here."); + // We can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + // Or for any type, use `NewJsonReader::_print_json_value(*value)`. + + const char* str_value = nullptr; + char tmp_buf[128] = {0}; + size_t wbytes = 0; + std::string json_str; + + switch (value->GetType()) { + case rapidjson::Type::kStringType: + str_value = value->GetString(); + wbytes = value->GetStringLength(); + break; + case rapidjson::Type::kNumberType: + if (value->IsUint()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); + } else if (value->IsInt()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); + } else if (value->IsUint64()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); + } else if (value->IsInt64()) { + wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); + } else if (value->IsFloat() || value->IsDouble()) { + auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); + wbytes = end - tmp_buf; + } else { + return Status::InternalError<false>("It should not here."); + } + str_value = tmp_buf; + break; + case rapidjson::Type::kFalseType: + wbytes = 1; + str_value = (char*)"0"; + break; + case rapidjson::Type::kTrueType: + wbytes = 1; + str_value = (char*)"1"; + break; + default: + // for other type, we convert it to string to save + json_str = NewJsonReader::_print_json_value(*value); + wbytes = json_str.size(); + str_value = json_str.c_str(); + break; + } + Slice slice {str_value, wbytes}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); } - str_value = tmp_buf; - break; - case rapidjson::Type::kFalseType: - wbytes = 1; - str_value = (char*)"0"; - break; - case rapidjson::Type::kTrueType: - wbytes = 1; - str_value = (char*)"1"; - break; - case rapidjson::Type::kNullType: - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - *value, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); + } else if (type_desc.type == TYPE_STRUCT) { + if (!value->IsObject()) [[unlikely]] { + return Status::DataQualityError( + "Json value isn't object, but the column `{}` is struct.", column_name); } - // return immediately to prevent from repeatedly insert_data - *valid = true; - return Status::OK(); - default: - // for other type like array or object. we convert it to string to save - json_str = NewJsonReader::_print_json_value(*value); - wbytes = json_str.size(); - str_value = json_str.c_str(); - break; - } + auto sub_col_size = type_desc.children.size(); + const auto& struct_value = value->GetObject(); - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - DCHECK(slot_desc->type().type == TYPE_VARCHAR || slot_desc->type().type == TYPE_STRING) - << slot_desc->type().type << ", query id: " << print_id(_state->query_id()); - assert_cast<ColumnString*>(column_ptr)->insert_data(str_value, wbytes); + auto sub_serdes = data_serde->get_nested_serdes(); + auto struct_column_ptr = assert_cast<ColumnStruct*>(data_column_ptr); + + std::map<std::string, size_t> sub_col_name_to_idx; + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx], sub_col_idx); + } + + std::vector<rapidjson::Value::ConstValueIterator> sub_values(sub_col_size, nullptr); + for (const auto& sub : struct_value) { + if (!sub.name.IsString()) [[unlikely]] { + return Status::DataQualityError( + "Json file struct column `{}` subfield name isn't a String", column_name); + } + + auto sub_key_char = sub.name.GetString(); + auto sub_key_length = sub.name.GetStringLength(); + + std::string sub_key(sub_key_char, sub_key_length); + std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(), ::tolower); + + if (sub_col_name_to_idx.find(sub_key) == sub_col_name_to_idx.end()) [[unlikely]] { + continue; + } + size_t sub_column_idx = sub_col_name_to_idx[sub_key]; + sub_values[sub_column_idx] = &sub.value; + } + + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + auto sub_value = sub_values[sub_col_idx]; + + const auto& sub_col_type = type_desc.children[sub_col_idx]; + + RETURN_IF_ERROR(_write_data_to_column( + sub_value, sub_col_type, struct_column_ptr->get_column(sub_col_idx).get_ptr(), + column_name + "." + type_desc.field_names[sub_col_idx], sub_serdes[sub_col_idx], + valid)); + } + } else if (type_desc.type == TYPE_MAP) { + if (!value->IsObject()) [[unlikely]] { + return Status::DataQualityError("Json value isn't object, but the column `{}` is map.", + column_name); + } + const auto& object_value = value->GetObject(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto map_column_ptr = assert_cast<ColumnMap*>(data_column_ptr); + + for (const auto& member_value : object_value) { + RETURN_IF_ERROR(_write_data_to_column( + &member_value.name, type_desc.children[0], + map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(), + column_name + ".key", sub_serdes[0], valid)); + + RETURN_IF_ERROR(_write_data_to_column( + &member_value.value, type_desc.children[1], + map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(), + column_name + ".value", sub_serdes[1], valid)); + } + + auto& offsets = map_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + object_value.MemberCount()); + } else if (type_desc.type == TYPE_ARRAY) { + if (!value->IsArray()) [[unlikely]] { + return Status::DataQualityError("Json value isn't array, but the column `{}` is array.", + column_name); + } + const auto& array_value = value->GetArray(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto array_column_ptr = assert_cast<ColumnArray*>(data_column_ptr); + + for (const auto& sub_value : array_value) { + RETURN_IF_ERROR(_write_data_to_column(&sub_value, type_desc.children[0], + array_column_ptr->get_data().get_ptr(), + column_name + ".element", sub_serdes[0], valid)); + } + auto& offsets = array_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + array_value.Size()); + } else { + return Status::InternalError("Not support load to complex column."); + } *valid = true; return Status::OK(); @@ -949,20 +1087,21 @@ Status NewJsonReader::_write_columns_by_jsonpath(rapidjson::Value& objectValue, // if json_values' size > 1, it means we just match an array, not a wrapped one, so no need to unwrap. json_values = &((*json_values)[0]); } - RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i], column_ptr, valid)); + RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i]->type(), column_ptr, + slot_descs[i]->col_name(), _serdes[i], valid)); if (!(*valid)) { return Status::OK(); } has_valid_value = true; } else { // not found, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } } - if (!has_valid_value) { + if (!has_valid_value && _is_load) { // there is no valid value in json line but has filled with default value before // so remove this line in block for (int i = 0; i < block.columns(); ++i) { @@ -1112,7 +1251,7 @@ Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc // step2: get json value by json doc Status st = _get_json_value(&size, eof, &error, is_empty_row); - if (st.is<DATA_QUALITY_ERROR>()) { + if (_is_load && st.is<DATA_QUALITY_ERROR>()) { return Status::OK(); } RETURN_IF_ERROR(st); @@ -1386,28 +1525,42 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val for (auto field : *value) { std::string_view key = field.unescaped_key(); StringRef name_ref(key.data(), key.size()); + std::string key_string; + if (_is_hive_table) { + key_string = name_ref.to_string(); + std::transform(key_string.begin(), key_string.end(), key_string.begin(), ::tolower); + name_ref = StringRef(key_string); + } const size_t column_index = _column_index(name_ref, key_index++); if (UNLIKELY(ssize_t(column_index) < 0)) { // This key is not exist in slot desc, just ignore continue; } - if (_seen_columns[column_index]) { - continue; - } if (column_index == skip_bitmap_col_idx) { continue; } + if (_seen_columns[column_index]) { + if (_is_hive_table) { + //Since value can only be traversed once, + // we can only insert the original value first, then delete it, and then reinsert the new value + block.get_by_position(column_index).column->assume_mutable()->pop_back(1); + } else { + continue; + } + } simdjson::ondemand::value val = field.value(); auto* column_ptr = block.get_by_position(column_index).column->assume_mutable().get(); - RETURN_IF_ERROR( - _simdjson_write_data_to_column(val, slot_descs[column_index], column_ptr, valid)); + RETURN_IF_ERROR(_simdjson_write_data_to_column( + val, slot_descs[column_index]->type(), column_ptr, + slot_descs[column_index]->col_name(), _serdes[column_index], valid)); if (!(*valid)) { return Status::OK(); } _seen_columns[column_index] = true; has_valid_value = true; } - if (!has_valid_value) { + + if (!has_valid_value && _is_load) { string col_names; for (auto* slot_desc : slot_descs) { col_names.append(slot_desc->col_name() + ", "); @@ -1472,7 +1625,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val _process_skip_bitmap_mark(slot_desc, column_ptr, block, cur_row_count, valid); column_ptr->insert_default(); } else { - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } @@ -1482,12 +1635,6 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val DCHECK(column_ptr->size() == cur_row_count + 1); } -#ifndef NDEBUG - // Check all columns rows matched - for (size_t i = 0; i < block.columns(); ++i) { - DCHECK_EQ(block.get_by_position(i).column->size(), cur_row_count + 1); - } -#endif // There is at least one valid value here DCHECK(nullcount < block.columns()); *valid = true; @@ -1495,54 +1642,180 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val } Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value, - SlotDescriptor* slot_desc, IColumn* column, - bool* valid) { - // write + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, + DataTypeSerDeSPtr serde, bool* valid) { ColumnNullable* nullable_column = nullptr; - IColumn* column_ptr = nullptr; - if (slot_desc->is_nullable()) { - nullable_column = assert_cast<ColumnNullable*>(column); - column_ptr = &nullable_column->get_nested_column(); - } - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - auto* column_string = assert_cast<ColumnString*>(column_ptr); - switch (value.type()) { - case simdjson::ondemand::json_type::null: { - if (column->is_nullable()) { - // insert_default already push 1 to null_map - nullable_column->insert_default(); + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + + if (column_ptr->is_nullable()) { + nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); + + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. + if (value.type() != simdjson::ondemand::json_type::null) { + nullable_column->get_null_map_data().push_back(0); } else { + nullable_column->insert_default(); + *valid = true; + return Status::OK(); + } + } else if (value.type() == simdjson::ondemand::json_type::null) [[unlikely]] { + if (_is_load) { RETURN_IF_ERROR(_append_error_msg( nullptr, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); + column_name, valid)); return Status::OK(); - } - break; - } - case simdjson::ondemand::json_type::boolean: { - nullable_column->get_null_map_data().push_back(0); - if (value.get_bool()) { - column_string->insert_data("1", 1); } else { - column_string->insert_data("0", 1); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - break; } - default: { + + if (_is_load || !type_desc.is_complex_type()) { if (value.type() == simdjson::ondemand::json_type::string) { - auto* unescape_buffer = - reinterpret_cast<uint8_t*>(_simdjson_ondemand_unscape_padding_buffer.data()); - std::string_view unescaped_value = - _ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer); - nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(unescaped_value.data(), unescaped_value.length()); - break; + std::string_view value_string = value.get_string(); + Slice slice {value_string.data(), value_string.size()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + + } else { + // Maybe we can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + std::string_view json_str = simdjson::to_json_string(value); + Slice slice {json_str.data(), json_str.size()}; + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + _serde_options)); + } + } else if (type_desc.type == TYPE_STRUCT) { + if (value.type() != simdjson::ondemand::json_type::object) [[unlikely]] { + return Status::DataQualityError( + "Json value isn't object, but the column `{}` is struct.", column_name); + } + + auto sub_col_size = type_desc.children.size(); + simdjson::ondemand::object struct_value = value.get_object(); + auto sub_serdes = data_serde->get_nested_serdes(); + auto struct_column_ptr = assert_cast<ColumnStruct*>(data_column_ptr); + + std::map<std::string, size_t> sub_col_name_to_idx; + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx], sub_col_idx); + } + vector<bool> has_value(sub_col_size, false); + for (simdjson::ondemand::field sub : struct_value) { + std::string_view sub_key_view = sub.unescaped_key(); + std::string sub_key(sub_key_view.data(), sub_key_view.length()); + std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(), ::tolower); + + if (sub_col_name_to_idx.find(sub_key) == sub_col_name_to_idx.end()) [[unlikely]] { + continue; + } + size_t sub_column_idx = sub_col_name_to_idx[sub_key]; + auto sub_column_ptr = struct_column_ptr->get_column(sub_column_idx).get_ptr(); + + if (has_value[sub_column_idx]) [[unlikely]] { + // Since struct_value can only be traversed once, we can only insert + // the original value first, then delete it, and then reinsert the new value. + sub_column_ptr->pop_back(1); + } + has_value[sub_column_idx] = true; + + const auto& sub_col_type = type_desc.children[sub_column_idx]; + RETURN_IF_ERROR(_simdjson_write_data_to_column( + sub.value(), sub_col_type, sub_column_ptr, column_name + "." + sub_key, + sub_serdes[sub_column_idx], valid)); } - auto value_str = simdjson::to_json_string(value).value(); - nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(value_str.data(), value_str.length()); - } + + //fill missing subcolumn + for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size; sub_col_idx++) { + if (has_value[sub_col_idx] == true) { + continue; + } + + auto sub_column_ptr = struct_column_ptr->get_column(sub_col_idx).get_ptr(); + if (sub_column_ptr->is_nullable()) { + sub_column_ptr->insert_default(); + continue; + } else [[unlikely]] { + return Status::DataQualityError( + "Json file structColumn miss field {} and this column isn't nullable.", + column_name + "." + type_desc.field_names[sub_col_idx]); + } + } + } else if (type_desc.type == TYPE_MAP) { + if (value.type() != simdjson::ondemand::json_type::object) [[unlikely]] { + return Status::DataQualityError("Json value isn't object, but the column `{}` is map.", + column_name); + } + simdjson::ondemand::object object_value = value.get_object(); + + auto sub_serdes = data_serde->get_nested_serdes(); + auto map_column_ptr = assert_cast<ColumnMap*>(data_column_ptr); + + size_t field_count = 0; + for (simdjson::ondemand::field member_value : object_value) { + auto f = [](std::string_view key_view, const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, DataTypeSerDeSPtr serde, + vectorized::DataTypeSerDe::FormatOptions serde_options, bool* valid) { + auto data_column_ptr = column_ptr; + auto data_serde = serde; + if (column_ptr->is_nullable()) { + auto nullable_column = static_cast<ColumnNullable*>(column_ptr); + + nullable_column->get_null_map_data().push_back(0); + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + } + Slice slice(key_view.data(), key_view.length()); + + RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, + serde_options)); + return Status::OK(); + }; + + RETURN_IF_ERROR(f(member_value.unescaped_key(), type_desc.children[0], + map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(), + sub_serdes[0], _serde_options, valid)); + + simdjson::ondemand::value field_value = member_value.value(); + RETURN_IF_ERROR(_simdjson_write_data_to_column( + field_value, type_desc.children[1], + map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(), + column_name + ".value", sub_serdes[1], valid)); + field_count++; + } + + auto& offsets = map_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + field_count); + + } else if (type_desc.type == TYPE_ARRAY) { + if (value.type() != simdjson::ondemand::json_type::array) [[unlikely]] { + return Status::DataQualityError("Json value isn't array, but the column `{}` is array.", + column_name); + } + + simdjson::ondemand::array array_value = value.get_array(); + + auto sub_serdes = data_serde->get_nested_serdes(); + auto array_column_ptr = assert_cast<ColumnArray*>(data_column_ptr); + + int field_count = 0; + for (simdjson::ondemand::value sub_value : array_value) { + RETURN_IF_ERROR(_simdjson_write_data_to_column( + sub_value, type_desc.children[0], array_column_ptr->get_data().get_ptr(), + column_name + ".element", sub_serdes[0], valid)); + field_count++; + } + auto& offsets = array_column_ptr->get_offsets(); + offsets.emplace_back(offsets.back() + field_count); + + } else { + return Status::InternalError("Not support load to complex column."); } *valid = true; return Status::OK(); @@ -1750,13 +2023,14 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( has_valid_value = true; } else if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) { // not match in jsondata, filling with default value - RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid)); if (!(*valid)) { return Status::OK(); } } else { - RETURN_IF_ERROR( - _simdjson_write_data_to_column(json_value, slot_desc, column_ptr, valid)); + RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value, slot_desc->type(), + column_ptr, slot_desc->col_name(), + _serdes[i], valid)); if (!(*valid)) { return Status::OK(); } @@ -1814,25 +2088,30 @@ Status NewJsonReader::_get_column_default_value( return Status::OK(); } -Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, IColumn* column_ptr, - bool* valid) { - if (slot_desc->is_nullable()) { - auto* nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - column_ptr = &nullable_column->get_nested_column(); - auto col_value = _col_default_value_map.find(slot_desc->col_name()); - if (col_value == _col_default_value_map.end()) { +Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde, + IColumn* column_ptr, bool* valid) { + auto col_value = _col_default_value_map.find(slot_desc->col_name()); + if (col_value == _col_default_value_map.end()) { + if (slot_desc->is_nullable()) { + auto* nullable_column = static_cast<ColumnNullable*>(column_ptr); nullable_column->insert_default(); } else { - const std::string& v_str = col_value->second; - nullable_column->get_null_map_data().push_back(0); - assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), v_str.size()); + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + nullptr, "The column `{}` is not nullable, but it's not found in jsondata.", + slot_desc->col_name(), valid)); + } else { + return Status::DataQualityError( + "The column `{}` is not nullable, but it's not found in jsondata.", + slot_desc->col_name()); + } } } else { - RETURN_IF_ERROR(_append_error_msg( - nullptr, "The column `{}` is not nullable, but it's not found in jsondata.", - slot_desc->col_name(), valid)); + const std::string& v_str = col_value->second; + Slice column_default_value {v_str}; + RETURN_IF_ERROR(serde->deserialize_one_cell_from_json(*column_ptr, column_default_value, + _serde_options)); } - *valid = true; return Status::OK(); } diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 985bf999676..482a1ced747 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -87,7 +87,8 @@ public: ~NewJsonReader() override = default; Status init_reader(const std::unordered_map<std::string, vectorized::VExprContextSPtr>& - col_default_value_ctx); + col_default_value_ctx, + bool is_load); Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status get_columns(std::unordered_map<std::string, TypeDescriptor>* name_to_type, std::unordered_set<std::string>* missing_cols) override; @@ -128,7 +129,8 @@ private: const std::vector<SlotDescriptor*>& slot_descs, bool* valid); Status _write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr, + const TypeDescriptor& type_desc, vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid); Status _write_columns_by_jsonpath(rapidjson::Value& objectValue, @@ -177,8 +179,10 @@ private: const std::vector<SlotDescriptor*>& slot_descs, bool* valid); Status _simdjson_write_data_to_column(simdjson::ondemand::value& value, - SlotDescriptor* slot_desc, - vectorized::IColumn* column_ptr, bool* valid); + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, + bool* valid); Status _simdjson_write_columns_by_jsonpath(simdjson::ondemand::object* value, const std::vector<SlotDescriptor*>& slot_descs, @@ -196,8 +200,8 @@ private: const std::unordered_map<std::string, vectorized::VExprContextSPtr>& col_default_value_ctx); - Status _fill_missing_column(SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr, - bool* valid); + Status _fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde, + vectorized::IColumn* column_ptr, bool* valid); // fe will add skip_bitmap_col to _file_slot_descs iff the target olap table has skip_bitmap_col // and the current load is a flexible partial update @@ -292,6 +296,22 @@ private: std::unordered_map<std::string, std::string> _col_default_value_map; int32_t skip_bitmap_col_idx {-1}; + + bool _is_load = true; + //Used to indicate whether it is a stream load. When loading, only data will be inserted into columnString. + //If an illegal value is encountered during the load process, `_append_error_msg` should be called + //instead of directly returning `Status::DataQualityError` + + bool _is_hive_table = false; + // In hive : create table xxx ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; + // Hive will not allow you to create columns with the same name but different case, including field names inside + // structs, and will automatically convert uppercase names in create sql to lowercase.However, when Hive loads data + // to table, the column names in the data may be uppercase,and there may be multiple columns with + // the same name but different capitalization.We refer to the behavior of hive, convert all column names + // in the data to lowercase,and use the last one as the insertion value + + DataTypeSerDeSPtrs _serdes; + vectorized::DataTypeSerDe::FormatOptions _serde_options; }; } // namespace vectorized diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 982ac6072dc..3053adebbb5 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -965,8 +965,8 @@ Status VFileScanner::_get_next_reader() { _cur_reader = NewJsonReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, &_scanner_eof, _io_ctx.get()); - init_status = - ((NewJsonReader*)(_cur_reader.get()))->init_reader(_col_default_value_ctx); + init_status = ((NewJsonReader*)(_cur_reader.get())) + ->init_reader(_col_default_value_ctx, _is_load); break; } case TFileFormatType::FORMAT_AVRO: { diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql new file mode 100644 index 00000000000..adf0f7d56b2 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql @@ -0,0 +1,35 @@ +use `default`; + + +CREATE TABLE json_nested_complex_table ( + user_ID STRING, + user_PROFILE STRUCT< + name: STRING, + AGE: INT, + preferences: MAP< + STRING, + STRUCT< + preference_ID: INT, + preference_VALUES: ARRAY<STRING> + > + > + >, + activity_LOG ARRAY< + STRUCT< + activity_DATE: STRING, + activities: MAP< + STRING, + STRUCT< + `DETAILS`: STRING, + metrics: MAP<STRING, float> + > + > + > + > +) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' + +LOCATION + '/user/doris/preinstalled_data/json/json_nested_complex_table'; + + +msck repair table json_nested_complex_table; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql new file mode 100644 index 00000000000..73df8cba557 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql @@ -0,0 +1,73 @@ +use `default`; + + +CREATE TABLE json_all_complex_types ( + `id` int, + `boolean_col` boolean, + `tinyint_col` tinyint, + `smallint_col` smallint, + `int_col` int, + `bigint_col` bigint, + `float_col` float, + `double_col` double, + `decimal_col1` decimal(9,0), + `decimal_col2` decimal(8,4), + `decimal_col3` decimal(18,6), + `decimal_col4` decimal(38,12), + `string_col` string, + `binary_col` binary, + `date_col` date, + `timestamp_col1` timestamp, + `timestamp_col2` timestamp, + `timestamp_col3` timestamp, + `char_col1` char(50), + `char_col2` char(100), + `char_col3` char(255), + `varchar_col1` varchar(50), + `varchar_col2` varchar(100), + `varchar_col3` varchar(255), + `t_map_string` map<string,string>, + `t_map_varchar` map<varchar(65535),varchar(65535)>, + `t_map_char` map<char(10),char(10)>, + `t_map_int` map<int,int>, + `t_map_bigint` map<bigint,bigint>, + `t_map_float` map<float,float>, + `t_map_double` map<double,double>, + `t_map_boolean` map<boolean,boolean>, + `t_map_decimal_precision_2` map<decimal(2,1),decimal(2,1)>, + `t_map_decimal_precision_4` map<decimal(4,2),decimal(4,2)>, + `t_map_decimal_precision_8` map<decimal(8,4),decimal(8,4)>, + `t_map_decimal_precision_17` map<decimal(17,8),decimal(17,8)>, + `t_map_decimal_precision_18` map<decimal(18,8),decimal(18,8)>, + `t_map_decimal_precision_38` map<decimal(38,16),decimal(38,16)>, + `t_array_string` array<string>, + `t_array_int` array<int>, + `t_array_bigint` array<bigint>, + `t_array_float` array<float>, + `t_array_double` array<double>, + `t_array_boolean` array<boolean>, + `t_array_varchar` array<varchar(65535)>, + `t_array_char` array<char(10)>, + `t_array_decimal_precision_2` array<decimal(2,1)>, + `t_array_decimal_precision_4` array<decimal(4,2)>, + `t_array_decimal_precision_8` array<decimal(8,4)>, + `t_array_decimal_precision_17` array<decimal(17,8)>, + `t_array_decimal_precision_18` array<decimal(18,8)>, + `t_array_decimal_precision_38` array<decimal(38,16)>, + `t_struct_bigint` struct<s_bigint:bigint>, + `t_complex` map<string,array<struct<s_int:int>>>, + `t_struct_nested` struct<struct_field:array<string>>, + `t_struct_null` struct<struct_field_null:string,struct_field_null2:string>, + `t_struct_non_nulls_after_nulls` struct<struct_non_nulls_after_nulls1:int,struct_non_nulls_after_nulls2:string>, + `t_nested_struct_non_nulls_after_nulls` struct<struct_field1:int,struct_field2:string,strict_field3:struct<nested_struct_field1:int,nested_struct_field2:string>>, + `t_map_null_value` map<string,string>, + `t_array_string_starting_with_nulls` array<string>, + `t_array_string_with_nulls_in_between` array<string>, + `t_array_string_ending_with_nulls` array<string>, + `t_array_string_all_nulls` array<string> + ) PARTITIONED BY (`dt` string) +ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' +LOCATION + '/user/doris/preinstalled_data/json/json_all_complex_types'; + +msck repair table json_all_complex_types; diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql new file mode 100644 index 00000000000..ec99e72d2f5 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql @@ -0,0 +1,13 @@ +use `default`; + + +CREATE TABLE json_load_data_table ( + `id` int, + `col1` int, + `col2` struct< col2a:int, col2b:string>, + `col3` map<int,string> +) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' +LOCATION + '/user/doris/preinstalled_data/json/json_load_data_table'; + +msck repair table json_load_data_table; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 new file mode 100644 index 00000000000..5fe37cbc6f0 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0 @@ -0,0 +1,3 @@ +{"id":1,"boolean_col":true,"tinyint_col":127,"smallint_col":32767,"int_col":2147483647,"bigint_col":9223372036854775807,"float_col":123.45,"double_col":123456.789,"decimal_col1":123456789,"decimal_col2":1234.5678,"decimal_col3":123456.789012,"decimal_col4":123456789.012345678901,"string_col":"string_value","binary_col":"binary_value","date_col":"2024-03-20","timestamp_col1":"2024-03-20 12:00:00","timestamp_col2":"2024-03-20 12:00:00.123456789","timestamp_col3":"2024-03-20 12:00:00.123456 [...] +{"id":2,"boolean_col":false,"tinyint_col":58,"smallint_col":12345,"int_col":2147483000,"bigint_col":null,"float_col":789.56,"double_col":654321.123,"decimal_col1":987654321,"decimal_col2":5678.1234,"decimal_col3":987654.321098,"decimal_col4":987654321.098765432109,"string_col":"changed_string","binary_col":"new_binary_value","date_col":"2025-05-25","timestamp_col1":"2025-05-25 15:30:00","timestamp_col2":"2025-05-25 15:30:00.654321987","timestamp_col3":"2025-05-25 15:30:00.654321987","cha [...] +{"id":3,"boolean_col":false,"tinyint_col":-128,"smallint_col":-32768,"int_col":-2147483648,"bigint_col":-9223372036854775808,"float_col":-3.4028235E38,"double_col":-1.7976931348623157E308,"decimal_col1":-999999999,"decimal_col2":-9999.9999,"decimal_col3":-999999999.999999,"decimal_col4":null,"string_col":"min_string_value","binary_col":"xxxx","date_col":"2001-01-01","timestamp_col1":"2001-01-01 00:00:00","timestamp_col2":"2001-01-01 00:00:00","timestamp_col3":"2001-01-01 00:00:00","char_ [...] diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 new file mode 100644 index 00000000000..0a823bee693 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0 @@ -0,0 +1 @@ +{"id":4,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":123.45,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map [...] diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 new file mode 100644 index 00000000000..a5e46399fdd --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0 @@ -0,0 +1,2 @@ +{"id":5,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_c [...] +{"id":6,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_c [...] diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 new file mode 100644 index 00000000000..70d1265f98d --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1 @@ -0,0 +1,13 @@ +{"id":1,"col1":10,"col2":{"col2a":10,"col2b":"string1"},"col3":{"1":"string10"}} +{"id":2,"col1":10,"col1":20,"col2":{"col2b":"string2","col2a":0,"Col2A":20},"col3":{"2":"string2"}} +{"id":3,"col1":10,"col1":20,"COL1":30,"COL2":{"col2a":30,"col2b":"string3"}} +{"id":4,"COL1":40,"col2":{"col2a":10,"col2b":"string4","new_col":"new_val","col2a":40},"col3":{"4":"string4"}} +{"id":5} +{"id":6,"col1":60,"col2":{"COL2a":60,"col2b":600},"col3":{"6":600}} +{"id":7,"col1":70,"col3":{"7":"string7"},"col2":{"col2b":"string7","col2a":70}} + + + + +{} +{"a":5} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 new file mode 100644 index 00000000000..11342c441bc --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1 @@ -0,0 +1,2 @@ +{"user_id":"user1","user_profile":{"name":"Alice","age":28,"preferences":{"sports":{"preference_id":101,"preference_values":["soccer","tennis"]},"music":{"preference_id":102,"preference_values":["rock","classical"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"workout":{"details":"Morning run","metrics":{"duration":30.5,"calories":200.0}},"reading":{"details":"Read book on Hive","metrics":{"pages":50.0,"time":2.0}}}},{"activity_date":"2024-08-02","activities":{"travel": [...] +{"user_id":"user2","user_profile":{"name":"Bob","age":32,"preferences":{"books":{"preference_id":201,"preference_values":["fiction","non-fiction"]},"travel":{"preference_id":202,"preference_values":["beaches","mountains"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"hiking":{"details":"Mountain trail","metrics":{"distance":10.0,"elevation":500.0}},"photography":{"details":"Wildlife photoshoot","metrics":{"photos_taken":100.0,"time":4.0}}}},{"activity_date":"2024-08-02" [...] diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 new file mode 100644 index 00000000000..e1b0befc7bc --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2 @@ -0,0 +1 @@ +{"user_id":"user3","user_profile":{"name":"Carol","age":24,"preferences":{"food":{"preference_id":301,"preference_values":["vegan","desserts"]},"movies":{"preference_id":302,"preference_values":["action","comedy"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"cooking":{"details":"Made vegan meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"details":"Watched action movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_date":"2024-08-02","activities":{ [...] diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 new file mode 100644 index 00000000000..08f1586f3aa --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2 @@ -0,0 +1,2 @@ +{"user_ID":"user4","user_PROFILE":{"name":"Carol","age":24,"preferences":{"food":{"preference_ID":301,"preference_VALUES":["vegan","desserts"]},"movies":{"preference_ID":302,"preference_VALUES":["action","comedy"]}}},"activity_LOG":[{"activity_DATE":"2024-08-01","activities":{"cooking":{"DETAILS":"Made vegan meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"DETAILS":"Watched action movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_DATE":"2024-08-02","activities":{ [...] + diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 97032467cec..0f839d238b2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -94,6 +94,9 @@ public class HiveMetaStoreClientHelper { private static final Pattern digitPattern = Pattern.compile("(\\d+)"); + public static final String HIVE_JSON_SERDE = "org.apache.hive.hcatalog.data.JsonSerDe"; + public static final String LEGACY_HIVE_JSON_SERDE = "org.apache.hadoop.hive.serde2.JsonSerDe"; + public enum HiveFileFormat { TEXT_FILE(0, "text"), PARQUET(1, "parquet"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index e710bdb935d..3a2a4d3eb5c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -364,14 +364,21 @@ public class HiveScanNode extends FileQueryScanNode { @Override public TFileFormatType getFileFormatType() throws UserException { TFileFormatType type = null; - String inputFormatName = hmsTable.getRemoteTable().getSd().getInputFormat(); + Table table = hmsTable.getRemoteTable(); + String inputFormatName = table.getSd().getInputFormat(); String hiveFormat = HiveMetaStoreClientHelper.HiveFileFormat.getFormat(inputFormatName); if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.PARQUET.getDesc())) { type = TFileFormatType.FORMAT_PARQUET; } else if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.ORC.getDesc())) { type = TFileFormatType.FORMAT_ORC; } else if (hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.TEXT_FILE.getDesc())) { - type = TFileFormatType.FORMAT_CSV_PLAIN; + String serDeLib = table.getSd().getSerdeInfo().getSerializationLib(); + if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE) + || serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) { + type = TFileFormatType.FORMAT_JSON; + } else { + type = TFileFormatType.FORMAT_CSV_PLAIN; + } } return type; } @@ -383,11 +390,12 @@ public class HiveScanNode extends FileQueryScanNode { @Override protected TFileAttributes getFileAttributes() throws UserException { - TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + TFileAttributes fileAttributes = new TFileAttributes(); Table table = hmsTable.getRemoteTable(); // TODO: separate hive text table and OpenCsv table String serDeLib = table.getSd().getSerdeInfo().getSerializationLib(); if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // set properties of LazySimpleSerDe // 1. set column separator textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table)); @@ -401,7 +409,10 @@ public class HiveScanNode extends FileQueryScanNode { HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0])); // 6. set null format textParams.setNullFormat(HiveProperties.getNullFormat(table)); + fileAttributes.setTextParams(textParams); + fileAttributes.setHeaderType(""); } else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // set set properties of OpenCSVSerde // 1. set column separator textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table)); @@ -411,17 +422,29 @@ public class HiveScanNode extends FileQueryScanNode { textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]); // 4. set escape char textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]); + fileAttributes.setTextParams(textParams); + fileAttributes.setHeaderType(""); + if (textParams.isSetEnclose()) { + fileAttributes.setTrimDoubleQuotes(true); + } + } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + textParams.setColumnSeparator("\t"); + textParams.setLineDelimiter("\n"); + fileAttributes.setTextParams(textParams); + + fileAttributes.setJsonpaths(""); + fileAttributes.setJsonRoot(""); + fileAttributes.setNumAsString(true); + fileAttributes.setFuzzyParse(false); + fileAttributes.setReadJsonByLine(true); + fileAttributes.setStripOuterArray(false); + fileAttributes.setHeaderType(""); } else { throw new UserException( "unsupported hive table serde: " + serDeLib); } - TFileAttributes fileAttributes = new TFileAttributes(); - fileAttributes.setTextParams(textParams); - fileAttributes.setHeaderType(""); - if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) { - fileAttributes.setTrimDoubleQuotes(true); - } return fileAttributes; } diff --git a/regression-test/data/external_table_p0/hive/hive_json_basic_test.out b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out new file mode 100644 index 00000000000..9023f5d72b1 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out @@ -0,0 +1,115 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q1 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 [...] +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 [...] +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 [...] +4 \N \N \N \N \N 123.45 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N {1:10} \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N [1.2345, 2.3456] \N \N \N \N \N \N \N \N \N \N [null, "value1", "value2"] \N \N \N dt2 +5 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 +6 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 + +-- !q2 -- +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 [...] + +-- !q3 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 [...] + +-- !q4 -- +123.45 +789.56 +-3.4028235e+38 +123.45 + +-- !q5 -- +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 [...] + +-- !q6 -- +user1 {"name":"Alice", "age":28, "preferences":{"sports":{"preference_id":101, "preference_values":["soccer", "tennis"]}, "music":{"preference_id":102, "preference_values":["rock", "classical"]}}} [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metric [...] +user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201, "preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202, "preference_values":["beaches", "mountains"]}}} [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details" [...] +user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train [...] +user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train [...] + +-- !q7 -- +user1 [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q8 -- +\N \N \N \N +\N \N \N \N +1 10 {"col2a":10, "col2b":"string1"} {1:"string10"} +2 20 {"col2a":20, "col2b":"string2"} {2:"string2"} +3 30 {"col2a":30, "col2b":"string3"} \N +4 40 {"col2a":40, "col2b":"string4"} {4:"string4"} +5 \N \N \N +6 60 {"col2a":60, "col2b":"600"} {6:"600"} +7 70 {"col2a":70, "col2b":"string7"} {7:"string7"} + +-- !q9 -- +\N \N +\N \N +\N 5 +10 1 +20 2 +30 3 +40 4 +60 6 +70 7 + +-- !q1 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 [...] +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 [...] +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 [...] +4 \N \N \N \N \N 123.45 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N {1:10} \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N [1.2345, 2.3456] \N \N \N \N \N \N \N \N \N \N [null, "value1", "value2"] \N \N \N dt2 +5 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 +6 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N dt3 + +-- !q2 -- +3 false -128 -32768 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999 -999999999.999999 \N min_string_value xxxx 2001-01-01 2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00 char_min_value1 char_min_value2 char_min_value3 [...] + +-- !q3 -- +1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 [...] + +-- !q4 -- +123.45 +789.56 +-3.4028235e+38 +123.45 + +-- !q5 -- +2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 [...] + +-- !q6 -- +user1 {"name":"Alice", "age":28, "preferences":{"sports":{"preference_id":101, "preference_values":["soccer", "tennis"]}, "music":{"preference_id":102, "preference_values":["rock", "classical"]}}} [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metric [...] +user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201, "preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202, "preference_values":["beaches", "mountains"]}}} [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details" [...] +user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train [...] +user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301, "preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302, "preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train [...] + +-- !q7 -- +user1 [{"activity_date":"2024-08-01", "activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5, "calories":200}}, "reading":{"details":"Read book on Hive", "metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02", "activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500, "time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5, "participants":5}}}}] +user2 [{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10, "elevation":500}}, "photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02", "activities":{"workshop":{"details":"Photography workshop", "metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought camera gear", "metrics":{"items":5, "cost":1500}}}}] +user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] +user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}] + +-- !q8 -- +\N \N \N \N +\N \N \N \N +1 10 {"col2a":10, "col2b":"string1"} {1:"string10"} +2 20 {"col2a":20, "col2b":"string2"} {2:"string2"} +3 30 {"col2a":30, "col2b":"string3"} \N +4 40 {"col2a":40, "col2b":"string4"} {4:"string4"} +5 \N \N \N +6 60 {"col2a":60, "col2b":"600"} {6:"600"} +7 70 {"col2a":70, "col2b":"string7"} {7:"string7"} + +-- !q9 -- +\N \N +\N \N +\N 5 +10 1 +20 2 +30 3 +40 4 +60 6 +70 7 + diff --git a/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy new file mode 100644 index 00000000000..9d05e1a4c74 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("hive_json_basic_test", "p0,external,hive,external_docker,external_docker_hive") { + + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive2", "hive3"]) { + try { + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_hive_json_basic_test" + String broker_name = "hdfs" + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + String tb1 = """json_all_complex_types""" + String tb2 = """json_nested_complex_table""" + String tb3 = """json_load_data_table""" + + def tables = sql """ show tables """ + logger.info("tables = ${tables}") + + qt_q1 """ select * from ${tb1} order by id """ + qt_q2 """ select * from ${tb1} where tinyint_col < 0 order by id """ + qt_q3 """ select * from ${tb1} where bigint_col > 0 order by id """ + qt_q4 """ select float_col from ${tb1} where float_col is not null order by id """ + qt_q5 """ select * from ${tb1} where id = 2 order by id """ + + + + qt_q6 """ select * from ${tb2} order by user_id""" + qt_q7 """ select user_id,activity_log from ${tb2} order by user_id""" + + + order_qt_q8 """ select * from ${tb3} order by id """ + + order_qt_q9 """ select col1,id from ${tb3} order by id """ + + + + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org