hubgeter commented on code in PR #43469: URL: https://github.com/apache/doris/pull/43469#discussion_r1836001942
########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -756,7 +770,23 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl _append_empty_skip_bitmap_value(block, cur_row_count); } - for (auto* slot_desc : slot_descs) { + if (_is_hive_table) { + //don't like _fuzzy_parse,each line read in must modify name_map once. + for (auto* v : slot_descs) { Review Comment: Here we not only need to deal with the same column names, but also the order of the table column names and the json data. I think I can simplify this double for loop by using a layer of for loop plus map. ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; Review Comment: yeah , i will fix it. ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); + } else { - return Status::InternalError<false>("It should not here."); + // Maybe we can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + auto json_str = NewJsonReader::_print_json_value(*value); + Slice slice {json_str.c_str(), json_str.size()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); } - str_value = tmp_buf; - break; - case rapidjson::Type::kFalseType: - wbytes = 1; - str_value = (char*)"0"; - break; - case rapidjson::Type::kTrueType: - wbytes = 1; - str_value = (char*)"1"; - break; - case rapidjson::Type::kNullType: - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - *value, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); + } else if (type_desc.type == TYPE_STRUCT) { Review Comment: i need check `_is_load` and `type_desc` , It seems that there is no way to use `switch`. ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); + } else { - return Status::InternalError<false>("It should not here."); + // Maybe we can `switch (value->GetType()) case: kNumberType`. Review Comment: I'm not sure if using `switch (value->GetType())` to get the string behavior would improve performance, but I might give it a try. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org