morningman commented on code in PR #43469: URL: https://github.com/apache/doris/pull/43469#discussion_r1835707840
########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; Review Comment: Looks like this `_options` can be created only once at reuse it in each call. ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); + } else { - return Status::InternalError<false>("It should not here."); + // Maybe we can `switch (value->GetType()) case: kNumberType`. + // Note that `if (value->IsInt())`, but column is FloatColumn. + auto json_str = NewJsonReader::_print_json_value(*value); + Slice slice {json_str.c_str(), json_str.size()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); } - str_value = tmp_buf; - break; - case rapidjson::Type::kFalseType: - wbytes = 1; - str_value = (char*)"0"; - break; - case rapidjson::Type::kTrueType: - wbytes = 1; - str_value = (char*)"1"; - break; - case rapidjson::Type::kNullType: - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - *value, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); + } else if (type_desc.type == TYPE_STRUCT) { Review Comment: Can we use `switch ... case` for following logic? ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -756,7 +770,23 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl _append_empty_skip_bitmap_value(block, cur_row_count); } - for (auto* slot_desc : slot_descs) { + if (_is_hive_table) { + //don't like _fuzzy_parse,each line read in must modify name_map once. + for (auto* v : slot_descs) { Review Comment: I think it will seriously impact the performance. What if we set a session variable to control it? And by default, in most cases, there is no duplicate column name in json value. ########## be/src/vec/exec/format/json/new_json_reader.cpp: ########## @@ -840,79 +871,145 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl } Status NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, IColumn* column_ptr, + const TypeDescriptor& type_desc, + vectorized::IColumn* column_ptr, + const std::string& column_name, DataTypeSerDeSPtr serde, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { + vectorized::IColumn* data_column_ptr = column_ptr; + DataTypeSerDeSPtr data_serde = serde; + vectorized::DataTypeSerDe::FormatOptions _options; + + bool value_is_null = (value == nullptr) || (value->GetType() == rapidjson::Type::kNullType); + + if (column_ptr->is_nullable()) { nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { + data_column_ptr = nullable_column->get_nested_column().get_ptr(); + data_serde = serde->get_nested_serdes()[0]; + + if (value_is_null) { + nullable_column->insert_default(); + return Status::OK(); + } else { nullable_column->get_null_map_data().push_back(0); + } + + } else if (value_is_null) [[unlikely]] { + if (_is_load) { + RETURN_IF_ERROR(_append_error_msg( + *value, "Json value is null, but the column `{}` is not nullable.", column_name, + valid)); + return Status::OK(); + } else { - nullable_column->insert_default(); + return Status::DataQualityError( + "Json value is null, but the column `{}` is not nullable.", column_name); } - column_ptr = &nullable_column->get_nested_column(); } - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = value->GetStringLength(); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else if (value->IsFloat() || value->IsDouble()) { - auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble()); - wbytes = end - tmp_buf; + if (_is_load || !type_desc.is_complex_type()) { + if (value->IsString()) { + Slice slice {value->GetString(), value->GetStringLength()}; + RETURN_IF_ERROR( + data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options)); + } else { - return Status::InternalError<false>("It should not here."); + // Maybe we can `switch (value->GetType()) case: kNumberType`. Review Comment: Why not using `switch (value->GetType())`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org